////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
getContrib.py
# -*- coding: utf-8 -*-
import wikipedia
import sgmllib
class getContrib(sgmllib.SGMLParser):
"A simple parser class."
def parse(self, s):
"Parse the given string 's'."
self.cpt_div = -1
self.open_li = 0
self.cpt_a = 0
self.href = ''
self.feed(s)
self.close()
def __init__(self, user, step=5000, condition=''):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self)
self.site = wikipedia.getSite()
self.name = user
self.unique_page = {}
self.condition = condition
self.data = ''
self.address = self.site.contribs_address(self.name,limit=step)
self.do(self.address)
def do(self, address):
self.data = self.site.getUrl(address)
self.address = ''
self.parse(self.data)
if address != self.address:
self.do(self.address)
def start_li(self, attributes):
if self.cpt_div == 0:
self.open_li = 1
self.cpt_a = 0
def end_li(self):
if self.cpt_div == 1:
self.open_li = 0
def start_a(self, attributes):
self.cpt_a += 1
sp = 0
href = ''
for name, value in attributes:
if name == "href" and value.find("dir=prev") == -1 and value.find("offset") != -1:
href = value
if name == "title":
Ok = 1
if self.cpt_a == 3:
Ok = Ok * 1
else:
Ok = 0
if self.condition == '' or self.data.find(self.condition) != -1:
Ok = Ok * 1
else:
Ok = 0
if Ok == 1:
print u"Page : %s" % value
if self.unique_page.has_key(value):
self.unique_page[value] += 1
else:
self.unique_page[value] = 1
if value == "Special:Contributions":
sp = 1
if sp == 1 and href != '' and self.address == '':
self.address = href
def start_div(self, attributes):
if self.cpt_div == -1:
for name, value in attributes:
if name == "id":
if value.find("bodyContent") != -1:
self.cpt_div = 0
else:
self.cpt_div += 1
def end_div(self):
if self.cpt_div > -1:
self.cpt_div -= 1
def write_unique_page(self, NomFic, write_num=0):
f = open(NomFic, "wt")
print "nb_elem = %d" % len(self.unique_page)
for k, v in self.unique_page.iteritems():
P = wikipedia.Page(self.site, k)
f.write(P.urlname())
if write_num == 1:
f.write(" : %d" % v)
f.write("\n")
f.close();
def main():
C = getContrib("ILJR", 500, u"Mod%C3%A8le:L%C3%A9gifrance")
C.write_unique_page("UCL.txt", 0)
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
getPagesLiees.py
# -*- coding: utf-8 -*-
import wikipedia
def main():
nomDePage = u"Mod%C3%A8le:L%C3%A9gifrance"
site = wikipedia.getSite()
pageL = wikipedia.Page(site, nomDePage)
f = open("modelelegifrance.txt", "wt")
for page in pageL.getReferences(follow_redirects=False,onlyTemplateInclusion=True):
f.write(page.urlname() + "\n")
f.close();
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
ModeleLegifrance.py
# -*- coding: utf-8 -*-
import datetime
import time
import urllib
import sgmllib
class ParseOldURL(sgmllib.SGMLParser):
"A simple parser class."
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, url_l, cherche_old=0, print_s=0, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
self.newURL = ''
self.oldURL = 0
self.trouve = 0
self.cidTexte = ''
self.idTexte = ''
opener = urllib.FancyURLopener({})
try:
urlopener = opener.open(url_l)
self.newURL = urlopener.geturl()
if self.is_newURL_OK() == 0:
print "KO : " + url_l + " -> " + self.newURL + " KO"
# print "wait 1 second"
# time.sleep(1)
# print "try again"
urlopener2 = opener.open(url_l)
self.newURL = urlopener2.geturl()
dateNow = datetime.date.today()
self.s_dateNow = "%04d%02d%02d" % (dateNow.year, dateNow.month, dateNow.day)
start_idTexte = self.newURL.find("dateTexte="+self.s_dateNow)
if start_idTexte > -1:
end_idTexte = self.newURL.find('&', start_idTexte+1)
if end_idTexte == -1:
end_idTexte = len(self.newURL)
self.newURL = self.newURL[:start_idTexte]+"dateTexte=vig"+self.newURL[end_idTexte:]
if cherche_old == 1:
sgmllib.SGMLParser.__init__(self, verbose)
s = urlopener.read()
self.parse(s)
if print_s == 1:
print s
except UnicodeError:
print "URL " + repr(url_l) + " contains non-ASCII characters"
def start_a(self, attributes):
if self.trouve == 0:
if self.oldURL > 0:
for name, value in attributes:
if name == "href":
self.newURL = self.newURL + value
self.trouve = 1
def start_div(self, attributes):
if self.trouve == 0:
if self.oldURL == 0:
for name, value in attributes:
if name == "id":
if value.find("oldURL") != -1:
self.oldURL = 1
else :
self.oldURL = self.oldURL + 1
def end_div(self):
if self.trouve == 0:
if self.oldURL > 0:
self.oldURL = self.oldURL - 1
def is_newURL_OK(self):
if len(self.get_cidTexte())+len(self.get_idTexte()) > 0:
return 1
else:
return 0
def get_newURL(self):
return self.newURL
def get_param(self, param):
_idTexte = param + '='
start_idTexte = self.newURL.find(_idTexte)
if start_idTexte > -1:
end_idTexte = self.newURL.find('&', start_idTexte+len(_idTexte))
if end_idTexte == -1:
end_idTexte = len(self.newURL)
return self.newURL[start_idTexte+len(_idTexte):end_idTexte]
return ''
def get_cidTexte(self):
if len(self.cidTexte) == 0:
self.cidTexte = self.get_param('cidTexte').strip()
return self.cidTexte
def get_idTexte(self):
if len(self.idTexte) == 0:
self.idTexte = self.get_param('idTexte').strip()
return self.idTexte
def get_idArticle(self):
return self.get_param('idArticle')
def get_dateTexte(self):
s_dateTexte = self.get_param('dateTexte')
if self.s_dateNow == s_dateTexte:
return "vig"
else:
return s_dateTexte
class ModeleLegifrance:
def __init__(self, ml, nomModele=u"Légifrance", test=0):
self.Code = {
u"constitution" : "constitution",
u"CASS" : "CASS",
u"INCA" : "INCA",
u"JADE" : "JADE",
u"CONSTIT" : "CONSTIT",
u"LEGI" : "LEGI",
u"LEX" : "LEX",
u"LEX_SIMPLE_AV90" : "LEX_SIMPLE_AV90",
u"avant90" : "LEX_SIMPLE_AV90",
u"consolidé" : "texteconsolide",
u"texteconsolide" : "texteconsolide",
u"consolide" : "texteconsolide",
u"JORF" : "JORF",
u"CC" : "CCIVILL0",
u"CCIVILL0" : "CCIVILL0",
u"CCOM" : "CCOMMERL",
u"COM" : "CCOMMERL",
u"CCOM(R)" : "CCOMMERM",
u"COM(R)" : "CCOMMERM",
u"CGCT" : "CGCTERRL",
u"CGCT(R)" : "CGCTERRM",
u"CEDU" : "CEDUCATL",
u"CEDU(R)" : "CEDUCATM",
u"CELE" : "CELECTOL",
u"CELE(R)" : "CELECTOM",
u"CESEDA(L)" : "CENTGERL",
u"CESEDA(R)" : "CENTGERM",
u"CE" : "CENVIROL",
u"CE(R)" : "CENVIROM",
u"CJA" : "CJUSADML",
u"CJA(R)" : "CJUSADMR",
u"CJF(L)" : "CJURFINL",
u"CJF(R)" : "CJURFINR",
u"CMONFIL" : "CMONFIL",
u"CMONFIR" : "CMONFIR",
u"COJ(L)" : "CORGJUDL",
u"COJ(R)" : "CORGJUDR",
u"COJ" : "CORGJUNL",
u"CPAT" : "CPATRIML",
u"CP" : "CPENALLL",
u"CPOSTESL" : "CPOSTESL",
u"CPOSTESR" : "CPOSTESR",
u"CP(R)" : "CPENALLR",
u"CPC" : "CPROCIA0",
u"NCPC" : "CPROCIV0",
u"CPROCIV0" : "CPROCIV0",
u"CPP" : "CPROCPEL",
u"CPP(R)" : "CPROCPER",
u"CPP(D)" : "CPROCPED",
u"CPP(A)" : "CPROCPEA",
u"CGPPP" : "CGPROPPL",
u"CPROINTL" : "CPROINTL",
u"CPROINTR" : "CPROINTR",
u"CPI" : "CPROINTL",
u"CESEDA" : "CENTGERL",
u"CRO" : "CROUTENL",
u"CRO(R)" : "CROUTENM",
u"CR" : "CRURALNL",
u"CR(R)" : "CRURALNM",
u"CSP" : "CSANPUNL",
u"CSP(NR)" : "CSANPUNR",
u"CSP(L)" : "CSANPUBL",
u"CSP(R)" : "CSANPUBR",
u"CSS(L)" : "CSECSOCL",
u"CSS(D)" : "CSECSOCD",
u"CSS(R)" : "CSECSOCR",
u"CT(NL)" : "CTRAVANL",
u"CT" : "CTRAVAIL",
u"CT(R)" : "CTRAVAIR",
u"CT(D)" : "CTRAVAID",
u"CONSO" : "CCONSOML",
u"CONSO(R)" : "CCONSOMR",
u"CONSO(D)" : "CCONSOMD",
u"URBA(L)" : "CURBANIL",
u"URBA(R)" : "CURBANIR",
u"CGI" : "CGIMPO00",
u"CGLIVP" : "CGLIVPFL",
u"CGLIVPFM" : "CGLIVPFM",
u"CGLIVPFA" : "CGLIVPFA",
u"ASS" : "CASSURAL",
u"ASS(R)" : "CASSURAM",
u"ASS(A)" : "CASSURAA",
u"CDEF" : "CDAFENSL",
u"CDEF(R)" : "CDAFENSM"
}
self.iCode = {
u"constitution" : 0,
u"CASS" : 1,
u"INCA" : 2,
u"JADE" : 3,
u"CONSTIT" : 4,
u"LEGI" : 5,
u"LEX" : 6,
u"LEX_SIMPLE_AV90" : 7,
u"avant90" : 8,
u"consolidé" : 9,
u"texteconsolide" : 10,
u"consolide" : 11,
u"JORF" : 12,
u"CC" : 13,
u"CCIVILL0" : 14,
u"CCOM" : 15,
u"COM" : 16,
u"CCOM(R)" : 17,
u"COM(R)" : 18,
u"CGCT" : 19,
u"CGCT(R)" : 20,
u"CEDU" : 21,
u"CEDU(R)" : 22,
u"CELE" : 23,
u"CELE(R)" : 24,
u"CESEDA(L)" : 25,
u"CESEDA(R)" : 26,
u"CE" : 27,
u"CE(R)" : 28,
u"CJA" : 29,
u"CJA(R)" : 30,
u"CJF(L)" : 31,
u"CJF(R)" : 32,
u"CMONFIL" : 33,
u"CMONFIR" : 34,
u"COJ(L)" : 35,
u"COJ(R)" : 36,
u"COJ" : 37,
u"CPAT" : 38,
u"CP" : 39,
u"CPOSTESL" : 40,
u"CPOSTESR" : 41,
u"CP(R)" : 42,
u"CPC" : 43,
u"NCPC" : 44,
u"CPROCIV0" : 45,
u"CPP" : 46,
u"CPP(R)" : 47,
u"CPP(D)" : 48,
u"CPP(A)" : 49,
u"CGPPP" : 50,
u"CPROINTL" : 51,
u"CPROINTR" : 52,
u"CPI" : 53,
u"CESEDA" : 54,
u"CRO" : 55,
u"CRO(R)" : 56,
u"CR" : 57,
u"CR(R)" : 58,
u"CSP" : 59,
u"CSP(NR)" : 60,
u"CSP(L)" : 61,
u"CSP(R)" : 62,
u"CSS(L)" : 63,
u"CSS(D)" : 64,
u"CSS(R)" : 65,
u"CT(NL)" : 66,
u"CT" : 67,
u"CT(R)" : 68,
u"CT(D)" : 69,
u"CONSO" : 70,
u"CONSO(R)" : 71,
u"CONSO(D)" : 72,
u"URBA(L)" : 73,
u"URBA(R)" : 74,
u"CGI" : 75,
u"CGLIVP" : 76,
u"CGLIVPFM" : 77,
u"CGLIVPFA" : 78,
u"ASS" : 79,
u"ASS(R)" : 80,
u"ASS(A)" : 81,
u"CDEF" : 82,
u"CDEF(R)" : 83
}
self.Modele = ""
self.Base = ""
self.Numero = ""
self.Texte = ""
self.oldURL = ""
self.url = ""
self.cidTexte = ""
self.idTexte = ""
self.idArticle = ""
self.dateTexte = ""
p = ml.find("{{")
if p != -1:
ml = ml[p:]
p = ml.find("}}")
if p != -1:
ml = ml[:p]
self.listParametres = ml.split("|", 8)
l = len(self.listParametres)
if l > 0:
p = self.listParametres[0].find(nomModele)
if p != -1:
self.Modele = nomModele
for cpt in range(1, l):
p = self.listParametres[cpt].find("=")
if test == 1:
self.Aff_mess("param[] = " + self.listParametres[cpt])
if p > -1:
#self.Aff_mess("param = " + self.listParametres[cpt][:p])
#self.Aff_mess("value = " + self.listParametres[cpt][p+1:])
self.put_param(cpt, self.listParametres[cpt][:p], self.listParametres[cpt][p+1:], test)
else:
self.put_param(cpt, "", self.listParametres[cpt], test)
def put_param(self, num, param, value, test=0):
value = value.strip()
if param == "base":
self.Base = value
elif param == u"numéro":
self.Numero = value
elif param == "texte":
self.Texte = value
elif param == "url":
self.url = value
elif param == "cidTexte":
self.cidTexte = value
elif param == "idTexte":
self.idTexte = value
elif param == "idArticle":
self.idArticle = value
elif param == "dateTexte":
self.dateTexte = value
elif num == 1:
self.Base = value
elif num == 2:
self.Numero = value
elif num == 3:
self.Texte = value
def Aff_mess(self, texte):
try:
print texte.encode("utf-8")
except UnicodeEncodeError:
print "impossible d'afficher le message de trace"
def Debug(self):
self.Aff_mess("Modele = " + self.Modele)
self.Aff_mess("Base = " + self.get_newBase())
self.Aff_mess("Numero = " + self.Numero)
self.Aff_mess("Texte = " + self.Texte)
def get_newBase(self):
if self.Base in self.Code:
return self.Code[self.Base]
else:
return self.Base
def get_texte(self):
if self.Texte == "":
if self.get_iCode() < 13:
return ""
else:
return self.Numero
else:
return self.Texte
def get_iCode(self):
if self.Base in self.iCode:
return self.iCode[self.Base]
else:
return -1
def isOldURL_OK(self):
i = self.get_iCode()
if i > -1:
if len(self.Numero) > 0:
return 1
return 0
def doOldURL(self, test=0):
if self.isOldURL_OK() == 0:
return ""
else:
self.oldURL = "http://www.legifrance.gouv.fr/"
i = self.get_iCode()
if i == 0:
self.oldURL += "html/constitution/constitution2.htm"
elif i > 0:
if i < 9:
self.oldURL += "WAspad/UnDocument?base=" + self.get_newBase() + "&nod="
elif i < 12:
self.oldURL += "texteconsolide/"
elif i == 12:
self.oldURL += "WAspad/UnTexteDeJorf?numjo="
else:
self.oldURL += "WAspad/UnArticleDeCode?code=" + self.get_newBase() + ".rcv&art="
self.oldURL = self.oldURL + self.Numero
return self.oldURL
#a1 = u"{{Légifrance|base=CASS|numéro=CXRXAX1997X06X06X00236X000|texte=Crim. 17 juin 1997, pourvoi n94-85126, ''Guionnet''|cidTexte=|idArticle=|dateTexte=}}"
#Légifrance|Base=CC|numéro=1116|Texte=Le Texte fondateur du dol en droit civil}}"
#a2 = u"{{Légifrance|base=INCA|numéro=IXRXCX2005X12X06X00813X012|texte=Arrêt de la Cour de Cassation}}"
#a3 = u"{{Légifrance|base=consolidé|numéro=PPEDY.htm}}"
#a4 = u"{{Légifrance|base=avant90|numéro=1LX978742|texte=Loi}}"
#a5 = u"{{Légifrance|CC|1116}}"
#
#m1 = ModeleLegifrance(a1)
#m2 = ModeleLegifrance(a2)
#m3 = ModeleLegifrance(a3)
#m4 = ModeleLegifrance(a4)
#m5 = ModeleLegifrance(a5)
#
#m1.Debug()
#s = m1.doOldURL()
#P = ParseOldURL(s)
#print "oldURL = " + s
#print "newURL = " + P.get_newURL()
#print
#m2.Debug()
#s = m2.doOldURL()
#P = ParseOldURL(s)
#print "oldURL = " + s
#print "newURL = " + P.get_newURL()
#print
#m3.Debug()
#s = m3.doOldURL()
#P = ParseOldURL(s)
#print "oldURL = " + s
#print "newURL = " + P.get_newURL()
#print
#m4.Debug()
#s = m4.doOldURL()
#P = ParseOldURL(s)
#print "oldURL = " + s
#print "newURL = " + P.get_newURL()
#print
#m5.Debug()
#s = m5.doOldURL()
#P = ParseOldURL(s)
#print "oldURL = " + s
#print "newURL = " + P.get_newURL()
#
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
test_PL.py
# -*- coding: iso-8859-1 -*-
import getPagesLiees
from os.path import join
import sys
sys.path = [join(sys.prefix, 'pywikipedia')] + sys.path
url_l = 'http://fr.wikipedia.org/w/index.php'
predata_l = 'title=Special:Pages_li%C3%A9es/Mod%C3%A8le:L%C3%A9gifrance&limit=500&from=0'
P = getPagesLiees(url_l, predata_l, 1)
P.save_list()
print P.nbody
print P.ndiv
print P.nul
print P.nli
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
traitePagesLiees.py
# -*- coding: utf-8 -*-
import wikipedia
import ModeleLegifrance
def save_texte(nom_fichier, texte):
f = open(nom_fichier, "wt")
f.write(texte.encode("utf-8"))
f.close();
def traite_modele(nomModele, old, test=0):
categ_KO = u"[[Catégorie:Page avec modèle Légifrance incomplet]]"
modif = 0
new = ""
l = len(old)
p = 0
while p > -1:
p = old.find(u"{{" + nomModele)
if p == -1:
new += old
else:
new += old[:p]
old = old[p:]
p = old.find("}}")
if p == -1:
new += old
else:
putCategKO = 0
a = ModeleLegifrance.ModeleLegifrance(old, nomModele, test)
new += "{{" + a.Modele
s = a.doOldURL(test)
if len(s) > 0:
"anciens param糲es OK"
new += "|base=" + a.Base
if a.get_iCode() == 0:
new += u"|numéro=vide"
else:
new += u"|numéro=" + a.Numero
P = ModeleLegifrance.ParseOldURL(s)
if a.get_iCode() < 12:
if P.is_newURL_OK() == 1:
new += "|url=" + P.get_newURL()
if P.get_newURL() != a.url:
modif = 1
if test == 1:
print a.get_iCode()
print P.get_newURL()
if P.is_newURL_OK() == 0:
print "KO"
putCategKO = 1
print P.get_cidTexte()
print P.get_idTexte()
elif len(a.url) > 0:
"anciens param糲es KO mais url pr貥nte"
"il faut tester que url est OK"
"si url OK, je ne mets que url et pas de message"
P = ModeleLegifrance.ParseOldURL(a.url)
if P.is_newURL_OK() == 1:
new += "|url=" + P.get_newURL()
if P.get_newURL() != a.url:
modif = 1
else:
"si url KO, je mets les anciens param糲es et la cat覯rie ad hoc"
putCategKO = 1
else:
putCategKO = 1
if test == 1:
print "KO2"
print P.get_cidTexte()
print P.get_idTexte()
new += "|texte=" + a.get_texte()
new += u"}}"
if putCategKO == 1:
if old[p+2:p+2+len(categ_KO)] != categ_KO:
modif = 1
else:
p += len(categ_KO)
new += categ_KO
while old[p+2:p+2+len(categ_KO)] == categ_KO:
p += len(categ_KO)
modif = 1
old = old[p+2:]
return new, modif
def main(test=0):
listeArticles = []
f = open("test_Jbot_ML.lst", "rt")
listeArticles = f.readlines()
f.close();
nomModele = u"Légifrance"
#nomModele = u"Utilisateur:ILJR/bac_a_sable/ModèleLégifrance"
site = wikipedia.getSite()
for nomDePageURL in listeArticles:
nomDePageURL = nomDePageURL[:len(nomDePageURL)-1]
pageL = wikipedia.Page(site, nomDePageURL)
if pageL.exists():
if not pageL.isRedirectPage():
if pageL.botMayEdit():
print "***** Article = " + nomDePageURL + " *****"
old = pageL.get()
new, modif = traite_modele(nomModele, old, test)
#if len(new) == 0:
# new = traite_modele(u"légifrance", old)
#else:
# new = traite_modele(u"légifrance", new)
if new != old and modif == 1:
if test == 1:
save_texte("ref/" + nomDePageURL, old)
save_texte("new/" + nomDePageURL, new)
else:
pageL.put(new, u"Traitement du nouveau paramètre du [[Modèle:Légifrance]] + test liens HS")
if __name__ == "__main__":
try:
main(1)
finally:
wikipedia.stopme()
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
user-config.py
mylang = 'fr'
usernames['wikipedia']['fr'] = 'Jbot'
console_encoding = 'utf-8'
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////