# -*- coding: utf-8 -*-
#
# (C) Rob W.W. Hooft, 2004
# (C) Daniel Herding, 2004
# Modified by Dake.
#
__version__ = '$Id: category.py,v 1.107 2006/04/30 18:38:49 cydeweys Exp $'
#
# Distributed under the terms of the MIT license.
#
import re, sys, pickle, bz2
import wikipedia, catlib, config, pagegenerators
# Summary messages
msg_add={
'da':u'Robot: Tilføjer [[%s]]',
'de':u'Bot: Ergänze [[%s]]',
'en':u'Robot: Adding [[%s]]',
'es':u'Bot: Añadida [[%s]]',
'fi':u'Botti lisäsi luokkaan [[%s]]',
'fr':u'Robot : ajoute [[%s]]',
'ia':u'Robot: Addition de [[%s]]',
'is':u'Vélmenni: Bæti við [[%s]]',
'no':u'Robot: Legger til [[%s]]',
'pt':u'Bot: Adicionando [[Categoria:%s]]',
'sr':u'Бот: Додаје [[Категорија:%s]]',
}
msg_change={
'da':u'Robot: Ændrer %s',
'de':u'Bot: Ändere %s',
'en':u'Robot: Changing %s',
'es':u'Bot: Cambiada %s',
'fi':u'Botti vaihtoi luokan %s',
'fr':u'Robot : modifie %s',
'ia':u'Robot: Modification de %s',
'is':u'Vélmenni: Breyti flokknum [[%s]]',
'nl':u'Bot: Wijziging %s',
'no':u'Robot: Endrer %s',
'pt':u'Bot: Modificando [[Categoria:%s]]',
'sr':u'Бот: Измена категорије %s',
}
deletion_reason_move = {
'de':u'Bot: Kategorie wurde nach %s verschoben',
'en':u'Robot: Category was moved to %s',
'fr':u'Robot : catégorie déplacée sur %s',
'ia':u'Robot: Categoria transferite a %s',
'no':u'Robot: Kategorien ble flyttet til %s',
'pt':u'Bot: Categoria %s foi movida',
'sr':u'Бот: Категорија премештена у %s',
}
cfd_templates = {
'en':['cfd', 'cfr', 'cfru', 'cfr-speedy', 'cfm', 'cfdu'],
}
class CategoryDatabase:
'''
This is a temporary knowledge base saving for each category the contained
subcategories and articles, so that category pages don't need to
be loaded over and over again
'''
def __init__(self, rebuild = False, filename = 'category.dump.bz2'):
if rebuild:
self.rebuild()
else:
try:
f = bz2.BZ2File(filename, 'r')
wikipedia.output(u'Reading dump from %s' % filename)
databases = pickle.load(f)
f.close()
# keys are categories, values are 2-tuples with lists as entries.
self.catContentDB = databases['catContentDB']
# like the above, but for supercategories
self.superclassDB = databases['superclassDB']
del databases
except:
# If something goes wrong, just rebuild the database
self.rebuild()
def rebuild(self):
self.catContentDB={}
self.superclassDB={}
def getSubcats(self, supercat):
'''
For a given supercategory, return a list of Categorys for all its
subcategories.
Saves this list in a temporary database so that it won't be loaded from the
server next time it's required.
'''
# if we already know which subcategories exist here
if supercat in self.catContentDB:
return self.catContentDB[supercat][0]
else:
subcatlist = supercat.subcategories()
articlelist = supercat.articles()
# add to dictionary
self.catContentDB[supercat] = (subcatlist, articlelist)
return subcatlist
def getArticles(self, cat):
'''
For a given category, return a list of Pages for all its articles.
Saves this list in a temporary database so that it won't be loaded from the
server next time it's required.
'''
# if we already know which articles exist here
if cat in self.catContentDB:
return self.catContentDB[cat][1]
else:
subcatlist = cat.subcategories()
articlelist = cat.articles()
# add to dictionary
self.catContentDB[cat] = (subcatlist, articlelist)
return articlelist
def getSupercats(self, subcat):
# if we already know which subcategories exist here
if subcat in self.superclassDB:
return self.superclassDB[subcat]
else:
supercatlist = subcat.supercategories()
# add to dictionary
self.superclassDB[subcat] = supercatlist
return supercatlist
def dump(self, filename = 'category.dump.bz2'):
'''
Saves the contents of the dictionaries superclassDB and catContentDB to disk.
'''
wikipedia.output(u'Dumping to %s, please wait...' % filename)
f = bz2.BZ2File(filename, 'w')
databases = {
'catContentDB': self.catContentDB,
'superclassDB': self.superclassDB
}
# store dump to disk in binary format
pickle.dump(databases, f, protocol=pickle.HIGHEST_PROTOCOL)
f.close()
def sorted_by_last_name(catlink, pagelink):
'''
given a Category, returns a Category which has an explicit sort key which
sorts persons by their last names.
Trailing words in brackets will be removed.
Example: If category_name is 'Author' and pl is a Page to
[[Alexandre Dumas (senior)]], this function will return this Category:
[[Category:Author|Dumas, Alexandre]]
'''
page_name = pagelink.title()
site = pagelink.site()
# regular expression that matches a name followed by a space and
# disambiguation brackets. Group 1 is the name without the rest.
bracketsR = re.compile('(.*) \(.+?\)')
match_object = bracketsR.match(page_name)
if match_object:
page_name = match_object.group(1)
split_string = page_name.split(' ')
if len(split_string) > 1:
# pull last part of the name to the beginning, and append the rest after a comma
# e.g. "John von Neumann" becomes "Neumann, John von"
sorted_key = split_string[-1] + ', ' + ' '.join(split_string[:-1])
# give explicit sort key
return wikipedia.Page(site, catlink.title() + '|' + sorted_key)
else:
return wikipedia.Page(site, catlink.title())
def add_category(sort_by_last_name = False):
'''
A robot to mass-add a category to a list of pages.
'''
print "This bot has two modes: you can add a category link to all"
print "pages mentioned in a List that is now in another wikipedia page"
print "or you can add a category link to all pages that link to a"
print "specific page. If you want the second, please give an empty"
print "answer to the first question."
listpageTitle = wikipedia.input(u'Wiki page with list of pages to change:')
site = wikipedia.getSite()
pages = []
if listpageTitle:
try:
listpage = wikipedia.Page(site, listpageTitle)
pages = listpage.linkedPages()
except wikipedia.NoPage:
wikipedia.output(u'%s could not be loaded from the server.' % listpage.aslink())
except wikipedia.IsRedirectPage:
wikipedia.output(u'%s is a redirect to %s.' % (listpage.aslink(), listpage.getRedirectTarget()))
else:
referredPage = wikipedia.input(u'Wikipedia page that is now linked to:')
page = wikipedia.Page(wikipedia.getSite(), referredPage)
pages = list(page.getReferences())
wikipedia.output(u' ==> %i pages to process\n' % len(pages))
if len(pages) > 0:
newcatTitle = wikipedia.input(u'Category to add (do not give namespace):')
newcatTitle = newcatTitle[:1].capitalize() + newcatTitle[1:]
# set edit summary message
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg_add) % newcatTitle)
cat_namespace = wikipedia.getSite().category_namespaces()[0]
answer = ''
for page in pages:
if answer != 'a':
answer = ''
while answer not in ('y','n','a'):
answer = wikipedia.input(u'%s [y/n/a(ll)]:' % (page.aslink()))
if answer == 'a':
confirm = ''
while confirm not in ('y','n'):
confirm = wikipedia.input(u'This should be used if and only if you are sure that your links are correct! Are you sure? [y/n]:')
if confirm == 'n':
answer = ''
if answer == 'y' or answer == 'a':
try:
cats = page.categories()
except wikipedia.NoPage:
wikipedia.output(u"%s doesn't exist yet. Ignoring." % (page.title()))
pass
except wikipedia.IsRedirectPage,arg:
redirTarget = wikipedia.Page(site,arg.args[0])
wikipedia.output(u"WARNING: %s is redirect to %s. Ignoring." % (page.title(), redirTarget.title()))
else:
wikipedia.output(u"Current categories:")
for cat in cats:
wikipedia.output(u"* %s" % cat.title())
catpl = wikipedia.Page(site, cat_namespace + ':' + newcatTitle)
if sort_by_last_name:
catpl = sorted_by_last_name(catpl, page)
if catpl in cats:
wikipedia.output(u"%s is already in %s." % (page.title(), catpl.title()))
else:
wikipedia.output(u'Adding %s' % catpl.aslink())
cats.append(catpl)
text = page.get()
text = wikipedia.replaceCategoryLinks(text, cats)
try:
page.put(text)
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
class CategoryCounterRobot:
'''
Robot to create tree overviews of the category structure.
Parameters:
* catTitle - The category which will be the tree's root.
* catDB - A CategoryDatabase object
* maxDepth - The limit beyond which no subcategories will be listed.
This also guarantees that loops in the category structure
won't be a problem.
* filename - The textfile where the tree should be saved; None to print
the tree to stdout.
'''
def __init__(self, catTitle, catDB, maxDepth = 10):
self.catTitle = catTitle
self.catDB = catDB
# TODO: make maxDepth changeable with a parameter or config file entry
self.maxDepth = maxDepth
self.articlesMap = {}
def treeview(self, cat, currentDepth = 0, parent = None):
'''
Returns a multi-line string which contains a tree view of all subcategories
of cat, up to level maxDepth. Recursively calls itself.
Parameters:
* cat - the Category of the node we're currently opening
* currentDepth - the current level in the tree (for recursion)
* parent - the Category of the category we're coming from
'''
# Translations to say that the current category is in more categories than
# the one we're coming from
also_in_cats = {
'da': u'(også i %s)',
'de': u'(auch in %s)',
'en': u'(also in %s)',
'fr': u'(également dans %s)',
'ia': u'(equalmente in %s)',
'is': u'(einnig í %s)',
'pt': u'(também em %s)',
'ср': u'(такође у %s)',
}
allArticles = self.catDB.getArticles(cat)
for a in allArticles:
self.articlesMap[a._title] = a._title
result = u'#' * currentDepth
result += '[[:%s|%s]]' % (cat.title(), cat.title().split(':', 1)[1])
result += ' (%d)' % len(allArticles)
# We will remove an element of this array, but will need the original array
# later, so we create a shallow copy with [:]
supercats = self.catDB.getSupercats(cat)[:]
# Find out which other cats are supercats of the current cat
try:
supercats.remove(parent)
except:
pass
if supercats != []:
supercat_names = []
for i in range(len(supercats)):
# create a list of wiki links to the supercategories
supercat_names.append('[[:%s|%s]]' % (supercats[i].title(), supercats[i].title().split(':', 1)[1]))
# print this list, separated with commas, using translations given in also_in_cats
result += ' ' + wikipedia.translate(wikipedia.getSite(), also_in_cats) % ', '.join(supercat_names)
result += '\n'
if currentDepth < self.maxDepth:
for subcat in self.catDB.getSubcats(cat):
# recurse into subdirectories
result += self.treeview(subcat, currentDepth + 1, parent = cat)
else:
if self.catDB.getSubcats(cat) != []:
# show that there are more categories beyond the depth limit
result += '#' * (currentDepth + 1) + '[...]\n'
return result
def run(self):
"""
Prints the multi-line string generated by treeview or saves it to a file.
Parameters:
* catTitle - the title of the category which will be the tree's root
* maxDepth - the limit beyond which no subcategories will be listed
"""
cat = catlib.Category(wikipedia.getSite(), 'Category:' + self.catTitle)
tree = self.treeview(cat)
print "Articles -------------> %d" % len(self.articlesMap)
if __name__ == "__main__":
fromGiven = False
toGiven = False
batchMode = False
customSummary = False
editSummary = ''
try:
catDB = CategoryDatabase()
action = None
sort_by_last_name = False
restore = False
catTitle = wikipedia.input(u'For which category do you want to count the articles?')
bot = CategoryCounterRobot(catTitle, catDB)
bot.run()
finally:
catDB.dump()
wikipedia.stopme()