Utilisateur:Probot/CatCounter.py

# -*- coding: utf-8 -*-
#
# (C) Rob W.W. Hooft, 2004
# (C) Daniel Herding, 2004
# Modified by Dake.
#
__version__ = '$Id: category.py,v 1.107 2006/04/30 18:38:49 cydeweys Exp $'
#
# Distributed under the terms of the MIT license.
# 
import re, sys, pickle, bz2
import wikipedia, catlib, config, pagegenerators

# Summary messages
msg_add={
    'da':u'Robot: Tilføjer [[%s]]',
    'de':u'Bot: Ergänze [[%s]]',
    'en':u'Robot: Adding [[%s]]',
    'es':u'Bot: Añadida [[%s]]',
    'fi':u'Botti lisäsi luokkaan [[%s]]',
    'fr':u'Robot : ajoute [[%s]]',
    'ia':u'Robot: Addition de [[%s]]',
    'is':u'Vélmenni: Bæti við [[%s]]',
    'no':u'Robot: Legger til [[%s]]',
    'pt':u'Bot: Adicionando [[Categoria:%s]]',
    'sr':u'Бот: Додаје [[Категорија:%s]]',
    }

msg_change={
    'da':u'Robot: Ændrer %s',
    'de':u'Bot: Ändere %s',
    'en':u'Robot: Changing %s',
    'es':u'Bot: Cambiada %s',
    'fi':u'Botti vaihtoi luokan %s',
    'fr':u'Robot : modifie %s',
    'ia':u'Robot: Modification de %s',
    'is':u'Vélmenni: Breyti flokknum [[%s]]',
    'nl':u'Bot: Wijziging %s',
    'no':u'Robot: Endrer %s',
    'pt':u'Bot: Modificando [[Categoria:%s]]',
    'sr':u'Бот: Измена категорије %s',
    }

deletion_reason_move = {
    'de':u'Bot: Kategorie wurde nach %s verschoben',
    'en':u'Robot: Category was moved to %s',
    'fr':u'Robot : catégorie déplacée sur %s',
    'ia':u'Robot: Categoria transferite a %s',
    'no':u'Robot: Kategorien ble flyttet til %s',
    'pt':u'Bot: Categoria %s foi movida',
    'sr':u'Бот: Категорија премештена у %s',
    }

cfd_templates = {
    'en':['cfd', 'cfr', 'cfru', 'cfr-speedy', 'cfm', 'cfdu'],
    }

class CategoryDatabase:
    '''
    This is a temporary knowledge base saving for each category the contained
    subcategories and articles, so that category pages don't need to
    be loaded over and over again
    '''
    def __init__(self, rebuild = False, filename = 'category.dump.bz2'):
        if rebuild:
            self.rebuild()
        else:
            try:
                
                f = bz2.BZ2File(filename, 'r')
                wikipedia.output(u'Reading dump from %s' % filename)
                databases = pickle.load(f)
                f.close()
                # keys are categories, values are 2-tuples with lists as entries.
                self.catContentDB = databases['catContentDB'] 
                # like the above, but for supercategories
                self.superclassDB = databases['superclassDB']
                del databases
            except:
                # If something goes wrong, just rebuild the database
                self.rebuild()

    def rebuild(self):
        self.catContentDB={}
        self.superclassDB={}
    
    def getSubcats(self, supercat):
        '''
        For a given supercategory, return a list of Categorys for all its
        subcategories.
        Saves this list in a temporary database so that it won't be loaded from the
        server next time it's required.
        '''
        # if we already know which subcategories exist here
        if supercat in self.catContentDB:
            return self.catContentDB[supercat][0]
        else:
            subcatlist = supercat.subcategories()
            articlelist = supercat.articles()
            # add to dictionary
            self.catContentDB[supercat] = (subcatlist, articlelist)
            return subcatlist
    
    def getArticles(self, cat):
        '''
        For a given category, return a list of Pages for all its articles.
        Saves this list in a temporary database so that it won't be loaded from the
        server next time it's required.
        '''
        # if we already know which articles exist here
        if cat in self.catContentDB:
            return self.catContentDB[cat][1]
        else:
            subcatlist = cat.subcategories()
            articlelist = cat.articles()
            # add to dictionary
            self.catContentDB[cat] = (subcatlist, articlelist)
            return articlelist
    
    def getSupercats(self, subcat):
        # if we already know which subcategories exist here
        if subcat in self.superclassDB:
            return self.superclassDB[subcat]
        else:
            supercatlist = subcat.supercategories()
            # add to dictionary
            self.superclassDB[subcat] = supercatlist
            return supercatlist

    def dump(self, filename = 'category.dump.bz2'):
        '''
        Saves the contents of the dictionaries superclassDB and catContentDB to disk.
        '''
        wikipedia.output(u'Dumping to %s, please wait...' % filename)
        f = bz2.BZ2File(filename, 'w')
        databases = {
            'catContentDB': self.catContentDB,
            'superclassDB': self.superclassDB
        }
        # store dump to disk in binary format
        pickle.dump(databases, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()
        
def sorted_by_last_name(catlink, pagelink):
        '''
        given a Category, returns a Category which has an explicit sort key which
        sorts persons by their last names.
        Trailing words in brackets will be removed.
        Example: If category_name is 'Author' and pl is a Page to
        [[Alexandre Dumas (senior)]], this function will return this Category:
        [[Category:Author|Dumas, Alexandre]]
        '''
        page_name = pagelink.title()
        site = pagelink.site()
        # regular expression that matches a name followed by a space and
        # disambiguation brackets. Group 1 is the name without the rest.
        bracketsR = re.compile('(.*) \(.+?\)')
        match_object = bracketsR.match(page_name)
        if match_object:
            page_name = match_object.group(1)
        split_string = page_name.split(' ')
        if len(split_string) > 1:
            # pull last part of the name to the beginning, and append the rest after a comma
            # e.g. "John von Neumann" becomes "Neumann, John von"
            sorted_key = split_string[-1] + ', ' + ' '.join(split_string[:-1])
            # give explicit sort key
            return wikipedia.Page(site, catlink.title() + '|' + sorted_key)
        else:
            return wikipedia.Page(site, catlink.title())

def add_category(sort_by_last_name = False):
    '''
    A robot to mass-add a category to a list of pages.
    '''
    print "This bot has two modes: you can add a category link to all"
    print "pages mentioned in a List that is now in another wikipedia page"
    print "or you can add a category link to all pages that link to a"
    print "specific page. If you want the second, please give an empty"
    print "answer to the first question."
    listpageTitle = wikipedia.input(u'Wiki page with list of pages to change:')
    site = wikipedia.getSite()
    pages = []
    if listpageTitle:
        try:
            listpage = wikipedia.Page(site, listpageTitle)
            pages = listpage.linkedPages()
        except wikipedia.NoPage:
            wikipedia.output(u'%s could not be loaded from the server.' % listpage.aslink())
        except wikipedia.IsRedirectPage:
            wikipedia.output(u'%s is a redirect to %s.' % (listpage.aslink(), listpage.getRedirectTarget()))
    else:
        referredPage = wikipedia.input(u'Wikipedia page that is now linked to:')
        page = wikipedia.Page(wikipedia.getSite(), referredPage)
        pages = list(page.getReferences())
    wikipedia.output(u'  ==> %i pages to process\n' % len(pages))
    if len(pages) > 0:
        newcatTitle = wikipedia.input(u'Category to add (do not give namespace):')
        newcatTitle = newcatTitle[:1].capitalize() + newcatTitle[1:]

        # set edit summary message
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg_add) % newcatTitle)

        cat_namespace = wikipedia.getSite().category_namespaces()[0]

        answer = ''
        for page in pages:
            if answer != 'a':
                answer = ''

            while answer not in ('y','n','a'):
                answer = wikipedia.input(u'%s [y/n/a(ll)]:' % (page.aslink()))
                if answer == 'a':
                    confirm = ''
                    while confirm not in ('y','n'):
                        confirm = wikipedia.input(u'This should be used if and only if you are sure that your links are correct! Are you sure? [y/n]:')
                    if confirm == 'n':
                        answer = ''

            if answer == 'y' or answer == 'a':
                try:
                    cats = page.categories()
                except wikipedia.NoPage:
                    wikipedia.output(u"%s doesn't exist yet. Ignoring." % (page.title()))
                    pass
                except wikipedia.IsRedirectPage,arg:
                    redirTarget = wikipedia.Page(site,arg.args[0])
                    wikipedia.output(u"WARNING: %s is redirect to %s. Ignoring." % (page.title(), redirTarget.title()))
                else:
                    wikipedia.output(u"Current categories:")
                    for cat in cats:
                        wikipedia.output(u"* %s" % cat.title())
                    catpl = wikipedia.Page(site, cat_namespace + ':' + newcatTitle)
                    if sort_by_last_name:
                        catpl = sorted_by_last_name(catpl, page) 
                    if catpl in cats:
                        wikipedia.output(u"%s is already in %s." % (page.title(), catpl.title()))
                    else:
                        wikipedia.output(u'Adding %s' % catpl.aslink())
                        cats.append(catpl)
                        text = page.get()
                        text = wikipedia.replaceCategoryLinks(text, cats)
			try:
                            page.put(text)
                        except wikipedia.EditConflict:
                            wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
class CategoryCounterRobot:
    '''
    Robot to create tree overviews of the category structure.
    
    Parameters:
        * catTitle - The category which will be the tree's root.
        * catDB    - A CategoryDatabase object
        * maxDepth - The limit beyond which no subcategories will be listed.
                     This also guarantees that loops in the category structure
                     won't be a problem.
        * filename - The textfile where the tree should be saved; None to print
                     the tree to stdout.
    '''
    
    def __init__(self, catTitle, catDB, maxDepth = 10):
        self.catTitle = catTitle
        self.catDB = catDB
        # TODO: make maxDepth changeable with a parameter or config file entry
        self.maxDepth = maxDepth
        self.articlesMap = {}
        
    def treeview(self, cat, currentDepth = 0, parent = None):
        '''
        Returns a multi-line string which contains a tree view of all subcategories
        of cat, up to level maxDepth. Recursively calls itself.
        
        Parameters:
            * cat - the Category of the node we're currently opening
            * currentDepth - the current level in the tree (for recursion)
            * parent - the Category of the category we're coming from
        '''
        
        # Translations to say that the current category is in more categories than
        # the one we're coming from
        also_in_cats = {
            'da': u'(også i %s)',
            'de': u'(auch in %s)',
            'en': u'(also in %s)',
            'fr': u'(également dans %s)',
            'ia': u'(equalmente in %s)',
            'is': u'(einnig í %s)',
            'pt': u'(também em %s)',
            'ср': u'(такође у %s)',
            }
            
        allArticles = self.catDB.getArticles(cat)
        for a in allArticles:
          self.articlesMap[a._title] = a._title
        result = u'#' * currentDepth
        result += '[[:%s|%s]]' % (cat.title(), cat.title().split(':', 1)[1])
        result += ' (%d)' % len(allArticles)
        # We will remove an element of this array, but will need the original array
        # later, so we create a shallow copy with [:]
        supercats = self.catDB.getSupercats(cat)[:]
        # Find out which other cats are supercats of the current cat
        try:
            supercats.remove(parent)
        except:
            pass
        if supercats != []:
            supercat_names = []
            for i in range(len(supercats)):
                # create a list of wiki links to the supercategories
                supercat_names.append('[[:%s|%s]]' % (supercats[i].title(), supercats[i].title().split(':', 1)[1]))
                # print this list, separated with commas, using translations given in also_in_cats
            result += ' ' + wikipedia.translate(wikipedia.getSite(), also_in_cats) % ', '.join(supercat_names)
        result += '\n'
        if currentDepth < self.maxDepth:
            for subcat in self.catDB.getSubcats(cat):
                # recurse into subdirectories
                result += self.treeview(subcat, currentDepth + 1, parent = cat)
        else:
            if self.catDB.getSubcats(cat) != []:
                # show that there are more categories beyond the depth limit
                result += '#' * (currentDepth + 1) + '[...]\n'
        return result

    def run(self):
        """
        Prints the multi-line string generated by treeview or saves it to a file.
    
        Parameters:
            * catTitle - the title of the category which will be the tree's root
            * maxDepth - the limit beyond which no subcategories will be listed
        """
        cat = catlib.Category(wikipedia.getSite(), 'Category:' + self.catTitle)
        tree = self.treeview(cat)
        print "Articles -------------> %d" % len(self.articlesMap)

if __name__ == "__main__":
    fromGiven = False
    toGiven = False
    batchMode = False
    customSummary = False
    editSummary = ''
    try:
        catDB = CategoryDatabase()
        action = None
        sort_by_last_name = False
        restore = False
        catTitle = wikipedia.input(u'For which category do you want to count the articles?')
        bot = CategoryCounterRobot(catTitle, catDB)
        bot.run()
    finally:
        catDB.dump()
        wikipedia.stopme()