Utilisateur:Lachaume/list wiki authors.pl

Lister les auteurs d'un article

modifier

Le programme informatique ci-dessous liste les auteurs d'un article de wikipédia par nombre de modifications non mineures, hors restaurations et interventions d'IP. Les trois derniers critères peuvent être changés (options en ligne de commande).

Il s'agit d'un script perl qui devrait fonctionner sur tout système avec une version récente de perl et une configuration du jeu de caractères en UTF-8.

Le script

modifier

Recopier le script ci-dessus en supprimant l'espace de début de ligne (important devant le EOF ou le #!/usr/bin/perl...).

 #!/usr/bin/perl
 
 # Name:    list_wiki_authors 
 # Purpose: list authors of an article in Wikipedia 
 # Author:  Régis Lachaume
 # Contact: lachaume [AT] MPIfR-Bonn [DOT] MPG [DOT] de
 # Date:    2005
 # Licence: public domain.
 # System:  recent version, UTF8-compliant system
    
 #### declarations ####
    
 use strict;
 use warnings;
 use utf8;
 use Getopt::Long qw{:config posix_default no_auto_abbrev no_ignorecase};
    
 # %history = fetch_wiki_history $article_name, $language_code;
 sub fetch_wiki_history ($$);
    
 # print_wiki_history %history
 sub print_wiki_history (%);
    
 # help($exit_code)
 sub help ($);  
    
 #### programme ####
    
 our ($help, $list_ips, $list_minor, $list_reverts, $short, $lang) = (0, 0, 0, 0, 0, 'fr');
 our $Nmax = 99999; # $Nmax last modifications...
       
 GetOptions(
    'help|h'                    => \$help,
    'ip|list-ip-contribs'       => \$list_ips,
    'minor|list-minor-contribs' => \$list_minor,
    'revert|list-reverts'       => \$list_reverts,
    'short'                     => \$short,
    'lang|wiki-language-code'   => \$lang,
 );
             
 $help && help(0);
 @ARGV >=1 && @ARGV <=  2 || help(1);
       
 our %history = fetch_wiki_history $ARGV[0], $lang;
 print_wiki_history %history;
 
 #### implementation ####
 
 use LWP::UserAgent;
 use HTML::TreeBuilder;
 sub fetch_wiki_history ($$) {
 
    my ($page, $lang) = @_;
 
    # retrieve HTML source
 
    my $ua = LWP::UserAgent->new();
    $ua->agent('Mozilla/5.0'); # don't tell wiki I'm a bot :-)
    my $url = "http://$lang.wikipedia.org/w/index.php?title=$page\&action=history\&limit=$Nmax\&offset=0";
    my $response = $ua->get($url);
    $response->is_success() || die $response->status_line();
    my $contents = $response->content;
 
    open my $save, ">list.save";
    print $save $contents;
    close $save;
 
    # parse HTML (find items of the contribution list)
    my $htmltree = HTML::TreeBuilder->new_from_content($contents);
    my $body    = $htmltree->look_down('id', 'bodyContent');
    defined $body || die "no page contents: wikipedia problem?\n";
    my $history = $body->look_down('id', 'pagehistory');
    
    defined $history || die "no contribution list: inexistent article?\n"; 
       # BUG. any error page of wikipedia will produce this error.  we need
       # to match given sentences in the text to distinguish them (e.g. no article,
       # wikipedia down, etc.), which depends on the language served!
    
    my @list    = $history->look_down('_tag', 'li');
    my %history = ( );
 
    foreach my $item (@list) {
 
       # ignore minor modifications (unless asked for)
       # BUG: should count the number of bytes of the difference!
       my $minor = defined $item->look_down('class', 'minor');
       next if !$list_minor && $minor;
 
       # user name and discard IPs (unless asked for)
       my $user = $item->look_down('class', 'user');
       next unless defined $user;
       $user = $user->as_text();
       next if !$list_ips && $user =~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/;
 
       # get comment and discard the entry if a revert is mentioned (unless
       # asked for)
       # BUG: vandalism itself is not detected...
       my $em = $item->look_down('_tag', 'em');
       my $comment = defined $em? $em->as_text(): "";
       next if !$list_reverts && $comment =~ /^\b(rv|r.vert|r.version|restauration)\b/i;
 
       # date (one of the anchors, the first matching a year and a time)
       my $date = "";
       my @anchors = $item->look_down('_tag', 'a');
       foreach my $anchor (@anchors) {
          my $text = $anchor->as_text();
          $date = $text, last if $text =~ /[0-9]{2}:[0-9]{2}/ && $text =~ /[0-9]{4}/;
       }
    
       # history is a hash { author1 => author_history1, ...}
       # author_history is an array [ modif1, ... ]
       # modif is a hash { date => <date>, comment => <comment>, minor => 0/1 }
       my $histline = { date => $date, comment => $comment, minor => $minor };
       if (defined $history{$user}) {
          push @{$history{$user}}, $histline;
       } else { 
          $history{$user} = [ $histline ];
       };
    
    }
    
    return %history;
    
 }  
    
 sub print_wiki_history (%) {
    
    my %history = @_;
       
    foreach my $author (sort {@{$history{$b}}<=>@{$history{$a}}} keys %history) {
       
       my $history = $history{$author};
       printf "%s - %i modification%s\n", $author, scalar(@$history),
          @$history>1? "s": "";
       
       if (!$short) {
          foreach my $histline (@$history) {
             my ($date, $comment, $minor) = @$histline{'date','comment','minor'};
             $date = " $date" if $date !~ /^[0-9]{2}/;
             printf "   %s [%s] %s\n", $date, ($minor? "min.": "maj."), $comment;
          }
       }
       
    }  
 
 }     
 
 sub help ($) {
    print<<EOF
 
 use: $0 article_title [-lang|-wiki-language-code language_code] 
    [-ip|-list-ip-contribs] [-minor|-list-minor-contribs]
    [-revert|-list-reverts] [-short]
 
 DESCRIPTION
 
 Fetch the authors of a Wikipedia article along with a summary of their
 contributions.  Author with the largest number of contributions are
 listed first; an author's contributions are sorted by date (most
 recent first).
 
 COMMAND-LINE OPTIONS
 
 -lang <language_code>
    Wikipedia to be used (default: fr, meaning fr.wikipedia.org)
 -ip
    List IP authors (default: no)
 -revert
    List reverts (default: no)
 -minor 
    List minor modifications (default: no)
 -short
    Only list authors and number of contributions (default: full information)
 
 BUGS
 
 * The selection of relevant contributions is based on the authors' 
   comments, not on their true nature (number of bytes, suppression
   or addition of text, etc.)
 * Vandalism--even its most evident manifestations-is not directly dealt 
   with, though filtering out IP contributions and reverts limits 
   the number of vandals listed.
 * Only UTF8 is supported.  (Should be trivial to change, but I'm lazy.)
 * Wikipedia errors are wrongly reported as 'no existent article'
 
 EOF
    ;
    exit shift;
 }