Utilisateur:Lachaume/list wiki authors.pl
Lister les auteurs d'un article
modifierLe programme informatique ci-dessous liste les auteurs d'un article de wikipédia par nombre de modifications non mineures, hors restaurations et interventions d'IP. Les trois derniers critères peuvent être changés (options en ligne de commande).
Il s'agit d'un script perl qui devrait fonctionner sur tout système avec une version récente de perl et une configuration du jeu de caractères en UTF-8.
Le script
modifierRecopier le script ci-dessus en supprimant l'espace de début de ligne (important devant le EOF ou le #!/usr/bin/perl...).
#!/usr/bin/perl # Name: list_wiki_authors # Purpose: list authors of an article in Wikipedia # Author: Régis Lachaume # Contact: lachaume [AT] MPIfR-Bonn [DOT] MPG [DOT] de # Date: 2005 # Licence: public domain. # System: recent version, UTF8-compliant system #### declarations #### use strict; use warnings; use utf8; use Getopt::Long qw{:config posix_default no_auto_abbrev no_ignorecase}; # %history = fetch_wiki_history $article_name, $language_code; sub fetch_wiki_history ($$); # print_wiki_history %history sub print_wiki_history (%); # help($exit_code) sub help ($); #### programme #### our ($help, $list_ips, $list_minor, $list_reverts, $short, $lang) = (0, 0, 0, 0, 0, 'fr'); our $Nmax = 99999; # $Nmax last modifications... GetOptions( 'help|h' => \$help, 'ip|list-ip-contribs' => \$list_ips, 'minor|list-minor-contribs' => \$list_minor, 'revert|list-reverts' => \$list_reverts, 'short' => \$short, 'lang|wiki-language-code' => \$lang, ); $help && help(0); @ARGV >=1 && @ARGV <= 2 || help(1); our %history = fetch_wiki_history $ARGV[0], $lang; print_wiki_history %history; #### implementation #### use LWP::UserAgent; use HTML::TreeBuilder; sub fetch_wiki_history ($$) { my ($page, $lang) = @_; # retrieve HTML source my $ua = LWP::UserAgent->new(); $ua->agent('Mozilla/5.0'); # don't tell wiki I'm a bot :-) my $url = "http://$lang.wikipedia.org/w/index.php?title=$page\&action=history\&limit=$Nmax\&offset=0"; my $response = $ua->get($url); $response->is_success() || die $response->status_line(); my $contents = $response->content; open my $save, ">list.save"; print $save $contents; close $save; # parse HTML (find items of the contribution list) my $htmltree = HTML::TreeBuilder->new_from_content($contents); my $body = $htmltree->look_down('id', 'bodyContent'); defined $body || die "no page contents: wikipedia problem?\n"; my $history = $body->look_down('id', 'pagehistory'); defined $history || die "no contribution list: inexistent article?\n"; # BUG. any error page of wikipedia will produce this error. we need # to match given sentences in the text to distinguish them (e.g. no article, # wikipedia down, etc.), which depends on the language served! my @list = $history->look_down('_tag', 'li'); my %history = ( ); foreach my $item (@list) { # ignore minor modifications (unless asked for) # BUG: should count the number of bytes of the difference! my $minor = defined $item->look_down('class', 'minor'); next if !$list_minor && $minor; # user name and discard IPs (unless asked for) my $user = $item->look_down('class', 'user'); next unless defined $user; $user = $user->as_text(); next if !$list_ips && $user =~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/; # get comment and discard the entry if a revert is mentioned (unless # asked for) # BUG: vandalism itself is not detected... my $em = $item->look_down('_tag', 'em'); my $comment = defined $em? $em->as_text(): ""; next if !$list_reverts && $comment =~ /^\b(rv|r.vert|r.version|restauration)\b/i; # date (one of the anchors, the first matching a year and a time) my $date = ""; my @anchors = $item->look_down('_tag', 'a'); foreach my $anchor (@anchors) { my $text = $anchor->as_text(); $date = $text, last if $text =~ /[0-9]{2}:[0-9]{2}/ && $text =~ /[0-9]{4}/; } # history is a hash { author1 => author_history1, ...} # author_history is an array [ modif1, ... ] # modif is a hash { date => <date>, comment => <comment>, minor => 0/1 } my $histline = { date => $date, comment => $comment, minor => $minor }; if (defined $history{$user}) { push @{$history{$user}}, $histline; } else { $history{$user} = [ $histline ]; }; } return %history; } sub print_wiki_history (%) { my %history = @_; foreach my $author (sort {@{$history{$b}}<=>@{$history{$a}}} keys %history) { my $history = $history{$author}; printf "%s - %i modification%s\n", $author, scalar(@$history), @$history>1? "s": ""; if (!$short) { foreach my $histline (@$history) { my ($date, $comment, $minor) = @$histline{'date','comment','minor'}; $date = " $date" if $date !~ /^[0-9]{2}/; printf " %s [%s] %s\n", $date, ($minor? "min.": "maj."), $comment; } } } } sub help ($) { print<<EOF use: $0 article_title [-lang|-wiki-language-code language_code] [-ip|-list-ip-contribs] [-minor|-list-minor-contribs] [-revert|-list-reverts] [-short] DESCRIPTION Fetch the authors of a Wikipedia article along with a summary of their contributions. Author with the largest number of contributions are listed first; an author's contributions are sorted by date (most recent first). COMMAND-LINE OPTIONS -lang <language_code> Wikipedia to be used (default: fr, meaning fr.wikipedia.org) -ip List IP authors (default: no) -revert List reverts (default: no) -minor List minor modifications (default: no) -short Only list authors and number of contributions (default: full information) BUGS * The selection of relevant contributions is based on the authors' comments, not on their true nature (number of bytes, suppression or addition of text, etc.) * Vandalism--even its most evident manifestations-is not directly dealt with, though filtering out IP contributions and reverts limits the number of vandals listed. * Only UTF8 is supported. (Should be trivial to change, but I'm lazy.) * Wikipedia errors are wrongly reported as 'no existent article' EOF ; exit shift; }