User:AnomieBOT/source/tasks/ReplaceExternalLinks4.pm
Approved 2011-11-02. Wikipedia:Bots/Requests for approval/AnomieBOT 58 |
package tasks::ReplaceExternalLinks4;
=pod
=begin metadata
Bot: AnomieBOT
Task: ReplaceExternalLinks4
BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT 58
Status: Approved 2011-11-02
Created: 2011-10-21
OnDemand: true
Replace URL redirector links with direct links to the target URL.
=end metadata
=cut
use utf8;
use strict;
use Data::Dumper;
use URI;
use URI::Escape;
use AnomieBOT::Task qw/:time/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;
# Maps euquery values to replacement functions
my %replacements=(
);
# Youtube shortener
if(0){ # disable for now, all fixed/logged and currently blacklisted
$replacements{'youtu.be/'}=sub {
my $url=shift;
my $u1=URI->new($url);
my $u2=URI->new("//youtube.com/watch");
$u2->scheme($u1->scheme);
my $p=$u1->path;
$p=~s!^/*([^/]+)(?:/.*)?$!$1!;
$u2->query_form(v=>uri_unescape($p), $u1->query_form);
my $ret=$u2->as_iri;
return ($ret);
};
}
# Google has lots of patterns, construct programmatically
if(0){ # disable for now, all fixed/logged and currently blacklisted
my @domains=qw(
www.google.com
books.google.com
books.google.co.uk
encrypted.google.com
images.google.ca
images.google.com
images.google.co.uk
images.google.ie
news.google.ca
news.google.co.in
news.google.com
news.google.com.au
news.google.com.br
news.google.com.co
news.google.com.hk
news.google.co.uk
news.google.co.za
news.google.de
news.google.ie
news.google.it
news.google.nl
news.google.ru
scholar.google.com
scholar.google.de
scholar.google.se
translate.google.com
www.google.at
www.google.az
www.google.be
www.google.bg
www.google.ca
www.google.ch
www.google.cl
www.google.cm
www.google.co.id
www.google.co.il
www.google.co.in
www.google.co.jp
www.google.co.ke
www.google.co.kr
www.google.co.ma
www.google.com.ar
www.google.com.au
www.google.com.br
www.google.com.co
www.google.com.ec
www.google.com.fj
www.google.com.gh
www.google.com.hk
www.google.com.lb
www.google.com.mx
www.google.com.my
www.google.com.ng
www.google.com.np
www.google.com.om
www.google.com.pe
www.google.com.ph
www.google.com.pk
www.google.com.pr
www.google.com.sg
www.google.com.tr
www.google.com.tw
www.google.com.ua
www.google.com.uy
www.google.co.nz
www.google.co.th
www.google.co.uk
www.google.co.za
www.google.co.zw
www.google.cz
www.google.de
www.google.dk
www.google.ee
www.google.es
www.google.fi
www.google.fr
www.google.gr
www.google.hr
www.google.hu
www.google.ie
www.google.it
www.google.jo
www.google.lk
www.google.lv
www.google.md
www.google.nl
www.google.no
www.google.pl
www.google.pt
www.google.ro
www.google.ru
www.google.se
www.google.si
www.google.sk
www.google.sm
);
my @suffixes=qw(
/url?
/archivesearch/url?
/bookmarks/url?
/history/url?
/m/url?
/newspapers/url?
/news/url?
);
my $repl=sub {
my $url=shift;
my %q=URI->new($url)->query_form;
my $ret=undef;
$ret=$q{'q'} if ($q{'q'}//'')=~/^(?:http|ftp)/;
$ret=$q{'url'} if ($q{'url'}//'')=~/^(?:http|ftp)/;
if(!defined($ret)){
return ($ret, "Could not find 'url' or 'q' parameter in Google $url", "Invalid/obfuscated Google redirect", "The link <code><nowiki>$url</nowiki></code> does not contain a <code>q</code> or <code>url</code> parameter containing the target URL. Please fix manually.");
}
return ($ret)
};
for my $domain (@domains) {
for my $suffix (@suffixes) {
$replacements{$domain.$suffix}=$repl;
}
}
}
###########################
my $chars='[^][<>"\x00-\x20\x7F\p{Zs}]';
sub new {
my $class=shift;
my $self=$class->SUPER::new();
$self->{'proto'}=undef;
$self->{'iter'}=undef;
my %remap=();
my @re=();
while(my ($k,$v)=each %replacements){
my $re=quotemeta($k);
$re=~s!\\/!/!g;
$re=~s/\\\*/$chars*/g;
$re=~s!^(.*?)($|/)!(?i:$1)$2!;
push @re, $re;
$remap{$k}=qr!//$re!;
}
$self->{'remap'}=\%remap;
my $re='//(?:'.join('|', @re).')'.$chars.'*';
$self->{'re'}=qr/$re/;
bless $self, $class;
return $self;
}
=pod
=for info
Approved 2011-11-02.<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 58]]
=cut
sub approved {
return -1;
}
sub run {
my ($self, $api)=@_;
my $res;
$api->task('ReplaceExternalLinks4', 0, 10, qw/d::Templates d::Talk/);
my $screwup='Errors? [[User:'.$api->user.'/shutoff/ReplaceExternalLinks4]]';
# Spend a max of 5 minutes on this task before restarting
my $endtime=time()+300;
my $re=$self->{'re'};
my %remap=%{$self->{'remap'}};
my $fix=0;
my $page;
my $checkExtLink=sub {
my ($fmt,$url,$txt)=@_;
my $prefix;
if($fmt==2){
# Duplicate Mediawiki post-processing of bare external links
$txt=$1.$txt if $url=~s/((?:[<>]|&[lg]t;).*$)//;
my $sep=',;\.:!?';
$sep.=')' unless $url=~/\(/;
$txt=$1.$txt if $url=~s/([$sep]+$)//;
# There shouldn't be a template inside the url
$txt=$1.$txt if $url=~s/(\{\{.*$)//;
$prefix=qr/https?:/;
} else {
$prefix=qr/(?:https?:)?/;
}
return $url.$txt unless $url=~/^$prefix$re$/;
keys %remap;
while(my ($k,$r)=each %remap){
next unless $url=~/^$prefix$r/;
my ($ret,$log,$errs,$errb)=$replacements{$k}($url);
if(defined($ret)){
$fix++;
$ret=~s/([][<>"\x00-\x20\x7F\p{Zs}])/ uri_escape_utf8($1,'\x00-\xff') /ge;
return $ret.$txt;
}
$api->warn("$log in $page") if defined($log);
$api->whine("$errs in [[:$page]]", $errb, Pagename=>'User:AnomieBOT/ReplaceExternalLinks4 problems', NoSmallPrint=>1) if(defined($errs) && defined($errb));
}
return $url.$txt;
};
my $fixLinks=sub {
my $txt=shift;
my $nowiki;
# Hide bits we shouldn't process
($txt,$nowiki)=$api->strip_nowiki($txt);
($txt,$nowiki)=$api->strip_templates($txt, sub { return 1; }, {}, $nowiki);
# Hide XLinkBot notices
if($page=~/^User talk:/){
($txt,$nowiki)=$api->strip_regex(qr/[^\n]*\[\[User:XLinkBot(?:\||\]\])[^\n]*/, $txt, $nowiki);
}
# First, fix any bracketed external link
$txt=~s{\[((?:https?:)?$re)( *[^\]\x00-\x08\x0a-\x1F]*?)\]}{ '['.($checkExtLink->(1,$1,$2)).']' }ge;
# Now hide the bracketed external links
($txt,$nowiki)=$api->strip_regex(qr{\[(?:https?:)?//[^][<>\x22\x00-\x20\x7F]+ *[^\]\x00-\x08\x0a-\x1F]*?\]}, $txt, $nowiki);
# Fix any bare external links
$txt=~s{\b(https?:$re)}{ $checkExtLink->(2,$1,'') }ge;
# Unstrip
$txt=$api->replace_stripped($txt,$nowiki);
return $txt;
};
$self->{'proto'}=['http','https'] unless @{$self->{'proto'}//[]};
while(@{$self->{'proto'}}){
if(!defined($self->{'iter'})){
$self->{'iter'}=$api->iterator(
generator => 'exturlusage',
geuprotocol => shift @{$self->{'proto'}},
geuquery => [ keys %replacements ],
geulimit => '1000', # exturlusage has issues with big lists
);
}
while(my $pg=$self->{'iter'}->next){
if(!$pg->{'_ok_'}){
$api->warn("Failed to retrieve page list for ".$self->{'iter'}->iterval.": ".$pg->{'error'}."\n");
return 60;
}
return 0 if $api->halting;
$page=$pg->{'title'};
my $tok=$api->edittoken($page, EditRedir => 1);
if($tok->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$tok->{'content'}."\n");
return 300;
}
if($tok->{'code'} eq 'pageprotected'){
$api->whine("[[:$page]] is protected", "Please fix manually.", Pagename=>'User:AnomieBOT/ReplaceExternalLinks4 problems', NoSmallPrint=>1);
next;
}
if($tok->{'code'} eq 'botexcluded'){
$api->whine("Bot excluded from [[:$page]]", "<nowiki>".$tok->{'error'}."</nowiki>. Please fix manually or adjust the exclusion.", Pagename=>'User:AnomieBOT/ReplaceExternalLinks4 problems', NoSmallPrint=>1);
next;
}
if($tok->{'code'} ne 'success'){
$api->warn("Failed to get edit token for $page: ".$tok->{'error'}."\n");
next;
}
if(exists($tok->{'missing'})){
$api->warn("WTF? $page does not exist?\n");
next;
}
my $intxt=$tok->{'revisions'}[0]{'slots'}{'main'}{'*'};
$fix=0;
# First, process links in templates
my $outtxt=$api->process_templates($intxt, sub {
shift; #$name
my $params=shift;
shift; #$wikitext
shift; #$data
my $oname=shift;
my $ret="{{$oname";
for my $p (@$params){
$ret.='|'.($fixLinks->($p));
}
$ret.="}}";
return $ret;
});
# Now clean up the rest of the page.
$outtxt=$fixLinks->($outtxt);
if($outtxt ne $intxt){
my @summary=();
push @summary, "bypassing $fix redirection URL".($fix==1?'':'s') if $fix;
unless(@summary){
$api->warn("Changes made with no summary for $page, not editing");
next;
}
$summary[$#summary]='and '.$summary[$#summary] if @summary>1;
my $summary=ucfirst(join((@summary>2)?', ':' ', @summary));
$api->log("$summary in $page");
my $r=$api->edit($tok, $outtxt, "$summary. $screwup", 1, 1);
if(lc($r->{'code'}) eq 'failure' && exists($r->{'edit'}{'spamblacklist'})){
my $bl=$r->{'edit'}{'spamblacklist'};
$api->log("Write failed on $page: Blacklisted link $bl");
$api->warn("Write failed on $page: Blacklisted link $bl\n");
$api->whine("Redirect to blacklisted URL in [[:$page]]", "MediaWiki's [[MediaWiki:Spam-blacklist|spam blacklist]] complained about <nowiki>$bl</nowiki>. Note there may be more than one blacklisted link in the page. Please fix manually.", Pagename=>'User:AnomieBOT/ReplaceExternalLinks4 problems', NoSmallPrint=>1);
next;
}
if($r->{'code'} ne 'success'){
$api->warn("Write failed on $page: ".$r->{'error'}."\n");
next;
}
}
# If we've been at it long enough, let another task have a go.
return 0 if time()>=$endtime;
}
$self->{'iter'}=undef;
}
$api->log("May be DONE!");
return 3600;
}
1;