Source code for StaleReqphotoBot, which identifies {{reqphoto}} instances that appear likely to be "stale" (where the article page has one or more images) and adds them to a category of articles which may need to have the reqphoto template removed.
#! /usr/bin/python
# StaleReqphotoBot
#
# Examine each article that transcludes {{reqphoto}}. If the
# main article page has at least one image, add the article
# to Category:Articles which may no longer need photos.
#
# Do not revisit any article more often than once every six
# months. Skip articles with {{reqphoto|of=...}}. Skip
# articles titled "National Register of Historic Places listing..."
import wikipedia, catlib, pagegenerators
import sqlite3
import time, sys
import re
import wikitemplate
import socket
from datetime import datetime, timedelta
startCatName = 'Category:Wikipedia requested photographs'
startCatAfter = None
hasImageCatName = 'Category:Articles which may no longer need images'
editComment = '[[User:PhotoCatBot|PhotoCatBot]] thinks this article may no longer need a photo request. Please check and update the talk page!'
def main():
diary = initialize_diary()
site = wikipedia.getSite()
# Find articles transcluding {{reqphoto}}
photoreq_cat = catlib.Category(None, startCatName)
photoreq_pages = pagegenerators.CategorizedPageGenerator(photoreq_cat, recurse = True, start = startCatAfter)
for p in photoreq_pages:
try:
update_stale_reqphotos(diary, p)
except (wikipedia.Error, socket.timeout):
wikipedia.output("%s raised on %s" % (sys.exc_info(), p.title()))
def update_stale_reqphotos(diary, page):
if page.isTalkPage():
talk = page
article = page.toggleTalkPage()
else:
article = page
talk = page.toggleTalkPage()
# Skip NRHP Listing articles per doncram.
if article.title().startswith('National Register of Historic Places listing'):
wikipedia.output("%s: skipping" % article.title())
return
# Skip this page if we have modified it in the last 6 months.
if recently_updated(diary, talk):
wikipedia.output("%s was updated within 6 months" % talk.title())
return
# If the page has a {{reqphoto}} with the "of=" paramter,
# we'll assume it's a very specific photo request and
# ignore it even if the page has images. This is the way
# to short-circuit the bot from re-adding a page inappropriately
# to category 'Articles which may no longer need images'.
reqphotos = find_reqphotos_on(talk)
if reqphotos:
reqphotos_have_of = any( param.find("of=") == 0
for req in reqphotos
for param in req[1] )
if reqphotos_have_of:
wikipedia.output("%s has {{reqphoto|of=}}, skipping" % article.title())
return
# If the article has an infobox *and* any infobox
# does not have an image, skip it -- the image request
# is assumed to still be legitimate in this case.
# Suggestion by {{user|Emperor}}.
infoboxes = find_infoboxes(article)
if infoboxes:
infoboxes_lacking_image = filter(infobox_lacks_image, infoboxes)
if infoboxes_lacking_image:
wikipedia.output("skipping %s: {{%s}} lacks an image" % (article.title(), infoboxes_lacking_image[0][0]))
return
# If this article appears to contain images, add it to
# 'Articles which may no longer need images'.
if has_images(article):
text = talk.get()
cats = talk.categories()
hasImageCat = catlib.Category(None, hasImageCatName, sortKey = article.title())
if hasImageCat in cats:
wikipedia.output("%s already in %s, skipping" % (talk.title(), hasImageCatName))
return
else:
newtext = wikipedia.replaceCategoryLinks(text, cats + [hasImageCat])
if text != newtext:
try:
#talk.put(newtext, editComment)
wikipedia.showDiff(text, newtext)
except:
wikipedia.output("could not save %s: %s" % (talk.title(), sys.exc_info()))
#update_modification_time(diary, talk)
def find_reqphotos_on(page):
reqphotos = [tmpl for tmpl in page.templatesWithParams() if tmpl[0] == 'Reqphoto']
return reqphotos
def find_infoboxes(page):
infoboxes = [tmpl for tmpl in page.templatesWithParams() if tmpl[0].startswith('Infobox')]
return infoboxes
def nonempty_image_param(param):
return re.match(r'image\s*=.*\.(jpg|png)', param, re.I | re.M)
def infobox_lacks_image(template):
parameters = template[1]
return not any(nonempty_image_param(p) for p in parameters)
# Check to see if the page includes a JPG, GIF or PNG image.
# Skip .SVG because it is so often used for maps, logos, icons,
# placeholders and other small art that is not intended by the reqphoto
# template.
def has_images(page):
try:
images = page.imagelinks()
except:
wikipedia.output("%s raised on %s" % (sys.exc_info(), page.title()))
return None
return any(re.match(r'.*\.(jpg|jpeg|gif|png)$', img.title(), re.I) for img in images)
def initialize_diary():
db = sqlite3.connect('StaleReqphotoBot.sqlite3', detect_types = sqlite3.PARSE_DECLTYPES)
c = db.cursor()
c.execute("""CREATE TABLE IF NOT EXISTS update_times (
title TEXT PRIMARY KEY,
update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)""")
db.commit()
c.close()
return db
def update_modification_time(db, page):
c = db.cursor()
args = (page.title(), )
c.execute('INSERT OR REPLACE INTO update_times (title) VALUES (?)', args)
db.commit()
c.close()
def recently_updated(db, page):
c = db.cursor()
args = (page.title(), )
c.execute('SELECT update_time FROM update_times WHERE title = ?', args)
r = c.fetchone()
c.close()
if r:
expire_time = r[0] + timedelta(180);
return datetime.now() < expire_time
else:
return False
def close_diary(db):
db.disconnect()
try:
main()
finally:
wikipedia.stopme()