#! /usr/bin/env python
# Public domain; MZMcBride; 2018, 2019
import pymysql
import wikitools
import settings
report_title = settings.rootpage + 'Potentially untagged misspellings'
report_template = u'''\
Potentially untagged misspellings, limited to 4000 entries; data as of
<onlyinclude>~~~~~</onlyinclude>.
<strong>NOTE: USE CAUTION WITH THIS LIST.</strong>
Instead of a misspelling, a redirect might more suitably be categorized as
an alternative name!!!!!!!!!!!!!!!!!
Possible templates to use:
* {{tlx|R from alternative name}}
* {{tlx|R from alternative spelling}}
* {{tlx|R from misspelling}}
%s
'''
def get_categorized_redirects(cursor, category):
categorized_redirects = set()
cursor.execute('''\
/* potenshuntaggedmisspellings.py SLOW_OK */
SELECT
page_id
FROM page
JOIN categorylinks
ON cl_from = page_id
WHERE cl_to = %s
AND page_namespace = 0;
''', (category,))
for row in cursor.fetchall():
categorized_redirects.add(int(row[0]))
return categorized_redirects
def get_incoming_article_links_count(cursor, page_id):
cursor.execute('''\
/* potenshuntaggedmisspellings.py SLOW_OK */
SELECT
COUNT(*)
FROM page AS p1
JOIN pagelinks
ON pl_from = p1.page_id
JOIN page AS p2
ON p2.page_namespace = pl_namespace
AND p2.page_title = pl_title
WHERE p2.page_id = %s
AND p1.page_namespace = 0;
''' % (page_id,))
count = int(cursor.fetchone()[0])
return count
wiki = wikitools.Wiki(settings.apiurl)
wiki.login(settings.username, settings.password)
conn = pymysql.connect(
host=settings.host,
db=settings.dbname,
read_default_file='~/.my.cnf',
)
cursor = conn.cursor()
categorized_redirects = (
get_categorized_redirects(cursor, 'Redirects_from_misspellings') |
get_categorized_redirects(cursor, 'Redirects_from_alternative_names') |
get_categorized_redirects(cursor, 'Redirects_from_alternative_spellings')
)
potential_misspellings = [
['Schultz', 'Shultz'],
['Fogel', 'Fogle'],
['Jacobsen', 'Jacobson'],
['Fraiser', 'Frasier'],
['Japenese', 'Japanese'],
['Jarrod', 'Jared'],
['Pennslyvania', 'Pennsylvania'],
['Craig', 'Criag'],
['Buffet', 'Buffett'],
['Historic', 'Historical'],
['Ginsburg', 'Ginsberg'],
['Allen', 'Alan'],
['Allan', 'Alan'],
['Alen', 'Allan'],
['Allen', 'Alen'],
['Steven', 'Stephen'],
['Stevens', 'Stephens'],
['Daniel', 'Dannel'],
['Dannel', 'Dannell'],
['Stefan', 'Stephan'],
['Stefanie', 'Stephanie'],
['William', 'Willem'],
['Lilly', 'Lily'],
['Jonathan', 'Johnathan'],
['Jon', 'John'],
['Johnny', 'Jonny'],
['Carey', 'Cary'],
['Mill', 'Mills'],
['Scarlet', 'Scarlett'],
['Phillip', 'Philip'],
['Seymore', 'Seymour'],
['Hoffman', 'Hoffmann'],
['Ferrell', 'Farrell'],
['McEntire', 'McIntire'],
['McEntire', 'McEntyre'],
['McEntire', 'MacEntire'],
['Selma', 'Salma'],
['Rogen', 'Rogan'],
['Cohn', 'Cohen'],
['Donny', 'Donnie'],
['Diana', 'Dianna'],
['Reed', 'Reid'],
['Laurence', 'Lawrence'],
['Mathias', 'Matthias'],
['Mathew', 'Matthew'],
['Clingon', 'Klingon'],
['Independance', 'Independence'],
['independance', 'independence'], # case sensitivty is tricky; mills...
['Hellen', 'Helen'],
['Brittany', 'Britney'],
['Katherine', 'Catherine'],
['Katherine', 'Katharine'],
['Catharine', 'Katharine'],
['Marissa', 'Marisa'],
['Tomei', 'Tomie'],
['Constanza', 'Costanza'],
['Costansa', 'Costanza'],
['Jayson', 'Jason'],
['Zweig', 'Zwieg'],
['Raimond', 'Raymond'],
['Sean', 'Shawn'],
['Corey', 'Cory'],
['Rodgers', 'Rogers'],
['Rodger', 'Roger'],
['Elizabeth', 'Elisabeth'],
['Kris', 'Chris'],
['Kristy', 'Kristie'],
['Kristy', 'Christie'],
['Christy', 'Christie'],
['Christy', 'Kristie'],
['Dennis', 'Denis'],
['Hanna', 'Hannah'],
['Tolkein', 'Tolkien'],
['Henri', 'Henry'],
['Herman', 'Hermann'],
['Thompson', 'Thomson'],
['Jeffrey', 'Geoffrey'],
['Jeffery', 'Geoffrey'],
['Jeffery', 'Jeffrey'],
['Jeff', 'Geoff'],
['Geoffry', 'Geoffrey'],
['Cusak', 'Cusack'],
['Terrel', 'Terrell'],
['Anahiem', 'Anaheim'],
['Fisher', 'Fischer'],
['Nicky', 'Nikki'],
['Untied', 'United'],
['Cincinnati', 'Cinncinatti'],
['Cincinnati', 'Cinncinati'],
['Cincinnati', 'Cinncinnatti'],
['Cincinnati', 'Cinncinnati'],
['Cincinnati', 'Cincinati'],
['Cincinnati', 'Cincinatti'],
['Cincinnati', 'Cincinnatti'],
['Connecticut', 'Connecticuit'],
['Connecticut', 'Conetticut'],
['Connecticut', 'Connetecuit'],
['Connecticut', 'Conneticut'],
['Connecticut', 'Conecticut'],
['Connecticut', 'Connecticutt'],
['Connecticut', 'Conneticuit'],
['Connecticut', 'Connetecut'],
['Connecticut', 'Connectecut'],
['Connecticut', 'Connnecticut'],
['Connecticut', 'Connectecuit'],
['Minnesota', 'Minnasota'],
['Minnesota', 'Minnestoa'],
['Minnesota', 'Minesotta'],
['Minnesota', 'Minesota'],
['Minnesota', 'Minasota'],
['Minnesota', 'Minnessotta'],
['Minnesota', 'Minnesotta'],
['Minnesota', 'Minessota'],
['Minnesota', 'Minnessota'],
['Minnesota', 'Minnesoda'],
['Minnesota', 'Minessotta'],
['Holliday', 'Holiday'],
['Inquirer', 'Enquirer'],
['Abby', 'Abbey'],
['Abbi', 'Abby'],
['Abbie', 'Abby'],
['Derrick', 'Derek'],
['Jenifer', 'Jennifer'],
['Jeniffer', 'Jennifer'],
['House', 'Hosue'],
['Jarrett', 'Jarret'],
['Stamford', 'Stanford'],
['Millennial', 'Millenial'],
['Millennium', 'Millenium'],
['Emmett', 'Emmitt'],
['Emmett', 'Emmet'],
['Emmett', 'Emmit'],
['Confrence', 'Conference'],
['Coference', 'Conference'],
['Ann', 'Anne'],
['Sara', 'Sarah'],
['Lyn', 'Lynn'],
['Lyn', 'Lynne'],
['Lynn', 'Lynne'],
['Megan', 'Meagan'],
['Megan', 'Meghan'],
['Meagan', 'Meghan'],
['Eric', 'Erik'],
['Ariana', 'Arianna'],
['Ana', 'Anna'],
['Karl', 'Carl'],
['Weird', 'Wierd'],
['Alec', 'Alex'],
['Susie', 'Suzie'],
['Nic', 'Nick'],
['Forrester', 'Forester'],
['Fahrenheit', 'Farenheit'],
['Louie', 'Louis'],
['Joseph', 'Josef'],
['Rogue', 'Rouge'],
['Dwayne', 'Dwyane'],
['Nicolas', 'Nicholas'],
['Nicolas', 'Nickolas'],
['Nicolas', 'Nikolas'],
['Nickolas', 'Nikolas'],
['Nicholas', 'Nickolas'],
['Nicholas', 'Nikolas'],
['Peterson', 'Petersen'],
['Adrienne', 'Adrian'],
['Adrienne', 'Adrianne'],
['Adrianne', 'Adrian'],
['Lindsey', 'Lindsay'],
['Hillary', 'Hilary'],
['Margo', 'Margot'],
['Brendan', 'Brandon'],
['Christine', 'Christina'],
['Kristen', 'Kirsten'],
['Katarina', 'Katrina'],
]
i = 1
output = {}
with open('/data/scratch/enwiki-namespace-0-redirects-2019-06-03.txt') as file_:
for line in file_:
if i > 4000:
break
if line:
line = line.rstrip('\n')
(redirect_id, redirect_title, target_title) = line.split('\t')
if line.startswith('redirect_id'):
continue
for pm in potential_misspellings:
source = pm[0].lower()
target = pm[1].lower()
grouping = '%s / %s' % (source, target)
if grouping not in output:
output[grouping] = []
all_tests = [
int(redirect_id) not in categorized_redirects,
]
any_tests = [
(source in redirect_title.lower().split('_') and
target in target_title.lower().split('_')),
(target in redirect_title.lower().split('_') and
source in target_title.lower().split('_')),
]
if all(all_tests) and any(any_tests):
current_items = output[grouping]
current_items.append(u'''\
| %d
| [[Special:WhatLinksHere/%s|%d]]
| {{plhnr|1=%s}}
| [[%s]]
|-''' % (
i,
unicode(redirect_title, 'utf-8'),
get_incoming_article_links_count(cursor, redirect_id),
unicode(redirect_title.replace('_', ' '), 'utf-8'),
unicode(target_title.replace('_', ' '), 'utf-8'),
))
output[grouping] = current_items
i += 1
report_section = u'''\
== %s ==
{| class="wikitable sortable plainlinks"
|- style="white-space:nowrap;"
! No.
! Article links
! Source
! Target
|-
%s
|}
'''
final = []
for k, v in output.items():
if v:
final.append(report_section % (k, '\n'.join(v)))
report = wikitools.Page(wiki, report_title)
report_text = report_template % ('\n'.join(final))
report_text = report_text.encode('utf-8')
report.edit(report_text, summary=settings.editsumm, bot=1)
cursor.close()
conn.close()