Wikipedia:Database reports/Potentially untagged misspellings/Configuration

potenshuntaggedmisspellings.py

edit
#! /usr/bin/env python
# Public domain; MZMcBride; 2018, 2019

import pymysql
import wikitools

import settings

report_title = settings.rootpage + 'Potentially untagged misspellings'

report_template = u'''\
Potentially untagged misspellings, limited to 4000 entries; data as of
<onlyinclude>~~~~~</onlyinclude>.

<strong>NOTE: USE CAUTION WITH THIS LIST.</strong>

Instead of a misspelling, a redirect might more suitably be categorized as
an alternative name!!!!!!!!!!!!!!!!!

Possible templates to use:

* {{tlx|R from alternative name}}
* {{tlx|R from alternative spelling}}
* {{tlx|R from misspelling}}

%s
'''


def get_categorized_redirects(cursor, category):
    categorized_redirects = set()
    cursor.execute('''\
    /* potenshuntaggedmisspellings.py SLOW_OK */
    SELECT
      page_id
    FROM page
    JOIN categorylinks
    ON cl_from = page_id
    WHERE cl_to = %s
    AND page_namespace = 0;
    ''', (category,))
    for row in cursor.fetchall():
        categorized_redirects.add(int(row[0]))
    return categorized_redirects


def get_incoming_article_links_count(cursor, page_id):
    cursor.execute('''\
    /* potenshuntaggedmisspellings.py SLOW_OK */
    SELECT
      COUNT(*)
    FROM page AS p1
    JOIN pagelinks
    ON pl_from = p1.page_id
    JOIN page AS p2
    ON p2.page_namespace = pl_namespace
    AND p2.page_title = pl_title
    WHERE p2.page_id = %s
    AND p1.page_namespace = 0;
    ''' % (page_id,))
    count = int(cursor.fetchone()[0])
    return count


wiki = wikitools.Wiki(settings.apiurl)
wiki.login(settings.username, settings.password)

conn = pymysql.connect(
    host=settings.host,
    db=settings.dbname,
    read_default_file='~/.my.cnf',
)
cursor = conn.cursor()

categorized_redirects = (
    get_categorized_redirects(cursor, 'Redirects_from_misspellings') |
    get_categorized_redirects(cursor, 'Redirects_from_alternative_names') |
    get_categorized_redirects(cursor, 'Redirects_from_alternative_spellings')
)

potential_misspellings = [
    ['Schultz', 'Shultz'],
    ['Fogel', 'Fogle'],
    ['Jacobsen', 'Jacobson'],
    ['Fraiser', 'Frasier'],
    ['Japenese', 'Japanese'],
    ['Jarrod', 'Jared'],
    ['Pennslyvania', 'Pennsylvania'],
    ['Craig', 'Criag'],
    ['Buffet', 'Buffett'],
    ['Historic', 'Historical'],
    ['Ginsburg', 'Ginsberg'],
    ['Allen', 'Alan'],
    ['Allan', 'Alan'],
    ['Alen', 'Allan'],
    ['Allen', 'Alen'],
    ['Steven', 'Stephen'],
    ['Stevens', 'Stephens'],
    ['Daniel', 'Dannel'],
    ['Dannel', 'Dannell'],
    ['Stefan', 'Stephan'],
    ['Stefanie', 'Stephanie'],
    ['William', 'Willem'],
    ['Lilly', 'Lily'],
    ['Jonathan', 'Johnathan'],
    ['Jon', 'John'],
    ['Johnny', 'Jonny'],
    ['Carey', 'Cary'],
    ['Mill', 'Mills'],
    ['Scarlet', 'Scarlett'],
    ['Phillip', 'Philip'],
    ['Seymore', 'Seymour'],
    ['Hoffman', 'Hoffmann'],
    ['Ferrell', 'Farrell'],
    ['McEntire', 'McIntire'],
    ['McEntire', 'McEntyre'],
    ['McEntire', 'MacEntire'],
    ['Selma', 'Salma'],
    ['Rogen', 'Rogan'],
    ['Cohn', 'Cohen'],
    ['Donny', 'Donnie'],
    ['Diana', 'Dianna'],
    ['Reed', 'Reid'],
    ['Laurence', 'Lawrence'],
    ['Mathias', 'Matthias'],
    ['Mathew', 'Matthew'],
    ['Clingon', 'Klingon'],
    ['Independance', 'Independence'],
    ['independance', 'independence'], # case sensitivty is tricky; mills...
    ['Hellen', 'Helen'],
    ['Brittany', 'Britney'],
    ['Katherine', 'Catherine'],
    ['Katherine', 'Katharine'],
    ['Catharine', 'Katharine'],
    ['Marissa', 'Marisa'],
    ['Tomei', 'Tomie'],
    ['Constanza', 'Costanza'],
    ['Costansa', 'Costanza'],
    ['Jayson', 'Jason'],
    ['Zweig', 'Zwieg'],
    ['Raimond', 'Raymond'],
    ['Sean', 'Shawn'],
    ['Corey', 'Cory'],
    ['Rodgers', 'Rogers'],
    ['Rodger', 'Roger'],
    ['Elizabeth', 'Elisabeth'],
    ['Kris', 'Chris'],
    ['Kristy', 'Kristie'],
    ['Kristy', 'Christie'],
    ['Christy', 'Christie'],
    ['Christy', 'Kristie'],
    ['Dennis', 'Denis'],
    ['Hanna', 'Hannah'],
    ['Tolkein', 'Tolkien'],
    ['Henri', 'Henry'],
    ['Herman', 'Hermann'],
    ['Thompson', 'Thomson'],
    ['Jeffrey', 'Geoffrey'],
    ['Jeffery', 'Geoffrey'],
    ['Jeffery', 'Jeffrey'],
    ['Jeff', 'Geoff'],
    ['Geoffry', 'Geoffrey'],
    ['Cusak', 'Cusack'],
    ['Terrel', 'Terrell'],
    ['Anahiem', 'Anaheim'],
    ['Fisher', 'Fischer'],
    ['Nicky', 'Nikki'],
    ['Untied', 'United'],
    ['Cincinnati', 'Cinncinatti'],
    ['Cincinnati', 'Cinncinati'],
    ['Cincinnati', 'Cinncinnatti'],
    ['Cincinnati', 'Cinncinnati'],
    ['Cincinnati', 'Cincinati'],
    ['Cincinnati', 'Cincinatti'],
    ['Cincinnati', 'Cincinnatti'],
    ['Connecticut', 'Connecticuit'],
    ['Connecticut', 'Conetticut'],
    ['Connecticut', 'Connetecuit'],
    ['Connecticut', 'Conneticut'],
    ['Connecticut', 'Conecticut'],
    ['Connecticut', 'Connecticutt'],
    ['Connecticut', 'Conneticuit'],
    ['Connecticut', 'Connetecut'],
    ['Connecticut', 'Connectecut'],
    ['Connecticut', 'Connnecticut'],
    ['Connecticut', 'Connectecuit'],
    ['Minnesota', 'Minnasota'],
    ['Minnesota', 'Minnestoa'],
    ['Minnesota', 'Minesotta'],
    ['Minnesota', 'Minesota'],
    ['Minnesota', 'Minasota'],
    ['Minnesota', 'Minnessotta'],
    ['Minnesota', 'Minnesotta'],
    ['Minnesota', 'Minessota'],
    ['Minnesota', 'Minnessota'],
    ['Minnesota', 'Minnesoda'],
    ['Minnesota', 'Minessotta'],
    ['Holliday', 'Holiday'],
    ['Inquirer', 'Enquirer'],
    ['Abby', 'Abbey'],
    ['Abbi', 'Abby'],
    ['Abbie', 'Abby'],
    ['Derrick', 'Derek'],
    ['Jenifer', 'Jennifer'],
    ['Jeniffer', 'Jennifer'],
    ['House', 'Hosue'],
    ['Jarrett', 'Jarret'],
    ['Stamford', 'Stanford'],
    ['Millennial', 'Millenial'],
    ['Millennium', 'Millenium'],
    ['Emmett', 'Emmitt'],
    ['Emmett', 'Emmet'],
    ['Emmett', 'Emmit'],
    ['Confrence', 'Conference'],
    ['Coference', 'Conference'],
    ['Ann', 'Anne'],
    ['Sara', 'Sarah'],
    ['Lyn', 'Lynn'],
    ['Lyn', 'Lynne'],
    ['Lynn', 'Lynne'],
    ['Megan', 'Meagan'],
    ['Megan', 'Meghan'],
    ['Meagan', 'Meghan'],
    ['Eric', 'Erik'],
    ['Ariana', 'Arianna'],
    ['Ana', 'Anna'],
    ['Karl', 'Carl'],
    ['Weird', 'Wierd'],
    ['Alec', 'Alex'],
    ['Susie', 'Suzie'],
    ['Nic', 'Nick'],
    ['Forrester', 'Forester'],
    ['Fahrenheit', 'Farenheit'],
    ['Louie', 'Louis'],
    ['Joseph', 'Josef'],
    ['Rogue', 'Rouge'],
    ['Dwayne', 'Dwyane'],
    ['Nicolas', 'Nicholas'],
    ['Nicolas', 'Nickolas'],
    ['Nicolas', 'Nikolas'],
    ['Nickolas', 'Nikolas'],
    ['Nicholas', 'Nickolas'],
    ['Nicholas', 'Nikolas'],
    ['Peterson', 'Petersen'],
    ['Adrienne', 'Adrian'],
    ['Adrienne', 'Adrianne'],
    ['Adrianne', 'Adrian'],
    ['Lindsey', 'Lindsay'],
    ['Hillary', 'Hilary'],
    ['Margo', 'Margot'],
    ['Brendan', 'Brandon'],
    ['Christine', 'Christina'],
    ['Kristen', 'Kirsten'],
    ['Katarina', 'Katrina'],
]

i = 1
output = {}
with open('/data/scratch/enwiki-namespace-0-redirects-2019-06-03.txt') as file_:
    for line in file_:
        if i > 4000:
            break
        if line:
            line = line.rstrip('\n')
            (redirect_id, redirect_title, target_title) = line.split('\t')
            if line.startswith('redirect_id'):
                continue
            for pm in potential_misspellings:
                source = pm[0].lower()
                target = pm[1].lower()
                grouping = '%s / %s' % (source, target)
                if grouping not in output:
                    output[grouping] = []
                all_tests = [
                    int(redirect_id) not in categorized_redirects,
                ]
                any_tests = [
                    (source in redirect_title.lower().split('_') and
                     target in target_title.lower().split('_')),
                    (target in redirect_title.lower().split('_') and
                     source in target_title.lower().split('_')),
                ]
                if all(all_tests) and any(any_tests):
                    current_items = output[grouping]
                    current_items.append(u'''\
| %d
| [[Special:WhatLinksHere/%s|%d]]
| {{plhnr|1=%s}}
| [[%s]]
|-''' % (
    i,
    unicode(redirect_title, 'utf-8'),
    get_incoming_article_links_count(cursor, redirect_id),
    unicode(redirect_title.replace('_', ' '), 'utf-8'),
    unicode(target_title.replace('_', ' '), 'utf-8'),
))
                    output[grouping] = current_items
                    i += 1

report_section = u'''\
== %s ==

{| class="wikitable sortable plainlinks"
|- style="white-space:nowrap;"
! No.
! Article links
! Source
! Target
|-
%s
|}
'''

final = []
for k, v in output.items():
    if v:
        final.append(report_section % (k, '\n'.join(v)))
report = wikitools.Page(wiki, report_title)
report_text = report_template % ('\n'.join(final))
report_text = report_text.encode('utf-8')
report.edit(report_text, summary=settings.editsumm, bot=1)

cursor.close()
conn.close()