Участник:LankLinkBot/reflinks.py

# -*- coding: utf-8 -*-
"""
This bot will search for references which are only made of a link
without title, (i.e. <ref>[http://www.google.fr/]</ref> or
<ref>http://www.google.fr/</ref>) and will fetch the html title from
the link to use it as the title of the wiki link in the reference, i.e.
<ref>[http://www.google.fr/search?q=test test - Google Search]</ref>

The bot checks every 20 edits a special stop page : if
the page has been edited, it stops.

DumZiBoT is running that script on en: & fr: at every new dump, running it on de: is not allowed anymore.

As it uses it, you need to configure noreferences.py for your wiki, or it will not work.

pdfinfo is needed for parsing pdf titles.

See [[:en:User:DumZiBoT/refLinks]] for more information on the bot.

&params;

-limit:n                Stops after n edits

-xml:dump.xml           Should be used instead of a simple page fetching
                        method from pagegenerators.py for performance and
                        load issues

-xmlstart               Page to start with when using an XML dump

-ignorepdf              Do not handle PDF files (handy if you use Windows and
                        can't get pdfinfo)

Basic pagegenerators commands, -page, etc...
"""
# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
#
# Distributed under the terms of the GPL

__version__ = '$Id: reflinks.py 8180 2010-05-15 13:14:44Z amir $'

import sys, re, urllib2, httplib, socket, codecs, ftplib, urllib
import wikipedia, pagegenerators, noreferences
import subprocess, tempfile, os, gzip, StringIO
import traceback
try:
    from wikificator import wikify
except ImportError:
    print 'WARNING: wikificator.py not found'
    wikify = None

try:
    import citeweb
except ImportError:
    citeweb = None

stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
            'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
            'fa':u'کاربر:Amirobot/EditThisPageToStopMe',
            'it':u'Utente:Marco27Bot/EditThisPageToStopMe',
            'ko':u'사용자:GrassnBreadRefBot/EditThisPageToStopMe1',
            'hu':'User:Damibot/EditThisPageToStopMe',
            'en':u'User:DumZiBoT/EditThisPageToStopMe',
            'ru':u'Участник:LankLinkBot/EditThisPageToStopMe',
            'pl':u'Wikipedysta:MastiBot/EditThisPageToStopMe',
            'zh':u'User:Sz-iwbot',
}

msg = {
    'de':u'Bot: Korrektes Referenzformat (siehe [[:en:User:DumZiBoT/refLinks]])',
    'en':u'Bot: Converting bare references, using ref names to avoid duplicates, see [[User:DumZiBoT/refLinks|FAQ]]',
    'es':u'Formateando las referencias que no tuvieran títulos (FAQ : [[:en:User:DumZiBoT/refLinks]] )',
    'fa':u'ربات:تصحیح پيوند به بيرون يا عنوان پيوند. [[:en:User:DumZiBoT/refLinks|اطلاعات بیشتر]]',
    'fr':u'Bot: Correction des refs. mal formatées, suppression doublons en utilisant des références nommées (cf. [[Utilisateur:DumZiBoT/liensRefs|explications]])',
    'hu':u'Robot: Forráshivatkozások kibővítése a hivatkozott oldal címével',
    'it':u'Bot: Sistemo note con collegamenti esterni senza titolo ([[Utente:Marco27Bot/refLinks.py|documentazione]])',
    'ko':u'봇: url만 있는 주석을 보강, (영문)[[:en:User:DumZiBoT/refLinks]] 참조',
    'pl':u'Bot: Dodanie tytułów do linków w przypisach (patrz [[Wikipedysta:MastiBot/refLinks|FAQ]])',
    'ru':u'Bot: добавление заголовков в сноски; исправление дублирующихся сносок',
    }
convmsg = {
    'en': u'',
    'ru': u'; исправление ссылок на википедию',
    }

deadLinkTagName = {'de':u'{{dead link}}',
                   'en':u'{{dead link}}',
                   'es':u'{{enlace roto2|',
                   'fr':u'{{lien mort}}',
                   'fa':u'{{پیوند مرده}}',
                   'hu':u'{{halott link}}',
                   'it':u'{{Collegamento interrotto|',
                   'ko':u'{{죽은 바깥 고리}}',
                   'pl':u'{{Martwy link}}',
                   }

deadLinkTag = {'de':u'',
               'en':u'[%s] {{dead link}}',
               'es':u'{{enlace roto2|%s}}',
               'fa':u'[%s] {{پیوند مرده}}',
               'fr':u'[%s] {{lien mort}}',
               'hu':u'[%s] {{halott link}}',
               'it':u'{{Collegamento interrotto|%s}}',
               'ko':u'[%s] {{죽은 바깥 고리}}',
               'pl':u'[%s] {{Martwy link}}',
               }

comment = {'ar':u'عنوان مولد بالبوت',
           'de':u'Automatisch generierter titel',
           'en':u'Bot generated title',
           'es':u'Título generado por un bot',
           'fa':u'عنوان تصحیح شده توسط ربات',
           'fr':u'Titre généré automatiquement',
           'hu':u'Robot generálta cím',
           'it':u'Titolo generato automaticamente',
           'ko':u'봇이 따온 제목',
           'pl':u'Tytuł wygenerowany przez bota',
           'ru':u'Заголовок добавлен ботом',
           }

autogen = { 'en': 'autogenerated',
            'ar': 'مولد تلقائيا',
            'it': 'autogenerato',
            'pl': 'autonazwa',
            }

soft404 = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE)
# Extracts the domain name
domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')

badtitles = ur"""
# is
(test|
# starts with
    ^\W*(
        register
        |registration
        |(sign|log)[ \-]?in
        |subscribe
        |sign[ \-]?up
        |log[ \-]?on
        |untitled[ ]?(document|page|\d+|$)
        |404[ ]
        ).*
# anywhere
    |.*(
        403[ ]forbidden
        |(404|page|file|information|resource).*not([ ]*be)?[ ]*(available|found)
        |site.*disabled
        |error[ ]404
        |error.+not[ ]found
        |not[ ]found.+error
        |404[ ]error
        |\D404\D
        |check[ ]browser[ ]settings
        |log[ \-]?(on|in)[ ]to
        |site[ ]redirection
        # see www.antonov.com
        |reflinks
        # fr
        |(404|page|site).*en[ ]travaux
        # es
        |sitio.*no +disponible
        # it
        |((pagina|sito)[ ](non[ ]trovata|inesistente)|accedi)
        # ru
        |сайт[ ]прода[её]тся
        |сайт[ ]закрыт
        |сайт.*заблокирован
        |устаревший[ ]адрес[ ]страницы
        |страница.*(не[ ]найдена|осутствует|недоступна)
        |доступ.*ограничен
        |обслуживание[ ]сайта.*приостановлено
        |не[ ]?возможно[ ]отобразить[ ]страницу
        |добавление[ ]новости
        |это[ ]наилучший[ ]источник[ ]информации
        |временно[ ]заблокирован
        |срок[ ]регистрации[ ]домена[ ]закончился
     ).*
# ends with
    |.*(
        register
        |registration
        |(sign|log)[ \-]?in
        |subscribe|sign[ \-]?up
        |log[ \-]?on
        )\W*$
# fill text
    |^(
        report
        |blog
        |new[ ]page([ ]\d+)?
        |press
        |details
        |search[ ]results
        |ban[ ]page
        |home[ ]page
        |redirector
        |youtube[ ]-[ ]broadcast[ ]yourself\.?
        |yahoo![ ]news
        |\d+
        |\W*sorry\W*
        |invalid[ ]id
        |uid=\d+[ ]genome[ ]result # ???
        |[\? ]+
        |новости
        |новая[ ]страница[ ]?\d*
        |(коммерсантъ\.[ ])?версия[ ]для[ ]печати
        |армс-тасс
        |cnews:
        |итар-тасс
        |главная([ ]страница)?
        |ошибка
        |гостевая[ ]книга
        |добавление[ ]новости
        |архив[ ]новостей
        |президент[ ]россии
        |пресс-центр
        |MetallZone[ ]—[ ]место[ ]в[ ]рунете[ ]где[ ]узнавать[ ]больше[ ]о[ ]тяжелой[ ]музыке[ ]стало[ ]проще!
    )$
)"""

# ignore this references in DuplicateReferences
dupignorerefs = { 'en': ur'^\s*ibid',
                  'ru': ur'^\s*(ibid|там же|указ\. соч\.)',
                  }

# Regex that match bare references
linksInRef = re.compile(
    ur'(?i)<ref(?P<name>[^>]*)>\s*\[?(?:'
    ur'(?P<url1>(?:http|https|ftp)://[^\[\]\s<>"{]+)\s*(?P<templ1>{{[-\w {}|]+}})|'
    ur'(?P<url2>(?:http|https|ftp)://[^\[\]\s<>"]+)'
    ur')\s*\]?\s*(?P<templ2>{{[-\w {}|]+}})?</ref>',
    re.UNICODE)

splitRef = re.compile(
    ur'<ref>'
    ur'((?:http|https|ftp)://[^\s,;<\[\]"]+)'
    ur'\s*[,;]?\s*'
    ur'((?:http|https|ftp)://[^\s,;<\[\]"]+)'
    ur'(?:\s*[,;]?\s*'
    ur'((?:http|https|ftp)://[^\s,;<\[\]"]+)'
    ur')?'
    ur'(?:\s*[,;]?\s*'
    ur'((?:http|https|ftp)://[^\s,;<\[\]"]+)'
    ur')?'
    ur'</ref>',
    re.UNICODE)

# ignore references if we are inside
ignoreInside = (
    ('<nowiki', '</nowiki'),            # <nowiki>
    ('<pre>', '</pre>'),
    ('<source', '</source'),
    ('<code', '</code'),
    ('<!--', '-->'),                    # html comment
    )

# ignore duplicated references if we are inside
dupIgnoreInside = (
    #(re.compile(r'[^{]{\|'), re.compile(r'\|}[^}]')), # table
    ('{|', '|}'),                       # table
    ('<table', '</table'),
    ('{{', '}}'),                       # template
    ('<nowiki', '</nowiki'),            # <nowiki>
    ('<pre>', '</pre>'),
    ('<source', '</source'),
    ('<code', '</code'),
    ('<!--', '-->'),                    # html comment
    )

# matches an URL at the index of a website
dirIndex = re.compile(ur'^\w+://[^/]+/?((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE)
# 
badRedirs = (
    # NB: should be string, not unicode
    re.compile(r'^http://global.nytimes.com/\?iht$'),
    re.compile(r'^http://www.youtube.com/index\?ytsession=.*'),
    re.compile(r'^http://search.yahoo.com/web\?fr=404.*'),
    re.compile(r'^http://slovari.yandex.ru/~книги/$'),
    re.compile(r'^http://slovari.yandex.ru/~книги/[^/]+/$'),
    re.compile(r'^http://www.billboard.com/#/$'),
    )

encodingsMap = (
    ('gb2312', 'gbk'),
    ('shiftjis', 'shift jis 2004'),
    ('xeucjp', 'euc-jp'),
    ('win1251', 'cp1251'),
    ('cp1251', 'cp1251'),
    ('windows1251', 'cp1251'),
    ('windowscp1251', 'cp1251'),
    ('microsoftcp1251', 'cp1251'),
    ('88591', 'iso8859-1'),
    ('bi5', 'big5'),
    ('macintosh', 'mac-roman'),
    ('windows31j', 'shift-jis'),
    ('xsjis', 'shift-jis'),
    ('iso88598i', 'iso-8859-8'),
    ('koi8', 'koi8-r'),
    ('window1251', 'cp1251'),
    )

# Download this file :
# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
# ( maintained by User:Dispenser )
listof404pages = '404-links.txt'

templatesList = {
    'en': {},
    'ru': {
        # slovari.yandex.ru
        u'http://slovari.yandex.ru/dict/olympic/article/olymp/': (
            re.compile(u'(.*) — Олимпийская энциклопедия — Яндекс.Словари'),
            u'{{Из БОЭ|1=%s|title=%s}}',
            None,
            ),
        u'http://slovari.yandex.ru/~книги/Олимпийская энциклопедия/': (
            re.compile(u'(.*) — Олимпийская энциклопедия — Яндекс.Словари'),
            u'{{Из БОЭ|1=%s|title=%s}}',
            None,
            ),
        u'http://slovari.yandex.ru/dict/bse/article/': (
            re.compile(u'(.*) — БСЭ — Яндекс.Словари'),
            u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
            None,
            ),
        u'http://slovari.yandex.ru/~книги/БСЭ/': (
            re.compile(u'(.*) — БСЭ — Яндекс.Словари'),
            u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
            None,
            ),
        u'http://slovari.yandex.ru/dict/brokminor/article/': (
            re.compile(u'(.*) — Брокгауз и Ефрон — Яндекс.Словари'),
            u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
            None,
            ),
        u'http://slovari.yandex.ru/~книги/Брокгауз и Ефрон/': (
            re.compile(u'(.*) — Брокгауз и Ефрон — Яндекс.Словари'),
            u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
            None,
            ),
        u'http://slovari.yandex.ru/dict/geography/article/geo/': (
            re.compile(u'(.*) — Географические названия — Яндекс.Словари'),
            u'{{Из|Словаря современных географических названий|ссылка=%s|заглавие=%s}}',
            None,
            ),
        u'http://slovari.yandex.ru/~книги/Географические названия/': (
            re.compile(u'(.*) — Географические названия — Яндекс.Словари'),
            u'{{Из|Словаря современных географических названий|ссылка=%s|заглавие=%s}}',
            None,
            ),
        # dic.academic.ru
        u'http://dic.academic.ru/dic.nsf/brokgauz_efron/': (
            re.compile(u'(.*)'),
            u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
            None,
            ),
        u'http://dic.academic.ru/dic.nsf/bse/': (
            re.compile(u'(.*)'),
            u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
            None,
            ),
        u'http://dic.academic.ru/dic.nsf/enc2p/': (
            re.compile(u'(.*)'),
            u'{{Из|Даль|ссылка=%s|заглавие=%s}}',
            None,
            ),
        u'http://dic.academic.ru/dic.nsf/vasmer/': (
            re.compile(u'(.*)'),
            u'{{Из|[[Этимологический словарь русского языка (М. Фасмер)|Этимологического словаря русского языка Макса Фасмера]]|ссылка=%s|заглавие=%s}}',
            None,
            ),
        # www.vehi.net
        u'http://vehi.net/brokgauz/all/': (
            re.compile(u'(.*)'),
            u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
            None,
            ),
        # gatchina3000.ru
        u'http://gatchina3000.ru/brockhaus-and-efron-encyclopedic-dictionary/': (
            re.compile(u'(.*) / Энциклопедия Брокгауза и Эфрона'),
            u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
            None,
            ),
        # www.cultinfo.ru
        u'http://cultinfo.ru/fulltext/1/001/008/': (
            re.compile(u'(.*)'),
            u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
            None,
            ),
        # sci-lib.com
        u'http://be.sci-lib.com/article': (
            re.compile(u'(.*)'),
            u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
            None,
            ),
        u'http://bse.sci-lib.com/article': (
            re.compile(u'(.*)'),
            u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
            None,
            ),
        # femto.com.ua
        u'http://femto.com.ua/articles/': (
            re.compile(u'(.*) - Физическая энциклопедия'),
            u'{{Из|ФЭ|ссылка=%s|заглавие=%s}}',
            None,
            ),
        # wikisource.org
        u'http://ru.wikisource.org/wiki/ЭСБЕ/': (
            re.compile(u'ЭСБЕ/(.*) — Викитека'),
            u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
            None,
            ),
        # feb-web.ru
        u'http://feb-web.ru/feb/litenc/encyclop/': (
            re.compile(u'(.*) // Литературная энциклопедия.*'),
            u'{{Из ЛЭ|1=%s|title=%s}}',
            None,
            ),
        u'http://feb-web.ru/feb/kle/': (
            re.compile(u'(.*) // Краткая литературная энциклопедия.*'),
            u'{{Из КЛЭ|1=%s|title=%s}}',
            None,
            ),
        # krugosvet.ru
        u'http://krugosvet.ru/articles/': (
            re.compile(u'(.*)'),
            u'{{Из|Кругосвет|ссылка=%s|заглавие=%s}}',
            None,
            ),
        # eleven.co.il
        u'http://eleven.co.il/article/': (
            re.compile(ur'(.*)\. Электронная еврейская энциклопедия'),
            u'{{Из|ЭЕЭ|ссылка=%s|заглавие=%s}}',
            None,
            ),
        # imdb
        u'http://imdb.com/title/tt': (
            re.compile(u'(.*)'),
            u'{{Imdb title|%s|%s}}',
            re.compile(r'http://imdb.com/title/tt(\d+)/?$')
            ),
        u'http://imdb.com/name/nm': (
            re.compile(u'(.*)'),
            u'{{Imdb title|%s|%s}}',
            re.compile(r'http://imdb.com/name/nm(\d+)/?$')
            ),
        # britannica.com
        u'http://britannica.com/ebchecked/topic/': (
            re.compile(u'(.*) -- Britannica Online Encyclopedia'),
            u'{{britannica-link|%s|%s}}',
            re.compile(r'http://britannica.com/ebchecked/topic/(\d+).*'),
            re.compile(u'^(.*) — Britannica Online Encyclopedia$'),
            ),
        }
    }

fixUrl = (
    re.compile(r'(http://www.kommersant.ru/.*)print=true$'),
    re.compile(r'(http://kommersant.ru/.*)print=true$'),
    re.compile(r'(http://top.rbc.ru/.*)\?print$'),
    )

class DeadLinkException(Exception):
    pass

def checkInside(text, ignore=dupIgnoreInside):
    '''check if we are inside table/template/html-comment'''
    for start, end in ignore:
        if isinstance(start, str):
            cnt1 = text.count(start)
        else:
            cnt1 = len(start.findall(text))
        if isinstance(end, str):
            cnt2 = text.count(end)
        else:
            cnt2 = len(end.findall(text))
        if cnt1 != cnt2:
            # we are inside
            return True
    return False

def safeReplace(text, old, new, count=0, ignore=dupIgnoreInside):
    '''do not replace inside table/template/html-comment'''
    newtext = ''
    i = 0
    j = 0
    while True:
        i = text.find(old, j)
        if i < 0:
            newtext = newtext + text[j:]
            break
        k = i + len(old)
        if checkInside(text[:i], ignore):
            newtext = newtext + text[j:k]
            j = k
            continue
        newtext = newtext + text[j:k].replace(old, new)
        j = k
        count -= 1
        if count == 0:
            newtext = newtext + text[k:]
            break
    return newtext

def safeReplaceRegexp(text, old, new, count=0):
    '''like safeReplace, but used regexp'''
    newtext = ''
    i = 0
    j = 0
    while True:
        m = re.search(old, text[j:], re.U)
        if not m:
            newtext = newtext + text[j:]
            break
        i = j + m.start()
        k = j + m.end()
        if checkInside(text[:i]):
            newtext = newtext + text[j:k]
            j = k
            continue
        newtext = newtext + re.sub(old, new, text[j:k], 1)
        j = k
        count -= 1
        if count == 0:
            newtext = newtext + text[k:]
            break
    return newtext

class XmlDumpPageGenerator:
    """Xml generator that yiels pages containing bare references"""

    def __init__(self, xmlFilename, xmlStart, namespaces):
        self.xmlStart = xmlStart
        self.namespaces = namespaces
        self.skipping = bool(xmlStart)
        self.site = wikipedia.getSite()

        import xmlreader
        dump = xmlreader.XmlDump(xmlFilename)
        self.parser = dump.parse()

    def __iter__(self):
        return self

    def next(self):
        while True:
            try:
                entry = self.parser.next()
            except StopIteration:
                raise
            if self.skipping:
                if entry.title != self.xmlStart:
                    continue
                self.skipping = False
            page=wikipedia.Page(self.site, entry.title)
            if not self.namespaces == []:
                if page.namespace() not in self.namespaces:
                    continue
            if linksInRef.search(entry.text):
                return page

class RefLink:
    """Container to handle a single bare reference"""

    def __init__(self, link, name, templ):
        self.refname = name or ''
        self.link = link
        self.site = wikipedia.getSite()
        self.linkComment = wikipedia.translate(self.site, comment)
        self.url = re.sub(u'#.*', '', self.link)
        self.title = None
        self.templ = templ
        self.templatesList = wikipedia.translate(wikipedia.getSite(),
                                                 templatesList)

    WikipediaUrl = re.compile(r'http://([a-z][a-z][a-z]?).wikipedia.org/(?:wiki/|w/index\.php\?title=)([^#]+)(.*)')
    def refWikipedia(self):
        '''convert an urls to wikipedia link
        i.e. http://de.wikipedia.org/wiki/Fernuniversit%C3%A4t_in_Hagen#cite_note-0 -> [[:de:Fernuniversität in Hagen#cite_note-0]]'''
        url = self.link.encode('utf-8')
        match = self.WikipediaUrl.match(url)
        if not match:
            return False
        lang, title, frag = match.group(1), match.group(2), match.group(3)
        title = urllib.unquote(title).replace('_', ' ')
        if lang == wikipedia.getSite().lang:
            text = '[[%s%s]]' % (title, frag)
        else:
            text = '[[:%s:%s%s]]' % (lang, title, frag)
        text = unicode(text, 'utf-8')
        if self.templ:
            text = '<ref%s>%s%s</ref>' % (self.refname, text, self.templ)
        else:
            text = '<ref%s>%s</ref>' % (self.refname, text)
        return text

    def refTempl(self, fix=False):
        '''returns the template or False if appropriate template not found'''
        for t in self.templatesList:
            normlink = re.sub(r'^http://www\.', 'http://', self.link).lower()
            if normlink.startswith(t):
                if len(self.templatesList[t]) == 3:
                    pat, templ, linkpat = self.templatesList[t]
                    fixpat = pat
                else:
                    pat, templ, linkpat, fixpat = self.templatesList[t]
                if fix:
                    pat = fixpat
                match = pat.match(self.title)
                if match:
                    #
                    self.transform()
                    link = self.link
                    if linkpat:
                        m = linkpat.match(normlink)
                        if not m:
                            return None
                        link = m.group(1)
                    text = templ % (link, match.group(1))
                    if self.templ:
                        text = '<ref%s>%s %s</ref>' % (
                            self.refname, text, self.templ)
                    else:
                        text = '<ref%s>%s</ref>' % (
                            self.refname, text)
                    wikipedia.output(u'*** \03{lightred}%s\03{default} ***' %
                                     text)
                    return text
                elif fix:
                    print '*** refTempl:', self.link, self.title
                    return None
        return False

    def refCiteWeb(self, html, enc):
        if not citeweb:
            return False
        t = citeweb.citeweb(self.link, html, enc)
        if not t:
            return False
        if self.templ:
            return '<ref%s>%s %s</ref>' % (self.refname, t, self.templ)
        return '<ref%s>%s</ref>' % (self.refname, t)

    def refTitle(self):
        """Returns the <ref> with its new title"""
        self.transform()
        if len(self.title) <= 5:
            raise DeadLinkException('title too short: "%s"' % self.title.encode('utf-8'))
        wikipedia.output(u'\03{lightgreen}%s\03{default}' % self.title)
        if self.templ:
            text = '<ref%s>{{cite web | url = %s | title = %s}}<!-- %s --> %s</ref>' % (
                self.refname, self.link, self.title, self.linkComment, self.templ)
        else:
            text = '<ref%s>{{cite web | url = %s | title = %s}}<!-- %s --></ref>' % (
                self.refname, self.link, self.title, self.linkComment)
        return text

    def refLink(self):
        """No title has been found, return the unbracketed link"""
        link = self.link
        if len(link) > 200:
            link = '['+link+']'
        if self.templ:
            return '<ref%s>%s %s</ref>' % (self.refname, link, self.templ)
        return '<ref%s>%s</ref>' % (self.refname, link)

    def refDead(self):
        """Dead link, tag it with a {{dead link}}"""
        tag = wikipedia.translate(self.site, deadLinkTag) % self.link
        if self.templ:
            tagname = wikipedia.translate(self.site, deadLinkTagName)
            if tagname in self.templ:
                # do not add yet another {{dead link}}
                # FIXME
                return '<ref%s>%s %s</ref>' % (self.refname, self.link, self.templ)
            return '<ref%s>%s %s</ref>' % (self.refname, tag, self.templ)
        return '<ref%s>%s</ref>' % (self.refname, tag)

    def transform(self, ispdf = False):
        """Normalize the title"""
        if wikify:
            self.title = wikify(self.title)
        # Truncate long titles. 175 is arbitrary
        if len(self.title) > 180:
            self.title = self.title[:175] + u" …"
        #convert html entities
        if not ispdf:
            self.title = wikipedia.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        #remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        #remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        #self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        #remove extra whitespaces
        #remove leading and trailing ./;/,/-/_/+/ /
        self.title = self.title.strip(r'=.;,-+_ ')

        #self.avoid_uppercase()
        #avoid pipe being interpreted as template parameters
        self.title = self.title.replace('|', '&#124;')
        #avoid closing the link before the end
        self.title = self.title.replace(']', '&#93;')
        #avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}&#125;')
        #prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\'&#39;')
        self.title = self.title.replace('<', '&lt;').replace('>', '&gt;')
        self.title = wikipedia.unicode2html(self.title, self.site.encoding())
        # TODO : remove HTML when both opening and closing tags are included

    def avoid_uppercase(self):
        """
        If title has more than 6 characters and has 60% of uppercase
        characters, capitalize() it
        """
        if len(self.title) <= 6:
            return
        nb_upper = 0
        nb_letter = 0
        for letter in self.title:
            if letter.isupper():
                nb_upper += 1
            if letter.isalpha():
                nb_letter += 1
            if letter.isdigit():
                return
        if float(nb_upper)/(nb_letter+1) > .70:
            self.title = self.title.title()

class DuplicateReferences:
    """
    When some references are duplicated in an article,
    name the first, and remove the content of the others
    """
    def __init__(self):
        # Match references
        self.REFS = re.compile(u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
        self.NAMES = re.compile(u'(?is).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
        self.SELF_CLOSED_NAMES = re.compile(u'(?i)<ref[^>]+name\s*=\s*(?P<quote>"?)\s*(?P<name>[^>]+)\s*(?P=quote)[^>]*/>')
        self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+?)\s*(?P=quote).*')
        self.autogen = wikipedia.translate(wikipedia.getSite(), autogen)
        self.IGNOREREFS = re.compile(
            wikipedia.translate(wikipedia.getSite(), dupignorerefs),
            re.I|re.U)

    def _cmpContent(self, s1, s2):
        '''compare two strings with ignore spaces'''
        s1 = re.sub(u'[\s\xa0]+', '', s1)
        s2 = re.sub(u'[\s\xa0]+', '', s2)
        return s1 == s2

    def _inDict(self, s, dct):
        for k in dct:
            if self._cmpContent(s, k):
                return dct[k]
        return False

    def process(self, text, page):
        # keys are ref groups
        # values are a dict where :
        #   keys are ref content
        #   values are [name, [list of full ref matches], quoted, need_to_change]
        foundRefs = {}
        foundRefNames = {}
        # Replace key by [value, quoted]
        namedRepl = {}
        # list of self-closed tags name
        selfClosedTags = []
        for match in self.SELF_CLOSED_NAMES.finditer(text):
            name = match.group('name')
            if not match.group('quote'):
                # FIXME
                name = name.strip()
            selfClosedTags.append(name)

        for match in self.REFS.finditer(text):
            content = match.group('content')
            if not content.strip():
                continue
            # check if ref inside table/template/html-comment
            prev_text = text[:match.start()]
            if checkInside(prev_text):
                #print 'ref inside', match.group()
                continue
            if self.IGNOREREFS.match(content):
                continue

            params = match.group('params')
            group = self.GROUPS.match(params)
            if not group in foundRefs:
                foundRefs[group] = {}

            groupdict = foundRefs[group]
            indict = self._inDict(content, groupdict)
            if indict:
                v = indict
                v[1].append(match.group())
            else:
                v = [None, [match.group()], False, False]
            name = self.NAMES.match(params)
            if name:
                quoted = name.group('quote') == '"'
                name = name.group('name')
                if not quoted:
                    name = name.strip()
                if v[0]:
                    if v[0] != name:
                        namedRepl[name] = [v[0], v[2]]
                else:
                    #First name associated with this content
                    d = self._inDict(name, foundRefNames)
                    if d and name in selfClosedTags:
                        wikipedia.output('*** refs ambiguity: '+page.aslink())
                        d = False
                    if not d:
                        # first time ever we meet this name
                        v[2] = quoted
                        v[0] = name
                    else:
                        # this name is used with another content.
                        # We'll need to change it
                        v[3] = True

                foundRefNames[name] = 1
            groupdict[content] = v

        id = 1
        while self.autogen + str(id) in foundRefNames:
            id += 1
        for (g, d) in foundRefs.iteritems():
            if g:
                group = "group=\"%s\" " % g.group('group')
            else:
                group = ""

            for (k, v) in d.iteritems():
                if len(v[1]) == 1 and not v[3]:
                    continue
                name = v[0]
                if not name:
                    name = self.autogen + str(id)
                    id += 1
                else:
                    if name != u'""':
                        name = name.replace(u'"', '')
                    if v[2]:
                        name = u'"%s"' % name
                named = u'<ref %sname=%s>%s</ref>' % (group, name, k)

                text = safeReplace(text, v[1][0], named, 1)

                # make sure that the first (named ref) is not
                # removed later :
                if named in text:
                    pos = text.index(named) + len(named)
                    header = text[:pos]
                    end = text[pos:]
                    unnamed = u'<ref %sname=%s />' % (group, name)
                    for ref in v[1][1:]:
                        end = safeReplace(end, ref, unnamed)
                    text = header + end

        for (k,v) in namedRepl.iteritems():
            # TODO : Support ref groups
            name = v[0]
            if v[1]:
                name = u'"%s"' % name
            text = re.sub(ur'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref name=%s />' % name, text)
        return text

class ReferencesRobot:
    def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ):
        """
        - generator : Page generator
        - acceptall : boolean, is -always on ?
        - limit : int, stop after n modified pages
        - ignorepdf : boolean
        """
        self.generator = generator
        self.acceptall = acceptall
        self.limit = limit
        self.ignorepdf = ignorepdf
        self.site = wikipedia.getSite()
        self.stopPage = wikipedia.Page(self.site, wikipedia.translate(self.site, stopPage))

        self.titleBlackList = re.compile(badtitles, re.I | re.S | re.X | re.U)
        self.norefbot = noreferences.NoReferencesBot(None)

        self.deduplicator = DuplicateReferences()

        try:
            self.stopPageRevId = self.stopPage.latestRevision()
        except wikipedia.NoPage :
            wikipedia.output(u'The stop page %s does not exist'
                                % self.stopPage.aslink())
            raise
        try:
            self.deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
        except IOError:
            wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory')
            raise

        # Regex to grasp content-type meta HTML tag in HTML source
        self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>')
        # Extract the encoding from a charset property (from content-type !)
        self.CHARSET = re.compile(
            ur'(?i)charset\s*=\s*["\']?(?P<enc>[-_a-zA-Z0-9]+)')
        # Extract html title from page
        #?self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)')
        self.TITLE = re.compile(r'(?is)<title[^>]*>(.*?)</title\s*>')
        # Matches content inside <script>/<style>/HTML comments
        self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')

        # Authorized mime types for HTML pages
        self.MIME = re.compile(ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml',
                               re.I)

        # cache
        self.title_cache = {}

    def setAction(self, addconv=False):
        if addconv:
            m = wikipedia.translate(self.site, msg) + wikipedia.translate(self.site, convmsg)
        else:
            m = wikipedia.translate(self.site, msg)
        wikipedia.setAction(m)

    def put_page(self, page, new, showdiff=True):
        """
        Prints diffs between orginal and new (text), puts new text for page
        """
        wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                         % page.title())
        if not self.acceptall:
            if showdiff:
                wikipedia.showDiff(page.get(), new)
            choice = wikipedia.inputChoice(u'Do you want to accept ' +
                                           u'these changes?',
                                           ['Yes', 'No', 'All', 'See'],
                                           ['y', 'N', 'a', 's'], 'N')
            if choice == 'a':
                self.acceptall = True
            if choice == 'y':
                page.put_async(new)
            if choice == 's':
                wikipedia.output('='*72)
                wikipedia.output(new)
                wikipedia.output('='*72)
                self.put_page(page, new, showdiff=False)
                return
        if self.acceptall:
            #wikipedia.showDiff(page.get(), new)
            try:
                page.put(new)
            except wikipedia.EditConflict:
                wikipedia.output(u'Skipping %s because of edit conflict'
                                  % (page.title(),))
            except wikipedia.SpamfilterError, e:
                wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
            except wikipedia.PageNotSaved, error:
                wikipedia.output(u'Error putting page: %s' % (error.args,))
            except wikipedia.LockedPage:
                wikipedia.output(u'Skipping %s (locked page)'
                                  % (page.title(),))
            except wikipedia.ServerError, e:
                wikipedia.output(u'Server Error : %s' % e)
            print '...done'

    def httpError(self, err_num, link, pagetitleaslink):
        """Log HTTP Error"""
        wikipedia.output(u'HTTP error (%s) for %s on %s'
                          % (err_num, link, pagetitleaslink),
                         toStdout = True)

    def getPDFTitle(self, f):
        """
        Use pdfinfo to retrieve title from a PDF.
        Unix-only, I'm afraid.
        """
        wikipedia.output( u'PDF file.' )
        fd, infile = tempfile.mkstemp()
        urlobj = os.fdopen(fd, 'r+w')
        urlobj.write(f.read())
        title = ''
        try:
            pdfinfo_out = subprocess.Popen(
                ["pdfinfo","/dev/stdin"],
                stdin=urlobj, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                shell=False).communicate()[0]
            for aline in pdfinfo_out.splitlines():
                if aline.lower().startswith('title'):
                    title = aline.split(None)[1:]
                    title = ' '.join(title)
                    if title != '': wikipedia.output(u'title: ' +title )
            wikipedia.output( u'PDF done.' )
        except ValueError:
            wikipedia.output( u'pdfinfo value error.' )
        except OSError:
            wikipedia.output( u'pdfinfo OS error.' )
        except:    # Ignore errors
            wikipedia.output( u'PDF processing error.' )
            pass
        finally:
            urlobj.close()
            os.unlink(infile)
        return title

    def getUrl(self, url, fullpage=False):
        url = url.encode('utf-8') # use `str', because unicode raise UnicodeError
        for pat in fixUrl:
            m = pat.match(url)
            if m:
                url = m.group(1)
                break
        f = None
        try:
            socket.setdefaulttimeout(20)
            req = urllib2.Request(url)
            req.add_header('Referer', url)
            req.add_header('Accept-Encoding', 'gzip')
            req.add_header('User-Agent', 'pywikipedia bot (reflinks.py)')
            f = urllib2.urlopen(req)
            #Try to get Content-Type from server
            headers = f.info()
            contentType = headers.getheader('Content-Type')
            if contentType and not self.MIME.search(contentType):
            ##     title = ''
            ##     if url.lower().endswith('.pdf') and not self.ignorepdf:
            ##         # If file has a PDF suffix
            ##         title = self.getPDFTitle(f)
            ##         if not re.match('(?i) *microsoft (word|excel|visio)', title):
            ##             title = wikipedia.html2unicode(title)
            ##             return title
            ##         else:
            ##             raise DeadLinkException('PDF title blacklisted')
                raise DeadLinkException('media')

            # Get the real url where we end (http redirects !)
            redir = f.geturl()
            if redir != url and domain.findall(redir) == domain.findall(url):
                if soft404.search(redir) and not soft404.search(url):
                    raise DeadLinkException('redirect 404')
            if redir != url:
                if (dirIndex.match(redir) and not dirIndex.match(url)):
                    #print 'WARNING: Redirect to root:', url, '->', redir
                    raise DeadLinkException('redirect to root: %s' % redir)
                for p in badRedirs:
                    if p.match(redir):
                        wikipedia.output(u'\03{lightred}WARNING Redirect to root\03{default}')
                        #print 'WARNING: Redirect to root:', url, '->', redir
                        raise DeadLinkException('bad redirect')

            # uncompress if necessary
            if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
                # Read the first 10,000 bytes (~ 30.000 bytes when ungzipped)
                if fullpage:
                    compressed = StringIO.StringIO(f.read())
                else:
                    compressed = StringIO.StringIO(f.read(25000))
                f = gzip.GzipFile(fileobj=compressed)
                linkedpagetext = ''
                while True:
                    try:
                        chank = f.read(512)
                    except: # (IOError, struct.error):
                        break
                    if not chank:
                        break
                    linkedpagetext += chank
            else:
                # Read the first 20,000 bytes
                # (I have not had more than 10.000 bites in an html-header)
                if fullpage:
                    linkedpagetext = f.read()
                else:
                    linkedpagetext = f.read(50000)

            socket.setdefaulttimeout(None)

        except urllib2.HTTPError, e:
            # 410 Gone, indicates that the resource has been purposely removed
            # FIXME: RefLink.refDead should be fixed
            ##if e.code == 410 or (e.code == 404 and (u'\t%s\t' % url in self.deadLinks)):
            ##    return '{{dead-link}}'
            raise DeadLinkException('HTTP error %s' % e.code)
        except (urllib2.URLError,
                socket.error,
                IOError,
                httplib.error), e:
        #except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error, socket.error), e:
            # FIXME
            #print 'Can\'t retrieve page:', url, str(e)
            raise DeadLinkException('connect error: %s' % e)
        except ValueError:
            # Known bug of httplib, google for :
            # "httplib raises ValueError reading chunked content"
            raise DeadLinkException('connect error')
        finally:
            if f:
                f.close()

        #remove <script>/<style>/comments/CDATA tags
        linkedpagetext = self.NON_HTML.sub('', linkedpagetext)

        meta_content = self.META_CONTENT.search(linkedpagetext)
        # use charset from http header
        s = None
        if contentType:
            s = self.CHARSET.search(contentType)
        if meta_content:
            tag = meta_content.group()
            # Prefer the contentType from the HTTP header :
            if not contentType:
                contentType = tag
            if not s:
                # use charset from html
                s = self.CHARSET.search(tag)
        if s:
            tmp = s.group('enc').strip().lower()
            naked = re.sub('[ _\-]', '', tmp)
            # Convert to python correct encoding names
            for encfrom, encto in encodingsMap:
                if naked == encfrom:
                    enc = encto
                    break
            else:
                enc = tmp
        else:
            enc = 'ascii'

        if not contentType:
            raise DeadLinkException('content-type not found')
        elif not self.MIME.search(contentType):
            raise DeadLinkException('media')

        return linkedpagetext, enc

    def getTitle(self, url):
        #print 'getTitle:', url

        linkedpagetext, enc = self.getUrl(url)
        # Retrieves the first non empty string inside <title> tags
        title = ''
        ## for m in self.TITLE.finditer(u.unicode):
        for m in self.TITLE.finditer(linkedpagetext):
            title = m.group(1)
            title = ' '.join(title.split())
            if title:
                break

        try:
            title = unicode(title, enc)
        except UnicodeDecodeError:
            if enc == 'ascii':
                raise DeadLinkException('unknown encoding')
            else:
                raise DeadLinkException('decode error: %s' % enc)
        except LookupError:
            raise DeadLinkException('bad encoding: %s' % enc)

        # XXX Ugly hack
        if u'é' in title or u'Ð' in title or u'Рµ' in title:
            raise DeadLinkException('hybrid encoding')

        if self.titleBlackList.match(title):
            raise DeadLinkException('blacklisted title: "%s"' %
                                    title.encode('utf-8'))
        if not title:
            raise DeadLinkException('empty title')
        if title.lower() in url.lower():
            raise DeadLinkException('title in url: %s' % title.encode('utf-8'))

        return title

    prevRef = re.compile(
        ur'<ref(?P<name>[^>]*)>\['
        ur'(?P<url>(?:http|https|ftp)://[^\[\]\s<>"{]+) '
        ur'(?P<title>[^<]+)'
        ur'<!-- Заголовок добавлен ботом -->\]\s*'
        ur'(?P<templ>{{[-\w {}|]+}})?</ref>',
        re.I|re.U)
    def fixPrev(self, new_text):
        for match in self.prevRef.finditer(new_text):
            ref = RefLink(match.group('url'), match.group('name'),
                          match.group('templ'))
            if citeweb.checksite(ref.link):
                try:
                    linkedpagetext, enc = self.getUrl(ref.link, fullpage=True)
                except DeadLinkException, err:
                    continue
                repl = ref.refCiteWeb(linkedpagetext, enc)
                if repl:
                    wikipedia.output(u'\03{lightred}%s\03{default}' % repl)
                    new_text = safeReplace(new_text, match.group(), repl,
                                           ignore=ignoreInside)
                    continue
            ref.title = match.group('title')
            repl = ref.refTempl(fix=True)
            if repl:
                pass
            elif repl is None:
                try:
                    title = self.getTitle(ref.url)
                except DeadLinkException:
                    continue
                ref.title = title
                repl = ref.refTempl()
                if not repl:
                    continue
            else:
                continue
            new_text = safeReplace(new_text, match.group(), repl,
                                   ignore=ignoreInside)
        return new_text

    def run(self):
        """
        Runs the Bot
        """
        socket.setdefaulttimeout(30)
        editedpages = 0
        for page in self.generator:
            self.setAction()
            #print '***', page.aslink(), '***'
            try:
                # Load the page's text from the wiki
                new_text = page.get()
                if not page.canBeEdited():
                    wikipedia.output(u"You can't edit page %s" % page.aslink())
                    continue
            except wikipedia.NoPage:
                wikipedia.output(u'Page %s not found' % page.aslink())
                continue
            except wikipedia.IsRedirectPage:
                wikipedia.output(u'Page %s is a redirect' % page.aslink())
                continue

            new_text = self.fixPrev(new_text)
            p = wikipedia.removeDisabledParts(page.get())

            if 0:
                # split ref
                # i.e. "<ref>http://aaa; http://bbb</ref>" ->
                # "<ref>http://aaa</ref><ref>http://bbb</ref>"
                for match in splitRef.finditer(p):
                    groups = filter(None, match.groups())
                    r = '<ref>' + '</ref><ref>'.join(groups) + '</ref>'
                    p = p.replace(match.group(), r)

            for match in linksInRef.finditer(p):
                #print '>>>', match.group()
                #for each link to change
                link = match.group('url1') or match.group('url2')
                assert link
                templ = (match.group('templ1') or '').strip() + \
                        (match.group('templ2') or '').strip()

                if u'jstor.org' in link:
                    #TODO: Clean URL blacklist
                    continue

                ref = RefLink(link, match.group('name'), templ)
                wikipedia.output(':: %s - %s - %s' %
                                 (link, match.group('name'), templ))

                repl = ref.refWikipedia()
                if repl:
                    wikipedia.output(u'%s : converted to wiki-link: \03{lightred}%s\03{default}' % (link, repl))
                    new_text = safeReplace(new_text, match.group(), repl,
                                           ignore=ignoreInside)
                    self.setAction(addconv=True)
                    continue

                if citeweb.checksite(ref.link):
                    try:
                        linkedpagetext, enc = self.getUrl(ref.link, fullpage=True)
                    except DeadLinkException, err:
                        print err, ref.link
                    else:
                        repl = ref.refCiteWeb(linkedpagetext, enc)
                        if repl:
                            wikipedia.output(u'\03{lightred}%s\03{default}' % repl)
                            new_text = safeReplace(new_text, match.group(), repl,
                                                   ignore=ignoreInside)
                            continue
                if ref.url in self.title_cache:
                    title = self.title_cache[ref.url]
                else:
                    try:
                        title = self.getTitle(ref.url)
                    except DeadLinkException, err:
                        err = unicode(str(err), 'utf-8')
                        if err == 'dead link':
                            repl = ref.refDead()
                            wikipedia.output(u'\03{lightred}Dead link\03{default} : %s' % ref.link)
                            new_text = safeReplace(new_text, match.group(), repl,
                                                   ignore=ignoreInside)
                            continue
                        else:
                            repl = ref.refLink()
                            wikipedia.output(u'%s : no title found: \03{lightpurple}%s\03{default}' % (ref.link, err))
                            new_text = safeReplace(new_text, match.group(), repl,
                                                   ignore=ignoreInside)
                            continue
                    # cache
                    self.title_cache[ref.url] = title
                ref.title = title
                repl = ref.refTempl()   # try to use template
                if not repl:
                    try:
                        repl = ref.refTitle()
                    except DeadLinkException, err:
                        err = unicode(str(err), 'utf-8')
                        repl = ref.refLink()
                        wikipedia.output(u'%s : \03{lightpurple}%s\03{default}' % (ref.link, err))
                        new_text = safeReplace(new_text, match.group(), repl,
                                               ignore=ignoreInside)
                        continue

                new_text = safeReplace(new_text, match.group(), repl,
                                       ignore=ignoreInside)

            # Add <references/> when needed, but ignore templates !
            if page.namespace != 10:
                if self.norefbot.lacksReferences(new_text, verbose=False):
                    new_text = self.norefbot.addReferences(new_text)

            # Find duplicated refs
            new_text = self.deduplicator.process(new_text, page)

            if new_text.replace(' ', '') == page.get().replace(' ', ''):
                wikipedia.output('No changes were necessary in %s'
                                 % page.aslink())
                continue

            editedpages += 1
            self.put_page(page, new_text)

            if self.limit and editedpages >= self.limit:
                wikipedia.output('Edited %s pages, stopping.' % self.limit)
                return

            if editedpages % 20 == 0:
                wikipedia.output('\03{lightgreen}Checking stop page...\03{default}')
                actualRev = self.stopPage.latestRevision()
                if actualRev != self.stopPageRevId:
                    wikipedia.output(u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage)
                    return

def main():
    genFactory = pagegenerators.GeneratorFactory()

    PageTitles = []
    xmlFilename = None
    always = False
    ignorepdf = False
    limit = None
    namespaces = []
    generator = None
    for arg in wikipedia.handleArgs():
        if arg.startswith('-namespace:'):
            try:
                namespaces.append(int(arg[11:]))
            except ValueError:
                namespaces.append(arg[11:])
        elif arg.startswith('-summary:'):
            wikipedia.setAction(arg[9:])
        elif arg == '-always':
            always = True
        elif arg == '-ignorepdf':
            ignorepdf= True
        elif arg.startswith('-limit:'):
            limit = int(arg[7:])
        elif arg.startswith('-xmlstart'):
            if len(arg) == 9:
                xmlStart = wikipedia.input(
                    u'Please enter the dumped article to start with:')
            else:
                xmlStart = arg[10:]
        elif arg.startswith('-xml'):
            if len(arg) == 4:
                xmlFilename = wikipedia.input(
                    u'Please enter the XML dump\'s filename:')
            else:
                xmlFilename = arg[5:]
        else:
            genFactory.handleArg(arg)

    if xmlFilename:
        try:
            xmlStart
        except NameError:
            xmlStart = None
        generator = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
    if not generator:
        generator = genFactory.getCombinedGenerator()
    if not generator:
        # syntax error, show help text from the top of this file
        wikipedia.showHelp('reflinks')
        return
    generator = pagegenerators.PreloadingGenerator(generator, pageNumber = 50)
    generator = pagegenerators.RedirectFilterPageGenerator(generator)
    if namespaces:
        generator = pagegenerators.NamespaceFilterPageGenerator(generator, namespaces)
    bot = ReferencesRobot(generator, always, limit, ignorepdf)
    bot.run()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()