# -*- coding: utf-8 -*-
"""
This bot will search for references which are only made of a link
without title, (i.e. <ref>[http://www.google.fr/]</ref> or
<ref>http://www.google.fr/</ref>) and will fetch the html title from
the link to use it as the title of the wiki link in the reference, i.e.
<ref>[http://www.google.fr/search?q=test test - Google Search]</ref>
The bot checks every 20 edits a special stop page : if
the page has been edited, it stops.
DumZiBoT is running that script on en: & fr: at every new dump, running it on de: is not allowed anymore.
As it uses it, you need to configure noreferences.py for your wiki, or it will not work.
pdfinfo is needed for parsing pdf titles.
See [[:en:User:DumZiBoT/refLinks]] for more information on the bot.
¶ms;
-limit:n Stops after n edits
-xml:dump.xml Should be used instead of a simple page fetching
method from pagegenerators.py for performance and
load issues
-xmlstart Page to start with when using an XML dump
-ignorepdf Do not handle PDF files (handy if you use Windows and
can't get pdfinfo)
Basic pagegenerators commands, -page, etc...
"""
# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
#
# Distributed under the terms of the GPL
__version__ = '$Id: reflinks.py 8180 2010-05-15 13:14:44Z amir $'
import sys, re, urllib2, httplib, socket, codecs, ftplib, urllib
import wikipedia, pagegenerators, noreferences
import subprocess, tempfile, os, gzip, StringIO
import traceback
try:
from wikificator import wikify
except ImportError:
print 'WARNING: wikificator.py not found'
wikify = None
try:
import citeweb
except ImportError:
citeweb = None
stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
'fa':u'کاربر:Amirobot/EditThisPageToStopMe',
'it':u'Utente:Marco27Bot/EditThisPageToStopMe',
'ko':u'사용자:GrassnBreadRefBot/EditThisPageToStopMe1',
'hu':'User:Damibot/EditThisPageToStopMe',
'en':u'User:DumZiBoT/EditThisPageToStopMe',
'ru':u'Участник:LankLinkBot/EditThisPageToStopMe',
'pl':u'Wikipedysta:MastiBot/EditThisPageToStopMe',
'zh':u'User:Sz-iwbot',
}
msg = {
'de':u'Bot: Korrektes Referenzformat (siehe [[:en:User:DumZiBoT/refLinks]])',
'en':u'Bot: Converting bare references, using ref names to avoid duplicates, see [[User:DumZiBoT/refLinks|FAQ]]',
'es':u'Formateando las referencias que no tuvieran títulos (FAQ : [[:en:User:DumZiBoT/refLinks]] )',
'fa':u'ربات:تصحیح پيوند به بيرون يا عنوان پيوند. [[:en:User:DumZiBoT/refLinks|اطلاعات بیشتر]]',
'fr':u'Bot: Correction des refs. mal formatées, suppression doublons en utilisant des références nommées (cf. [[Utilisateur:DumZiBoT/liensRefs|explications]])',
'hu':u'Robot: Forráshivatkozások kibővítése a hivatkozott oldal címével',
'it':u'Bot: Sistemo note con collegamenti esterni senza titolo ([[Utente:Marco27Bot/refLinks.py|documentazione]])',
'ko':u'봇: url만 있는 주석을 보강, (영문)[[:en:User:DumZiBoT/refLinks]] 참조',
'pl':u'Bot: Dodanie tytułów do linków w przypisach (patrz [[Wikipedysta:MastiBot/refLinks|FAQ]])',
'ru':u'Bot: добавление заголовков в сноски; исправление дублирующихся сносок',
}
convmsg = {
'en': u'',
'ru': u'; исправление ссылок на википедию',
}
deadLinkTagName = {'de':u'{{dead link}}',
'en':u'{{dead link}}',
'es':u'{{enlace roto2|',
'fr':u'{{lien mort}}',
'fa':u'{{پیوند مرده}}',
'hu':u'{{halott link}}',
'it':u'{{Collegamento interrotto|',
'ko':u'{{죽은 바깥 고리}}',
'pl':u'{{Martwy link}}',
}
deadLinkTag = {'de':u'',
'en':u'[%s] {{dead link}}',
'es':u'{{enlace roto2|%s}}',
'fa':u'[%s] {{پیوند مرده}}',
'fr':u'[%s] {{lien mort}}',
'hu':u'[%s] {{halott link}}',
'it':u'{{Collegamento interrotto|%s}}',
'ko':u'[%s] {{죽은 바깥 고리}}',
'pl':u'[%s] {{Martwy link}}',
}
comment = {'ar':u'عنوان مولد بالبوت',
'de':u'Automatisch generierter titel',
'en':u'Bot generated title',
'es':u'Título generado por un bot',
'fa':u'عنوان تصحیح شده توسط ربات',
'fr':u'Titre généré automatiquement',
'hu':u'Robot generálta cím',
'it':u'Titolo generato automaticamente',
'ko':u'봇이 따온 제목',
'pl':u'Tytuł wygenerowany przez bota',
'ru':u'Заголовок добавлен ботом',
}
autogen = { 'en': 'autogenerated',
'ar': 'مولد تلقائيا',
'it': 'autogenerato',
'pl': 'autonazwa',
}
soft404 = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE)
# Extracts the domain name
domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
badtitles = ur"""
# is
(test|
# starts with
^\W*(
register
|registration
|(sign|log)[ \-]?in
|subscribe
|sign[ \-]?up
|log[ \-]?on
|untitled[ ]?(document|page|\d+|$)
|404[ ]
).*
# anywhere
|.*(
403[ ]forbidden
|(404|page|file|information|resource).*not([ ]*be)?[ ]*(available|found)
|site.*disabled
|error[ ]404
|error.+not[ ]found
|not[ ]found.+error
|404[ ]error
|\D404\D
|check[ ]browser[ ]settings
|log[ \-]?(on|in)[ ]to
|site[ ]redirection
# see www.antonov.com
|reflinks
# fr
|(404|page|site).*en[ ]travaux
# es
|sitio.*no +disponible
# it
|((pagina|sito)[ ](non[ ]trovata|inesistente)|accedi)
# ru
|сайт[ ]прода[её]тся
|сайт[ ]закрыт
|сайт.*заблокирован
|устаревший[ ]адрес[ ]страницы
|страница.*(не[ ]найдена|осутствует|недоступна)
|доступ.*ограничен
|обслуживание[ ]сайта.*приостановлено
|не[ ]?возможно[ ]отобразить[ ]страницу
|добавление[ ]новости
|это[ ]наилучший[ ]источник[ ]информации
|временно[ ]заблокирован
|срок[ ]регистрации[ ]домена[ ]закончился
).*
# ends with
|.*(
register
|registration
|(sign|log)[ \-]?in
|subscribe|sign[ \-]?up
|log[ \-]?on
)\W*$
# fill text
|^(
report
|blog
|new[ ]page([ ]\d+)?
|press
|details
|search[ ]results
|ban[ ]page
|home[ ]page
|redirector
|youtube[ ]-[ ]broadcast[ ]yourself\.?
|yahoo![ ]news
|\d+
|\W*sorry\W*
|invalid[ ]id
|uid=\d+[ ]genome[ ]result # ???
|[\? ]+
|новости
|новая[ ]страница[ ]?\d*
|(коммерсантъ\.[ ])?версия[ ]для[ ]печати
|армс-тасс
|cnews:
|итар-тасс
|главная([ ]страница)?
|ошибка
|гостевая[ ]книга
|добавление[ ]новости
|архив[ ]новостей
|президент[ ]россии
|пресс-центр
|MetallZone[ ]—[ ]место[ ]в[ ]рунете[ ]где[ ]узнавать[ ]больше[ ]о[ ]тяжелой[ ]музыке[ ]стало[ ]проще!
)$
)"""
# ignore this references in DuplicateReferences
dupignorerefs = { 'en': ur'^\s*ibid',
'ru': ur'^\s*(ibid|там же|указ\. соч\.)',
}
# Regex that match bare references
linksInRef = re.compile(
ur'(?i)<ref(?P<name>[^>]*)>\s*\[?(?:'
ur'(?P<url1>(?:http|https|ftp)://[^\[\]\s<>"{]+)\s*(?P<templ1>{{[-\w {}|]+}})|'
ur'(?P<url2>(?:http|https|ftp)://[^\[\]\s<>"]+)'
ur')\s*\]?\s*(?P<templ2>{{[-\w {}|]+}})?</ref>',
re.UNICODE)
splitRef = re.compile(
ur'<ref>'
ur'((?:http|https|ftp)://[^\s,;<\[\]"]+)'
ur'\s*[,;]?\s*'
ur'((?:http|https|ftp)://[^\s,;<\[\]"]+)'
ur'(?:\s*[,;]?\s*'
ur'((?:http|https|ftp)://[^\s,;<\[\]"]+)'
ur')?'
ur'(?:\s*[,;]?\s*'
ur'((?:http|https|ftp)://[^\s,;<\[\]"]+)'
ur')?'
ur'</ref>',
re.UNICODE)
# ignore references if we are inside
ignoreInside = (
('<nowiki', '</nowiki'), # <nowiki>
('<pre>', '</pre>'),
('<source', '</source'),
('<code', '</code'),
('<!--', '-->'), # html comment
)
# ignore duplicated references if we are inside
dupIgnoreInside = (
#(re.compile(r'[^{]{\|'), re.compile(r'\|}[^}]')), # table
('{|', '|}'), # table
('<table', '</table'),
('{{', '}}'), # template
('<nowiki', '</nowiki'), # <nowiki>
('<pre>', '</pre>'),
('<source', '</source'),
('<code', '</code'),
('<!--', '-->'), # html comment
)
# matches an URL at the index of a website
dirIndex = re.compile(ur'^\w+://[^/]+/?((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE)
#
badRedirs = (
# NB: should be string, not unicode
re.compile(r'^http://global.nytimes.com/\?iht$'),
re.compile(r'^http://www.youtube.com/index\?ytsession=.*'),
re.compile(r'^http://search.yahoo.com/web\?fr=404.*'),
re.compile(r'^http://slovari.yandex.ru/~книги/$'),
re.compile(r'^http://slovari.yandex.ru/~книги/[^/]+/$'),
re.compile(r'^http://www.billboard.com/#/$'),
)
encodingsMap = (
('gb2312', 'gbk'),
('shiftjis', 'shift jis 2004'),
('xeucjp', 'euc-jp'),
('win1251', 'cp1251'),
('cp1251', 'cp1251'),
('windows1251', 'cp1251'),
('windowscp1251', 'cp1251'),
('microsoftcp1251', 'cp1251'),
('88591', 'iso8859-1'),
('bi5', 'big5'),
('macintosh', 'mac-roman'),
('windows31j', 'shift-jis'),
('xsjis', 'shift-jis'),
('iso88598i', 'iso-8859-8'),
('koi8', 'koi8-r'),
('window1251', 'cp1251'),
)
# Download this file :
# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
# ( maintained by User:Dispenser )
listof404pages = '404-links.txt'
templatesList = {
'en': {},
'ru': {
# slovari.yandex.ru
u'http://slovari.yandex.ru/dict/olympic/article/olymp/': (
re.compile(u'(.*) — Олимпийская энциклопедия — Яндекс.Словари'),
u'{{Из БОЭ|1=%s|title=%s}}',
None,
),
u'http://slovari.yandex.ru/~книги/Олимпийская энциклопедия/': (
re.compile(u'(.*) — Олимпийская энциклопедия — Яндекс.Словари'),
u'{{Из БОЭ|1=%s|title=%s}}',
None,
),
u'http://slovari.yandex.ru/dict/bse/article/': (
re.compile(u'(.*) — БСЭ — Яндекс.Словари'),
u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
None,
),
u'http://slovari.yandex.ru/~книги/БСЭ/': (
re.compile(u'(.*) — БСЭ — Яндекс.Словари'),
u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
None,
),
u'http://slovari.yandex.ru/dict/brokminor/article/': (
re.compile(u'(.*) — Брокгауз и Ефрон — Яндекс.Словари'),
u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
None,
),
u'http://slovari.yandex.ru/~книги/Брокгауз и Ефрон/': (
re.compile(u'(.*) — Брокгауз и Ефрон — Яндекс.Словари'),
u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
None,
),
u'http://slovari.yandex.ru/dict/geography/article/geo/': (
re.compile(u'(.*) — Географические названия — Яндекс.Словари'),
u'{{Из|Словаря современных географических названий|ссылка=%s|заглавие=%s}}',
None,
),
u'http://slovari.yandex.ru/~книги/Географические названия/': (
re.compile(u'(.*) — Географические названия — Яндекс.Словари'),
u'{{Из|Словаря современных географических названий|ссылка=%s|заглавие=%s}}',
None,
),
# dic.academic.ru
u'http://dic.academic.ru/dic.nsf/brokgauz_efron/': (
re.compile(u'(.*)'),
u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
None,
),
u'http://dic.academic.ru/dic.nsf/bse/': (
re.compile(u'(.*)'),
u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
None,
),
u'http://dic.academic.ru/dic.nsf/enc2p/': (
re.compile(u'(.*)'),
u'{{Из|Даль|ссылка=%s|заглавие=%s}}',
None,
),
u'http://dic.academic.ru/dic.nsf/vasmer/': (
re.compile(u'(.*)'),
u'{{Из|[[Этимологический словарь русского языка (М. Фасмер)|Этимологического словаря русского языка Макса Фасмера]]|ссылка=%s|заглавие=%s}}',
None,
),
# www.vehi.net
u'http://vehi.net/brokgauz/all/': (
re.compile(u'(.*)'),
u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
None,
),
# gatchina3000.ru
u'http://gatchina3000.ru/brockhaus-and-efron-encyclopedic-dictionary/': (
re.compile(u'(.*) / Энциклопедия Брокгауза и Эфрона'),
u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
None,
),
# www.cultinfo.ru
u'http://cultinfo.ru/fulltext/1/001/008/': (
re.compile(u'(.*)'),
u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
None,
),
# sci-lib.com
u'http://be.sci-lib.com/article': (
re.compile(u'(.*)'),
u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
None,
),
u'http://bse.sci-lib.com/article': (
re.compile(u'(.*)'),
u'{{Из|БСЭ|ссылка=%s|заглавие=%s|издание=3-е}}',
None,
),
# femto.com.ua
u'http://femto.com.ua/articles/': (
re.compile(u'(.*) - Физическая энциклопедия'),
u'{{Из|ФЭ|ссылка=%s|заглавие=%s}}',
None,
),
# wikisource.org
u'http://ru.wikisource.org/wiki/ЭСБЕ/': (
re.compile(u'ЭСБЕ/(.*) — Викитека'),
u'{{Из|ЭСБЕ|ссылка=%s|заглавие=%s}}',
None,
),
# feb-web.ru
u'http://feb-web.ru/feb/litenc/encyclop/': (
re.compile(u'(.*) // Литературная энциклопедия.*'),
u'{{Из ЛЭ|1=%s|title=%s}}',
None,
),
u'http://feb-web.ru/feb/kle/': (
re.compile(u'(.*) // Краткая литературная энциклопедия.*'),
u'{{Из КЛЭ|1=%s|title=%s}}',
None,
),
# krugosvet.ru
u'http://krugosvet.ru/articles/': (
re.compile(u'(.*)'),
u'{{Из|Кругосвет|ссылка=%s|заглавие=%s}}',
None,
),
# eleven.co.il
u'http://eleven.co.il/article/': (
re.compile(ur'(.*)\. Электронная еврейская энциклопедия'),
u'{{Из|ЭЕЭ|ссылка=%s|заглавие=%s}}',
None,
),
# imdb
u'http://imdb.com/title/tt': (
re.compile(u'(.*)'),
u'{{Imdb title|%s|%s}}',
re.compile(r'http://imdb.com/title/tt(\d+)/?$')
),
u'http://imdb.com/name/nm': (
re.compile(u'(.*)'),
u'{{Imdb title|%s|%s}}',
re.compile(r'http://imdb.com/name/nm(\d+)/?$')
),
# britannica.com
u'http://britannica.com/ebchecked/topic/': (
re.compile(u'(.*) -- Britannica Online Encyclopedia'),
u'{{britannica-link|%s|%s}}',
re.compile(r'http://britannica.com/ebchecked/topic/(\d+).*'),
re.compile(u'^(.*) — Britannica Online Encyclopedia$'),
),
}
}
fixUrl = (
re.compile(r'(http://www.kommersant.ru/.*)print=true$'),
re.compile(r'(http://kommersant.ru/.*)print=true$'),
re.compile(r'(http://top.rbc.ru/.*)\?print$'),
)
class DeadLinkException(Exception):
pass
def checkInside(text, ignore=dupIgnoreInside):
'''check if we are inside table/template/html-comment'''
for start, end in ignore:
if isinstance(start, str):
cnt1 = text.count(start)
else:
cnt1 = len(start.findall(text))
if isinstance(end, str):
cnt2 = text.count(end)
else:
cnt2 = len(end.findall(text))
if cnt1 != cnt2:
# we are inside
return True
return False
def safeReplace(text, old, new, count=0, ignore=dupIgnoreInside):
'''do not replace inside table/template/html-comment'''
newtext = ''
i = 0
j = 0
while True:
i = text.find(old, j)
if i < 0:
newtext = newtext + text[j:]
break
k = i + len(old)
if checkInside(text[:i], ignore):
newtext = newtext + text[j:k]
j = k
continue
newtext = newtext + text[j:k].replace(old, new)
j = k
count -= 1
if count == 0:
newtext = newtext + text[k:]
break
return newtext
def safeReplaceRegexp(text, old, new, count=0):
'''like safeReplace, but used regexp'''
newtext = ''
i = 0
j = 0
while True:
m = re.search(old, text[j:], re.U)
if not m:
newtext = newtext + text[j:]
break
i = j + m.start()
k = j + m.end()
if checkInside(text[:i]):
newtext = newtext + text[j:k]
j = k
continue
newtext = newtext + re.sub(old, new, text[j:k], 1)
j = k
count -= 1
if count == 0:
newtext = newtext + text[k:]
break
return newtext
class XmlDumpPageGenerator:
"""Xml generator that yiels pages containing bare references"""
def __init__(self, xmlFilename, xmlStart, namespaces):
self.xmlStart = xmlStart
self.namespaces = namespaces
self.skipping = bool(xmlStart)
self.site = wikipedia.getSite()
import xmlreader
dump = xmlreader.XmlDump(xmlFilename)
self.parser = dump.parse()
def __iter__(self):
return self
def next(self):
while True:
try:
entry = self.parser.next()
except StopIteration:
raise
if self.skipping:
if entry.title != self.xmlStart:
continue
self.skipping = False
page=wikipedia.Page(self.site, entry.title)
if not self.namespaces == []:
if page.namespace() not in self.namespaces:
continue
if linksInRef.search(entry.text):
return page
class RefLink:
"""Container to handle a single bare reference"""
def __init__(self, link, name, templ):
self.refname = name or ''
self.link = link
self.site = wikipedia.getSite()
self.linkComment = wikipedia.translate(self.site, comment)
self.url = re.sub(u'#.*', '', self.link)
self.title = None
self.templ = templ
self.templatesList = wikipedia.translate(wikipedia.getSite(),
templatesList)
WikipediaUrl = re.compile(r'http://([a-z][a-z][a-z]?).wikipedia.org/(?:wiki/|w/index\.php\?title=)([^#]+)(.*)')
def refWikipedia(self):
'''convert an urls to wikipedia link
i.e. http://de.wikipedia.org/wiki/Fernuniversit%C3%A4t_in_Hagen#cite_note-0 -> [[:de:Fernuniversität in Hagen#cite_note-0]]'''
url = self.link.encode('utf-8')
match = self.WikipediaUrl.match(url)
if not match:
return False
lang, title, frag = match.group(1), match.group(2), match.group(3)
title = urllib.unquote(title).replace('_', ' ')
if lang == wikipedia.getSite().lang:
text = '[[%s%s]]' % (title, frag)
else:
text = '[[:%s:%s%s]]' % (lang, title, frag)
text = unicode(text, 'utf-8')
if self.templ:
text = '<ref%s>%s%s</ref>' % (self.refname, text, self.templ)
else:
text = '<ref%s>%s</ref>' % (self.refname, text)
return text
def refTempl(self, fix=False):
'''returns the template or False if appropriate template not found'''
for t in self.templatesList:
normlink = re.sub(r'^http://www\.', 'http://', self.link).lower()
if normlink.startswith(t):
if len(self.templatesList[t]) == 3:
pat, templ, linkpat = self.templatesList[t]
fixpat = pat
else:
pat, templ, linkpat, fixpat = self.templatesList[t]
if fix:
pat = fixpat
match = pat.match(self.title)
if match:
#
self.transform()
link = self.link
if linkpat:
m = linkpat.match(normlink)
if not m:
return None
link = m.group(1)
text = templ % (link, match.group(1))
if self.templ:
text = '<ref%s>%s %s</ref>' % (
self.refname, text, self.templ)
else:
text = '<ref%s>%s</ref>' % (
self.refname, text)
wikipedia.output(u'*** \03{lightred}%s\03{default} ***' %
text)
return text
elif fix:
print '*** refTempl:', self.link, self.title
return None
return False
def refCiteWeb(self, html, enc):
if not citeweb:
return False
t = citeweb.citeweb(self.link, html, enc)
if not t:
return False
if self.templ:
return '<ref%s>%s %s</ref>' % (self.refname, t, self.templ)
return '<ref%s>%s</ref>' % (self.refname, t)
def refTitle(self):
"""Returns the <ref> with its new title"""
self.transform()
if len(self.title) <= 5:
raise DeadLinkException('title too short: "%s"' % self.title.encode('utf-8'))
wikipedia.output(u'\03{lightgreen}%s\03{default}' % self.title)
if self.templ:
text = '<ref%s>{{cite web | url = %s | title = %s}}<!-- %s --> %s</ref>' % (
self.refname, self.link, self.title, self.linkComment, self.templ)
else:
text = '<ref%s>{{cite web | url = %s | title = %s}}<!-- %s --></ref>' % (
self.refname, self.link, self.title, self.linkComment)
return text
def refLink(self):
"""No title has been found, return the unbracketed link"""
link = self.link
if len(link) > 200:
link = '['+link+']'
if self.templ:
return '<ref%s>%s %s</ref>' % (self.refname, link, self.templ)
return '<ref%s>%s</ref>' % (self.refname, link)
def refDead(self):
"""Dead link, tag it with a {{dead link}}"""
tag = wikipedia.translate(self.site, deadLinkTag) % self.link
if self.templ:
tagname = wikipedia.translate(self.site, deadLinkTagName)
if tagname in self.templ:
# do not add yet another {{dead link}}
# FIXME
return '<ref%s>%s %s</ref>' % (self.refname, self.link, self.templ)
return '<ref%s>%s %s</ref>' % (self.refname, tag, self.templ)
return '<ref%s>%s</ref>' % (self.refname, tag)
def transform(self, ispdf = False):
"""Normalize the title"""
if wikify:
self.title = wikify(self.title)
# Truncate long titles. 175 is arbitrary
if len(self.title) > 180:
self.title = self.title[:175] + u" …"
#convert html entities
if not ispdf:
self.title = wikipedia.html2unicode(self.title)
self.title = re.sub(r'-+', '-', self.title)
#remove formatting, i.e long useless strings
self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
#remove \n and \r and Unicode spaces from titles
self.title = re.sub(r'(?u)\s', ' ', self.title)
#self.title = re.sub(r'[\n\r\t]', ' ', self.title)
#remove extra whitespaces
#remove leading and trailing ./;/,/-/_/+/ /
self.title = self.title.strip(r'=.;,-+_ ')
#self.avoid_uppercase()
#avoid pipe being interpreted as template parameters
self.title = self.title.replace('|', '|')
#avoid closing the link before the end
self.title = self.title.replace(']', ']')
#avoid multiple } being interpreted as a template inclusion
self.title = self.title.replace('}}', '}}')
#prevent multiple quotes being interpreted as '' or '''
self.title = self.title.replace('\'\'', '\''')
self.title = self.title.replace('<', '<').replace('>', '>')
self.title = wikipedia.unicode2html(self.title, self.site.encoding())
# TODO : remove HTML when both opening and closing tags are included
def avoid_uppercase(self):
"""
If title has more than 6 characters and has 60% of uppercase
characters, capitalize() it
"""
if len(self.title) <= 6:
return
nb_upper = 0
nb_letter = 0
for letter in self.title:
if letter.isupper():
nb_upper += 1
if letter.isalpha():
nb_letter += 1
if letter.isdigit():
return
if float(nb_upper)/(nb_letter+1) > .70:
self.title = self.title.title()
class DuplicateReferences:
"""
When some references are duplicated in an article,
name the first, and remove the content of the others
"""
def __init__(self):
# Match references
self.REFS = re.compile(u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
self.NAMES = re.compile(u'(?is).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
self.SELF_CLOSED_NAMES = re.compile(u'(?i)<ref[^>]+name\s*=\s*(?P<quote>"?)\s*(?P<name>[^>]+)\s*(?P=quote)[^>]*/>')
self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+?)\s*(?P=quote).*')
self.autogen = wikipedia.translate(wikipedia.getSite(), autogen)
self.IGNOREREFS = re.compile(
wikipedia.translate(wikipedia.getSite(), dupignorerefs),
re.I|re.U)
def _cmpContent(self, s1, s2):
'''compare two strings with ignore spaces'''
s1 = re.sub(u'[\s\xa0]+', '', s1)
s2 = re.sub(u'[\s\xa0]+', '', s2)
return s1 == s2
def _inDict(self, s, dct):
for k in dct:
if self._cmpContent(s, k):
return dct[k]
return False
def process(self, text, page):
# keys are ref groups
# values are a dict where :
# keys are ref content
# values are [name, [list of full ref matches], quoted, need_to_change]
foundRefs = {}
foundRefNames = {}
# Replace key by [value, quoted]
namedRepl = {}
# list of self-closed tags name
selfClosedTags = []
for match in self.SELF_CLOSED_NAMES.finditer(text):
name = match.group('name')
if not match.group('quote'):
# FIXME
name = name.strip()
selfClosedTags.append(name)
for match in self.REFS.finditer(text):
content = match.group('content')
if not content.strip():
continue
# check if ref inside table/template/html-comment
prev_text = text[:match.start()]
if checkInside(prev_text):
#print 'ref inside', match.group()
continue
if self.IGNOREREFS.match(content):
continue
params = match.group('params')
group = self.GROUPS.match(params)
if not group in foundRefs:
foundRefs[group] = {}
groupdict = foundRefs[group]
indict = self._inDict(content, groupdict)
if indict:
v = indict
v[1].append(match.group())
else:
v = [None, [match.group()], False, False]
name = self.NAMES.match(params)
if name:
quoted = name.group('quote') == '"'
name = name.group('name')
if not quoted:
name = name.strip()
if v[0]:
if v[0] != name:
namedRepl[name] = [v[0], v[2]]
else:
#First name associated with this content
d = self._inDict(name, foundRefNames)
if d and name in selfClosedTags:
wikipedia.output('*** refs ambiguity: '+page.aslink())
d = False
if not d:
# first time ever we meet this name
v[2] = quoted
v[0] = name
else:
# this name is used with another content.
# We'll need to change it
v[3] = True
foundRefNames[name] = 1
groupdict[content] = v
id = 1
while self.autogen + str(id) in foundRefNames:
id += 1
for (g, d) in foundRefs.iteritems():
if g:
group = "group=\"%s\" " % g.group('group')
else:
group = ""
for (k, v) in d.iteritems():
if len(v[1]) == 1 and not v[3]:
continue
name = v[0]
if not name:
name = self.autogen + str(id)
id += 1
else:
if name != u'""':
name = name.replace(u'"', '')
if v[2]:
name = u'"%s"' % name
named = u'<ref %sname=%s>%s</ref>' % (group, name, k)
text = safeReplace(text, v[1][0], named, 1)
# make sure that the first (named ref) is not
# removed later :
if named in text:
pos = text.index(named) + len(named)
header = text[:pos]
end = text[pos:]
unnamed = u'<ref %sname=%s />' % (group, name)
for ref in v[1][1:]:
end = safeReplace(end, ref, unnamed)
text = header + end
for (k,v) in namedRepl.iteritems():
# TODO : Support ref groups
name = v[0]
if v[1]:
name = u'"%s"' % name
text = re.sub(ur'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref name=%s />' % name, text)
return text
class ReferencesRobot:
def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ):
"""
- generator : Page generator
- acceptall : boolean, is -always on ?
- limit : int, stop after n modified pages
- ignorepdf : boolean
"""
self.generator = generator
self.acceptall = acceptall
self.limit = limit
self.ignorepdf = ignorepdf
self.site = wikipedia.getSite()
self.stopPage = wikipedia.Page(self.site, wikipedia.translate(self.site, stopPage))
self.titleBlackList = re.compile(badtitles, re.I | re.S | re.X | re.U)
self.norefbot = noreferences.NoReferencesBot(None)
self.deduplicator = DuplicateReferences()
try:
self.stopPageRevId = self.stopPage.latestRevision()
except wikipedia.NoPage :
wikipedia.output(u'The stop page %s does not exist'
% self.stopPage.aslink())
raise
try:
self.deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
except IOError:
wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory')
raise
# Regex to grasp content-type meta HTML tag in HTML source
self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>')
# Extract the encoding from a charset property (from content-type !)
self.CHARSET = re.compile(
ur'(?i)charset\s*=\s*["\']?(?P<enc>[-_a-zA-Z0-9]+)')
# Extract html title from page
#?self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)')
self.TITLE = re.compile(r'(?is)<title[^>]*>(.*?)</title\s*>')
# Matches content inside <script>/<style>/HTML comments
self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
# Authorized mime types for HTML pages
self.MIME = re.compile(ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml',
re.I)
# cache
self.title_cache = {}
def setAction(self, addconv=False):
if addconv:
m = wikipedia.translate(self.site, msg) + wikipedia.translate(self.site, convmsg)
else:
m = wikipedia.translate(self.site, msg)
wikipedia.setAction(m)
def put_page(self, page, new, showdiff=True):
"""
Prints diffs between orginal and new (text), puts new text for page
"""
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
% page.title())
if not self.acceptall:
if showdiff:
wikipedia.showDiff(page.get(), new)
choice = wikipedia.inputChoice(u'Do you want to accept ' +
u'these changes?',
['Yes', 'No', 'All', 'See'],
['y', 'N', 'a', 's'], 'N')
if choice == 'a':
self.acceptall = True
if choice == 'y':
page.put_async(new)
if choice == 's':
wikipedia.output('='*72)
wikipedia.output(new)
wikipedia.output('='*72)
self.put_page(page, new, showdiff=False)
return
if self.acceptall:
#wikipedia.showDiff(page.get(), new)
try:
page.put(new)
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict'
% (page.title(),))
except wikipedia.SpamfilterError, e:
wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
except wikipedia.PageNotSaved, error:
wikipedia.output(u'Error putting page: %s' % (error.args,))
except wikipedia.LockedPage:
wikipedia.output(u'Skipping %s (locked page)'
% (page.title(),))
except wikipedia.ServerError, e:
wikipedia.output(u'Server Error : %s' % e)
print '...done'
def httpError(self, err_num, link, pagetitleaslink):
"""Log HTTP Error"""
wikipedia.output(u'HTTP error (%s) for %s on %s'
% (err_num, link, pagetitleaslink),
toStdout = True)
def getPDFTitle(self, f):
"""
Use pdfinfo to retrieve title from a PDF.
Unix-only, I'm afraid.
"""
wikipedia.output( u'PDF file.' )
fd, infile = tempfile.mkstemp()
urlobj = os.fdopen(fd, 'r+w')
urlobj.write(f.read())
title = ''
try:
pdfinfo_out = subprocess.Popen(
["pdfinfo","/dev/stdin"],
stdin=urlobj, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=False).communicate()[0]
for aline in pdfinfo_out.splitlines():
if aline.lower().startswith('title'):
title = aline.split(None)[1:]
title = ' '.join(title)
if title != '': wikipedia.output(u'title: ' +title )
wikipedia.output( u'PDF done.' )
except ValueError:
wikipedia.output( u'pdfinfo value error.' )
except OSError:
wikipedia.output( u'pdfinfo OS error.' )
except: # Ignore errors
wikipedia.output( u'PDF processing error.' )
pass
finally:
urlobj.close()
os.unlink(infile)
return title
def getUrl(self, url, fullpage=False):
url = url.encode('utf-8') # use `str', because unicode raise UnicodeError
for pat in fixUrl:
m = pat.match(url)
if m:
url = m.group(1)
break
f = None
try:
socket.setdefaulttimeout(20)
req = urllib2.Request(url)
req.add_header('Referer', url)
req.add_header('Accept-Encoding', 'gzip')
req.add_header('User-Agent', 'pywikipedia bot (reflinks.py)')
f = urllib2.urlopen(req)
#Try to get Content-Type from server
headers = f.info()
contentType = headers.getheader('Content-Type')
if contentType and not self.MIME.search(contentType):
## title = ''
## if url.lower().endswith('.pdf') and not self.ignorepdf:
## # If file has a PDF suffix
## title = self.getPDFTitle(f)
## if not re.match('(?i) *microsoft (word|excel|visio)', title):
## title = wikipedia.html2unicode(title)
## return title
## else:
## raise DeadLinkException('PDF title blacklisted')
raise DeadLinkException('media')
# Get the real url where we end (http redirects !)
redir = f.geturl()
if redir != url and domain.findall(redir) == domain.findall(url):
if soft404.search(redir) and not soft404.search(url):
raise DeadLinkException('redirect 404')
if redir != url:
if (dirIndex.match(redir) and not dirIndex.match(url)):
#print 'WARNING: Redirect to root:', url, '->', redir
raise DeadLinkException('redirect to root: %s' % redir)
for p in badRedirs:
if p.match(redir):
wikipedia.output(u'\03{lightred}WARNING Redirect to root\03{default}')
#print 'WARNING: Redirect to root:', url, '->', redir
raise DeadLinkException('bad redirect')
# uncompress if necessary
if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
# Read the first 10,000 bytes (~ 30.000 bytes when ungzipped)
if fullpage:
compressed = StringIO.StringIO(f.read())
else:
compressed = StringIO.StringIO(f.read(25000))
f = gzip.GzipFile(fileobj=compressed)
linkedpagetext = ''
while True:
try:
chank = f.read(512)
except: # (IOError, struct.error):
break
if not chank:
break
linkedpagetext += chank
else:
# Read the first 20,000 bytes
# (I have not had more than 10.000 bites in an html-header)
if fullpage:
linkedpagetext = f.read()
else:
linkedpagetext = f.read(50000)
socket.setdefaulttimeout(None)
except urllib2.HTTPError, e:
# 410 Gone, indicates that the resource has been purposely removed
# FIXME: RefLink.refDead should be fixed
##if e.code == 410 or (e.code == 404 and (u'\t%s\t' % url in self.deadLinks)):
## return '{{dead-link}}'
raise DeadLinkException('HTTP error %s' % e.code)
except (urllib2.URLError,
socket.error,
IOError,
httplib.error), e:
#except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error, socket.error), e:
# FIXME
#print 'Can\'t retrieve page:', url, str(e)
raise DeadLinkException('connect error: %s' % e)
except ValueError:
# Known bug of httplib, google for :
# "httplib raises ValueError reading chunked content"
raise DeadLinkException('connect error')
finally:
if f:
f.close()
#remove <script>/<style>/comments/CDATA tags
linkedpagetext = self.NON_HTML.sub('', linkedpagetext)
meta_content = self.META_CONTENT.search(linkedpagetext)
# use charset from http header
s = None
if contentType:
s = self.CHARSET.search(contentType)
if meta_content:
tag = meta_content.group()
# Prefer the contentType from the HTTP header :
if not contentType:
contentType = tag
if not s:
# use charset from html
s = self.CHARSET.search(tag)
if s:
tmp = s.group('enc').strip().lower()
naked = re.sub('[ _\-]', '', tmp)
# Convert to python correct encoding names
for encfrom, encto in encodingsMap:
if naked == encfrom:
enc = encto
break
else:
enc = tmp
else:
enc = 'ascii'
if not contentType:
raise DeadLinkException('content-type not found')
elif not self.MIME.search(contentType):
raise DeadLinkException('media')
return linkedpagetext, enc
def getTitle(self, url):
#print 'getTitle:', url
linkedpagetext, enc = self.getUrl(url)
# Retrieves the first non empty string inside <title> tags
title = ''
## for m in self.TITLE.finditer(u.unicode):
for m in self.TITLE.finditer(linkedpagetext):
title = m.group(1)
title = ' '.join(title.split())
if title:
break
try:
title = unicode(title, enc)
except UnicodeDecodeError:
if enc == 'ascii':
raise DeadLinkException('unknown encoding')
else:
raise DeadLinkException('decode error: %s' % enc)
except LookupError:
raise DeadLinkException('bad encoding: %s' % enc)
# XXX Ugly hack
if u'é' in title or u'Ð' in title or u'Рµ' in title:
raise DeadLinkException('hybrid encoding')
if self.titleBlackList.match(title):
raise DeadLinkException('blacklisted title: "%s"' %
title.encode('utf-8'))
if not title:
raise DeadLinkException('empty title')
if title.lower() in url.lower():
raise DeadLinkException('title in url: %s' % title.encode('utf-8'))
return title
prevRef = re.compile(
ur'<ref(?P<name>[^>]*)>\['
ur'(?P<url>(?:http|https|ftp)://[^\[\]\s<>"{]+) '
ur'(?P<title>[^<]+)'
ur'<!-- Заголовок добавлен ботом -->\]\s*'
ur'(?P<templ>{{[-\w {}|]+}})?</ref>',
re.I|re.U)
def fixPrev(self, new_text):
for match in self.prevRef.finditer(new_text):
ref = RefLink(match.group('url'), match.group('name'),
match.group('templ'))
if citeweb.checksite(ref.link):
try:
linkedpagetext, enc = self.getUrl(ref.link, fullpage=True)
except DeadLinkException, err:
continue
repl = ref.refCiteWeb(linkedpagetext, enc)
if repl:
wikipedia.output(u'\03{lightred}%s\03{default}' % repl)
new_text = safeReplace(new_text, match.group(), repl,
ignore=ignoreInside)
continue
ref.title = match.group('title')
repl = ref.refTempl(fix=True)
if repl:
pass
elif repl is None:
try:
title = self.getTitle(ref.url)
except DeadLinkException:
continue
ref.title = title
repl = ref.refTempl()
if not repl:
continue
else:
continue
new_text = safeReplace(new_text, match.group(), repl,
ignore=ignoreInside)
return new_text
def run(self):
"""
Runs the Bot
"""
socket.setdefaulttimeout(30)
editedpages = 0
for page in self.generator:
self.setAction()
#print '***', page.aslink(), '***'
try:
# Load the page's text from the wiki
new_text = page.get()
if not page.canBeEdited():
wikipedia.output(u"You can't edit page %s" % page.aslink())
continue
except wikipedia.NoPage:
wikipedia.output(u'Page %s not found' % page.aslink())
continue
except wikipedia.IsRedirectPage:
wikipedia.output(u'Page %s is a redirect' % page.aslink())
continue
new_text = self.fixPrev(new_text)
p = wikipedia.removeDisabledParts(page.get())
if 0:
# split ref
# i.e. "<ref>http://aaa; http://bbb</ref>" ->
# "<ref>http://aaa</ref><ref>http://bbb</ref>"
for match in splitRef.finditer(p):
groups = filter(None, match.groups())
r = '<ref>' + '</ref><ref>'.join(groups) + '</ref>'
p = p.replace(match.group(), r)
for match in linksInRef.finditer(p):
#print '>>>', match.group()
#for each link to change
link = match.group('url1') or match.group('url2')
assert link
templ = (match.group('templ1') or '').strip() + \
(match.group('templ2') or '').strip()
if u'jstor.org' in link:
#TODO: Clean URL blacklist
continue
ref = RefLink(link, match.group('name'), templ)
wikipedia.output(':: %s - %s - %s' %
(link, match.group('name'), templ))
repl = ref.refWikipedia()
if repl:
wikipedia.output(u'%s : converted to wiki-link: \03{lightred}%s\03{default}' % (link, repl))
new_text = safeReplace(new_text, match.group(), repl,
ignore=ignoreInside)
self.setAction(addconv=True)
continue
if citeweb.checksite(ref.link):
try:
linkedpagetext, enc = self.getUrl(ref.link, fullpage=True)
except DeadLinkException, err:
print err, ref.link
else:
repl = ref.refCiteWeb(linkedpagetext, enc)
if repl:
wikipedia.output(u'\03{lightred}%s\03{default}' % repl)
new_text = safeReplace(new_text, match.group(), repl,
ignore=ignoreInside)
continue
if ref.url in self.title_cache:
title = self.title_cache[ref.url]
else:
try:
title = self.getTitle(ref.url)
except DeadLinkException, err:
err = unicode(str(err), 'utf-8')
if err == 'dead link':
repl = ref.refDead()
wikipedia.output(u'\03{lightred}Dead link\03{default} : %s' % ref.link)
new_text = safeReplace(new_text, match.group(), repl,
ignore=ignoreInside)
continue
else:
repl = ref.refLink()
wikipedia.output(u'%s : no title found: \03{lightpurple}%s\03{default}' % (ref.link, err))
new_text = safeReplace(new_text, match.group(), repl,
ignore=ignoreInside)
continue
# cache
self.title_cache[ref.url] = title
ref.title = title
repl = ref.refTempl() # try to use template
if not repl:
try:
repl = ref.refTitle()
except DeadLinkException, err:
err = unicode(str(err), 'utf-8')
repl = ref.refLink()
wikipedia.output(u'%s : \03{lightpurple}%s\03{default}' % (ref.link, err))
new_text = safeReplace(new_text, match.group(), repl,
ignore=ignoreInside)
continue
new_text = safeReplace(new_text, match.group(), repl,
ignore=ignoreInside)
# Add <references/> when needed, but ignore templates !
if page.namespace != 10:
if self.norefbot.lacksReferences(new_text, verbose=False):
new_text = self.norefbot.addReferences(new_text)
# Find duplicated refs
new_text = self.deduplicator.process(new_text, page)
if new_text.replace(' ', '') == page.get().replace(' ', ''):
wikipedia.output('No changes were necessary in %s'
% page.aslink())
continue
editedpages += 1
self.put_page(page, new_text)
if self.limit and editedpages >= self.limit:
wikipedia.output('Edited %s pages, stopping.' % self.limit)
return
if editedpages % 20 == 0:
wikipedia.output('\03{lightgreen}Checking stop page...\03{default}')
actualRev = self.stopPage.latestRevision()
if actualRev != self.stopPageRevId:
wikipedia.output(u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage)
return
def main():
genFactory = pagegenerators.GeneratorFactory()
PageTitles = []
xmlFilename = None
always = False
ignorepdf = False
limit = None
namespaces = []
generator = None
for arg in wikipedia.handleArgs():
if arg.startswith('-namespace:'):
try:
namespaces.append(int(arg[11:]))
except ValueError:
namespaces.append(arg[11:])
elif arg.startswith('-summary:'):
wikipedia.setAction(arg[9:])
elif arg == '-always':
always = True
elif arg == '-ignorepdf':
ignorepdf= True
elif arg.startswith('-limit:'):
limit = int(arg[7:])
elif arg.startswith('-xmlstart'):
if len(arg) == 9:
xmlStart = wikipedia.input(
u'Please enter the dumped article to start with:')
else:
xmlStart = arg[10:]
elif arg.startswith('-xml'):
if len(arg) == 4:
xmlFilename = wikipedia.input(
u'Please enter the XML dump\'s filename:')
else:
xmlFilename = arg[5:]
else:
genFactory.handleArg(arg)
if xmlFilename:
try:
xmlStart
except NameError:
xmlStart = None
generator = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
if not generator:
generator = genFactory.getCombinedGenerator()
if not generator:
# syntax error, show help text from the top of this file
wikipedia.showHelp('reflinks')
return
generator = pagegenerators.PreloadingGenerator(generator, pageNumber = 50)
generator = pagegenerators.RedirectFilterPageGenerator(generator)
if namespaces:
generator = pagegenerators.NamespaceFilterPageGenerator(generator, namespaces)
bot = ReferencesRobot(generator, always, limit, ignorepdf)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()