Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to backup pages using API:Query instead of Special:Export #280

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 241 additions & 5 deletions dumpgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,20 @@
from kitchen.text.converters import getwriter
except ImportError:
print "Please install the kitchen module."

try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET

import xml.dom.minidom as MD

import cookielib
import cPickle
import datetime
import sys
import io
import traceback
try:
import argparse
except ImportError:
Expand Down Expand Up @@ -436,8 +446,12 @@ def getXMLHeader(config={}, session=None):
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x....
randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ
# when params['curonly'] is not set, API does not return the namespace info
# since we does not need the page history here, just set it temporarily
curonly = config['curonly']
config['curonly'] = True
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
Expand All @@ -458,12 +472,12 @@ def getXMLHeader(config={}, session=None):
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass

config['curonly'] = curonly
header = xml.split('</mediawiki>')[0]
if not re.match(r"\s*<mediawiki", xml):
print 'XML export on this wiki is broken, quitting.'
Expand All @@ -475,7 +489,7 @@ def getXMLHeader(config={}, session=None):
def getXMLFileDesc(config={}, title='', session=None):
""" Get XML for image description page """
config['curonly'] = 1 # tricky to get only the most recent desc
return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
return("".join([x for x in getXMLPage_( config=config, title=title, verbose=False, session=session)]))


def getUserAgent():
Expand All @@ -496,6 +510,218 @@ def logerror(config={}, text=''):
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
outfile.write(output.encode('utf-8'))

def reconstructRevisions(root=None):
#print ET.tostring(rev)
page = ET.Element('stub')
edits = 0
for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'):
try:
rev_ = ET.SubElement(page,'revision')
ET.SubElement(rev_,'id').text = rev.attrib['revid']
ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp']
contributor = ET.SubElement(rev_,'contributor')
if not rev.attrib.has_key('userhidden'):
ET.SubElement(contributor,'username').text = rev.attrib['user']
ET.SubElement(contributor,'id').text = rev.attrib['userid']
else:
contributor.set('deleted','deleted')
comment = ET.SubElement(rev_,'comment')
if not rev.attrib.has_key('commenthidden'):
comment.text = rev.attrib['comment']
else:
comment.set('deleted','deleted')

# some revision does not return model and format, so just use hard-code
ET.SubElement(rev_,'model').text = 'wikitext'
ET.SubElement(rev_,'format').text = 'text/x-wiki'
text = ET.SubElement(rev_,'text')
if not rev.attrib.has_key('texthidden'):
text.attrib['xml:space'] = "preserve"
text.attrib['bytes'] = rev.attrib['size']
text.text = rev.text
else:
text.set('deleted','deleted')
# delete sha1 here :)
#sha1 = ET.SubElement(rev_,'sha1')
#if not rev.attrib.has_key('sha1missing'):
#sha1.text = rev.attrib['sha1']
if rev.attrib.has_key('minor'):
ET.SubElement(rev_,'minor')
edits += 1
except Exception as e:
#logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
print ET.tostring(rev)
traceback.print_exc()
page = None
edits = 0
raise e
return page,edits

def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None):
""" """
# just send the API request
# if it fails, it will reduce params['rvlimit']
xml = ''
c = 0
maxseconds = 100 # max seconds to wait in a single sleeping
maxretries = config['retries'] # x retries and skip
increment = 20 # increment every retry

while not re.search(r'</api>' if not config['curonly'] else r'</mediawiki>', xml) or re.search(r'</error>', xml):
if c > 0 and c < maxretries:
wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait)
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# rvlimit = 1 from mother function)
if params['rvlimit'] > 1:
params['rvlimit'] = params['rvlimit'] / 2 # half
if c >= maxretries:
print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages'])
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save only the last,
# params['curonly'] should mean that we've already tried this
# fallback, because it's set by the following if and passed to
# getXMLPageCore
# TODO: save only the last version when failed
print ' Saving in the errors log, and skipping...'
logerror(
config=config,
text=u'Error while retrieving the last revision of "%s". Skipping.' %
(params['titles' if config['apiexport'] else 'pages']).decode('utf-8'))
#raise ExportAbortedError(config['index'])
return '' # empty xml

# FIXME HANDLE HTTP Errors HERE
try:
r = session.get(url=config['api'], params=params, headers=headers)
handleStatusCode(r)
xml = fixBOM(r)
#print xml
except requests.exceptions.ConnectionError as e:
print ' Connection error: %s'%(str(e[0]))
xml = ''
c += 1
return xml


def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
""" Get the full history (or current only) of a page using API:Query
if params['curonly'] is set, then using export&exportwrap to export
"""

title_ = title
title_ = re.sub(' ', '_', title_)
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
# action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
# &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
#print 'current:%s' % (title_)
if not config['curonly']:
params = {'titles': title_, 'action': 'query','format':'xml',
'prop':'revisions',
'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
'rvcontinue' : None,
'rvlimit' : 10 # TODO: set this by commandline
}
else:
params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
#print 'params:%s' % (params)
if not config['curonly']:
firstpartok = False
lastcontinue = None
numberofedits = 0
ret = ''
while True:
# in case the last request is not right, saving last time's progress
if not firstpartok:
try:
lastcontinue = params['rvcontinue']
except:
lastcontinue = None

xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
if xml == "":
#just return so that we can continue, and getXMLPageCoreWithApi will log the error
return
try:
root = ET.fromstring(xml.encode('utf-8'))
except:
continue
try:
retpage = root.find('query').find('pages').find('page')
except:
continue
if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
print 'Page not found'
raise PageMissingError(params['titles'], xml)
if not firstpartok:
try:
# build the firstpart by ourselves to improve the memory usage
ret = ' <page>\n'
ret += ' <title>%s</title>\n' %(retpage.attrib['title'])
ret += ' <ns>%s</ns>\n' % (retpage.attrib['ns'])
ret += ' <id>%s</id>\n' % (retpage.attrib['pageid'])
except:
firstpartok = False
continue
else:
firstpartok = True
yield ret
try:
ret = ''
edits = 0
if config['curonly'] or root.find('continue') == None:
# transform the revision
rev_,edits = reconstructRevisions(root=root)
xmldom = MD.parseString('<stub1>'+ET.tostring(rev_)+'</stub1>')
# convert it into text in case it throws MemoryError
# delete the first three line and last two line,which is for setting the indent
ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
yield ret
numberofedits += edits
break
else:
rev_,edits = reconstructRevisions(root=root)
xmldom = MD.parseString('<stub1>' + ET.tostring(rev_) + '</stub1>')
ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
numberofedits += edits
yield ret
except:
traceback.print_exc()
params['rvcontinue'] = lastcontinue
ret = ''
yield ' </page>\n'
else:
xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
if xml == "":
raise ExportAbortedError(config['index'])
if not "</page>" in xml:
raise PageMissingError(params['titles'], xml)
else:
# strip these sha1s sums which keep showing up in the export and
# which are invalid for the XML schema (they only apply to
# revisions)
xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)

yield xml.split("</page>")[0]

# just for looking good :)
r_timestamp = r'<timestamp>([^<]+)</timestamp>'

numberofedits = 0
numberofedits += len(re.findall(r_timestamp, xml))

yield "</page>\n"

if verbose:
if (numberofedits == 1):
print ' %s, 1 edit' % (title.strip())
else:
print ' %s, %d edits' % (title.strip(), numberofedits)

def getXMLPageCore(headers={}, params={}, config={}, session=None):
""" """
Expand Down Expand Up @@ -663,6 +889,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
else:
print ' %s, %d edits' % (title.strip(), numberofedits)

def getXMLPage_(config={}, title='', verbose=True, session=None):
#print config
if config['apiexport']:
return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
else:
return getXMLPage(config=config, title=title, verbose=verbose, session=session)
return ''

def cleanXML(xml=''):
""" Trim redundant info """
Expand Down Expand Up @@ -710,7 +943,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
for xml in getXMLPage_(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
Expand Down Expand Up @@ -1311,6 +1544,8 @@ def getParameters(params=[]):
'--exnamespaces',
metavar="1,2,3",
help='comma-separated value of namespaces to exclude')
groupDownload.add_argument(
'--apiexport', action='store_true', help="Using API instead of Special:Export to export pages")

# Meta info params
groupMeta = parser.add_argument_group(
Expand Down Expand Up @@ -1494,6 +1729,7 @@ def getParameters(params=[]):
'cookies': args.cookies or '',
'delay': args.delay,
'retries': int(args.retries),
'apiexport' : args.apiexport
}

other = {
Expand Down