diff --git a/dumpgenerator.py b/dumpgenerator.py
index bd27ff17..a2d5fabc 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -23,10 +23,21 @@
from kitchen.text.converters import getwriter, to_unicode
except ImportError:
print "Please install the kitchen module."
+
+try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+import xml.dom.minidom as MD
+
import cookielib
import cPickle
import datetime
import sys
+import io
+import traceback
+
try:
import argparse
except ImportError:
@@ -63,7 +74,7 @@
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
-__VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org
+__VERSION__ = '0.5.0-alpha' # major, minor, micro: semver.org
class PageMissingError(Exception):
def __init__(self, title, xml):
@@ -164,7 +175,7 @@ def getNamespacesScraper(config={}, session=None):
namespacenames = {0: ''} # main is 0, no prefix
if namespaces:
r = session.post(
- url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
+ url=config['index'], params={'title': 'Special:Allpages'}, timeout=120)
raw = r.text
delay(config=config, session=session)
@@ -206,7 +217,7 @@ def getNamespacesAPI(config={}, session=None):
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'},
- timeout=30
+ timeout=120
)
result = getJSON(r)
delay(config=config, session=session)
@@ -281,7 +292,7 @@ def getPageTitlesScraper(config={}, session=None):
print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % (
config['index'], namespace)
- r = session.get(url=url, timeout=30)
+ r = session.get(url=url, timeout=120)
raw = r.text
raw = cleanHTML(raw)
@@ -455,7 +466,7 @@ def getXMLHeader(config={}, session=None):
else:
try:
- xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+ xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The does not exist. Not a problem, if we get the .
xml = pme.xml
@@ -477,7 +488,7 @@ def getXMLHeader(config={}, session=None):
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
- xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+ xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
@@ -500,7 +511,7 @@ def getXMLHeader(config={}, session=None):
def getXMLFileDesc(config={}, title='', session=None):
""" Get XML for image description page """
config['curonly'] = 1 # tricky to get only the most recent desc
- return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
+ return("".join([x for x in getXMLPage_( config=config, title=title, verbose=False, session=session)]))
def getUserAgent():
@@ -521,7 +532,216 @@ def logerror(config={}, text=''):
output = u'%s: %s\n' % (
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
outfile.write(output.encode('utf-8'))
+def reconstructRevisions(root=None):
+ #print ET.tostring(rev)
+ page = ET.Element('stub')
+ edits = 0
+ for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'):
+ try:
+ rev_ = ET.SubElement(page,'revision')
+ ET.SubElement(rev_,'id').text = rev.attrib['revid']
+ ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp']
+ contributor = ET.SubElement(rev_,'contributor')
+ if not rev.attrib.has_key('userhidden'):
+ ET.SubElement(contributor,'username').text = rev.attrib['user']
+ ET.SubElement(contributor,'id').text = rev.attrib['userid']
+ else:
+ contributor.set('deleted','deleted')
+ comment = ET.SubElement(rev_,'comment')
+ if not rev.attrib.has_key('commenthidden'):
+ comment.text = rev.attrib['comment']
+ else:
+ comment.set('deleted','deleted')
+
+ # some revision does not return model and format, so just use hard-code
+ ET.SubElement(rev_,'model').text = 'wikitext'
+ ET.SubElement(rev_,'format').text = 'text/x-wiki'
+ text = ET.SubElement(rev_,'text')
+ if not rev.attrib.has_key('texthidden'):
+ text.attrib['xml:space'] = "preserve"
+ text.attrib['bytes'] = rev.attrib['size']
+ text.text = rev.text
+ else:
+ text.set('deleted','deleted')
+ # delete sha1 here :)
+ #sha1 = ET.SubElement(rev_,'sha1')
+ #if not rev.attrib.has_key('sha1missing'):
+ #sha1.text = rev.attrib['sha1']
+ if rev.attrib.has_key('minor'):
+ ET.SubElement(rev_,'minor')
+ edits += 1
+ except Exception as e:
+ #logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
+ print ET.tostring(rev)
+ traceback.print_exc()
+ page = None
+ edits = 0
+ raise e
+ return page,edits
+
+def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None):
+ """ """
+ # just send the API request
+ # if it fails, it will reduce params['rvlimit']
+ xml = ''
+ c = 0
+ maxseconds = 100 # max seconds to wait in a single sleeping
+ maxretries = config['retries'] # x retries and skip
+ increment = 20 # increment every retry
+ while not re.search(r'' if not config['curonly'] else r'', xml) or re.search(r'', xml):
+ if c > 0 and c < maxretries:
+ wait = increment * c < maxseconds and increment * \
+ c or maxseconds # incremental until maxseconds
+ print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait)
+ time.sleep(wait)
+ # reducing server load requesting smallest chunks (if curonly then
+ # rvlimit = 1 from mother function)
+ if params['rvlimit'] > 1:
+ params['rvlimit'] = params['rvlimit'] / 2 # half
+ if c >= maxretries:
+ print ' We have retried %d times' % (c)
+ print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages'])
+ # If it's not already what we tried: our last chance, preserve only the last revision...
+ # config['curonly'] means that the whole dump is configured to save only the last,
+ # params['curonly'] should mean that we've already tried this
+ # fallback, because it's set by the following if and passed to
+ # getXMLPageCore
+ # TODO: save only the last version when failed
+ print ' Saving in the errors log, and skipping...'
+ logerror(
+ config=config,
+ text=u'Error while retrieving the last revision of "%s". Skipping.' %
+ (params['titles' if config['apiexport'] else 'pages']).decode('utf-8'))
+ #raise ExportAbortedError(config['index'])
+ return '' # empty xml
+
+ # FIXME HANDLE HTTP Errors HERE
+ try:
+ r = session.get(url=config['api'], params=params, headers=headers)
+ handleStatusCode(r)
+ xml = fixBOM(r)
+ #print xml
+ except requests.exceptions.ConnectionError as e:
+ print ' Connection error: %s'%(str(e[0]))
+ xml = ''
+ c += 1
+ return xml
+def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
+ """ Get the full history (or current only) of a page using API:Query
+ if params['curonly'] is set, then using export&exportwrap to export
+ """
+
+ title_ = title
+ title_ = re.sub(' ', '_', title_)
+ # do not convert & into %26, title_ = re.sub('&', '%26', title_)
+ # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
+ # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
+ #print 'current:%s' % (title_)
+ if not config['curonly']:
+ params = {'titles': title_, 'action': 'query','format':'xml',
+ 'prop':'revisions',
+ 'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
+ 'rvcontinue' : None,
+ 'rvlimit' : 10 # TODO: set this by commandline
+ }
+ else:
+ params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
+ #print 'params:%s' % (params)
+ if not config['curonly']:
+ firstpartok = False
+ lastcontinue = None
+ numberofedits = 0
+ ret = ''
+ while True:
+ # in case the last request is not right, saving last time's progress
+ if not firstpartok:
+ try:
+ lastcontinue = params['rvcontinue']
+ except:
+ lastcontinue = None
+
+ xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+ if xml == "":
+ #just return so that we can continue, and getXMLPageCoreWithApi will log the error
+ return
+ try:
+ root = ET.fromstring(xml.encode('utf-8'))
+ except:
+ continue
+ try:
+ retpage = root.find('query').find('pages').find('page')
+ except:
+ continue
+ if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
+ print 'Page not found'
+ raise PageMissingError(params['titles'], xml)
+ if not firstpartok:
+ try:
+ # build the firstpart by ourselves to improve the memory usage
+ ret = ' \n'
+ ret += ' %s\n' %(retpage.attrib['title'])
+ ret += ' %s\n' % (retpage.attrib['ns'])
+ ret += ' %s\n' % (retpage.attrib['pageid'])
+ except:
+ firstpartok = False
+ continue
+ else:
+ firstpartok = True
+ yield ret
+ try:
+ ret = ''
+ edits = 0
+ if config['curonly'] or root.find('continue') == None:
+ # transform the revision
+ rev_,edits = reconstructRevisions(root=root)
+ xmldom = MD.parseString(''+ET.tostring(rev_)+'')
+ # convert it into text in case it throws MemoryError
+ # delete the first three line and last two line,which is for setting the indent
+ ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
+ yield ret
+ numberofedits += edits
+ break
+ else:
+ rev_,edits = reconstructRevisions(root=root)
+ xmldom = MD.parseString('' + ET.tostring(rev_) + '')
+ ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
+ params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
+ numberofedits += edits
+ yield ret
+ except:
+ traceback.print_exc()
+ params['rvcontinue'] = lastcontinue
+ ret = ''
+ yield ' \n'
+ else:
+ xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+ if xml == "":
+ raise ExportAbortedError(config['index'])
+ if not "" in xml:
+ raise PageMissingError(params['titles'], xml)
+ else:
+ # strip these sha1s sums which keep showing up in the export and
+ # which are invalid for the XML schema (they only apply to
+ # revisions)
+ xml = re.sub(r'\n\s*\w+\s*\n', r'\n', xml)
+ xml = re.sub(r'\n\s*\s*\n', r'\n', xml)
+
+ yield xml.split("")[0]
+
+ # just for looking good :)
+ r_timestamp = r'([^<]+)'
+
+ numberofedits = 0
+ numberofedits += len(re.findall(r_timestamp, xml))
+
+ yield "\n"
+
+ if verbose:
+ if (numberofedits == 1):
+ print ' %s, 1 edit' % (title.strip())
+ else:
+ print ' %s, %d edits' % (title.strip(), numberofedits)
def getXMLPageCore(headers={}, params={}, config={}, session=None):
""" """
@@ -694,7 +914,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
print ' %s, 1 edit' % (title.strip())
else:
print ' %s, %d edits' % (title.strip(), numberofedits)
-
+def getXMLPage_(config={}, title='', verbose=True, session=None):
+ #print config
+ if config['apiexport']:
+ return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
+ else:
+ return getXMLPage(config=config, title=title, verbose=verbose, session=session)
+ return ''
def makeXmlPageFromRaw(xml):
""" Discard the metadata around a element in string"""
@@ -775,7 +1001,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
- for xml in getXMLPage(config=config, title=title, session=session):
+ for xml in getXMLPage_(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
@@ -1680,6 +1906,7 @@ def getParameters(params=[]):
action='store_true',
help='resumes previous incomplete dump (requires --path)')
parser.add_argument('--force', action='store_true', help='')
+ parser.add_argument('--ignore-api-check', action='store_true', help='')
parser.add_argument(
'--user', help='Username if authentication is required.')
parser.add_argument(
@@ -1723,6 +1950,10 @@ def getParameters(params=[]):
'--exnamespaces',
metavar="1,2,3",
help='comma-separated value of namespaces to exclude')
+ groupDownload.add_argument(
+ '--apiexport',
+ action='store_true',
+ help="Using API instead of Special:Export to export pages")
# Meta info params
groupMeta = parser.add_argument_group(
@@ -1824,6 +2055,8 @@ def getParameters(params=[]):
index2 = check[1]
api = checkedapi
print 'API is OK: ' + checkedapi
+ elif args.ignore_api_check:
+ print 'Error in API. Ignoring.'
else:
if index and not args.wiki:
print 'API not available. Trying with index.php only.'
@@ -1921,6 +2154,7 @@ def getParameters(params=[]):
'cookies': args.cookies or '',
'delay': args.delay,
'retries': int(args.retries),
+ 'apiexport': args.apiexport
}
other = {