WikiTeam · NyaMisty · Oct 5, 2016 · Oct 5, 2016
diff --git a/dumpgenerator.py b/dumpgenerator.py
@@ -23,10 +23,20 @@
     from kitchen.text.converters import getwriter
 except ImportError:
     print "Please install the kitchen module."
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+import xml.dom.minidom as MD
+
 import cookielib
 import cPickle
 import datetime
 import sys
+import io
+import traceback
 try:
     import argparse
 except ImportError:
@@ -436,8 +446,12 @@ def getXMLHeader(config={}, session=None):
     # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
     # xmlns:x....
     randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
+    # when params['curonly'] is not set, API does not return the namespace info
+    # since we does not need the page history here, just set it temporarily
+    curonly = config['curonly']
+    config['curonly'] = True
     try:
-        xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+        xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
     except PageMissingError as pme:
         # The <page> does not exist. Not a problem, if we get the <siteinfo>.
         xml = pme.xml
@@ -458,12 +472,12 @@ def getXMLHeader(config={}, session=None):
                 )
                 config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
                     + ':Export'
-                xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+                xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
         except PageMissingError as pme:
             xml = pme.xml
         except ExportAbortedError:
             pass
-
+    config['curonly'] = curonly
     header = xml.split('</mediawiki>')[0]
     if not re.match(r"\s*<mediawiki", xml):
         print 'XML export on this wiki is broken, quitting.'
@@ -475,7 +489,7 @@ def getXMLHeader(config={}, session=None):
 def getXMLFileDesc(config={}, title='', session=None):
     """ Get XML for image description page """
     config['curonly'] = 1  # tricky to get only the most recent desc
-    return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
+    return("".join([x for x in getXMLPage_( config=config, title=title, verbose=False, session=session)]))
 
 
 def getUserAgent():
@@ -496,6 +510,218 @@ def logerror(config={}, text=''):
                 datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
             outfile.write(output.encode('utf-8'))
 
+def reconstructRevisions(root=None):
+    #print ET.tostring(rev)
+    page = ET.Element('stub')
+    edits = 0
+    for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'):
+        try:
+            rev_ = ET.SubElement(page,'revision')
+            ET.SubElement(rev_,'id').text = rev.attrib['revid']
+            ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp']
+            contributor = ET.SubElement(rev_,'contributor')
+            if not rev.attrib.has_key('userhidden'):
+                ET.SubElement(contributor,'username').text = rev.attrib['user']
+                ET.SubElement(contributor,'id').text = rev.attrib['userid']
+            else:
+                contributor.set('deleted','deleted')
+            comment = ET.SubElement(rev_,'comment')
+            if not rev.attrib.has_key('commenthidden'):
+                comment.text = rev.attrib['comment']
+            else:
+                comment.set('deleted','deleted')
+
+            # some revision does not return model and format, so just use hard-code
+            ET.SubElement(rev_,'model').text = 'wikitext'
+            ET.SubElement(rev_,'format').text = 'text/x-wiki'
+            text = ET.SubElement(rev_,'text')
+            if not rev.attrib.has_key('texthidden'):
+                text.attrib['xml:space'] = "preserve"
+                text.attrib['bytes'] = rev.attrib['size']
+                text.text = rev.text
+            else:
+                text.set('deleted','deleted')
+            # delete sha1 here :)
+            #sha1 = ET.SubElement(rev_,'sha1')
+            #if not rev.attrib.has_key('sha1missing'):
+                #sha1.text = rev.attrib['sha1']
+            if rev.attrib.has_key('minor'):
+                ET.SubElement(rev_,'minor')
+            edits += 1
+        except Exception as e:
+            #logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
+            print ET.tostring(rev)
+            traceback.print_exc()
+            page = None
+            edits = 0
+            raise e
+    return page,edits
+
+def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None):
+    """  """
+    # just send the API request
+    # if it fails, it will reduce params['rvlimit']
+    xml = ''
+    c = 0
+    maxseconds = 100  # max seconds to wait in a single sleeping
+    maxretries = config['retries']  # x retries and skip
+    increment = 20  # increment every retry
+
+    while not re.search(r'</api>' if not config['curonly'] else r'</mediawiki>', xml) or re.search(r'</error>', xml):
+        if c > 0 and c < maxretries:
+            wait = increment * c < maxseconds and increment * \
+                c or maxseconds  # incremental until maxseconds
+            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait)
+            time.sleep(wait)
+            # reducing server load requesting smallest chunks (if curonly then
+            # rvlimit = 1 from mother function)
+            if params['rvlimit'] > 1:
+                params['rvlimit'] = params['rvlimit'] / 2  # half
+        if c >= maxretries:
+            print '    We have retried %d times' % (c)
+            print '    MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages'])
+            # If it's not already what we tried: our last chance, preserve only the last revision...
+            # config['curonly'] means that the whole dump is configured to save only the last,
+            # params['curonly'] should mean that we've already tried this
+            # fallback, because it's set by the following if and passed to
+            # getXMLPageCore
+            # TODO: save only the last version when failed
+            print '    Saving in the errors log, and skipping...'
+            logerror(
+                config=config,
+                text=u'Error while retrieving the last revision of "%s". Skipping.' %
+                (params['titles' if config['apiexport'] else 'pages']).decode('utf-8'))
+            #raise ExportAbortedError(config['index'])
+            return ''  # empty xml
+
+        # FIXME HANDLE HTTP Errors HERE
+        try:
+            r = session.get(url=config['api'], params=params, headers=headers)
+            handleStatusCode(r)
+            xml = fixBOM(r)
+            #print xml
+        except requests.exceptions.ConnectionError as e:
+            print '    Connection error: %s'%(str(e[0]))
+            xml = ''
+        c += 1
+    return xml
+
+
+def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
+    """ Get the full history (or current only) of a page using API:Query
+        if params['curonly'] is set, then using export&exportwrap to export
+    """
+
+    title_ = title
+    title_ = re.sub(' ', '_', title_)
+    # do not convert & into %26, title_ = re.sub('&', '%26', title_)
+    # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
+    # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
+    #print 'current:%s' % (title_)
+    if not config['curonly']:
+        params = {'titles': title_, 'action': 'query','format':'xml',
+            'prop':'revisions',
+            'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
+            'rvcontinue' : None,
+            'rvlimit' : 10 # TODO: set this by commandline
+        }
+    else:
+        params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
+    #print 'params:%s' % (params)
+    if not config['curonly']:
+        firstpartok = False
+        lastcontinue = None
+        numberofedits = 0
+        ret = ''
+        while True:
+            # in case the last request is not right, saving last time's progress
+            if not firstpartok:
+                try:
+                    lastcontinue = params['rvcontinue']
+                except:
+                    lastcontinue = None
+
+            xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+            if xml == "":
+                #just return so that we can continue, and getXMLPageCoreWithApi will log the error
+                return
+            try:
+                root = ET.fromstring(xml.encode('utf-8'))
+            except:
+                continue
+            try:
+                retpage = root.find('query').find('pages').find('page')
+            except:
+                continue
+            if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
+                print 'Page not found'
+                raise PageMissingError(params['titles'], xml)
+            if not firstpartok:
+                try:
+                    # build the firstpart by ourselves to improve the memory usage
+                    ret  = '  <page>\n'
+                    ret += '    <title>%s</title>\n' %(retpage.attrib['title'])
+                    ret += '    <ns>%s</ns>\n' % (retpage.attrib['ns'])
+                    ret += '    <id>%s</id>\n' % (retpage.attrib['pageid'])
+                except:
+                    firstpartok = False
+                    continue
+                else:
+                    firstpartok = True
+                    yield ret
+            try:
+                ret = ''
+                edits = 0
+                if config['curonly'] or root.find('continue') == None:
+                    # transform the revision
+                    rev_,edits = reconstructRevisions(root=root)
+                    xmldom = MD.parseString('<stub1>'+ET.tostring(rev_)+'</stub1>')
+                    # convert it into text in case it throws MemoryError
+                    # delete the first three line and last two line,which is for setting the indent
+                    ret += ''.join(xmldom.toprettyxml(indent='  ').splitlines(True)[3:-2])
+                    yield ret
+                    numberofedits += edits
+                    break
+                else:
+                    rev_,edits = reconstructRevisions(root=root)
+                    xmldom = MD.parseString('<stub1>' + ET.tostring(rev_) + '</stub1>')
+                    ret += ''.join(xmldom.toprettyxml(indent='  ').splitlines(True)[3:-2])
+                    params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
+                    numberofedits += edits
+                    yield ret
+            except:
+                traceback.print_exc()
+                params['rvcontinue'] = lastcontinue
+                ret = ''
+        yield '  </page>\n'
+    else:
+        xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+        if xml == "":
+            raise ExportAbortedError(config['index'])
+        if not "</page>" in xml:
+            raise PageMissingError(params['titles'], xml)
+        else:
+            # strip these sha1s sums which keep showing up in the export and
+            # which are invalid for the XML schema (they only apply to
+            # revisions)
+            xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
+            xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
+
+        yield xml.split("</page>")[0]
+
+        # just for looking good :)
+        r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+
+        numberofedits = 0
+        numberofedits += len(re.findall(r_timestamp, xml))
+
+        yield "</page>\n"
+
+    if verbose:
+        if (numberofedits == 1):
+           print '    %s, 1 edit' % (title.strip())
+        else:
+           print '    %s, %d edits' % (title.strip(), numberofedits)
 
 def getXMLPageCore(headers={}, params={}, config={}, session=None):
     """  """
@@ -663,6 +889,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
         else:
            print '    %s, %d edits' % (title.strip(), numberofedits)
 
+def getXMLPage_(config={}, title='', verbose=True, session=None):
+    #print config
+    if config['apiexport']:
+        return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
+    else:
+        return getXMLPage(config=config, title=title, verbose=verbose, session=session)
+    return ''
 
 def cleanXML(xml=''):
     """ Trim redundant info """
@@ -710,7 +943,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
         if c % 10 == 0:
             print 'Downloaded %d pages' % (c)
         try:
-            for xml in getXMLPage(config=config, title=title, session=session):
+            for xml in getXMLPage_(config=config, title=title, session=session):
                 xml = cleanXML(xml=xml)
                 xmlfile.write(xml.encode('utf-8'))
         except PageMissingError:
@@ -1311,6 +1544,8 @@ def getParameters(params=[]):
         '--exnamespaces',
         metavar="1,2,3",
         help='comma-separated value of namespaces to exclude')
+    groupDownload.add_argument(
+        '--apiexport', action='store_true', help="Using API instead of Special:Export to export pages")
 
     # Meta info params
     groupMeta = parser.add_argument_group(
@@ -1494,6 +1729,7 @@ def getParameters(params=[]):
         'cookies': args.cookies or '',
         'delay': args.delay,
         'retries': int(args.retries),
+        'apiexport' : args.apiexport
     }
 
     other = {