diff --git a/dumpgenerator.py b/dumpgenerator.py index bd27ff17..a2d5fabc 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -23,10 +23,21 @@ from kitchen.text.converters import getwriter, to_unicode except ImportError: print "Please install the kitchen module." + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +import xml.dom.minidom as MD + import cookielib import cPickle import datetime import sys +import io +import traceback + try: import argparse except ImportError: @@ -63,7 +74,7 @@ UTF8Writer = getwriter('utf8') sys.stdout = UTF8Writer(sys.stdout) -__VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org +__VERSION__ = '0.5.0-alpha' # major, minor, micro: semver.org class PageMissingError(Exception): def __init__(self, title, xml): @@ -164,7 +175,7 @@ def getNamespacesScraper(config={}, session=None): namespacenames = {0: ''} # main is 0, no prefix if namespaces: r = session.post( - url=config['index'], params={'title': 'Special:Allpages'}, timeout=30) + url=config['index'], params={'title': 'Special:Allpages'}, timeout=120) raw = r.text delay(config=config, session=session) @@ -206,7 +217,7 @@ def getNamespacesAPI(config={}, session=None): 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}, - timeout=30 + timeout=120 ) result = getJSON(r) delay(config=config, session=session) @@ -281,7 +292,7 @@ def getPageTitlesScraper(config={}, session=None): print ' Retrieving titles in the namespace', namespace url = '%s?title=Special:Allpages&namespace=%s' % ( config['index'], namespace) - r = session.get(url=url, timeout=30) + r = session.get(url=url, timeout=120) raw = r.text raw = cleanHTML(raw) @@ -455,7 +466,7 @@ def getXMLHeader(config={}, session=None): else: try: - xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) + xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)]) except PageMissingError as pme: # The does not exist. Not a problem, if we get the . xml = pme.xml @@ -477,7 +488,7 @@ def getXMLHeader(config={}, session=None): ) config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \ + ':Export' - xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) + xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)]) except PageMissingError as pme: xml = pme.xml except ExportAbortedError: @@ -500,7 +511,7 @@ def getXMLHeader(config={}, session=None): def getXMLFileDesc(config={}, title='', session=None): """ Get XML for image description page """ config['curonly'] = 1 # tricky to get only the most recent desc - return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)])) + return("".join([x for x in getXMLPage_( config=config, title=title, verbose=False, session=session)])) def getUserAgent(): @@ -521,7 +532,216 @@ def logerror(config={}, text=''): output = u'%s: %s\n' % ( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text) outfile.write(output.encode('utf-8')) +def reconstructRevisions(root=None): + #print ET.tostring(rev) + page = ET.Element('stub') + edits = 0 + for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'): + try: + rev_ = ET.SubElement(page,'revision') + ET.SubElement(rev_,'id').text = rev.attrib['revid'] + ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp'] + contributor = ET.SubElement(rev_,'contributor') + if not rev.attrib.has_key('userhidden'): + ET.SubElement(contributor,'username').text = rev.attrib['user'] + ET.SubElement(contributor,'id').text = rev.attrib['userid'] + else: + contributor.set('deleted','deleted') + comment = ET.SubElement(rev_,'comment') + if not rev.attrib.has_key('commenthidden'): + comment.text = rev.attrib['comment'] + else: + comment.set('deleted','deleted') + + # some revision does not return model and format, so just use hard-code + ET.SubElement(rev_,'model').text = 'wikitext' + ET.SubElement(rev_,'format').text = 'text/x-wiki' + text = ET.SubElement(rev_,'text') + if not rev.attrib.has_key('texthidden'): + text.attrib['xml:space'] = "preserve" + text.attrib['bytes'] = rev.attrib['size'] + text.text = rev.text + else: + text.set('deleted','deleted') + # delete sha1 here :) + #sha1 = ET.SubElement(rev_,'sha1') + #if not rev.attrib.has_key('sha1missing'): + #sha1.text = rev.attrib['sha1'] + if rev.attrib.has_key('minor'): + ET.SubElement(rev_,'minor') + edits += 1 + except Exception as e: + #logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev))) + print ET.tostring(rev) + traceback.print_exc() + page = None + edits = 0 + raise e + return page,edits + +def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None): + """ """ + # just send the API request + # if it fails, it will reduce params['rvlimit'] + xml = '' + c = 0 + maxseconds = 100 # max seconds to wait in a single sleeping + maxretries = config['retries'] # x retries and skip + increment = 20 # increment every retry + while not re.search(r'' if not config['curonly'] else r'', xml) or re.search(r'', xml): + if c > 0 and c < maxretries: + wait = increment * c < maxseconds and increment * \ + c or maxseconds # incremental until maxseconds + print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait) + time.sleep(wait) + # reducing server load requesting smallest chunks (if curonly then + # rvlimit = 1 from mother function) + if params['rvlimit'] > 1: + params['rvlimit'] = params['rvlimit'] / 2 # half + if c >= maxretries: + print ' We have retried %d times' % (c) + print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages']) + # If it's not already what we tried: our last chance, preserve only the last revision... + # config['curonly'] means that the whole dump is configured to save only the last, + # params['curonly'] should mean that we've already tried this + # fallback, because it's set by the following if and passed to + # getXMLPageCore + # TODO: save only the last version when failed + print ' Saving in the errors log, and skipping...' + logerror( + config=config, + text=u'Error while retrieving the last revision of "%s". Skipping.' % + (params['titles' if config['apiexport'] else 'pages']).decode('utf-8')) + #raise ExportAbortedError(config['index']) + return '' # empty xml + + # FIXME HANDLE HTTP Errors HERE + try: + r = session.get(url=config['api'], params=params, headers=headers) + handleStatusCode(r) + xml = fixBOM(r) + #print xml + except requests.exceptions.ConnectionError as e: + print ' Connection error: %s'%(str(e[0])) + xml = '' + c += 1 + return xml +def getXMLPageWithApi(config={}, title='', verbose=True, session=None): + """ Get the full history (or current only) of a page using API:Query + if params['curonly'] is set, then using export&exportwrap to export + """ + + title_ = title + title_ = re.sub(' ', '_', title_) + # do not convert & into %26, title_ = re.sub('&', '%26', title_) + # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE + # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize + #print 'current:%s' % (title_) + if not config['curonly']: + params = {'titles': title_, 'action': 'query','format':'xml', + 'prop':'revisions', + 'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags', + 'rvcontinue' : None, + 'rvlimit' : 10 # TODO: set this by commandline + } + else: + params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1} + #print 'params:%s' % (params) + if not config['curonly']: + firstpartok = False + lastcontinue = None + numberofedits = 0 + ret = '' + while True: + # in case the last request is not right, saving last time's progress + if not firstpartok: + try: + lastcontinue = params['rvcontinue'] + except: + lastcontinue = None + + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + if xml == "": + #just return so that we can continue, and getXMLPageCoreWithApi will log the error + return + try: + root = ET.fromstring(xml.encode('utf-8')) + except: + continue + try: + retpage = root.find('query').find('pages').find('page') + except: + continue + if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'): + print 'Page not found' + raise PageMissingError(params['titles'], xml) + if not firstpartok: + try: + # build the firstpart by ourselves to improve the memory usage + ret = ' \n' + ret += ' %s\n' %(retpage.attrib['title']) + ret += ' %s\n' % (retpage.attrib['ns']) + ret += ' %s\n' % (retpage.attrib['pageid']) + except: + firstpartok = False + continue + else: + firstpartok = True + yield ret + try: + ret = '' + edits = 0 + if config['curonly'] or root.find('continue') == None: + # transform the revision + rev_,edits = reconstructRevisions(root=root) + xmldom = MD.parseString(''+ET.tostring(rev_)+'') + # convert it into text in case it throws MemoryError + # delete the first three line and last two line,which is for setting the indent + ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2]) + yield ret + numberofedits += edits + break + else: + rev_,edits = reconstructRevisions(root=root) + xmldom = MD.parseString('' + ET.tostring(rev_) + '') + ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2]) + params['rvcontinue'] = root.find('continue').attrib['rvcontinue'] + numberofedits += edits + yield ret + except: + traceback.print_exc() + params['rvcontinue'] = lastcontinue + ret = '' + yield ' \n' + else: + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + if xml == "": + raise ExportAbortedError(config['index']) + if not "" in xml: + raise PageMissingError(params['titles'], xml) + else: + # strip these sha1s sums which keep showing up in the export and + # which are invalid for the XML schema (they only apply to + # revisions) + xml = re.sub(r'\n\s*\w+\s*\n', r'\n', xml) + xml = re.sub(r'\n\s*\s*\n', r'\n', xml) + + yield xml.split("")[0] + + # just for looking good :) + r_timestamp = r'([^<]+)' + + numberofedits = 0 + numberofedits += len(re.findall(r_timestamp, xml)) + + yield "\n" + + if verbose: + if (numberofedits == 1): + print ' %s, 1 edit' % (title.strip()) + else: + print ' %s, %d edits' % (title.strip(), numberofedits) def getXMLPageCore(headers={}, params={}, config={}, session=None): """ """ @@ -694,7 +914,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None): print ' %s, 1 edit' % (title.strip()) else: print ' %s, %d edits' % (title.strip(), numberofedits) - +def getXMLPage_(config={}, title='', verbose=True, session=None): + #print config + if config['apiexport']: + return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session) + else: + return getXMLPage(config=config, title=title, verbose=verbose, session=session) + return '' def makeXmlPageFromRaw(xml): """ Discard the metadata around a element in string""" @@ -775,7 +1001,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): if c % 10 == 0: print 'Downloaded %d pages' % (c) try: - for xml in getXMLPage(config=config, title=title, session=session): + for xml in getXMLPage_(config=config, title=title, session=session): xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) except PageMissingError: @@ -1680,6 +1906,7 @@ def getParameters(params=[]): action='store_true', help='resumes previous incomplete dump (requires --path)') parser.add_argument('--force', action='store_true', help='') + parser.add_argument('--ignore-api-check', action='store_true', help='') parser.add_argument( '--user', help='Username if authentication is required.') parser.add_argument( @@ -1723,6 +1950,10 @@ def getParameters(params=[]): '--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude') + groupDownload.add_argument( + '--apiexport', + action='store_true', + help="Using API instead of Special:Export to export pages") # Meta info params groupMeta = parser.add_argument_group( @@ -1824,6 +2055,8 @@ def getParameters(params=[]): index2 = check[1] api = checkedapi print 'API is OK: ' + checkedapi + elif args.ignore_api_check: + print 'Error in API. Ignoring.' else: if index and not args.wiki: print 'API not available. Trying with index.php only.' @@ -1921,6 +2154,7 @@ def getParameters(params=[]): 'cookies': args.cookies or '', 'delay': args.delay, 'retries': int(args.retries), + 'apiexport': args.apiexport } other = {