diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..b2a92142
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,10 @@
+{
+ "python.analysis.extraPaths": [
+ ".",
+ "./poster",
+ "./tests",
+ "./wikitools",
+ "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages"
+ ],
+ "python.pythonPath": "/usr/local/bin/python3.8"
+}
\ No newline at end of file
diff --git a/dumpgenerator.py b/dumpgenerator.py
index 536cd0ed..f13b1a46 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# dumpgenerator.py A generator of dumps for wikis
@@ -22,44 +22,38 @@
try:
from kitchen.text.converters import getwriter, to_unicode
except ImportError:
- print "Please install the kitchen module."
-import cookielib
-import cPickle
+ print ("Please install the kitchen module.")
+import http.cookiejar
+import pickle
import datetime
import sys
try:
import argparse
except ImportError:
- print "Please install the argparse module."
+ print ("Please install the argparse module.")
sys.exit(1)
import json
-try:
- from hashlib import md5
-except ImportError: # Python 2.4 compatibility
- from md5 import new as md5
+from hashlib import md5
import os
import re
-import subprocess
+# import subprocess
try:
import requests
except ImportError:
- print "Please install or update the Requests module."
+ print ("Please install or update the Requests module.")
sys.exit(1)
try:
import mwclient
except ImportError:
- print "Please install the mwclient module if you want to use --xmlrevisions."
+ print ("Please install the mwclient module if you want to use --xmlrevisions.")
try:
from lxml import etree
from lxml.builder import E
except ImportError:
- print "Please install the lxml module if you want to use --xmlrevisions."
+ print ("Please install the lxml module if you want to use --xmlrevisions.")
import time
import urllib
-try:
- from urlparse import urlparse, urlunparse
-except ImportError:
- from urllib.parse import urlparse, urlunparse
+from urllib.parse import urlparse, urlunparse
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
@@ -83,22 +77,22 @@ def getVersion():
def truncateFilename(other={}, filename=''):
- """ Truncate filenames when downloading images with large filenames """
+ """Truncate filenames when downloading images with large filenames"""
return filename[:other['filenamelimit']] + \
md5(filename.encode('utf-8')).hexdigest() + '.' + filename.split('.')[-1]
def delay(config={}, session=None):
- """ Add a delay if configured for that """
+ """Add a delay if configured for that"""
if config['delay'] > 0:
- print 'Sleeping... %d seconds...' % (config['delay'])
+ print ('Sleeping... %d seconds...' % (config['delay']))
time.sleep(config['delay'])
def cleanHTML(raw=''):
- """ Extract only the real wiki content and remove rubbish """
- """ This function is ONLY used to retrieve page titles and file names when no API is available """
- """ DO NOT use this function to extract page content """
+ """Extract only the real wiki content and remove rubbish"""
+ """This function is ONLY used to retrieve page titles and file names when no API is available"""
+ """DO NOT use this function to extract page content"""
# different "tags" used by different MediaWiki versions to mark where
# starts and ends content
if re.search('', raw):
@@ -118,8 +112,8 @@ def cleanHTML(raw=''):
raw = raw.split('
')[0]
else:
- print raw[:250]
- print 'This wiki doesn\'t use marks to split content'
+ print (raw[:250])
+ print ('This wiki doesn\'t use marks to split content')
sys.exit()
return raw
@@ -129,37 +123,37 @@ def handleStatusCode(response):
if statuscode >= 200 and statuscode < 300:
return
- print "HTTP Error %d." % statuscode
+ print ("HTTP Error %d." % statuscode)
if statuscode >= 300 and statuscode < 400:
- print "Redirect should happen automatically: please report this as a bug."
- print response.url
+ print ("Redirect should happen automatically: please report this as a bug.")
+ print (response.url)
elif statuscode == 400:
- print "Bad Request: The wiki may be malfunctioning."
- print "Please try again later."
- print response.url
+ print ("Bad Request: The wiki may be malfunctioning.")
+ print ("Please try again later.")
+ print (response.url)
sys.exit(1)
elif statuscode == 401 or statuscode == 403:
- print "Authentication required."
- print "Please use --user and --pass."
- print response.url
+ print ("Authentication required.")
+ print ("Please use --user and --pass.")
+ print (response.url)
elif statuscode == 404:
- print "Not found. Is Special:Export enabled for this wiki?"
- print response.url
+ print ("Not found. Is Special:Export enabled for this wiki?")
+ print (response.url)
sys.exit(1)
elif statuscode == 429 or (statuscode >= 500 and statuscode < 600):
- print "Server error, max retries exceeded."
- print "Please resume the dump later."
- print response.url
+ print ("Server error, max retries exceeded.")
+ print ("Please resume the dump later.")
+ print (response.url)
sys.exit(1)
def getNamespacesScraper(config={}, session=None):
- """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
- """ Function called if no API is available """
+ """Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages"""
+ """Function called if no API is available"""
namespaces = config['namespaces']
namespacenames = {0: ''} # main is 0, no prefix
if namespaces:
@@ -190,12 +184,12 @@ def getNamespacesScraper(config={}, session=None):
namespaces = [0]
namespaces = list(set(namespaces)) # uniques
- print '%d namespaces found' % (len(namespaces))
+ print ('%d namespaces found' % (len(namespaces)))
return namespaces, namespacenames
def getNamespacesAPI(config={}, session=None):
- """ Uses the API to get the list of namespaces names and ids """
+ """Uses the API to get the list of namespaces names and ids"""
namespaces = config['namespaces']
namespacenames = {0: ''} # main is 0, no prefix
if namespaces:
@@ -213,9 +207,9 @@ def getNamespacesAPI(config={}, session=None):
try:
nsquery = result['query']['namespaces']
except KeyError:
- print "Error: could not get namespaces from the API request."
- print "HTTP %d" % r.status_code
- print r.text
+ print ("Error: could not get namespaces from the API request.")
+ print ("HTTP %d" % r.status_code)
+ print (r.text)
return None
if 'all' in namespaces:
@@ -241,22 +235,22 @@ def getNamespacesAPI(config={}, session=None):
namespaces = [0]
namespaces = list(set(namespaces)) # uniques
- print '%d namespaces found' % (len(namespaces))
+ print ('%d namespaces found' % (len(namespaces)))
return namespaces, namespacenames
def getPageTitlesAPI(config={}, session=None):
- """ Uses the API to get the list of page titles """
+ """Uses the API to get the list of page titles"""
titles = []
namespaces, namespacenames = getNamespacesAPI(
config=config, session=session)
for namespace in namespaces:
if namespace in config['exnamespaces']:
- print ' Skipping namespace = %d' % (namespace)
+ print (' Skipping namespace = %d' % (namespace))
continue
c = 0
- print ' Retrieving titles in the namespace %d' % (namespace)
+ print (' Retrieving titles in the namespace %d' % (namespace))
apiurl = urlparse(config['api'])
site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme)
for page in site.allpages(namespace=namespace):
@@ -266,19 +260,19 @@ def getPageTitlesAPI(config={}, session=None):
yield title
if len(titles) != len(set(titles)):
- print 'Probably a loop, switching to next namespace'
+ print ('Probably a loop, switching to next namespace')
titles = list(set(titles))
delay(config=config, session=session)
- print ' %d titles retrieved in the namespace %d' % (c, namespace)
+ print (' %d titles retrieved in the namespace %d' % (c, namespace))
def getPageTitlesScraper(config={}, session=None):
- """ Scrape the list of page titles from Special:Allpages """
+ """Scrape the list of page titles from Special:Allpages"""
titles = []
namespaces, namespacenames = getNamespacesScraper(
config=config, session=session)
for namespace in namespaces:
- print ' Retrieving titles in the namespace', namespace
+ print (' Retrieving titles in the namespace', namespace)
url = '%s?title=Special:Allpages&namespace=%s' % (
config['index'], namespace)
r = session.get(url=url, timeout=30)
@@ -342,13 +336,13 @@ def getPageTitlesScraper(config={}, session=None):
checked_suballpages.append(name)
delay(config=config, session=session)
r = session.get(url=url, timeout=10)
- #print 'Fetching URL: ', url
+ #print ('Fetching URL: ', url)
raw = r.text
raw = cleanHTML(raw)
rawacum += raw # merge it after removed junk
- print ' Reading', name, len(raw), 'bytes', \
+ print (' Reading', name, len(raw), 'bytes', \
len(re.findall(r_suballpages, raw)), 'subpages', \
- len(re.findall(r_title, raw)), 'pages'
+ len(re.findall(r_title, raw)), 'pages')
delay(config=config, session=session)
oldfr = currfr
@@ -362,24 +356,24 @@ def getPageTitlesScraper(config={}, session=None):
if t not in titles:
titles.append(t)
c += 1
- print ' %d titles retrieved in the namespace %d' % (c, namespace)
+ print (' %d titles retrieved in the namespace %d' % (c, namespace))
return titles
def getPageTitles(config={}, session=None):
- """ Get list of page titles """
+ """Get list of page titles"""
# http://en.wikipedia.org/wiki/Special:AllPages
# http://archiveteam.org/index.php?title=Special:AllPages
# http://www.wikanda.es/wiki/Especial:Todas
- print 'Loading page titles from namespaces = %s' % (config['namespaces'] and ','.join([str(i) for i in config['namespaces']]) or 'None')
- print 'Excluding titles from namespaces = %s' % (config['exnamespaces'] and ','.join([str(i) for i in config['exnamespaces']]) or 'None')
+ print ('Loading page titles from namespaces = %s' % (config['namespaces'] and ','.join([str(i) for i in config['namespaces']]) or 'None'))
+ print ('Excluding titles from namespaces = %s' % (config['exnamespaces'] and ','.join([str(i) for i in config['exnamespaces']]) or 'None'))
titles = []
if 'api' in config and config['api']:
try:
titles = getPageTitlesAPI(config=config, session=session)
except:
- print "Error: could not get page titles from the API"
+ print ("Error: could not get page titles from the API")
titles = getPageTitlesScraper(config=config, session=session)
elif 'index' in config and config['index']:
titles = getPageTitlesScraper(config=config, session=session)
@@ -396,15 +390,15 @@ def getPageTitles(config={}, session=None):
# We can use sort -u in UNIX, but is it worth it?
titlesfile.write(u'--END--\n')
titlesfile.close()
- print 'Titles saved at...', titlesfilename
+ print ('Titles saved at...', titlesfilename)
- print '%d page titles loaded' % (c)
+ print ('%d page titles loaded' % (c))
return titlesfilename
def getImageNames(config={}, session=None):
- """ Get list of image names """
+ """Get list of image names"""
- print 'Retrieving image filenames'
+ print (')Retrieving image filenames')
images = []
if 'api' in config and config['api']:
images = getImageNamesAPI(config=config, session=session)
@@ -414,21 +408,21 @@ def getImageNames(config={}, session=None):
# images = list(set(images)) # it is a list of lists
images.sort()
- print '%d image names loaded' % (len(images))
+ print ('%d image names loaded' % (len(images)))
return images
def getXMLHeader(config={}, session=None):
- """ Retrieve a random page to extract XML headers (namespace info, etc) """
+ """Retrieve a random page to extract XML headers (namespace info, etc)"""
# get the header of a random page, to attach it in the complete XML backup
# similar to:
# if retrieving params['limit'] revisions fails, returns a current only version
# if all fail, returns the empty string
@@ -538,17 +532,17 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if c > 0 and c < maxretries:
wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds
- print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)
+ print (' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait))
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# limit = 1 from mother function)
if params['limit'] > 1:
params['limit'] = params['limit'] / 2 # half
if c >= maxretries:
- print ' We have retried %d times' % (c)
- print ' MediaWiki error for "%s", network error or whatever...' % (params['pages'])
+ print (' We have retried %d times' % (c))
+ print (' MediaWiki error for "%s", network error or whatever...' % (params['pages']))
if config['failfast']:
- print "Exit, it will be for another time"
+ print ("Exit, it will be for another time")
sys.exit()
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save only the last,
@@ -556,7 +550,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
# fallback, because it's set by the following if and passed to
# getXMLPageCore
if not config['curonly'] and not 'curonly' in params:
- print ' Trying to save only the last revision for this page...'
+ print (' Trying to save only the last revision for this page...')
params['curonly'] = 1
logerror(
config=config,
@@ -570,7 +564,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
session=session
)
else:
- print ' Saving in the errors log, and skipping...'
+ print (' Saving in the errors log, and skipping...')
logerror(
config=config,
text=u'Error while retrieving the last revision of "%s". Skipping.' %
@@ -583,10 +577,10 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
handleStatusCode(r)
xml = fixBOM(r)
except requests.exceptions.ConnectionError as e:
- print ' Connection error: %s'%(str(e[0]))
+ print (' Connection error: %s'%(str(e[0])))
xml = ''
except requests.exceptions.ReadTimeout as e:
- print ' Read timeout: %s'%(str(e[0]))
+ print (' Read timeout: %s'%(str(e[0])))
xml = ''
c += 1
@@ -594,7 +588,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
def getXMLPage(config={}, title='', verbose=True, session=None):
- """ Get the full history (or current only) of a page """
+ """Get the full history (or current only) of a page"""
# if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
# http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
@@ -649,7 +643,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
xml2 = getXMLPageCore(
params=params, config=config, session=session)
except MemoryError:
- print "The page's history exceeds our memory, halving limit."
+ print ("The page's history exceeds our memory, halving limit.")
params['limit'] = params['limit'] / 2
continue
@@ -659,11 +653,11 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
# again the same XML, this wiki does not support params in
# Special:Export, offer complete XML up to X edits (usually
# 1000)
- print 'ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated'
+ print ('ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated')
truncated = True
break
else:
- """
+ """
Main Page
@@ -673,14 +667,14 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
4180098322011-03-09T19:57:06Z
- """
+ """
# offset is OK in this wiki, merge with the previous chunk
# of this page history and continue
try:
xml2 = xml2.split("")[0]
yield ' ' + (''.join(xml2.split('')[1:]))
except MemoryError:
- print "The page's history exceeds our memory, halving limit."
+ "The page's history exceeds our memory, halving limit."
params['limit'] = params['limit'] / 2
continue
xml = xml2
@@ -691,13 +685,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
if verbose:
if (numberofedits == 1):
- print ' %s, 1 edit' % (title.strip())
+ print (' %s, 1 edit' % (title.strip()))
else:
- print ' %s, %d edits' % (title.strip(), numberofedits)
+ print (' %s, %d edits' % (title.strip(), numberofedits))
def makeXmlPageFromRaw(xml):
- """ Discard the metadata around a element in string"""
+ """Discard the metadata around a element in string"""
root = etree.XML(xml)
find = etree.XPath("//*[local-name() = 'page']")
# The tag will inherit the namespace, like:
@@ -707,7 +701,7 @@ def makeXmlPageFromRaw(xml):
def cleanXML(xml=''):
- """ Trim redundant info from the XML however it comes """
+ """Trim redundant info from the XML however it comes"""
# do not touch XML codification, leave AS IS
if re.search(r'\n', xml):
xml = xml.split('\n')[1]
@@ -717,7 +711,7 @@ def cleanXML(xml=''):
def generateXMLDump(config={}, titles=[], start=None, session=None):
- """ Generates a XML dump for a list of titles or from revision IDs """
+ """Generates a XML dump for a list of titles or from revision IDs"""
# TODO: titles is now unused.
header, config = getXMLHeader(config=config, session=session)
@@ -733,7 +727,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
print("WARNING: will try to start the download from title: {}".format(start))
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
else:
- print 'Retrieving the XML for every page from the beginning'
+ print ('Retrieving the XML for every page from the beginning')
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
try:
@@ -742,17 +736,17 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
# TODO: get the page title and reuse the usual format "X title, y edits"
- print " %d more revisions exported" % numrevs
+ print (" %d more revisions exported" % numrevs)
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except AttributeError as e:
print(e)
- print "This API library version is not working"
+ print ("This API library version is not working")
sys.exit()
else:
- print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
+ print ('Retrieving the XML for every page from "%s"' % (start and start or 'start'))
if start:
- print "Removing the last chunk of past XML dump: it is probably incomplete."
+ print ("Removing the last chunk of past XML dump: it is probably incomplete.")
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
@@ -773,7 +767,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
continue
delay(config=config, session=session)
if c % 10 == 0:
- print 'Downloaded %d pages' % (c)
+ print ('Downloaded %d pages' % (c))
try:
for xml in getXMLPage(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
@@ -791,7 +785,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
xmlfile.write(footer)
xmlfile.close()
- print 'XML dump saved at...', xmlfilename
+ print ('XML dump saved at...', xmlfilename)
def getXMLRevisions(config={}, session=None, allpages=False, start=None):
# FIXME: actually figure out the various strategies for each MediaWiki version
@@ -874,7 +868,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
for page in arvrequest['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
- print " %d more revisions listed, until %s" % (len(revids), revids[-1])
+ print (" %d more revisions listed, until %s" % (len(revids), revids[-1]))
# We can now get the XML for one revision at a time
# FIXME: we can actually get them in batches as we used to
@@ -922,7 +916,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
except (KeyError, mwclient.errors.InvalidResponse) as e:
print(e)
# TODO: check whether the KeyError was really for a missing arv API
- print "Warning. Could not use allrevisions. Wiki too old?"
+ print ("Warning. Could not use allrevisions. Wiki too old?")
if config['curonly']:
# The raw XML export in the API gets a title and gives the latest revision.
# We could also use the allpages API as generator but let's be consistent.
@@ -1022,7 +1016,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
if 'continue' in prequest.keys():
print("Getting more revisions for the page")
for key, value in prequest['continue']:
- params[key] = value
+ pparams[key] = value
elif 'query-continue' in prequest.keys():
rvstartid = prequest['query-continue']['revisions']['rvstartid']
pparams['rvstartid'] = rvstartid
@@ -1047,11 +1041,11 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
except mwclient.errors.MwClientError as e:
print(e)
- print "This mwclient version seems not to work for us. Exiting."
+ print ("This mwclient version seems not to work for us. Exiting.")
sys.exit()
def makeXmlFromPage(page):
- """ Output an XML document as a string from a page as in the API JSON """
+ """Output an XML document as a string from a page as in the API JSON"""
try:
p = E.page(
E.title(to_unicode(page['title'])),
@@ -1098,7 +1092,7 @@ def makeXmlFromPage(page):
return etree.tostring(p, pretty_print=True, encoding='unicode')
def readTitles(config={}, start=None, batch=False):
- """ Read title list from a file, from the title "start" """
+ """Read title list from a file, from the title "start" """
titlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date'])
@@ -1173,7 +1167,7 @@ def reverse_readline(filename, buf_size=8192, truncate=False):
yield segment
def saveImageNames(config={}, images=[], session=None):
- """ Save image list in a file, including filename, url and uploader """
+ """Save image list in a file, including filename, url and uploader"""
imagesfilename = '%s-%s-images.txt' % (
domain2prefix(config=config), config['date'])
@@ -1193,11 +1187,11 @@ def saveImageNames(config={}, images=[], session=None):
imagesfile.write('\n--END--')
imagesfile.close()
- print 'Image filenames and URLs saved at...', imagesfilename
+ print ('Image filenames and URLs saved at...', imagesfilename)
def curateImageURL(config={}, url=''):
- """ Returns an absolute URL for an image, adding the domain if missing """
+ """Returns an absolute URL for an image, adding the domain if missing"""
if 'index' in config and config['index']:
# remove from :// (http or https) until the first / after domain
@@ -1207,7 +1201,7 @@ def curateImageURL(config={}, url=''):
domainalone = config['api'].split(
'://')[0] + '://' + config['api'].split('://')[1].split('/')[0]
else:
- print 'ERROR: no index nor API'
+ print ('ERROR: no index nor API')
sys.exit()
if url.startswith('//'): # Orain wikifarm returns URLs starting with //
@@ -1227,7 +1221,7 @@ def curateImageURL(config={}, url=''):
def getImageNamesScraper(config={}, session=None):
- """ Retrieve file list: filename, url, uploader """
+ """Retrieve file list: filename, url, uploader"""
# (?\d+)&'
@@ -1253,15 +1247,15 @@ def getImageNamesScraper(config={}, session=None):
ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)',
raw):
if limit > 10:
- print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
+ print ('Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit))
limit = limit / 10
continue
elif retries > 0: # waste retries, then exit
retries -= 1
- print 'Retrying...'
+ print ('Retrying...')
continue
else:
- print 'No more retries, exit...'
+ print ('No more retries, exit...')
break
raw = cleanHTML(raw)
@@ -1309,7 +1303,7 @@ def getImageNamesScraper(config={}, session=None):
uploader = undoHTMLEntities(text=uploader)
uploader = urllib.unquote(uploader)
images.append([filename, url, uploader])
- # print filename, url
+ # print (filename, url)
if re.search(r_next, raw):
new_offset = re.findall(r_next, raw)[0]
@@ -1323,16 +1317,16 @@ def getImageNamesScraper(config={}, session=None):
offset = ''
if (len(images) == 1):
- print ' Found 1 image'
+ print (' Found 1 image')
else:
- print ' Found %d images' % (len(images))
+ print (' Found %d images' % (len(images)))
images.sort()
return images
def getImageNamesAPI(config={}, session=None):
- """ Retrieve file list: filename, url, uploader """
+ """Retrieve file list: filename, url, uploader"""
oldAPI = False
aifrom = '!'
images = []
@@ -1366,7 +1360,7 @@ def getImageNamesAPI(config={}, session=None):
aifrom = jsonimages['continue']['aicontinue']
elif 'aifrom' in jsonimages['continue']:
aifrom = jsonimages['continue']['aifrom']
- # print aifrom
+ # print (aifrom)
for image in jsonimages['query']['allimages']:
url = image['url']
@@ -1375,9 +1369,9 @@ def getImageNamesAPI(config={}, session=None):
# http://bugs.python.org/issue8136
if 'api' in config and ('.wikia.' in config['api'] or '.fandom.com' in config['api']):
#to avoid latest?cb=20120816112532 in filenames
- filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')), 'utf-8')
+ filename = str(urllib.unquote((re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')), 'utf-8')
else:
- filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')), 'utf-8')
+ filename = str(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')), 'utf-8')
uploader = re.sub('_', ' ', image['user'])
images.append([filename, url, uploader])
else:
@@ -1416,8 +1410,8 @@ def getImageNamesAPI(config={}, session=None):
if 'gapfrom' in jsonimages['query-continue']['allpages']:
gapfrom = jsonimages[
'query-continue']['allpages']['gapfrom']
- # print gapfrom
- # print jsonimages['query']
+ # print (gapfrom)
+ # print (jsonimages['query'])
for image, props in jsonimages['query']['pages'].items():
url = props['imageinfo'][0]['url']
@@ -1433,15 +1427,15 @@ def getImageNamesAPI(config={}, session=None):
break
if (len(images) == 1):
- print ' Found 1 image'
+ print (' Found 1 image')
else:
- print ' Found %d images' % (len(images))
+ print (' Found %d images' % (len(images)))
return images
def undoHTMLEntities(text=''):
- """ Undo some HTML codes """
+ """Undo some HTML codes"""
# i guess only < > & " ' need conversion
# http://www.w3schools.com/html/html_entities.asp
@@ -1455,13 +1449,13 @@ def undoHTMLEntities(text=''):
def generateImageDump(config={}, other={}, images=[], start='', session=None):
- """ Save files and descriptions using a file list """
+ """Save files and descriptions using a file list"""
# fix use subdirectories md5
- print 'Retrieving images from "%s"' % (start and start or 'start')
+ print ('Retrieving images from "%s"' % (start and start or 'start'))
imagepath = '%s/images' % (config['path'])
if not os.path.isdir(imagepath):
- print 'Creating "%s" directory' % (imagepath)
+ print ('Creating "%s" directory' % (imagepath))
os.makedirs(imagepath)
c = 0
@@ -1482,7 +1476,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
if len(filename2) > other['filenamelimit']:
# split last . (extension) and then merge
filename2 = truncateFilename(other=other, filename=filename2)
- print 'Filename is too long, truncating. Now it is:', filename2
+ print ('Filename is too long, truncating. Now it is:', filename2)
filename3 = u'%s/%s' % (imagepath, filename2)
imagefile = open(filename3, 'wb')
@@ -1543,13 +1537,13 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
delay(config=config, session=session)
c += 1
if c % 10 == 0:
- print ' Downloaded %d images' % (c)
+ print (' Downloaded %d images' % (c))
- print 'Downloaded %d images' % (c)
+ print ('Downloaded %d images' % (c))
def saveLogs(config={}, session=None):
- """ Save Special:Log """
+ """Save Special:Log"""
# get all logs from Special:Log
"""parse
- """
+ """
delay(config=config, session=session)
def domain2prefix(config={}, session=None):
- """ Convert domain name to a valid prefix filename. """
+ """Convert domain name to a valid prefix filename."""
# At this point, both api and index are supposed to be defined
domain = ''
@@ -1589,30 +1583,30 @@ def domain2prefix(config={}, session=None):
def loadConfig(config={}, configfilename=''):
- """ Load config file """
+ """Load config file"""
try:
with open('%s/%s' % (config['path'], configfilename), 'r') as infile:
- config = cPickle.load(infile)
+ config = pickle.load(infile)
except:
- print 'There is no config file. we can\'t resume. Start a new dump.'
+ print ('There is no config file. we can\'t resume. Start a new dump.')
sys.exit()
return config
def saveConfig(config={}, configfilename=''):
- """ Save config file """
+ """Save config file"""
with open('%s/%s' % (config['path'], configfilename), 'w') as outfile:
- cPickle.dump(config, outfile)
+ pickle.dump(config, outfile)
def welcome():
message = ''
- """ Opening message """
+ """Opening message"""
message += "#" * 73
- message += """
+ message +="""
# Welcome to DumpGenerator %s by WikiTeam (GPL v3) #
# More info at: https://github.com/WikiTeam/wikiteam #""" % (getVersion())
message += "\n"
@@ -1623,7 +1617,7 @@ def welcome():
message += "#" * 73
message += "\n"
message += "# Copyright (C) 2011-%d WikiTeam developers #\n" % (datetime.datetime.now().year)
- message += """
+ message +="""
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
@@ -1645,11 +1639,11 @@ def welcome():
def bye():
- """ Closing message """
- print "---> Congratulations! Your dump is complete <---"
- print "If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues"
- print "If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam"
- print "Good luck! Bye!"
+ """Closing message"""
+ print ("---> Congratulations! Your dump is complete <---")
+ print ("If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues"
+ "If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam")
+ print ("Good luck! Bye!")
def getParameters(params=[]):
@@ -1738,37 +1732,37 @@ def getParameters(params=[]):
help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.")
args = parser.parse_args()
- # print args
+ # print (args)
# Don't mix download params and meta info params
if (args.xml or args.images) and \
(args.get_wiki_engine):
- print 'ERROR: Don\'t mix download params and meta info params'
+ print ('ERROR: Don\'t mix download params and meta info params')
parser.print_help()
sys.exit(1)
# No download params and no meta info params? Exit
if (not args.xml and not args.images) and \
(not args.get_wiki_engine):
- print 'ERROR: Use at least one download param or meta info param'
+ print ('ERROR: Use at least one download param or meta info param')
parser.print_help()
sys.exit(1)
# Execute meta info params
if args.wiki:
if args.get_wiki_engine:
- print getWikiEngine(url=args.wiki)
+ print (getWikiEngine(url=args.wiki))
sys.exit()
# Create session
- cj = cookielib.MozillaCookieJar()
+ cj = http.cookiejar.MozillaCookieJar()
if args.cookies:
cj.load(args.cookies)
- print 'Using cookies from %s' % args.cookies
+ print ('Using cookies from %s' % args.cookies)
session = requests.Session()
try:
- from requests.packages.urllib3.util.retry import Retry
+ from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
# Courtesy datashaman https://stackoverflow.com/a/35504626
__retries__ = Retry(total=5,
@@ -1787,8 +1781,8 @@ def getParameters(params=[]):
# check URLs
for url in [args.api, args.index, args.wiki]:
if url and (not url.startswith('http://') and not url.startswith('https://')):
- print url
- print 'ERROR: URLs must start with http:// or https://\n'
+ print (url)
+ print ('ERROR: URLs must start with http:// or https://\n')
parser.print_help()
sys.exit(1)
@@ -1804,7 +1798,7 @@ def getParameters(params=[]):
if not index:
index = index2
else:
- print 'ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki'
+ print ('ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki')
sys.exit(1)
else:
if api == '':
@@ -1812,8 +1806,8 @@ def getParameters(params=[]):
elif index == '':
index = '/'.join(api.split('/')[:-1]) + '/index.php'
- # print api
- # print index
+ # print (api)
+ # print (index)
index2 = None
if api:
@@ -1823,20 +1817,20 @@ def getParameters(params=[]):
# Replace the index URL we got from the API check
index2 = check[1]
api = checkedapi
- print 'API is OK: ' + checkedapi
+ print ('API is OK: ' + checkedapi)
else:
if index and not args.wiki:
- print 'API not available. Trying with index.php only.'
+ print ('API not available. Trying with index.php only.')
args.api = None
else:
- print 'Error in API. Please, provide a correct path to API'
+ print ('Error in API. Please, provide a correct path to API')
sys.exit(1)
if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
- print 'index.php is OK'
+ print ('index.php is OK')
else:
index = index2
if index and index.startswith('//'):
@@ -1845,7 +1839,7 @@ def getParameters(params=[]):
index=index,
cookies=args.cookies,
session=session):
- print 'index.php is OK'
+ print ('index.php is OK')
else:
try:
index = '/'.join(index.split('/')[:-1])
@@ -1855,16 +1849,16 @@ def getParameters(params=[]):
index=index,
cookies=args.cookies,
session=session):
- print 'index.php is OK'
+ print ('index.php is OK')
else:
- print 'Error in index.php.'
+ print ('Error in index.php.')
if not args.xmlrevisions:
- print 'Please, provide a correct path to index.php or use --xmlrevisions. Terminating.'
+ print ('Please, provide a correct path to index.php or use --xmlrevisions. Terminating.')
sys.exit(1)
# check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user):
- print 'ERROR: Both --user and --pass are required for authentication.'
+ print ('ERROR: Both --user and --pass are required for authentication.')
parser.print_help()
sys.exit(1)
@@ -1876,7 +1870,7 @@ def getParameters(params=[]):
if re.search(
r'[^\d, \-]',
args.namespaces) and args.namespaces.lower() != 'all':
- print "Invalid namespace values.\nValid format is integer(s) separated by commas"
+ print ("Invalid namespace values.\nValid format is integer(s) separated by commas")
sys.exit()
else:
ns = re.sub(' ', '', args.namespaces)
@@ -1888,19 +1882,19 @@ def getParameters(params=[]):
# Process namespace exclusions
if args.exnamespaces:
if re.search(r'[^\d, \-]', args.exnamespaces):
- print "Invalid namespace values.\nValid format is integer(s) separated by commas"
+ print ("Invalid namespace values.\nValid format is integer(s) separated by commas")
sys.exit(1)
else:
ns = re.sub(' ', '', args.exnamespaces)
if ns.lower() == 'all':
- print 'You cannot exclude all namespaces.'
+ print ('You cannot exclude all namespaces.')
sys.exit(1)
else:
exnamespaces = [int(i) for i in ns.split(',')]
# --curonly requires --xml
if args.curonly and not args.xml:
- print "--curonly requires --xml\n"
+ print ("--curonly requires --xml\n")
parser.print_help()
sys.exit(1)
@@ -1938,7 +1932,7 @@ def getParameters(params=[]):
def checkRetryAPI(api=None, retries=5, apiclient=False, session=None):
- """ Call checkAPI and mwclient if necessary """
+ """Call checkAPI and mwclient if necessary"""
retry = 0
retrydelay = 20
check = None
@@ -1947,9 +1941,9 @@ def checkRetryAPI(api=None, retries=5, apiclient=False, session=None):
check = checkAPI(api, session=session)
break
except requests.exceptions.ConnectionError as e:
- print 'Connection error: %s'%(str(e))
+ print ('Connection error: %s'%(str(e)))
retry += 1
- print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
+ print ("Start retry attempt %d in %d seconds."%(retry+1, retrydelay))
time.sleep(retrydelay)
if check and apiclient:
@@ -1974,11 +1968,11 @@ def checkRetryAPI(api=None, retries=5, apiclient=False, session=None):
return check, api
def checkAPI(api=None, session=None):
- """ Checking API availability """
+ """Checking API availability"""
global cj
# handle redirects
for i in range(4):
- print 'Checking API...', api
+ print ('Checking API...', api)
r = session.get(
url=api,
params={
@@ -1993,7 +1987,7 @@ def checkAPI(api=None, session=None):
p = r.url
api = urlunparse([p.scheme, p.netloc, p.path, '', '', ''])
elif r.status_code > 400:
- print "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
+ print ("MediaWiki API URL not found or giving error: HTTP %d" % r.status_code)
return False
if "MediaWiki API is not enabled for this site." in r.text:
return False
@@ -2006,33 +2000,33 @@ def checkAPI(api=None, session=None):
result['query']['general']['script']
return ( True, index, api )
except KeyError:
- print "MediaWiki API seems to work but returned no index URL"
+ print ("MediaWiki API seems to work but returned no index URL")
return (True, None, api)
except ValueError:
- print repr(r.text)
- print "MediaWiki API returned data we could not parse"
+ print (repr(r.text))
+ print ("MediaWiki API returned data we could not parse")
return False
return False
def checkIndex(index=None, cookies=None, session=None):
- """ Checking index.php availability """
+ """Checking index.php availability"""
r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
if r.status_code >= 400:
print("ERROR: The wiki returned status code HTTP {}".format(r.status_code))
return False
raw = r.text
- print 'Checking index.php...', index
+ print ('Checking index.php...', index)
# Workaround for issue 71
if re.search(
r'(Special:Badtitle|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required)',
raw) and not cookies:
- print "ERROR: This wiki requires login and we are not authenticated"
+ print ("ERROR: This wiki requires login and we are not authenticated")
return False
if re.search(
r'(page-Index_php|"wgPageName":"Index.php"|"firstHeading">Index.php)',
raw):
- print "Looks like the page called Index.php, not index.php itself"
+ print ("Looks like the page called Index.php, not index.php itself")
return False
if re.search(
r'(This wiki is powered by|
|meta name="generator" content="MediaWiki)',
@@ -2042,7 +2036,7 @@ def checkIndex(index=None, cookies=None, session=None):
def removeIP(raw=''):
- """ Remove IP from HTML comments """
+ """Remove IP from HTML comments """
raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw)
# http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
@@ -2074,10 +2068,10 @@ def fixBOM(request):
def checkXMLIntegrity(config={}, titles=[], session=None):
- """ Check XML dump integrity, to detect broken XML chunks """
+ """Check XML dump integrity, to detect broken XML chunks"""
return
- print 'Verifying dump...'
+ print ('Verifying dump...')
checktitles = 0
checkpageopen = 0
checkpageclose = 0
@@ -2107,7 +2101,7 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
if (checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose):
pass
else:
- print 'XML dump seems to be corrupted.'
+ print ('XML dump seems to be corrupted.')
reply = ''
if config['failfast']:
reply = 'yes'
@@ -2116,12 +2110,12 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
if reply.lower() in ['yes', 'y']:
generateXMLDump(config=config, titles=titles, session=session)
elif reply.lower() in ['no', 'n']:
- print 'Not generating a new dump.'
+ print ('Not generating a new dump.')
def createNewDump(config={}, other={}):
images = []
- print 'Trying generating a new dump into a new directory...'
+ print ('Trying generating a new dump into a new directory...')
if config['xml']:
getPageTitles(config=config, session=other['session'])
titles=readTitles(config)
@@ -2144,7 +2138,7 @@ def createNewDump(config={}, other={}):
def resumePreviousDump(config={}, other={}):
images = []
- print 'Resuming previous dump process...'
+ print ('Resuming previous dump process...')
if config['xml']:
titles=readTitles(config)
try:
@@ -2160,9 +2154,9 @@ def resumePreviousDump(config={}, other={}):
lasttitle = '' # probably file does not exists
if lasttitle == '--END--':
# titles list is complete
- print 'Title list was completed in the previous session'
+ print ('Title list was completed in the previous session')
else:
- print 'Title list is incomplete. Reloading...'
+ print ('Title list is incomplete. Reloading...')
# do not resume, reload, to avoid inconsistences, deleted pages or
# so
getPageTitles(config=config, session=other['session'])
@@ -2194,10 +2188,10 @@ def resumePreviousDump(config={}, other={}):
pass # probably file does not exists
if xmliscomplete:
- print 'XML dump was completed in the previous session'
+ print ('XML dump was completed in the previous session')
elif lastxmltitle:
# resuming...
- print 'Resuming XML dump from "%s"' % (lastxmltitle)
+ print ('Resuming XML dump from "%s"' % (lastxmltitle))
titles = readTitles(config, start=lastxmltitle)
generateXMLDump(
config=config,
@@ -2206,7 +2200,7 @@ def resumePreviousDump(config={}, other={}):
session=other['session'])
else:
# corrupt? only has XML header?
- print 'XML is corrupt? Regenerating...'
+ print ('XML is corrupt? Regenerating...')
titles = readTitles(config)
generateXMLDump(
config=config, titles=titles, session=other['session'])
@@ -2222,7 +2216,7 @@ def resumePreviousDump(config={}, other={}):
config=config),
config['date']),
'r')
- raw = unicode(f.read(), 'utf-8').strip()
+ raw = str(f.read(), 'utf-8').strip()
lines = raw.split('\n')
for l in lines:
if re.search(r'\t', l):
@@ -2232,9 +2226,9 @@ def resumePreviousDump(config={}, other={}):
except:
pass # probably file does not exists
if lastimage == u'--END--':
- print 'Image list was completed in the previous session'
+ print ('Image list was completed in the previous session')
else:
- print 'Image list is incomplete. Reloading...'
+ print ('Image list is incomplete. Reloading...')
# do not resume, reload, to avoid inconsistences, deleted images or
# so
images = getImageNames(config=config, session=other['session'])
@@ -2261,10 +2255,10 @@ def resumePreviousDump(config={}, other={}):
complete = False
break
c += 1
- print '%d images were found in the directory from a previous session' % (c)
+ print ('%d images were found in the directory from a previous session' % (c))
if complete:
# image dump is complete
- print 'Image dump was completed in the previous session'
+ print ('Image dump was completed in the previous session')
else:
# we resume from previous image, which may be corrupted (or missing
# .desc) by the previous session ctrl-c or abort
@@ -2281,12 +2275,12 @@ def resumePreviousDump(config={}, other={}):
def saveSpecialVersion(config={}, session=None):
- """ Save Special:Version as .html, to preserve extensions details """
+ """Save Special:Version as .html, to preserve extensions details"""
if os.path.exists('%s/Special:Version.html' % (config['path'])):
- print 'Special:Version.html exists, do not overwrite'
+ print ('Special:Version.html exists, do not overwrite')
else:
- print 'Downloading Special:Version with extensions and other related info'
+ print ('Downloading Special:Version with extensions and other related info')
r = session.post(
url=config['index'], params={'title': 'Special:Version'}, timeout=10)
raw = r.text
@@ -2297,12 +2291,12 @@ def saveSpecialVersion(config={}, session=None):
def saveIndexPHP(config={}, session=None):
- """ Save index.php as .html, to preserve license details available at the botom of the page """
+ """Save index.php as .html, to preserve license details available at the botom of the page"""
if os.path.exists('%s/index.html' % (config['path'])):
- print 'index.html exists, do not overwrite'
+ print ('index.html exists, do not overwrite')
else:
- print 'Downloading index.php (Main Page) as index.html'
+ print ('Downloading index.php (Main Page) as index.html')
r = session.post(url=config['index'], params={}, timeout=10)
raw = r.text
delay(config=config, session=session)
@@ -2311,13 +2305,13 @@ def saveIndexPHP(config={}, session=None):
outfile.write(raw.encode('utf-8'))
def saveSiteInfo(config={}, session=None):
- """ Save a file with site info """
+ """Save a file with site info"""
if config['api']:
if os.path.exists('%s/siteinfo.json' % (config['path'])):
- print 'siteinfo.json exists, do not overwrite'
+ print ('siteinfo.json exists, do not overwrite')
else:
- print 'Downloading site info as siteinfo.json'
+ print ('Downloading site info as siteinfo.json')
# MediaWiki 1.13+
r = session.get(
@@ -2356,7 +2350,7 @@ def saveSiteInfo(config={}, session=None):
def avoidWikimediaProjects(config={}, other={}):
- """ Skip Wikimedia projects and redirect to the dumps website """
+ """Skip Wikimedia projects and redirect to the dumps website"""
# notice about wikipedia dumps
url = ''
@@ -2367,15 +2361,15 @@ def avoidWikimediaProjects(config={}, other={}):
if re.findall(
r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
url):
- print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
- print 'Download the dumps from http://dumps.wikimedia.org'
+ print ('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!')
+ print ('Download the dumps from http://dumps.wikimedia.org')
if not other['force']:
- print 'Thanks!'
+ print ('Thanks!')
sys.exit()
def getWikiEngine(url=''):
- """ Returns the wiki engine of a URL, if known """
+ """Returns the wiki engine of a URL, if known"""
session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()})
@@ -2452,13 +2446,13 @@ def getWikiEngine(url=''):
elif re.search(ur'(?im)(