Skip to content

Commit 27c7fde

Browse files
committed
refactor mirrors.py into nsdb
remove dependancy on requests add check for internet connection before attempting to download dump
1 parent 385ce48 commit 27c7fde

File tree

4 files changed

+110
-107
lines changed

4 files changed

+110
-107
lines changed

DOCUMENTATION.md

+11-21
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,17 @@ Functions
4141
Execution takes 5-15 minutes as a guideline
4242

4343

44+
`findFastestMirror(dump='20200401/', wiki='enwiki/')`
45+
: Gets a list of the fastest mirrors, downloads a single file from each
46+
and returns the fastest one.
47+
48+
Execution takes 5-10 seconds as a guideline
49+
50+
Returns
51+
-------
52+
fastestMirror: str - the url of the fastest mirror
53+
54+
4455
`jobsDone()`
4556
: Returns True if all jobs are done
4657

@@ -266,27 +277,6 @@ Functions
266277
-----
267278

268279

269-
Module [mirrors](nsdb/mirrors.py)
270-
==============
271-
This script finds the fastest mirror to download Wikipedia dumps from
272-
273-
Functions
274-
---------
275-
276-
277-
`fastest(dump='20200401/', wiki='enwiki/')`
278-
: Gets a list of the fastest mirrors, downloads a single file from each
279-
and returns the fastest one.
280-
281-
Execution takes 5-10 seconds as a guideline
282-
283-
Returns
284-
-------
285-
fastestMirror: str - the url of the fastest mirror
286-
287-
-----
288-
289-
290280
Module [Database](nsdb/Database.py)
291281
===============
292282
This module creates a database connection for other scripts to use.

nsdb/mirrors.py

-71
This file was deleted.

nsdb/nsdb.py

100755100644
+99-15
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,23 @@
1010
Please run pip install -r requirements.txt before running this script.
1111
"""
1212

13+
import http.client
1314
import multiprocessing
1415
import os
1516
import re
1617
import subprocess
1718
import time
1819
import traceback
20+
import urllib
21+
import urllib.request
1922
from datetime import datetime
2023
from sys import argv
2124
from typing import List
22-
from urllib import request
25+
26+
from tqdm import tqdm
2327

2428
import Database
2529
import parse
26-
from mirrors import fastest
2730
from splitwiki import split
2831

2932

@@ -58,7 +61,7 @@ def createDumpsFile(listOfDumps: str, wiki: str = "enwiki", dump: str = "2020040
5861
pass
5962

6063
url = mirror + wiki + "/" + dump
61-
content = request.urlopen(url).read().decode("utf-8")
64+
content = urllib.request.urlopen(url).read().decode("utf-8")
6265
dumps = re.findall('(?<=href="/).*pages-meta-history.*7z(?=")', content)
6366

6467
with open(listOfDumps, "w") as file:
@@ -74,11 +77,90 @@ def countLines(file: str) -> int:
7477
return lines
7578

7679

80+
def findFastestMirror(dump: str = "20200401/", wiki: str = "enwiki/") -> str:
81+
"""Gets a list of the fastest mirrors, downloads a single file from each
82+
and returns the fastest one.
83+
84+
Execution takes 5-10 seconds as a guideline
85+
86+
Returns
87+
-------
88+
fastestMirror: str - the url of the fastest mirror
89+
"""
90+
91+
# find a list of mirrors
92+
url = "https://dumps.wikimedia.org/mirrors.html"
93+
94+
html = urllib.request.urlopen(url).read().decode("utf-8")
95+
96+
# https is always going to be slower than http for download but check in case mirror
97+
# is only available over https
98+
mirrors = re.findall('href="(https?:.*)"', html)
99+
mirrorDownloadTime = []
100+
101+
# Add main site
102+
mirrors.append("https://dumps.wikimedia.org/")
103+
104+
firstfile = "enwiki-20200401-pages-meta-history5.xml-p564843p565313.7z"
105+
print("Finding fastest mirror")
106+
for index, mirror in enumerate(tqdm(mirrors, unit=" mirror")):
107+
url = mirror + wiki + dump + firstfile
108+
109+
tick = time.time()
110+
try:
111+
urllib.request.urlopen(url)
112+
113+
# add the time to download
114+
mirrorDownloadTime.append(time.time() - tick)
115+
except urllib.error.HTTPError as err:
116+
if str(err.code)[0] in ["4", "5"]:
117+
# try other url scheme
118+
url = mirror + "dumps/" + wiki + dump + firstfile
119+
120+
tick = time.time()
121+
try:
122+
urllib.request.urlopen(url)
123+
124+
mirrorDownloadTime.append(time.time() - tick)
125+
except urllib.error.HTTPError as err:
126+
if str(err.code)[0] in ["4", "5"]:
127+
mirrorDownloadTime.append(1000)
128+
else:
129+
raise
130+
else:
131+
raise
132+
133+
# return fastest mirror
134+
_, index = min((val, index) for (index, val) in enumerate(mirrorDownloadTime))
135+
136+
# for i in range(len(mirrors)):
137+
# print(mirrors[i], mirrorDownloadTime[i])
138+
# print("Fastest mirror is " + mirrors[index])
139+
if all(time == 1000 for time in mirrorDownloadTime):
140+
raise RuntimeError("Dump " + dump + " is no longer hosted on any mirror")
141+
142+
return mirrors[index]
143+
144+
77145
def downloadFirstDump(
78146
dump: str, listOfDumps: str, archivesDir: str, dumpsDir: str
79147
) -> str:
80148
"""Downloads the first dump in dumps.txt if it is not already present
81149
in the dumps directory"""
150+
# check internet connectivity
151+
# source https://stackoverflow.com/a/29854274
152+
conn = http.client.HTTPConnection("www.fast.com", timeout=5)
153+
for _ in range(5):
154+
try:
155+
conn.request("HEAD", "/")
156+
except:
157+
conn.close()
158+
else:
159+
conn.close()
160+
break
161+
else:
162+
print("Internet connection lost")
163+
return ""
82164

83165
with open(listOfDumps) as file:
84166
firstLine = file.readline().strip()
@@ -92,7 +174,7 @@ def downloadFirstDump(
92174
file.writelines(data)
93175

94176
if not os.path.exists(dumpsDir + fileName[:-3]):
95-
fastestMirror = fastest(dump)
177+
fastestMirror = findFastestMirror(dump)
96178

97179
subprocess.run(
98180
["wget", "-nc", "-nv", "-P", archivesDir, fastestMirror + firstLine]
@@ -341,17 +423,19 @@ def main(
341423
% (fileName, time.time() - tick)
342424
)
343425

344-
tick = time.time()
345-
fileName = extractFile(fileName, archivesDir, dumpsDir)
346-
print(
347-
"--- Extracting %s took %s seconds ---" % (fileName, time.time() - tick)
348-
)
349-
350-
splitter.apply_async(
351-
splitFile,
352-
(fileName, queue, dumpsDir, partitionsDir, numPartitions),
353-
error_callback=splitError,
354-
)
426+
if fileName != "":
427+
tick = time.time()
428+
fileName = extractFile(fileName, archivesDir, dumpsDir)
429+
print(
430+
"--- Extracting %s took %s seconds ---"
431+
% (fileName, time.time() - tick)
432+
)
433+
434+
splitter.apply_async(
435+
splitFile,
436+
(fileName, queue, dumpsDir, partitionsDir, numPartitions),
437+
error_callback=splitError,
438+
)
355439

356440
numJobs = outstandingJobs()
357441
diskSpace = checkDiskSpace(dataDir)

programflow.png

-78.2 KB
Loading

0 commit comments

Comments
 (0)