10
10
Please run pip install -r requirements.txt before running this script.
11
11
"""
12
12
13
+ import http .client
13
14
import multiprocessing
14
15
import os
15
16
import re
16
17
import subprocess
17
18
import time
18
19
import traceback
20
+ import urllib
21
+ import urllib .request
19
22
from datetime import datetime
20
23
from sys import argv
21
24
from typing import List
22
- from urllib import request
25
+
26
+ from tqdm import tqdm
23
27
24
28
import Database
25
29
import parse
26
- from mirrors import fastest
27
30
from splitwiki import split
28
31
29
32
@@ -58,7 +61,7 @@ def createDumpsFile(listOfDumps: str, wiki: str = "enwiki", dump: str = "2020040
58
61
pass
59
62
60
63
url = mirror + wiki + "/" + dump
61
- content = request .urlopen (url ).read ().decode ("utf-8" )
64
+ content = urllib . request .urlopen (url ).read ().decode ("utf-8" )
62
65
dumps = re .findall ('(?<=href="/).*pages-meta-history.*7z(?=")' , content )
63
66
64
67
with open (listOfDumps , "w" ) as file :
@@ -74,11 +77,90 @@ def countLines(file: str) -> int:
74
77
return lines
75
78
76
79
80
+ def findFastestMirror (dump : str = "20200401/" , wiki : str = "enwiki/" ) -> str :
81
+ """Gets a list of the fastest mirrors, downloads a single file from each
82
+ and returns the fastest one.
83
+
84
+ Execution takes 5-10 seconds as a guideline
85
+
86
+ Returns
87
+ -------
88
+ fastestMirror: str - the url of the fastest mirror
89
+ """
90
+
91
+ # find a list of mirrors
92
+ url = "https://dumps.wikimedia.org/mirrors.html"
93
+
94
+ html = urllib .request .urlopen (url ).read ().decode ("utf-8" )
95
+
96
+ # https is always going to be slower than http for download but check in case mirror
97
+ # is only available over https
98
+ mirrors = re .findall ('href="(https?:.*)"' , html )
99
+ mirrorDownloadTime = []
100
+
101
+ # Add main site
102
+ mirrors .append ("https://dumps.wikimedia.org/" )
103
+
104
+ firstfile = "enwiki-20200401-pages-meta-history5.xml-p564843p565313.7z"
105
+ print ("Finding fastest mirror" )
106
+ for index , mirror in enumerate (tqdm (mirrors , unit = " mirror" )):
107
+ url = mirror + wiki + dump + firstfile
108
+
109
+ tick = time .time ()
110
+ try :
111
+ urllib .request .urlopen (url )
112
+
113
+ # add the time to download
114
+ mirrorDownloadTime .append (time .time () - tick )
115
+ except urllib .error .HTTPError as err :
116
+ if str (err .code )[0 ] in ["4" , "5" ]:
117
+ # try other url scheme
118
+ url = mirror + "dumps/" + wiki + dump + firstfile
119
+
120
+ tick = time .time ()
121
+ try :
122
+ urllib .request .urlopen (url )
123
+
124
+ mirrorDownloadTime .append (time .time () - tick )
125
+ except urllib .error .HTTPError as err :
126
+ if str (err .code )[0 ] in ["4" , "5" ]:
127
+ mirrorDownloadTime .append (1000 )
128
+ else :
129
+ raise
130
+ else :
131
+ raise
132
+
133
+ # return fastest mirror
134
+ _ , index = min ((val , index ) for (index , val ) in enumerate (mirrorDownloadTime ))
135
+
136
+ # for i in range(len(mirrors)):
137
+ # print(mirrors[i], mirrorDownloadTime[i])
138
+ # print("Fastest mirror is " + mirrors[index])
139
+ if all (time == 1000 for time in mirrorDownloadTime ):
140
+ raise RuntimeError ("Dump " + dump + " is no longer hosted on any mirror" )
141
+
142
+ return mirrors [index ]
143
+
144
+
77
145
def downloadFirstDump (
78
146
dump : str , listOfDumps : str , archivesDir : str , dumpsDir : str
79
147
) -> str :
80
148
"""Downloads the first dump in dumps.txt if it is not already present
81
149
in the dumps directory"""
150
+ # check internet connectivity
151
+ # source https://stackoverflow.com/a/29854274
152
+ conn = http .client .HTTPConnection ("www.fast.com" , timeout = 5 )
153
+ for _ in range (5 ):
154
+ try :
155
+ conn .request ("HEAD" , "/" )
156
+ except :
157
+ conn .close ()
158
+ else :
159
+ conn .close ()
160
+ break
161
+ else :
162
+ print ("Internet connection lost" )
163
+ return ""
82
164
83
165
with open (listOfDumps ) as file :
84
166
firstLine = file .readline ().strip ()
@@ -92,7 +174,7 @@ def downloadFirstDump(
92
174
file .writelines (data )
93
175
94
176
if not os .path .exists (dumpsDir + fileName [:- 3 ]):
95
- fastestMirror = fastest (dump )
177
+ fastestMirror = findFastestMirror (dump )
96
178
97
179
subprocess .run (
98
180
["wget" , "-nc" , "-nv" , "-P" , archivesDir , fastestMirror + firstLine ]
@@ -341,17 +423,19 @@ def main(
341
423
% (fileName , time .time () - tick )
342
424
)
343
425
344
- tick = time .time ()
345
- fileName = extractFile (fileName , archivesDir , dumpsDir )
346
- print (
347
- "--- Extracting %s took %s seconds ---" % (fileName , time .time () - tick )
348
- )
349
-
350
- splitter .apply_async (
351
- splitFile ,
352
- (fileName , queue , dumpsDir , partitionsDir , numPartitions ),
353
- error_callback = splitError ,
354
- )
426
+ if fileName != "" :
427
+ tick = time .time ()
428
+ fileName = extractFile (fileName , archivesDir , dumpsDir )
429
+ print (
430
+ "--- Extracting %s took %s seconds ---"
431
+ % (fileName , time .time () - tick )
432
+ )
433
+
434
+ splitter .apply_async (
435
+ splitFile ,
436
+ (fileName , queue , dumpsDir , partitionsDir , numPartitions ),
437
+ error_callback = splitError ,
438
+ )
355
439
356
440
numJobs = outstandingJobs ()
357
441
diskSpace = checkDiskSpace (dataDir )
0 commit comments