Skip to content

Commit de66b64

Browse files
committed
add types to parameters
1 parent 1f08d3c commit de66b64

File tree

3 files changed

+28
-24
lines changed

3 files changed

+28
-24
lines changed

nsdb/nsdb.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,15 @@ def createDumpsFile(listOfDumps: str, wiki: str, dump: str):
5353
subprocess.run(["./download.sh", "https://dumps.wikimedia.org/", wiki, dump])
5454

5555

56-
def countLines(file) -> int:
56+
def countLines(file: str) -> int:
5757
"""Returns the number of lines in a file using wc from bash"""
5858
wordCount = subprocess.check_output(["wc", "-l", file]).decode("utf-8")
5959
lines = int(wordCount.split(" ")[0])
6060

6161
return lines
6262

6363

64-
def downloadFirstDump(listOfDumps, archivesDir, dumpsDir) -> str:
64+
def downloadFirstDump(listOfDumps: str, archivesDir: str, dumpsDir: str) -> str:
6565
"""Downloads the first dump in dumps.txt if it is not already present
6666
in the dumps directory"""
6767

@@ -86,7 +86,7 @@ def downloadFirstDump(listOfDumps, archivesDir, dumpsDir) -> str:
8686
return fileName
8787

8888

89-
def extractFile(fileName: str, archivesDir, dumpsDir):
89+
def extractFile(fileName: str, archivesDir: str, dumpsDir: str):
9090
"""Unzip if not already extracted, delete if extracted
9191
9292
Execution takes 5-15 minutes as a guideline"""
@@ -99,7 +99,9 @@ def extractFile(fileName: str, archivesDir, dumpsDir):
9999
return fileName[:-3]
100100

101101

102-
def splitFile(fileName, queue, dumpsDir, partitionsDir, numPartitions):
102+
def splitFile(
103+
fileName: str, queue, dumpsDir: str, partitionsDir: str, numPartitions: int
104+
):
103105
"""Split a dump into a number of partitions"""
104106
database, cursor = Database.connect()
105107

@@ -116,7 +118,7 @@ def splitFile(fileName, queue, dumpsDir, partitionsDir, numPartitions):
116118
database.close()
117119

118120

119-
def checkDiskSpace(dataDir):
121+
def checkDiskSpace(dataDir: str) -> int:
120122
"""Returns the size of the data directory"""
121123
try:
122124
space = int(
@@ -188,7 +190,7 @@ def markLongRunningJobsAsError():
188190
database.close()
189191

190192

191-
def removeDoneJobs(partitionsDir):
193+
def removeDoneJobs(partitionsDir: str):
192194
"""Remove partitions that are completed"""
193195
query = "SELECT file_name FROM partition WHERE status = 'done'"
194196
database, cursor = Database.connect()
@@ -242,7 +244,7 @@ def restartJobs():
242244
database.close()
243245

244246

245-
def main(parallelID=0, numParallel=1, dataDir="/bigtemp/ckm8gz/"):
247+
def main(parallelID: str = 0, numParallel: int = 1, dataDir: str = "/bigtemp/ckm8gz/"):
246248
"""Download a list of dumps if it doesn't exist. If there are no dumps,
247249
download one and split it, then process the dump on multiple threads
248250

nsdb/parse.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import time
1212
import traceback
1313
from datetime import datetime
14-
from typing import Tuple
14+
from typing import Tuple, List
1515

1616
import mwreverts
1717
import mwxml
@@ -21,7 +21,7 @@
2121
import Database
2222

2323

24-
def multiprocess(partitionsDir, namespaces, queue, jobId):
24+
def multiprocess(partitionsDir: str, namespaces: List[int], queue, jobId: str):
2525
"""Wrapper around process to call parse in a multiprocessing pool"""
2626
while True:
2727
i = queue.get()
@@ -35,7 +35,7 @@ def multiprocess(partitionsDir, namespaces, queue, jobId):
3535
time.sleep(10)
3636

3737

38-
def getDump(partitionsDir, cursor):
38+
def getDump(partitionsDir: str, cursor):
3939
"""Returns the next dump to be parsed from the database
4040
4141
Parameters
@@ -179,12 +179,12 @@ def parseNonTargetNamespace(
179179
),
180180
)
181181

182-
183182
query = """UPDATE page (number_of_edits)
184183
VALUES (%s)
185184
WHERE title=%s;"""
186185
cursor.execute(query, (pageEdits, title))
187186

187+
188188
def parseTargetNamespace(page, title: str, namespace: str, cursor, parallel: str):
189189
"""Extracts features from each revision of a page into a database
190190
@@ -431,7 +431,7 @@ def getDiff(old: str, new: str, parallel: str) -> Tuple[str, str]:
431431
return added, deleted
432432

433433

434-
def checkReverted(detector, revision, cursor, undidRevision, target):
434+
def checkReverted(detector, revision, cursor, undidRevision, target: bool):
435435
"""Inserts reverted edits into the database for target namespace, otherwise
436436
returns the user that was reverted"""
437437
reverted = detector.process(
@@ -575,7 +575,9 @@ def containsVulgarity(string: str) -> bool:
575575

576576

577577
def parse(
578-
partitionsDir: str = "/bigtemp/ckm8gz/partitions/", namespaces=[1], parallel=""
578+
partitionsDir: str = "/bigtemp/ckm8gz/partitions/",
579+
namespaces: List[int] = [1],
580+
parallel: str = "",
579581
):
580582
"""Selects the next dump from the database, extracts the features and
581583
imports them into several database tables.

nsdb/splitwiki.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,21 @@
1111
from tqdm import trange
1212

1313

14-
def countLines(file) -> int:
14+
def countLines(file: str) -> int:
1515
"""Returns the estimated number of lines in a dump using wcle.sh"""
1616
print("counting lines: ", file)
1717
lines = int(subprocess.check_output(["./wcle.sh", file]))
1818

1919
return lines
2020

2121

22-
def addJobToDatabase(cursor, partitionName):
22+
def addJobToDatabase(cursor, partitionName: str):
2323
"""Inserts partition into the database"""
2424
query = "INSERT INTO partition (file_name) VALUES (%s)"
2525
cursor.execute(query, (partitionName,))
2626

2727

28-
def addJobToQueue(queue, jobId):
28+
def addJobToQueue(queue, jobId: str):
2929
"""Adds partition to the multiprocessing queue"""
3030
queue.put(jobId)
3131

@@ -41,11 +41,11 @@ def addJobToQueue(queue, jobId):
4141
# "-d", "--deletedump", default=False, is_flag=True,
4242
# )
4343
def split(
44-
number=10,
45-
inputFolder="/bigtemp/ckm8gz/dumps/",
46-
outputFolder="/bigtemp/ckm8gz/partitions/",
47-
deleteDump=True,
48-
fileName="",
44+
number: int = 10,
45+
inputFolder: str = "/bigtemp/ckm8gz/dumps/",
46+
outputFolder: str = "/bigtemp/ckm8gz/partitions/",
47+
deleteDump: bool = True,
48+
fileName: str = "",
4949
queue=0,
5050
cursor=0,
5151
):
@@ -126,10 +126,10 @@ def split(
126126
i = 0
127127
inPage = False
128128
moreFile = True
129-
130-
t = tqdm(total=number, desc=fileName, unit=" partition") # Initialise
129+
130+
t = tqdm(total=number, desc=fileName, unit=" partition") # Initialise
131131
index = 0
132-
132+
133133
with open(file) as inFile:
134134
while True:
135135
if not moreFile:

0 commit comments

Comments
 (0)