Skip to content

Commit 1f08d3c

Browse files
committed
add counting of number of page edits
1 parent c99538b commit 1f08d3c

File tree

2 files changed

+19
-0
lines changed

2 files changed

+19
-0
lines changed

nsdb/parse.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,12 +96,16 @@ def parseNonTargetNamespace(
9696

9797
detector = mwreverts.Detector()
9898

99+
pageEdits = 0
100+
99101
for revision in tqdm.tqdm(
100102
page, desc=title, unit=" edits", smoothing=0, disable=parallel
101103
):
102104
if not revision.user:
103105
continue
104106

107+
pageEdits = pageEdits + 1
108+
105109
# Check if not None as there is a user 0, Larry Sanger
106110
if revision.user.id is not None:
107111
userId = revision.user.id
@@ -175,6 +179,11 @@ def parseNonTargetNamespace(
175179
),
176180
)
177181

182+
183+
query = """UPDATE page (number_of_edits)
184+
VALUES (%s)
185+
WHERE title=%s;"""
186+
cursor.execute(query, (pageEdits, title))
178187

179188
def parseTargetNamespace(page, title: str, namespace: str, cursor, parallel: str):
180189
"""Extracts features from each revision of a page into a database
@@ -200,13 +209,17 @@ def parseTargetNamespace(page, title: str, namespace: str, cursor, parallel: str
200209

201210
detector = mwreverts.Detector()
202211

212+
pageEdits = 0
213+
203214
## Extract page features from each revision
204215
for revision in tqdm.tqdm(
205216
page, desc=title, unit=" edits", smoothing=0, disable=parallel
206217
):
207218
if not revision.user:
208219
continue
209220

221+
pageEdits = pageEdits + 1
222+
210223
# Check if not None as there is a user 0, Larry Sanger
211224
if revision.user.id is not None:
212225
userId = revision.user.id
@@ -366,6 +379,11 @@ def parseTargetNamespace(page, title: str, namespace: str, cursor, parallel: str
366379
## Insert page features into database
367380
cursor.execute(query, editTuple)
368381

382+
query = """UPDATE page (number_of_edits)
383+
VALUES (%s)
384+
WHERE title=%s;"""
385+
cursor.execute(query, (pageEdits, title))
386+
369387

370388
def getDiff(old: str, new: str, parallel: str) -> Tuple[str, str]:
371389
"""Returns the diff between two edits using wdiff

sql/schema.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ CREATE TABLE page (
4242
namespace smallint NOT NULL,
4343
title varchar(255) binary NOT NULL,
4444
file_name varchar(85) NOT NULL,
45+
number_of_edits int unsigned NOT NULL DEFAULT '0',
4546
PRIMARY KEY (page_id)
4647
);
4748

0 commit comments

Comments
 (0)