Skip to content

Commit 2a9c207

Browse files
authored
Merge pull request #269 from opensanctions/wd-dates-filter
Handle imprecise historical dates in Wikidata better
2 parents 3b0e1b1 + 6b764be commit 2a9c207

File tree

5 files changed

+23
-5
lines changed

5 files changed

+23
-5
lines changed

nomenklatura/wikidata/props.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@
6464
# "Q392651": "role.spy",
6565
"Q14886050": "crime.terror",
6666
"Q16533": "role.judge",
67-
"Q17276321": "role.pep", # member of the state duma
6867
"Q189290": "mil", # military officer
6968
"Q47064": "mil", # military personnel
7069
}

nomenklatura/wikidata/value.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414

1515
log = logging.getLogger(__name__)
16+
MIN_DATE = "1001"
1617
PRECISION = {
1718
11: Precision.DAY,
1819
10: Precision.MONTH,
@@ -31,6 +32,13 @@ def snak_value_to_string(
3132
return LangText(None)
3233
time = raw_time.strip("+")
3334
prec_id = cast(int, value.get("precision"))
35+
# cf. https://www.wikidata.org/wiki/Help:Dates#Precision
36+
if prec_id >= 9:
37+
if time < "1900":
38+
# Hacky, but set all old dates to the minimum date so persons
39+
# with historical birth dates are filtered out.
40+
return LangText(MIN_DATE, original=raw_time)
41+
return LangText(None, original=raw_time)
3442
prec = PRECISION.get(prec_id, Precision.DAY)
3543
time = time[: prec.value]
3644

@@ -40,7 +48,7 @@ def snak_value_to_string(
4048
time = time[:4]
4149

4250
# Date limit in FtM. These will be removed by the death filter:
43-
time = max("1001", time)
51+
time = max(MIN_DATE, time)
4452
return LangText(time, original=raw_time)
4553
elif value_type == "wikibase-entityid":
4654
qid = value.get("id")

tests/conftest.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from sqlalchemy import MetaData
55
import yaml
66
import pytest
7+
from urllib.error import HTTPError
8+
from urllib.request import Request, urlopen
79
from pathlib import Path
810
from tempfile import mkdtemp
911
from normality import slugify_text
@@ -18,6 +20,10 @@
1820
from nomenklatura.cache import Cache
1921

2022
FIXTURES_PATH = Path(__file__).parent.joinpath("fixtures/")
23+
FIXTURE_FETCH_HEADERS = {
24+
"User-Agent": "followthemoney.tech/nomenklatura (https://github.com/opensanctions/nomenklatura)",
25+
"Accept": "application/json",
26+
}
2127
settings.TESTING = True
2228

2329

@@ -119,9 +125,12 @@ def wd_read_response(request, context):
119125
assert file_name is not None, "Invalid Wikidata URL: %s" % request.url
120126
path = FIXTURES_PATH / f"wikidata/{file_name}.json"
121127
if not path.exists():
122-
import urllib.request
123-
124-
data = json.load(urllib.request.urlopen(request.url))
128+
try:
129+
req = Request(request.url, headers=FIXTURE_FETCH_HEADERS)
130+
data = json.load(urlopen(req))
131+
except HTTPError as exc:
132+
print("URL", request.url, "failed:", exc, exc.read())
133+
raise
125134
for _, value in data["entities"].items():
126135
value.pop("sitelinks", None)
127136
for sect in ["labels", "aliases", "descriptions"]:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"entities": {"P22": {"type": "property", "datatype": "wikibase-item", "id": "P22", "labels": {"en": {"language": "en", "value": "father"}}}}, "success": 1}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"entities": {"P25": {"type": "property", "datatype": "wikibase-item", "id": "P25", "labels": {"en": {"language": "en", "value": "mother"}}}}, "success": 1}

0 commit comments

Comments
 (0)