Skip to content

Commit 81c6617

Browse files
committed
Simplify i18n API
- Single `Language` class that takes a query and handles everything - `get_language()` and `get_language_or_none` as goto calls to get it - kept `find_language_names()` and `is_valid_iso_639_3()` but reusing `Language`
1 parent 6d68086 commit 81c6617

File tree

2 files changed

+165
-146
lines changed

2 files changed

+165
-146
lines changed

Diff for: src/zimscraperlib/i18n.py

+154-137
Original file line numberDiff line numberDiff line change
@@ -9,171 +9,188 @@
99
ISO_LEVELS = ["1", "2b", "2t", "3", "5"]
1010

1111

12-
class NotFoundError(ValueError):
13-
pass
12+
class NotFoundError(ValueError): ...
13+
14+
15+
class Language:
16+
"""Qualified ISO-639-3 language"""
17+
18+
def __init__(self, query: str):
19+
"""Instantiate a valid ISO-639-3 Language from query
20+
21+
params: either an ISO-639 code or a locale or an english language name"""
22+
self.iso_639_1: str | None = None
23+
self.iso_639_2b: str | None = None
24+
self.iso_639_2t: str | None = None
25+
self.iso_639_3: str | None = None
26+
self.iso_639_5: str | None = None
27+
self.english: str | None = None
28+
self.native: str | None = None
29+
self.iso_types: list[str] = []
30+
self.query: str = query
31+
self.native_query: str | None = None
32+
self.querytype: str | None = None
33+
34+
def get_adjusted_query(query: str) -> tuple[str, str, str]:
35+
# possibily an iso-639 code
36+
if query.isalpha() and (2 <= len(query) <= 3): # noqa: PLR2004
37+
adjusted_query = query
38+
native_query = query
39+
query_type = "purecode"
40+
# possibily a locale
41+
elif all(x.isalpha() or x in ("-", "_") for x in query) and (
42+
query.count("_") + query.count("-") == 1
43+
):
44+
adjusted_query = re.split("-|_", query)[0]
45+
native_query = query.replace("-", "_")
46+
query_type = "locale"
47+
# possibily an ISO language name
48+
else:
49+
adjusted_query = query.title().replace("Languages", "languages")
50+
native_query = query
51+
query_type = "languagename"
52+
return adjusted_query, native_query, query_type
53+
54+
adjusted_query, self.native_query, self.querytype = get_adjusted_query(query)
1455

15-
16-
class Lang:
17-
18-
def __init__(self, requested_lang: str, iso639_lang_obj: iso639.Lang):
19-
self.iso_639_1 = iso639_lang_obj.pt1 or None
20-
self.iso_639_2b = iso639_lang_obj.pt2b or None
21-
self.iso_639_2t = iso639_lang_obj.pt2t or None
22-
self.iso_639_3 = iso639_lang_obj.pt3 or None
23-
self.iso_639_5 = iso639_lang_obj.pt5 or None
24-
self.english = iso639_lang_obj.name or None
56+
try:
57+
isolang = iso639.Lang(adjusted_query)
58+
except (
59+
iso639.exceptions.InvalidLanguageValue,
60+
iso639.exceptions.DeprecatedLanguageValue,
61+
) as exc:
62+
raise NotFoundError("Not a valid iso language name/code") from exc
63+
64+
parts_keys_map = {
65+
"iso_639_1": "pt1",
66+
"iso_639_2b": "pt2b",
67+
"iso_639_2t": "pt2t",
68+
"iso_639_3": "pt3",
69+
"iso_639_5": "pt5",
70+
"english": "name",
71+
}
72+
73+
self.iso_639_1 = isolang.pt1 or None
74+
self.iso_639_2b = isolang.pt2b or None
75+
self.iso_639_2t = isolang.pt2t or None
76+
self.iso_639_3 = isolang.pt3 or None
77+
self.iso_639_5 = isolang.pt5 or None
78+
self.english = isolang.name or None
2579
self.iso_types = [
2680
part_level
2781
for iso_level, part_level in [
2882
(f"pt{level}", f"part{level}") for level in ISO_LEVELS
2983
]
3084
+ [("name", "name")]
31-
if getattr(iso639_lang_obj, iso_level).lower() == requested_lang.lower()
85+
if getattr(isolang, iso_level).lower() == adjusted_query.lower()
3286
]
3387

34-
35-
class LangAndDetails:
36-
def __init__(
37-
self, lang: Lang, english_name: str, native: str, querytype: str, query: str
38-
):
39-
self.iso_639_1 = lang.iso_639_1
40-
self.iso_639_2b = lang.iso_639_2b
41-
self.iso_639_2t = lang.iso_639_2t
42-
self.iso_639_3 = lang.iso_639_3
43-
self.iso_639_5 = lang.iso_639_5
44-
self.iso_types = lang.iso_types
45-
self.english = english_name
46-
self.native = native
47-
self.querytype = querytype
48-
self.query = query
49-
50-
def __eq__(self, value: object) -> bool:
51-
if not isinstance(value, LangAndDetails):
52-
return False
53-
54-
return (
55-
self.iso_639_1 == value.iso_639_1
56-
and self.iso_639_2b == value.iso_639_2b
57-
and self.iso_639_2t == value.iso_639_2t
58-
and self.iso_639_3 == value.iso_639_3
59-
and self.iso_639_5 == value.iso_639_5
60-
and self.english == value.english
61-
and self.native == value.native
62-
)
63-
64-
def __repr__(self) -> str:
65-
return (
66-
f"iso_639_1:{self.iso_639_1}, iso_639_2b:{self.iso_639_2b}, "
67-
f"iso_639_2t:{self.iso_639_2t}, iso_639_3:{self.iso_639_3}, "
68-
f"iso_639_5:{self.iso_639_5}, iso_639_5:{self.english}, "
69-
f"iso_639_5:{self.native}"
70-
)
71-
72-
73-
def get_iso_lang_data(lang: str) -> tuple[Lang, Lang | None]:
74-
"""ISO-639-x languages details for lang. Raises NotFoundError
75-
76-
Returns a tuple (main_language, macro_language | None)"""
77-
78-
try:
79-
isolang = iso639.Lang(lang)
80-
except (
81-
iso639.exceptions.InvalidLanguageValue,
82-
iso639.exceptions.DeprecatedLanguageValue,
83-
) as exc:
84-
raise NotFoundError("Not a valid iso language name/code") from exc
85-
86-
ourlang = Lang(lang, isolang)
87-
88-
macro = isolang.macro()
89-
90-
return (ourlang, get_iso_lang_data(macro.name)[0] if macro else None)
91-
92-
93-
def find_language_names(
94-
query: str, lang_data: Lang | LangAndDetails | None = None
95-
) -> tuple[str, str]:
96-
"""(native, english) language names for lang with help from lang_data
97-
98-
Falls back to English name if available or query if not"""
99-
if lang_data is None:
100-
lang_data = get_language_details(query, failsafe=True)
101-
if not lang_data:
102-
return query, query
103-
104-
try:
105-
query_locale = babel.Locale.parse(query)
106-
if native_display_name := query_locale.get_display_name():
107-
if english_display_name := query_locale.get_display_name("en"):
108-
return native_display_name, english_display_name
109-
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
110-
pass
111-
112-
# ISO code lookup order matters (most qualified first)!
113-
for iso_level in [f"iso_639_{level}" for level in reversed(ISO_LEVELS)]:
88+
# update if language has a macro
89+
if isolang.macro():
90+
for iso_level in [f"iso_639_{level}" for level in ISO_LEVELS]:
91+
if not getattr(self, iso_level):
92+
setattr(
93+
self,
94+
iso_level,
95+
# we'll get the pt attr for each iso_xxx
96+
getattr(isolang.macro(), parts_keys_map[iso_level], None)
97+
# we want None if value is empty
98+
or None,
99+
)
100+
101+
self.native, self.english = self._get_names_from(self.native_query)
102+
103+
def _get_names_from(self, query: str) -> tuple[str, str]:
104+
"""logic to find language names from babel and fallback"""
114105
try:
115-
query_locale = babel.Locale.parse(getattr(lang_data, iso_level))
106+
query_locale = babel.Locale.parse(query)
116107
if native_display_name := query_locale.get_display_name():
117108
if english_display_name := query_locale.get_display_name("en"):
118109
return native_display_name, english_display_name
119110
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
120111
pass
121-
default = lang_data.english or query
122-
return default, default
123112

113+
# ISO code lookup order matters (most qualified first)!
114+
for iso_level in [f"iso_639_{level}" for level in reversed(ISO_LEVELS)]:
115+
try:
116+
query_locale = babel.Locale.parse(getattr(self, iso_level))
117+
if native_display_name := query_locale.get_display_name():
118+
if english_display_name := query_locale.get_display_name("en"):
119+
return native_display_name, english_display_name
120+
except (
121+
babel.UnknownLocaleError,
122+
TypeError,
123+
ValueError,
124+
AttributeError,
125+
):
126+
pass
127+
default = self.english or query
128+
return default, default
129+
130+
def todict(self) -> dict[str, str | None | list[str]]:
131+
return {
132+
key.replace("_", "-") if key.startswith("iso") else key: getattr(
133+
self, key, None
134+
)
135+
for key in [
136+
"iso_639_1",
137+
"iso_639_2b",
138+
"iso_639_2t",
139+
"iso_639_3",
140+
"iso_639_5",
141+
"english",
142+
"iso_types",
143+
"native",
144+
"querytype",
145+
"query",
146+
]
147+
}
124148

125-
def update_with_macro(lang_data: Lang, macro_data: Lang | None):
126-
"""update empty keys from lang_data with ones of macro_data"""
127-
if not macro_data:
128-
return lang_data
149+
def __repr__(self) -> str:
150+
data_repr = ", ".join(
151+
f'{key.replace("-", "_")}="{value}"' for key, value in self.todict().items()
152+
)
153+
return f"{type(self).__name__}({data_repr})"
129154

130-
for iso_level in [f"iso_639_{level}" for level in ISO_LEVELS]:
131-
if not getattr(lang_data, iso_level):
132-
setattr(lang_data, iso_level, getattr(macro_data, iso_level))
155+
def __str__(self) -> str:
156+
return f"{self.iso_639_3}: {self.english}"
133157

134-
return lang_data
158+
def __eq__(self, value: object) -> bool:
159+
return (
160+
self.iso_639_1 == getattr(value, "iso_639_1", None)
161+
and self.iso_639_2b == getattr(value, "iso_639_2b", None)
162+
and self.iso_639_2t == getattr(value, "iso_639_2t", None)
163+
and self.iso_639_3 == getattr(value, "iso_639_3", None)
164+
and self.iso_639_5 == getattr(value, "iso_639_5", None)
165+
and self.english == getattr(value, "english", None)
166+
and self.native == getattr(value, "native", None)
167+
)
135168

136169

137-
def get_language_details(
138-
query: str, failsafe: bool | None = False # noqa: FBT002
139-
) -> LangAndDetails | None:
140-
"""language details dict from query.
170+
def find_language_names(query: str) -> tuple[str, str]:
171+
"""(native, english) language names for query"""
172+
try:
173+
lang = Language(query)
174+
except NotFoundError:
175+
return query, query
176+
# should be qualified but "None" is as valid as anything if not
177+
return str(lang.native), str(lang.english)
141178

142-
When query fails, either raises NotFoundError or return None, based on failsafe
143179

144-
"""
180+
def get_language(lang_code: str) -> Language:
181+
"""Language from lang_code"""
182+
return Language(lang_code)
145183

146-
if query.isalpha() and (2 <= len(query) <= 3): # noqa: PLR2004
147-
# possibility of iso-639 code
148-
adjusted_query = query
149-
native_query = query
150-
query_type = "purecode"
151-
elif all(x.isalpha() or x in ("-", "_") for x in query) and (
152-
query.count("_") + query.count("-") == 1
153-
):
154-
# possibility of locale
155-
adjusted_query = re.split("-|_", query)[0]
156-
native_query = query.replace("-", "_")
157-
query_type = "locale"
158-
else:
159-
# possibility of iso language name
160-
adjusted_query = query.title().replace("Languages", "languages")
161-
native_query = query
162-
query_type = "languagename"
163184

185+
def get_language_or_none(lang_code: str) -> Language | None:
186+
"""Language from lang_code or None if not found"""
164187
try:
165-
lang_data, macro_data = get_iso_lang_data(adjusted_query)
166-
except NotFoundError as exc:
167-
if failsafe:
168-
return None
169-
raise exc
170-
171-
iso_data = update_with_macro(lang_data, macro_data)
172-
native_name, english_name = find_language_names(native_query, iso_data)
173-
return LangAndDetails(iso_data, english_name, native_name, query_type, query)
188+
return get_language(lang_code)
189+
except NotFoundError:
190+
return None
174191

175192

176193
def is_valid_iso_639_3(code: str) -> bool:
177194
"""whether code is a valid ISO-639-3 code"""
178-
lang = get_language_details(code, failsafe=True)
195+
lang = get_language_or_none(code)
179196
return lang is not None and lang.iso_639_3 == code

Diff for: tests/i18n/test_i18n.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
import pytest
55

66
from zimscraperlib.i18n import (
7+
Language,
78
NotFoundError,
89
find_language_names,
9-
get_language_details,
10+
get_language,
11+
get_language_or_none,
1012
)
1113

1214

@@ -168,11 +170,11 @@
168170
)
169171
def test_lang_details(query: str, expected: dict[str, Any] | None):
170172
if expected is None:
171-
assert get_language_details(query, failsafe=True) == expected
173+
assert get_language_or_none(query) == expected
172174
with pytest.raises(NotFoundError):
173-
get_language_details(query)
175+
get_language(query)
174176
else:
175-
result = get_language_details(query)
177+
result = get_language_or_none(query)
176178
assert result
177179
assert result.iso_639_1 == expected.get("iso-639-1")
178180
assert result.iso_639_2b == expected.get("iso-639-2b")
@@ -236,7 +238,7 @@ def mock_display_name(lang: str | None = None) -> str | None:
236238
],
237239
)
238240
def test_lang_details_equality(query_left: str, query_right: str):
239-
assert get_language_details(query_left) == get_language_details(query_right)
241+
assert Language(query_left) == Language(query_right)
240242

241243

242244
@pytest.mark.parametrize(
@@ -252,9 +254,9 @@ def test_lang_details_equality(query_left: str, query_right: str):
252254
],
253255
)
254256
def test_lang_details_inequality_with_patch(patch_attribute: str):
255-
lang_and_details_patched = get_language_details("arq")
257+
lang_and_details_patched = get_language("arq")
256258
setattr(lang_and_details_patched, patch_attribute, "foo")
257-
assert get_language_details("arq") != lang_and_details_patched
259+
assert get_language("arq") != lang_and_details_patched
258260

259261

260262
@pytest.mark.parametrize(
@@ -265,8 +267,8 @@ def test_lang_details_inequality_with_patch(patch_attribute: str):
265267
],
266268
)
267269
def test_lang_details_inequality(query_left: str, query_right: str):
268-
assert get_language_details(query_left) != get_language_details(query_right)
270+
assert get_language(query_left) != get_language(query_right)
269271

270272

271273
def test_lang_details_inequality_objects():
272-
assert get_language_details("ara") != "ara"
274+
assert get_language("ara") != "ara"

0 commit comments

Comments
 (0)