Skip to content

Commit b7b5e46

Browse files
authored
Merge pull request #229 from openzim/i18n_class
Move to real classes for i18n classes for proper typing in strict mode
2 parents f68d568 + 67844f4 commit b7b5e46

File tree

2 files changed

+359
-242
lines changed

2 files changed

+359
-242
lines changed

Diff for: src/zimscraperlib/i18n.py

+173-184
Original file line numberDiff line numberDiff line change
@@ -1,207 +1,196 @@
1-
#!/usr/bin/env python3
2-
# vim: ai ts=4 sts=4 et sw=4 nu
3-
41
from __future__ import annotations
52

63
import re
74

85
import babel
9-
import iso639
10-
import iso639.exceptions
6+
import iso639 # pyright: ignore[reportMissingTypeStubs]
7+
import iso639.exceptions # pyright: ignore[reportMissingTypeStubs]
118

129
ISO_LEVELS = ["1", "2b", "2t", "3", "5"]
1310

1411

15-
class NotFoundError(ValueError):
16-
pass
17-
18-
19-
class Lang(dict):
20-
21-
@property
22-
def iso_639_1(self) -> str | None:
23-
"""ISO-639-1 language code"""
24-
return self["iso-639-1"]
25-
26-
@property
27-
def iso_639_2b(self) -> str | None:
28-
"""ISO-639-2b language code"""
29-
return self["iso-639-2b"]
30-
31-
@property
32-
def iso_639_2t(self) -> str | None:
33-
"""ISO-639-2t language code"""
34-
return self["iso-639-2t"]
35-
36-
@property
37-
def iso_639_3(self) -> str | None:
38-
"""ISO-639-3 language code"""
39-
return self["iso-639-3"]
40-
41-
@property
42-
def iso_639_5(self) -> str | None:
43-
"""ISO-639-5 language code"""
44-
return self["iso-639-5"]
45-
46-
@property
47-
def english(self) -> str:
48-
"""language name in English"""
49-
return self["english"]
50-
51-
@property
52-
def native(self) -> str:
53-
"""language name in native language"""
54-
return self["native"]
55-
56-
@property
57-
def iso_types(self) -> list[str]:
58-
"""list of supported iso types"""
59-
return self["iso_types"]
60-
61-
@property
62-
def query(self) -> str:
63-
"""Query issued for these language details"""
64-
return self["query"]
65-
66-
@property
67-
def querytype(self) -> str:
68-
"""Type of query issued to retrieve language details"""
69-
return self["querytype"]
70-
71-
72-
def get_iso_lang_data(lang: str) -> tuple[Lang, Lang | None]:
73-
"""ISO-639-x languages details for lang. Raises NotFoundError
12+
class NotFoundError(ValueError): ...
13+
14+
15+
class Language:
16+
"""Qualified ISO-639-3 language"""
17+
18+
def __init__(self, query: str):
19+
"""Instantiate a valid ISO-639-3 Language from query
20+
21+
params: either an ISO-639 code or a locale or an english language name"""
22+
self.iso_639_1: str | None = None
23+
self.iso_639_2b: str | None = None
24+
self.iso_639_2t: str | None = None
25+
self.iso_639_3: str | None = None
26+
self.iso_639_5: str | None = None
27+
self.english: str | None = None
28+
self.native: str | None = None
29+
self.iso_types: list[str] = []
30+
self.query: str = query
31+
self.native_query: str | None = None
32+
self.querytype: str | None = None
33+
34+
def get_adjusted_query(query: str) -> tuple[str, str, str]:
35+
# possibily an iso-639 code
36+
if query.isalpha() and (2 <= len(query) <= 3): # noqa: PLR2004
37+
adjusted_query = query
38+
native_query = query
39+
query_type = "purecode"
40+
# possibily a locale
41+
elif all(x.isalpha() or x in ("-", "_") for x in query) and (
42+
query.count("_") + query.count("-") == 1
43+
):
44+
adjusted_query = re.split("-|_", query)[0]
45+
native_query = query.replace("-", "_")
46+
query_type = "locale"
47+
# possibily an ISO language name
48+
else:
49+
adjusted_query = query.title().replace("Languages", "languages")
50+
native_query = query
51+
query_type = "languagename"
52+
return adjusted_query, native_query, query_type
53+
54+
adjusted_query, self.native_query, self.querytype = get_adjusted_query(query)
7455

75-
Returns a tuple (main_language, macro_language | None)"""
76-
77-
iso_types = []
78-
79-
try:
80-
isolang = iso639.Lang(lang)
81-
except (
82-
iso639.exceptions.InvalidLanguageValue,
83-
iso639.exceptions.DeprecatedLanguageValue,
84-
) as exc:
85-
raise NotFoundError("Not a valid iso language name/code") from exc
86-
87-
def replace_types(new_type: str) -> str:
88-
# convert new iso_types from iso639-lang Pypi package to old iso_types from
89-
# iso-639 package, since we were returning these values for a long time
90-
if new_type == "pt1":
91-
return "part1"
92-
elif new_type == "pt2b":
93-
return "part2b"
94-
elif new_type == "pt2t":
95-
return "part2t"
96-
elif new_type == "pt3":
97-
return "part3"
98-
elif new_type == "pt5":
99-
return "part5"
100-
return new_type
101-
102-
for code_type in [f"pt{lang_}" for lang_ in ISO_LEVELS] + ["name"]:
103-
# the `if` condition below is a bit hackish but it is the only way to know
104-
# if the passed value is matching a code type or not with new python-i639
105-
# library and we do not expect weird things to happen here
106-
if str(getattr(isolang, code_type)).lower() == lang.lower():
107-
iso_types.append(replace_types(code_type))
108-
109-
lang_data = Lang(
110-
**{f"iso-639-{lang_}": getattr(isolang, f"pt{lang_}") for lang_ in ISO_LEVELS}
111-
)
112-
lang_data.update({"english": isolang.name, "iso_types": iso_types})
113-
114-
# first item in the returned tuple
115-
macro = isolang.macro()
116-
return (lang_data, get_iso_lang_data(macro.name)[0] if macro else None)
117-
118-
119-
def find_language_names(query: str, lang_data: Lang | None = None) -> tuple[str, str]:
120-
"""(native, english) language names for lang with help from lang_data
121-
122-
Falls back to English name if available or query if not"""
123-
if lang_data is None:
124-
lang_data = get_language_details(query, failsafe=True)
125-
if not lang_data:
126-
return query, query
56+
try:
57+
isolang = iso639.Lang(adjusted_query)
58+
except (
59+
iso639.exceptions.InvalidLanguageValue,
60+
iso639.exceptions.DeprecatedLanguageValue,
61+
) as exc:
62+
raise NotFoundError("Not a valid iso language name/code") from exc
63+
64+
parts_keys_map = {
65+
"iso_639_1": "pt1",
66+
"iso_639_2b": "pt2b",
67+
"iso_639_2t": "pt2t",
68+
"iso_639_3": "pt3",
69+
"iso_639_5": "pt5",
70+
"english": "name",
71+
}
12772

128-
try:
129-
query_locale = babel.Locale.parse(query)
130-
if native_display_name := query_locale.get_display_name():
131-
if english_display_name := query_locale.get_display_name("en"):
132-
return native_display_name, english_display_name
133-
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
134-
pass
135-
136-
# ISO code lookup order matters (most qualified first)!
137-
for iso_level in [f"iso-639-{lang_}" for lang_ in reversed(ISO_LEVELS)]:
73+
self.iso_639_1 = isolang.pt1 or None
74+
self.iso_639_2b = isolang.pt2b or None
75+
self.iso_639_2t = isolang.pt2t or None
76+
self.iso_639_3 = isolang.pt3 or None
77+
self.iso_639_5 = isolang.pt5 or None
78+
self.english = isolang.name or None
79+
self.iso_types = [
80+
part_level
81+
for iso_level, part_level in [
82+
(f"pt{level}", f"part{level}") for level in ISO_LEVELS
83+
]
84+
+ [("name", "name")]
85+
if getattr(isolang, iso_level).lower() == adjusted_query.lower()
86+
]
87+
88+
# update if language has a macro
89+
if isolang.macro():
90+
for iso_level in [f"iso_639_{level}" for level in ISO_LEVELS]:
91+
if not getattr(self, iso_level):
92+
setattr(
93+
self,
94+
iso_level,
95+
# we'll get the pt attr for each iso_xxx
96+
getattr(isolang.macro(), parts_keys_map[iso_level], None)
97+
# we want None if value is empty
98+
or None,
99+
)
100+
101+
self.native, self.english = self._get_names_from(self.native_query)
102+
103+
def _get_names_from(self, query: str) -> tuple[str, str]:
104+
"""logic to find language names from babel and fallback"""
138105
try:
139-
query_locale = babel.Locale.parse(lang_data.get(iso_level))
106+
query_locale = babel.Locale.parse(query)
140107
if native_display_name := query_locale.get_display_name():
141108
if english_display_name := query_locale.get_display_name("en"):
142109
return native_display_name, english_display_name
143110
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
144111
pass
145-
default = lang_data.get("english") or query
146-
return default, default
147-
148-
149-
def update_with_macro(lang_data: Lang, macro_data: Lang | None):
150-
"""update empty keys from lang_data with ones of macro_data"""
151-
if macro_data:
152-
for key, value in macro_data.items():
153-
if key in lang_data and not lang_data.get(key):
154-
lang_data[key] = value
155-
return lang_data
156-
157-
158-
def get_language_details(
159-
query: str, failsafe: bool | None = False # noqa: FBT002
160-
) -> Lang | None:
161-
"""language details dict from query.
162-
163-
When query fails, either raises NotFoundError or return None, based on failsafe
164-
165-
"""
166-
167-
if query.isalpha() and (2 <= len(query) <= 3): # noqa: PLR2004
168-
# possibility of iso-639 code
169-
adjusted_query = query
170-
native_query = query
171-
query_type = "purecode"
172-
elif all(x.isalpha() or x in ("-", "_") for x in query) and (
173-
query.count("_") + query.count("-") == 1
174-
):
175-
# possibility of locale
176-
adjusted_query = re.split("-|_", query)[0]
177-
native_query = query.replace("-", "_")
178-
query_type = "locale"
179-
else:
180-
# possibility of iso language name
181-
adjusted_query = query.title().replace("Languages", "languages")
182-
native_query = query
183-
query_type = "languagename"
184112

185-
try:
186-
lang_data, macro_data = get_iso_lang_data(adjusted_query)
187-
except NotFoundError as exc:
188-
if failsafe:
189-
return None
190-
raise exc
191-
192-
iso_data = update_with_macro(lang_data, macro_data)
193-
native_name, english_name = find_language_names(native_query, iso_data)
194-
iso_data.update(
195-
{
196-
"english": english_name,
197-
"native": native_name,
198-
"querytype": query_type,
199-
"query": query,
113+
# ISO code lookup order matters (most qualified first)!
114+
for iso_level in [f"iso_639_{level}" for level in reversed(ISO_LEVELS)]:
115+
try:
116+
query_locale = babel.Locale.parse(getattr(self, iso_level))
117+
if native_display_name := query_locale.get_display_name():
118+
if english_display_name := query_locale.get_display_name("en"):
119+
return native_display_name, english_display_name
120+
except (
121+
babel.UnknownLocaleError,
122+
TypeError,
123+
ValueError,
124+
AttributeError,
125+
):
126+
pass
127+
default = self.english or query
128+
return default, default
129+
130+
def todict(self) -> dict[str, str | None | list[str]]:
131+
return {
132+
key.replace("_", "-") if key.startswith("iso") else key: getattr(
133+
self, key, None
134+
)
135+
for key in [
136+
"iso_639_1",
137+
"iso_639_2b",
138+
"iso_639_2t",
139+
"iso_639_3",
140+
"iso_639_5",
141+
"english",
142+
"iso_types",
143+
"native",
144+
"querytype",
145+
"query",
146+
]
200147
}
201-
)
202-
return iso_data
148+
149+
def __repr__(self) -> str:
150+
data_repr = ", ".join(
151+
f'{key.replace("-", "_")}="{value}"' for key, value in self.todict().items()
152+
)
153+
return f"{type(self).__name__}({data_repr})"
154+
155+
def __str__(self) -> str:
156+
return f"{self.iso_639_3}: {self.english}"
157+
158+
def __eq__(self, value: object) -> bool:
159+
return (
160+
self.iso_639_1 == getattr(value, "iso_639_1", None)
161+
and self.iso_639_2b == getattr(value, "iso_639_2b", None)
162+
and self.iso_639_2t == getattr(value, "iso_639_2t", None)
163+
and self.iso_639_3 == getattr(value, "iso_639_3", None)
164+
and self.iso_639_5 == getattr(value, "iso_639_5", None)
165+
and self.english == getattr(value, "english", None)
166+
and self.native == getattr(value, "native", None)
167+
)
168+
169+
170+
def find_language_names(query: str) -> tuple[str, str]:
171+
"""(native, english) language names for query"""
172+
try:
173+
lang = Language(query)
174+
except NotFoundError:
175+
return query, query
176+
# should be qualified but "None" is as valid as anything if not
177+
return str(lang.native), str(lang.english)
178+
179+
180+
def get_language(lang_code: str) -> Language:
181+
"""Language from lang_code"""
182+
return Language(lang_code)
183+
184+
185+
def get_language_or_none(lang_code: str) -> Language | None:
186+
"""Language from lang_code or None if not found"""
187+
try:
188+
return get_language(lang_code)
189+
except NotFoundError:
190+
return None
203191

204192

205193
def is_valid_iso_639_3(code: str) -> bool:
206194
"""whether code is a valid ISO-639-3 code"""
207-
return (get_language_details(code, failsafe=True) or {}).get("iso-639-3") == code
195+
lang = get_language_or_none(code)
196+
return lang is not None and lang.iso_639_3 == code

0 commit comments

Comments
 (0)