Skip to content

Commit fff2c10

Browse files
authored
Merge pull request #178 from openzim/v4_breaking_changes
Fix typos, remaining code qa and typing issues resulting in breaking changes
2 parents 0feff56 + 2d2c547 commit fff2c10

35 files changed

+537
-321
lines changed

Diff for: CHANGELOG.md

+15
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1414
- New `indexing.IndexData` class to hold title, content and keywords to pass to libzim to index an item
1515
- Automatically index PDF documents content #167
1616
- Automatically set proper title on PDF documents #168
17+
- Expose new `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
18+
- Add `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
19+
- New `creator.Creator.convert_and_check_metadata` to convert metadata to bytes or str for known use cases and check proper type is passed to libzim
20+
21+
## Changed
22+
23+
- **BREAKING** Renamed `zimscraperlib.image.convertion` to `zimscraperlib.image.conversion` to fix typo
24+
- **BREAKING** Many changes in type hints to match the real underlying code
25+
- **BREAKING** Force all boolean arguments (and some other non-obvious parameters) to be keyword-only in function calls for clarity / disambiguation (see ruff rule FBT002)
26+
- Prefer to use `IO[bytes]` to `io.BytesIO` when possible since it is more generic
27+
- **BREAKING** `i18n.NotFound` renamed `i18n.NotFoundError`
28+
- **BREAKING** `types.get_mime_for_name` now returns `str | None`
29+
- **BREAKING** `creator.Creator.add_metadata` and `creator.Creator.validate_metadata` now only accepts `bytes | str` as value (it must have been converted before call)
30+
- **BREAKING** second argument of `creator.Creator.add_metadata` has been renamed to `value` instead of `content` to align with other methods
31+
- When a type issue arises in metadata checks, wrong value type is displayed in exception
1732

1833
### Fixed
1934

Diff for: src/zimscraperlib/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "3.4.1-dev0"
1+
__version__ = "4.0.0-dev0"

Diff for: src/zimscraperlib/download.py

+20-16
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33

44
from __future__ import annotations
55

6-
import io
76
import pathlib
87
import subprocess
98
from concurrent.futures import Future, ThreadPoolExecutor
10-
from typing import ClassVar
9+
from typing import IO, ClassVar
1110

1211
import requests
12+
import requests.adapters
13+
import requests.structures
14+
import urllib3.util
1315
import yt_dlp as youtube_dl
1416

1517
from zimscraperlib import logger
@@ -42,7 +44,8 @@ def download(
4244
self,
4345
url: str,
4446
options: dict | None,
45-
wait: bool | None = True, # noqa: FBT002
47+
*,
48+
wait: bool | None = True,
4649
) -> bool | Future:
4750
"""Downloads video using initialized executor.
4851
@@ -52,9 +55,7 @@ def download(
5255
5356
Returns download result of future (wait=False)"""
5457

55-
future = self.executor.submit(
56-
self._run_youtube_dl, url, options # pyright: ignore
57-
)
58+
future = self.executor.submit(self._run_youtube_dl, url, options or {})
5859
if not wait:
5960
return future
6061
if not future.exception():
@@ -143,8 +144,8 @@ def save_large_file(url: str, fpath: pathlib.Path) -> None:
143144

144145
def _get_retry_adapter(
145146
max_retries: int | None = 5,
146-
) -> requests.adapters.BaseAdapter: # pyright: ignore
147-
retries = requests.packages.urllib3.util.retry.Retry( # pyright: ignore
147+
) -> requests.adapters.BaseAdapter:
148+
retries = urllib3.util.retry.Retry(
148149
total=max_retries, # total number of retries
149150
connect=max_retries, # connection errors
150151
read=max_retries, # read errors
@@ -161,7 +162,7 @@ def _get_retry_adapter(
161162
], # force retry on the following codes
162163
)
163164

164-
return requests.adapters.HTTPAdapter(max_retries=retries) # pyright: ignore
165+
return requests.adapters.HTTPAdapter(max_retries=retries)
165166

166167

167168
def get_session(max_retries: int | None = 5) -> requests.Session:
@@ -174,14 +175,15 @@ def get_session(max_retries: int | None = 5) -> requests.Session:
174175
def stream_file(
175176
url: str,
176177
fpath: pathlib.Path | None = None,
177-
byte_stream: io.BytesIO | None = None,
178+
byte_stream: IO[bytes] | None = None,
178179
block_size: int | None = 1024,
179180
proxies: dict | None = None,
180-
only_first_block: bool | None = False, # noqa: FBT002
181181
max_retries: int | None = 5,
182182
headers: dict[str, str] | None = None,
183183
session: requests.Session | None = None,
184-
) -> tuple[int, requests.structures.CaseInsensitiveDict]: # pyright: ignore
184+
*,
185+
only_first_block: bool | None = False,
186+
) -> tuple[int, requests.structures.CaseInsensitiveDict]:
185187
"""Stream data from a URL to either a BytesIO object or a file
186188
Arguments -
187189
fpath - Path of the file where data is sent
@@ -212,12 +214,14 @@ def stream_file(
212214
total_downloaded = 0
213215
if fpath is not None:
214216
fp = open(fpath, "wb")
215-
else:
217+
elif (
218+
byte_stream is not None
219+
): # pragma: no branch (we use a precise condition to help type checker)
216220
fp = byte_stream
217221

218222
for data in resp.iter_content(block_size):
219223
total_downloaded += len(data)
220-
fp.write(data) # pyright: ignore
224+
fp.write(data)
221225

222226
# stop downloading/reading if we're just testing first block
223227
if only_first_block:
@@ -226,7 +230,7 @@ def stream_file(
226230
logger.debug(f"Downloaded {total_downloaded} bytes from {url}")
227231

228232
if fpath:
229-
fp.close() # pyright: ignore
233+
fp.close()
230234
else:
231-
fp.seek(0) # pyright: ignore
235+
fp.seek(0)
232236
return total_downloaded, resp.headers

Diff for: src/zimscraperlib/filesystem.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def get_file_mimetype(fpath: pathlib.Path) -> str:
3030
return get_content_mimetype(fh.read(2048))
3131

3232

33-
def get_content_mimetype(content: bytes) -> str:
33+
def get_content_mimetype(content: bytes | str) -> str:
3434
"""MIME Type of content retrieved from magic headers"""
3535

3636
try:

Diff for: src/zimscraperlib/html.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,22 @@
77
import pathlib
88
from typing import BinaryIO, TextIO
99

10-
from bs4 import BeautifulSoup
10+
from bs4 import BeautifulSoup, element
1111

1212
from zimscraperlib.types import ARTICLE_MIME
1313

1414

15-
def find_title_in(content: str | BinaryIO | TextIO, mime_type: str) -> str:
15+
def find_title_in(content: str | BinaryIO | TextIO, mime_type: str | None) -> str:
1616
"""Extracted title from HTML content
1717
1818
blank on failure to extract and non-HTML files"""
1919
if mime_type != ARTICLE_MIME:
2020
return ""
21-
try:
22-
return BeautifulSoup(content, "lxml").find("title").text # pyright: ignore
23-
except Exception:
24-
return ""
21+
title_tag = BeautifulSoup(content, "lxml").find("title")
22+
return title_tag.text if title_tag else ""
2523

2624

27-
def find_title_in_file(fpath: pathlib.Path, mime_type: str) -> str:
25+
def find_title_in_file(fpath: pathlib.Path, mime_type: str | None) -> str:
2826
"""Extracted title from an HTML file"""
2927
try:
3028
with open(fpath) as fh:
@@ -45,15 +43,17 @@ def find_language_in(content: str | BinaryIO | TextIO, mime_type: str) -> str:
4543
for key in keylist:
4644
node = soup.find(nodename)
4745
if node:
48-
if not node.has_attr(key): # pyright: ignore
46+
if not isinstance(node, element.Tag) or (
47+
isinstance(node, element.Tag) and not node.has_attr(key)
48+
):
4949
continue
5050
if (
5151
nodename == "meta"
52-
and not node.attrs.get("http-equiv", "").lower() # pyright: ignore
52+
and not node.attrs.get("http-equiv", "").lower()
5353
== "content-language"
5454
):
5555
continue
56-
return node.attrs[key] # pyright: ignore
56+
return node.attrs[key]
5757
return ""
5858

5959

Diff for: src/zimscraperlib/i18n.py

+12-16
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
ISO_LEVELS = ["1", "2b", "2t", "3", "5"]
1616

1717

18-
class NotFound(ValueError): # noqa: N818
18+
class NotFoundError(ValueError):
1919
pass
2020

2121

@@ -81,7 +81,7 @@ def get_iso_lang_data(lang: str) -> tuple[dict, dict | None]:
8181
iso639.exceptions.InvalidLanguageValue,
8282
iso639.exceptions.DeprecatedLanguageValue,
8383
) as exc:
84-
raise NotFound("Not a valid iso language name/code") from exc
84+
raise NotFoundError("Not a valid iso language name/code") from exc
8585

8686
def replace_types(new_type: str) -> str:
8787
# convert new iso_types from iso639-lang Pypi package to old iso_types from
@@ -118,34 +118,32 @@ def replace_types(new_type: str) -> str:
118118
return lang_data, None
119119

120120

121-
def find_language_names(query: str, lang_data: dict | None = None) -> tuple[str, str]:
121+
def find_language_names(
122+
query: str, lang_data: dict | None = None
123+
) -> tuple[str | None, str | None]:
122124
"""(native, english) language names for lang with help from language_details dict
123125
124126
Falls back to English name if available or query if not"""
125127
if lang_data is None:
126128
lang_data = get_language_details(query, failsafe=True) or {}
127129
try:
128130
query_locale = babel.Locale.parse(query)
129-
return query_locale.get_display_name(), query_locale.get_display_name(
130-
"en"
131-
) # pyright: ignore
131+
return query_locale.get_display_name(), query_locale.get_display_name("en")
132132
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
133133
pass
134134

135135
# ISO code lookup order matters (most qualified first)!
136136
for iso_level in [f"iso-639-{lang_}" for lang_ in reversed(ISO_LEVELS)]:
137137
try:
138138
query_locale = babel.Locale.parse(lang_data.get(iso_level))
139-
return query_locale.get_display_name(), query_locale.get_display_name(
140-
"en"
141-
) # pyright: ignore
139+
return query_locale.get_display_name(), query_locale.get_display_name("en")
142140
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
143141
pass
144142
default = lang_data.get("english", query)
145143
return default, default
146144

147145

148-
def update_with_macro(lang_data: dict, macro_data: dict):
146+
def update_with_macro(lang_data: dict, macro_data: dict | None):
149147
"""update empty keys from lang_data with ones of macro_data"""
150148
if macro_data:
151149
for key, value in macro_data.items():
@@ -154,9 +152,7 @@ def update_with_macro(lang_data: dict, macro_data: dict):
154152
return lang_data
155153

156154

157-
def get_language_details(
158-
query: str, failsafe: bool | None = False # noqa: FBT002
159-
) -> dict:
155+
def get_language_details(query: str, *, failsafe: bool | None = False) -> dict | None:
160156
"""language details dict from query.
161157
162158
Raises NotFound or return `und` language details if failsafe
@@ -192,12 +188,12 @@ def get_language_details(
192188

193189
try:
194190
lang_data, macro_data = get_iso_lang_data(adjusted_query)
195-
except NotFound as exc:
191+
except NotFoundError as exc:
196192
if failsafe:
197-
return None # pyright: ignore
193+
return None
198194
raise exc
199195

200-
iso_data = update_with_macro(lang_data, macro_data) # pyright: ignore
196+
iso_data = update_with_macro(lang_data, macro_data)
201197
native_name, english_name = find_language_names(native_query, iso_data)
202198
iso_data.update(
203199
{

Diff for: src/zimscraperlib/image/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# vim: ai ts=4 sts=4 et sw=4 nu
44

55
# flake8: noqa
6-
from .convertion import convert_image
6+
from .conversion import convert_image
77
from .optimization import optimize_image
88
from .probing import is_valid_image
99
from .transformation import resize_image

Diff for: src/zimscraperlib/image/convertion.py renamed to src/zimscraperlib/image/conversion.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33

44
from __future__ import annotations
55

6-
import io
76
import pathlib
7+
from typing import IO
88

9-
import PIL
9+
from PIL.Image import open as pilopen
1010

1111
from zimscraperlib.constants import ALPHA_NOT_SUPPORTED
1212
from zimscraperlib.image.probing import format_for
@@ -15,8 +15,8 @@
1515

1616

1717
def convert_image(
18-
src: pathlib.Path | io.BytesIO,
19-
dst: pathlib.Path | io.BytesIO,
18+
src: pathlib.Path | IO[bytes],
19+
dst: pathlib.Path | IO[bytes],
2020
**params: str,
2121
) -> None:
2222
"""convert an image file from one format to another
@@ -29,12 +29,12 @@ def convert_image(
2929
to RGB. ex: RGB, ARGB, CMYK (and other PIL colorspaces)"""
3030

3131
colorspace = params.get("colorspace") # requested colorspace
32-
fmt = (
33-
params.pop("fmt").upper() if "fmt" in params else None # pyright: ignore
34-
) # requested format
32+
fmt = params.pop("fmt").upper() if "fmt" in params else None # requested format
3533
if not fmt:
3634
fmt = format_for(dst)
37-
with PIL.Image.open(src) as image: # pyright: ignore
35+
if not fmt:
36+
raise ValueError("Impossible to guess destination image format")
37+
with pilopen(src) as image:
3838
if image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED or colorspace:
3939
image = image.convert(colorspace or "RGB") # noqa: PLW2901
4040
save_image(image, dst, fmt, **params)
@@ -45,13 +45,13 @@ def create_favicon(src: pathlib.Path, dst: pathlib.Path) -> None:
4545
if dst.suffix != ".ico":
4646
raise ValueError("favicon extension must be ICO")
4747

48-
img = PIL.Image.open(src) # pyright: ignore
48+
img = pilopen(src)
4949
w, h = img.size
5050
# resize image to square first
5151
if w != h:
5252
size = min([w, h])
5353
resized = dst.parent.joinpath(f"{src.stem}.tmp.{src.suffix}")
5454
resize_image(src, size, size, resized, "contain")
55-
img = PIL.Image.open(resized) # pyright: ignore
55+
img = pilopen(resized)
5656
# now convert to ICO
5757
save_image(img, dst, "ICO")

0 commit comments

Comments
 (0)