Merge pull request #202 from openzim/tests-reg

rgaudin · web-flow · commit 8b6cbac88eaf · 2024-10-11T12:05:09.000Z
Fix CI on main
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ requires-python = ">=3.8,<3.13"
 description = "Collection of python tools to re-use common code across scrapers"
 readme = "README.md"
 dependencies = [
-  "iso639-lang>=2.2.3,<3.0",
+  "iso639-lang>=2.4.0,<3.0",
   "requests>=2.25.1,<3.0",
   "colorthief==0.2.1",
   "python-resize-image>=1.1.19,<1.2",
@@ -83,7 +83,7 @@ features = ["dev"]
 features = ["scripts", "test"]
 
 [[tool.hatch.envs.test.matrix]]
-python = ["3.8", "3.9", "3.10", "3.11"]
+python = ["3.8", "3.9", "3.10", "3.11", "3.12"]
 
 [tool.hatch.envs.test.scripts]
 run = "inv test --args '{args}'"
diff --git a/src/zimscraperlib/i18n.py b/src/zimscraperlib/i18n.py
@@ -111,12 +111,9 @@ def replace_types(new_type: str) -> str:
     )
     lang_data.update({"english": isolang.name, "iso_types": iso_types})
 
-    if isolang.macro():
-        return (
-            lang_data,
-            get_iso_lang_data(isolang.macro().name)[0],
-        )  # first item in the returned tuple
-    return lang_data, None
+    # first item in the returned tuple
+    macro = isolang.macro()
+    return (lang_data, get_iso_lang_data(macro.name)[0] if macro else None)
 
 
 def find_language_names(query: str, lang_data: Lang | None = None) -> tuple[str, str]:
diff --git a/src/zimscraperlib/zim/archive.py b/src/zimscraperlib/zim/archive.py
@@ -18,7 +18,6 @@
 import libzim.suggestion  # SuggestionSearcher  # pyright: ignore
 
 from zimscraperlib.zim._libkiwix import convertTags, parseMimetypeCounter
-from zimscraperlib.zim.items import Item
 
 
 class Archive(libzim.reader.Archive):
@@ -61,7 +60,7 @@ def get_entry_by_id(self, id_: int) -> libzim.reader.Entry:
         """Entry from its Id in ZIM"""
         return self._get_entry_by_id(id_)
 
-    def get_item(self, path: str) -> Item:
+    def get_item(self, path: str) -> libzim.reader.Item:
         """Item from a path"""
         return self.get_entry_by_path(path).get_item()
 
diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py
@@ -30,7 +30,7 @@
 
 import libzim.writer  # pyright: ignore
 import PIL.Image
-import regex
+import regex  # pyright: ignore [reportMissingModuleSource]
 
 from zimscraperlib import logger
 from zimscraperlib.constants import (
@@ -491,7 +491,7 @@ def add_redirect(
 
         try:
             try:
-                super().add_redirection(path, title, target_path, hints)
+                super().add_redirection(path, title or path, target_path, hints)
             except RuntimeError as exc:
                 if not DUPLICATE_EXC_STR.match(str(exc)) or not duplicate_ok:
                     raise exc
diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py
@@ -10,6 +10,7 @@
 import re
 import tempfile
 import urllib.parse
+from collections.abc import Callable
 from typing import Any
 
 import libzim.writer  # pyright: ignore
@@ -65,6 +66,11 @@ def get_hints(self) -> dict:
         return getattr(self, "hints", {})
 
 
+def no_indexing_indexdata() -> IndexData:
+    """IndexData asking libzim not to index this item"""
+    return IndexData("", "")
+
+
 class StaticItem(Item):
     """scraperlib Item with auto contentProvider from `content` or `filepath`
 
@@ -107,19 +113,17 @@ def __init__(
             path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
         )
         if index_data:
-            self.get_indexdata = lambda: index_data
+            self.get_indexdata: Callable[[], IndexData] = lambda: index_data
         elif not auto_index:
-            self.get_indexdata = lambda: IndexData("", "")  # index nothing
+            self.get_indexdata = no_indexing_indexdata  # index nothing
         else:
             self._get_auto_index()  # consider to add auto index
 
         # Populate item title from index data if title is not set by caller
-        if (
-            (not hasattr(self, "title") or not self.title)
-            and hasattr(self, "get_indexdata")
-            and self.get_indexdata().get_title()
-        ):
-            self.title = self.get_indexdata().get_title()
+        if (not getattr(self, "title", None)) and hasattr(self, "get_indexdata"):
+            title = self.get_indexdata().get_title()
+            if title:
+                self.title = title
 
     def get_contentprovider(self) -> libzim.writer.ContentProvider:
         # content was set manually
diff --git a/src/zimscraperlib/zim/providers.py b/src/zimscraperlib/zim/providers.py
@@ -13,6 +13,7 @@
 
 import io
 import pathlib
+from typing import Generator
 
 import libzim.writer  # pyright: ignore
 import requests
@@ -61,7 +62,7 @@ def __init__(
     def get_size(self) -> int:
         return self.size  # pyright: ignore
 
-    def gen_blob(self) -> libzim.writer.Blob:
+    def gen_blob(self) -> Generator[libzim.writer.Blob, None, None]:
         yield libzim.writer.Blob(self.fileobj.getvalue())  # pragma: no cover
 
 
@@ -92,7 +93,7 @@ def get_size_of(url) -> int | None:
     def get_size(self) -> int:
         return self.size  # pyright: ignore
 
-    def gen_blob(self) -> libzim.writer.Blob:  # pragma: no cover
+    def gen_blob(self) -> Generator[libzim.writer.Blob, None, None]:  # pragma: no cover
         for chunk in self.resp.iter_content(10 * 1024):
             if chunk:
                 yield libzim.writer.Blob(chunk)
diff --git a/tests/i18n/test_i18n.py b/tests/i18n/test_i18n.py
@@ -81,11 +81,11 @@
             {
                 "iso-639-1": "",
                 "iso-639-2b": "afa",
-                "iso-639-2t": "",
+                "iso-639-2t": "afa",
                 "iso-639-3": "",
                 "iso-639-5": "afa",
                 "english": "Afro-Asiatic languages",
-                "iso_types": ["part2b", "part5"],
+                "iso_types": ["part2b", "part2t", "part5"],
                 "querytype": "purecode",
                 "query": "afa",
                 "native": "Afro-Asiatic languages",
@@ -96,7 +96,7 @@
             {
                 "iso-639-1": "",
                 "iso-639-2b": "afa",
-                "iso-639-2t": "",
+                "iso-639-2t": "afa",
                 "iso-639-3": "",
                 "iso-639-5": "afa",
                 "english": "Afro-Asiatic languages",
diff --git a/tests/zim/test_fs.py b/tests/zim/test_fs.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # vim: ai ts=4 sts=4 et sw=4 nu
 
+import pathlib
 import shutil
 import subprocess
 import sys
@@ -147,7 +148,17 @@ def test_make_zim_file_no_file_on_error(tmp_path, png_image, build_data):
     print("Program exiting")
 """
 
-    py = subprocess.run([sys.executable, "-c", pycode], check=False)
+    py = subprocess.run(
+        [sys.executable, "-c", pycode],
+        check=False,
+        # using python3.9 on macOS15, calling this failed to find zimscraperlib
+        # making the subprocess exit with 1
+        env=(
+            {"PYTHONPATH": str(pathlib.Path.cwd() / "src")}
+            if sys.version_info[:2] == (3, 9)
+            else None
+        ),
+    )
     # returncode will be either 0 or -11, depending on garbage collection
     # in scrapers, we want to be able to fail on errors and absolutely don't want to
     # create a ZIM file, so SEGFAULT on exit it (somewhat) OK
diff --git a/tests/zim/test_indexing.py b/tests/zim/test_indexing.py
@@ -296,10 +296,12 @@ def test_get_pdf_index_data(
         filepath=encrypted_pdf_file if pdf_no == 1 else big_pdf_file
     )
     assert index_data.get_title() == expected_title
-    assert (
-        index_data.get_content()
-        == (encrypted_pdf_content if pdf_no == 1 else big_pdf_content).read_text()
+    # actual index content is dependent on the MuPDF version used by PyMuPDF
+    # this checks that index is large-enough
+    content_size = len(
+        (encrypted_pdf_content if pdf_no == 1 else big_pdf_content).read_text()
     )
+    assert len(index_data.get_content()) >= content_size * 0.9
     assert index_data.has_indexdata()
     assert index_data.get_wordcount() == expected_word_count
     assert index_data.get_keywords() == ""
diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py
@@ -38,11 +38,15 @@ def get_size(self) -> int:
 
 
 class SpecialURLProviderItem(StaticItem):
+    url: str
+
     def get_contentprovider(self):
         return SpecialURLProvider(self.url)
 
 
 class FileLikeProviderItem(StaticItem):
+    fileobj: io.BytesIO
+
     def get_contentprovider(self):
         if not self.fileobj:
             raise AttributeError("fileobj cannot be None")
@@ -125,7 +129,7 @@ def test_create_without_workaround(tmp_path):
         fpath, "welcome", workaround_nocancel=False
     ).config_dev_metadata() as creator:
         with pytest.raises(RuntimeError, match="AttributeError"):
-            creator.add_item("hello")
+            creator.add_item("hello")  # pyright: ignore [reportArgumentType]
 
 
 def test_noindexlanguage(tmp_path):