lurkshark · lurkshark · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,28 @@
+name: Tests
+
+on:
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv sync --package grogbot-search-core --extra test
+
+      - name: Run tests
+        run: uv run --package grogbot-search-core --extra test pytest packages/search-core/tests
diff --git a/README.md b/README.md
@@ -49,3 +49,11 @@ Install test dependencies for the search core and run pytest:
 uv sync --extra test
 uv run pytest packages/search-core/tests
 ```
+
+Run coverage checks with `pytest-cov`:
+
+```bash
+uv run --package grogbot-search-core --extra test \
+  pytest packages/search-core/tests \
+  --cov=grogbot_search --cov-report=term-missing
+```
diff --git a/packages/search-core/pyproject.toml b/packages/search-core/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
 [project.optional-dependencies]
 test = [
   "pytest>=8.0",
+  "pytest-cov>=5.0",
 ]
 
 [build-system]

diff --git a/packages/search-core/tests/conftest.py b/packages/search-core/tests/conftest.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
 
+import hashlib
+import math
+import re
 import threading
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from typing import Any, Dict
@@ -11,10 +14,24 @@
 
 @pytest.fixture()
 def service(tmp_path, monkeypatch):
-    def fake_embed(texts, *, prompt):
-        return [[0.0] * 768 for _ in texts]
+    def fake_embed(texts, *, prompt):  # noqa: ARG001 - mirror real signature
+        embeddings: list[list[float]] = []
+        for text in texts:
+            vector = [0.0] * 768
+            tokens = re.findall(r"[a-z0-9]+", text.lower())
+            for token in tokens:
+                digest = hashlib.sha256(token.encode("utf-8")).digest()
+                index = int.from_bytes(digest[:2], "big") % 768
+                sign = 1.0 if digest[2] % 2 == 0 else -1.0
+                vector[index] += sign
+            norm = math.sqrt(sum(value * value for value in vector))
+            if norm > 0:
+                vector = [value / norm for value in vector]
+            embeddings.append(vector)
+        return embeddings
 
     monkeypatch.setattr("grogbot_search.service.embed_texts", fake_embed)
+    monkeypatch.setattr("grogbot_search.service.time.sleep", lambda _seconds: None)
     db_path = tmp_path / "search.db"
     svc = SearchService(db_path)
     yield svc
@@ -97,6 +114,36 @@ def log_message(self, format, *args):  # noqa: A003 - match base signature
     </html>
     """
 
+    responses["/article-no-canonical"] = """
+    <html>
+      <head>
+        <title>No Canonical Tag</title>
+      </head>
+      <body>
+        <article>
+          <h1>No Canonical Heading</h1>
+          <p>Uses requested URL as canonical fallback.</p>
+        </article>
+      </body>
+    </html>
+    """
+
+    responses["/article-published"] = f"""
+    <html>
+      <head>
+        <title>Published Article</title>
+        <link rel="canonical" href="{base_url}/canonical-published" />
+        <meta property="article:published_time" content="2025-01-09T14:30:00Z" />
+      </head>
+      <body>
+        <article>
+          <h1>Published Heading</h1>
+          <p>Article with metadata timestamp.</p>
+        </article>
+      </body>
+    </html>
+    """
+
     responses["/feed"] = f"""
     <rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
       <channel>
@@ -229,6 +276,26 @@ def log_message(self, format, *args):  # noqa: A003 - match base signature
 
     responses["/invalid-feed"] = "NOT VALID XML"
 
+    responses["/feed-summary-and-empty"] = f"""
+    <rss version="2.0">
+      <channel>
+        <title>Summary Feed</title>
+        <item>
+          <title>Summary Entry</title>
+          <link>{base_url}/summary-entry</link>
+          <guid>{base_url}/summary-entry</guid>
+          <description><![CDATA[<p>Summary based content.</p>]]></description>
+          <pubDate>Tue, 09 Jan 2025 12:00:00 GMT</pubDate>
+        </item>
+        <item>
+          <title>Empty Entry</title>
+          <link>{base_url}/empty-entry</link>
+          <guid>{base_url}/empty-entry</guid>
+        </item>
+      </channel>
+    </rss>
+    """
+
     responses["/opml"] = f"""<?xml version="1.0" encoding="UTF-8"?>
     <opml version="2.0">
       <head>
@@ -270,6 +337,8 @@ def log_message(self, format, *args):  # noqa: A003 - match base signature
     </opml>
     """
 
+    responses["/invalid-opml"] = "<opml><body><outline"
+
     responses["/sitemap.xml"] = f"""<?xml version="1.0" encoding="UTF-8"?>
     <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
       <url>
@@ -291,6 +360,15 @@ def log_message(self, format, *args):  # noqa: A003 - match base signature
     </urlset>
     """
 
+    responses["/invalid-sitemap.xml"] = "<urlset><url><loc>"
+
+    responses["/sitemap-bootstrap-skip.xml"] = f"""<?xml version="1.0" encoding="UTF-8"?>
+    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+      <url><loc>{base_url}/backoff-403</loc></url>
+      <url><loc>{base_url}/article-2</loc></url>
+    </urlset>
+    """
+
     responses["/backoff-403"] = {
         "status": 403,
         "body": "Forbidden",

diff --git a/packages/search-core/tests/test_chunking.py b/packages/search-core/tests/test_chunking.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+import grogbot_search.chunking as chunking
+
+
+def test_split_sections_breaks_on_headings():
+    markdown = """intro line
+# Heading One
+body one
+## Heading Two
+body two
+"""
+
+    sections = chunking._split_sections(markdown)
+
+    assert sections == [
+        "intro line",
+        "# Heading One\nbody one",
+        "## Heading Two\nbody two",
+    ]
+
+
+def test_split_sentences_strips_whitespace_and_empties():
+    sentences = chunking._split_sentences("  First sentence.  Second sentence!   Third?   ")
+
+    assert sentences == ["First sentence.", "Second sentence!", "Third?"]
+
+
+def test_chunk_markdown_splits_oversized_block_by_sentences(monkeypatch):
+    monkeypatch.setattr(chunking, "TARGET_WORDS", 4)
+    monkeypatch.setattr(chunking, "MAX_WORDS", 6)
+
+    markdown = """# Heading
+
+preface words
+
+one two three. four five six. seven eight nine.
+"""
+
+    chunks = chunking.chunk_markdown(markdown)
+
+    assert chunks == [
+        "Heading preface words",
+        "one two three. four five six.",
+        "seven eight nine.",
+    ]
+
+
+def test_chunk_markdown_flushes_when_next_block_would_exceed_max(monkeypatch):
+    monkeypatch.setattr(chunking, "TARGET_WORDS", 100)
+    monkeypatch.setattr(chunking, "MAX_WORDS", 5)
+
+    markdown = """one two three
+
+four five six
+"""
+
+    chunks = chunking.chunk_markdown(markdown)
+
+    assert chunks == ["one two three", "four five six"]
+
+
+def test_chunk_markdown_sentence_group_flushes_on_max_overflow(monkeypatch):
+    monkeypatch.setattr(chunking, "TARGET_WORDS", 100)
+    monkeypatch.setattr(chunking, "MAX_WORDS", 5)
+
+    markdown = """# Heading
+
+one two three. four five six.
+"""
+
+    chunks = chunking.chunk_markdown(markdown)
+
+    assert chunks == ["Heading", "one two three.", "four five six."]
+
+
+def test_chunk_markdown_flushes_when_target_is_reached(monkeypatch):
+    monkeypatch.setattr(chunking, "TARGET_WORDS", 3)
+    monkeypatch.setattr(chunking, "MAX_WORDS", 10)
+
+    markdown = """one two three
+
+four five
+"""
+
+    chunks = chunking.chunk_markdown(markdown)
+
+    assert chunks == ["one two three", "four five"]
diff --git a/packages/search-core/tests/test_embeddings.py b/packages/search-core/tests/test_embeddings.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+import grogbot_search.embeddings as embeddings
+
+
+class _FakeArray:
+    def __init__(self, values):
+        self._values = values
+
+    def tolist(self):
+        return list(self._values)
+
+
+def test_load_model_uses_expected_sentence_transformer(monkeypatch):
+    calls: list[tuple[str, bool]] = []
+    model = object()
+
+    def fake_sentence_transformer(model_name: str, trust_remote_code: bool):
+        calls.append((model_name, trust_remote_code))
+        return model
+
+    monkeypatch.setattr(embeddings, "SentenceTransformer", fake_sentence_transformer)
+    embeddings._load_model.cache_clear()
+
+    first = embeddings._load_model()
+    second = embeddings._load_model()
+
+    assert first is model
+    assert second is model
+    assert calls == [("nomic-ai/nomic-embed-text-v1", True)]
+
+    embeddings._load_model.cache_clear()
+
+
+def test_embed_texts_calls_model_and_returns_lists(monkeypatch):
+    class FakeModel:
+        def __init__(self):
+            self.calls = []
+
+        def encode(self, texts, *, normalize_embeddings: bool, prompt: str):
+            self.calls.append((texts, normalize_embeddings, prompt))
+            return [_FakeArray([1.0, 2.0]), _FakeArray([3.0, 4.0])]
+
+    fake_model = FakeModel()
+    monkeypatch.setattr(embeddings, "_load_model", lambda: fake_model)
+
+    result = embeddings.embed_texts(("first", "second"), prompt="search_query")
+
+    assert result == [[1.0, 2.0], [3.0, 4.0]]
+    assert fake_model.calls == [(["first", "second"], True, "search_query")]