Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Tests

on:
pull_request:

jobs:
test:
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Set up uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true

- name: Install dependencies
run: uv sync --package grogbot-search-core --extra test

- name: Run tests
run: uv run --package grogbot-search-core --extra test pytest packages/search-core/tests
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,11 @@ Install test dependencies for the search core and run pytest:
uv sync --extra test
uv run pytest packages/search-core/tests
```

Run coverage checks with `pytest-cov`:

```bash
uv run --package grogbot-search-core --extra test \
pytest packages/search-core/tests \
--cov=grogbot_search --cov-report=term-missing
```
1 change: 1 addition & 0 deletions packages/search-core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies = [
[project.optional-dependencies]
test = [
"pytest>=8.0",
"pytest-cov>=5.0",
]

[build-system]
Expand Down
82 changes: 80 additions & 2 deletions packages/search-core/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

import hashlib
import math
import re
import threading
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from typing import Any, Dict
Expand All @@ -11,10 +14,24 @@

@pytest.fixture()
def service(tmp_path, monkeypatch):
def fake_embed(texts, *, prompt):
return [[0.0] * 768 for _ in texts]
def fake_embed(texts, *, prompt): # noqa: ARG001 - mirror real signature
embeddings: list[list[float]] = []
for text in texts:
vector = [0.0] * 768
tokens = re.findall(r"[a-z0-9]+", text.lower())
for token in tokens:
digest = hashlib.sha256(token.encode("utf-8")).digest()
index = int.from_bytes(digest[:2], "big") % 768
sign = 1.0 if digest[2] % 2 == 0 else -1.0
vector[index] += sign
norm = math.sqrt(sum(value * value for value in vector))
if norm > 0:
vector = [value / norm for value in vector]
embeddings.append(vector)
return embeddings

monkeypatch.setattr("grogbot_search.service.embed_texts", fake_embed)
monkeypatch.setattr("grogbot_search.service.time.sleep", lambda _seconds: None)
db_path = tmp_path / "search.db"
svc = SearchService(db_path)
yield svc
Expand Down Expand Up @@ -97,6 +114,36 @@ def log_message(self, format, *args): # noqa: A003 - match base signature
</html>
"""

responses["/article-no-canonical"] = """
<html>
<head>
<title>No Canonical Tag</title>
</head>
<body>
<article>
<h1>No Canonical Heading</h1>
<p>Uses requested URL as canonical fallback.</p>
</article>
</body>
</html>
"""

responses["/article-published"] = f"""
<html>
<head>
<title>Published Article</title>
<link rel="canonical" href="{base_url}/canonical-published" />
<meta property="article:published_time" content="2025-01-09T14:30:00Z" />
</head>
<body>
<article>
<h1>Published Heading</h1>
<p>Article with metadata timestamp.</p>
</article>
</body>
</html>
"""

responses["/feed"] = f"""
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
Expand Down Expand Up @@ -229,6 +276,26 @@ def log_message(self, format, *args): # noqa: A003 - match base signature

responses["/invalid-feed"] = "NOT VALID XML"

responses["/feed-summary-and-empty"] = f"""
<rss version="2.0">
<channel>
<title>Summary Feed</title>
<item>
<title>Summary Entry</title>
<link>{base_url}/summary-entry</link>
<guid>{base_url}/summary-entry</guid>
<description><![CDATA[<p>Summary based content.</p>]]></description>
<pubDate>Tue, 09 Jan 2025 12:00:00 GMT</pubDate>
</item>
<item>
<title>Empty Entry</title>
<link>{base_url}/empty-entry</link>
<guid>{base_url}/empty-entry</guid>
</item>
</channel>
</rss>
"""

responses["/opml"] = f"""<?xml version="1.0" encoding="UTF-8"?>
<opml version="2.0">
<head>
Expand Down Expand Up @@ -270,6 +337,8 @@ def log_message(self, format, *args): # noqa: A003 - match base signature
</opml>
"""

responses["/invalid-opml"] = "<opml><body><outline"

responses["/sitemap.xml"] = f"""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
Expand All @@ -291,6 +360,15 @@ def log_message(self, format, *args): # noqa: A003 - match base signature
</urlset>
"""

responses["/invalid-sitemap.xml"] = "<urlset><url><loc>"

responses["/sitemap-bootstrap-skip.xml"] = f"""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>{base_url}/backoff-403</loc></url>
<url><loc>{base_url}/article-2</loc></url>
</urlset>
"""

responses["/backoff-403"] = {
"status": 403,
"body": "Forbidden",
Expand Down
88 changes: 88 additions & 0 deletions packages/search-core/tests/test_chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from __future__ import annotations

import grogbot_search.chunking as chunking


def test_split_sections_breaks_on_headings():
markdown = """intro line
# Heading One
body one
## Heading Two
body two
"""

sections = chunking._split_sections(markdown)

assert sections == [
"intro line",
"# Heading One\nbody one",
"## Heading Two\nbody two",
]


def test_split_sentences_strips_whitespace_and_empties():
sentences = chunking._split_sentences(" First sentence. Second sentence! Third? ")

assert sentences == ["First sentence.", "Second sentence!", "Third?"]


def test_chunk_markdown_splits_oversized_block_by_sentences(monkeypatch):
monkeypatch.setattr(chunking, "TARGET_WORDS", 4)
monkeypatch.setattr(chunking, "MAX_WORDS", 6)

markdown = """# Heading

preface words

one two three. four five six. seven eight nine.
"""

chunks = chunking.chunk_markdown(markdown)

assert chunks == [
"Heading preface words",
"one two three. four five six.",
"seven eight nine.",
]


def test_chunk_markdown_flushes_when_next_block_would_exceed_max(monkeypatch):
monkeypatch.setattr(chunking, "TARGET_WORDS", 100)
monkeypatch.setattr(chunking, "MAX_WORDS", 5)

markdown = """one two three

four five six
"""

chunks = chunking.chunk_markdown(markdown)

assert chunks == ["one two three", "four five six"]


def test_chunk_markdown_sentence_group_flushes_on_max_overflow(monkeypatch):
monkeypatch.setattr(chunking, "TARGET_WORDS", 100)
monkeypatch.setattr(chunking, "MAX_WORDS", 5)

markdown = """# Heading

one two three. four five six.
"""

chunks = chunking.chunk_markdown(markdown)

assert chunks == ["Heading", "one two three.", "four five six."]


def test_chunk_markdown_flushes_when_target_is_reached(monkeypatch):
monkeypatch.setattr(chunking, "TARGET_WORDS", 3)
monkeypatch.setattr(chunking, "MAX_WORDS", 10)

markdown = """one two three

four five
"""

chunks = chunking.chunk_markdown(markdown)

assert chunks == ["one two three", "four five"]
50 changes: 50 additions & 0 deletions packages/search-core/tests/test_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from __future__ import annotations

import grogbot_search.embeddings as embeddings


class _FakeArray:
def __init__(self, values):
self._values = values

def tolist(self):
return list(self._values)


def test_load_model_uses_expected_sentence_transformer(monkeypatch):
calls: list[tuple[str, bool]] = []
model = object()

def fake_sentence_transformer(model_name: str, trust_remote_code: bool):
calls.append((model_name, trust_remote_code))
return model

monkeypatch.setattr(embeddings, "SentenceTransformer", fake_sentence_transformer)
embeddings._load_model.cache_clear()

first = embeddings._load_model()
second = embeddings._load_model()

assert first is model
assert second is model
assert calls == [("nomic-ai/nomic-embed-text-v1", True)]

embeddings._load_model.cache_clear()


def test_embed_texts_calls_model_and_returns_lists(monkeypatch):
class FakeModel:
def __init__(self):
self.calls = []

def encode(self, texts, *, normalize_embeddings: bool, prompt: str):
self.calls.append((texts, normalize_embeddings, prompt))
return [_FakeArray([1.0, 2.0]), _FakeArray([3.0, 4.0])]

fake_model = FakeModel()
monkeypatch.setattr(embeddings, "_load_model", lambda: fake_model)

result = embeddings.embed_texts(("first", "second"), prompt="search_query")

assert result == [[1.0, 2.0], [3.0, 4.0]]
assert fake_model.calls == [(["first", "second"], True, "search_query")]
Loading