diff --git a/packages/search-core/src/grogbot_search/service.py b/packages/search-core/src/grogbot_search/service.py index db25bb3..ef56434 100644 --- a/packages/search-core/src/grogbot_search/service.py +++ b/packages/search-core/src/grogbot_search/service.py @@ -718,6 +718,8 @@ def create_documents_from_sitemap(self, sitemap_url: str, bootstrap: bool = Fals documents: List[Document] = [] for page_url in unique_urls: canonical_url = _canonicalize_url(page_url) + if "/shop/" in urlparse(canonical_url).path: + continue if bootstrap: existing = self.connection.execute( "SELECT 1 FROM documents WHERE canonical_url = ? LIMIT 1", diff --git a/packages/search-core/tests/conftest.py b/packages/search-core/tests/conftest.py index 415ee87..6305e60 100644 --- a/packages/search-core/tests/conftest.py +++ b/packages/search-core/tests/conftest.py @@ -114,6 +114,21 @@ def log_message(self, format, *args): # noqa: A003 - match base signature """ + responses["/shop/product"] = f""" + + + Shop Product + + + +
+

Shop Product Heading

+

Shop product details.

+
+ + + """ + responses["/article-no-canonical"] = """ @@ -360,6 +375,13 @@ def log_message(self, format, *args): # noqa: A003 - match base signature """ + responses["/sitemap-shop-skip.xml"] = f""" + + {base_url}/shop/product + {base_url}/article + + """ + responses["/invalid-sitemap.xml"] = "" responses["/sitemap-bootstrap-skip.xml"] = f""" diff --git a/packages/search-core/tests/test_service.py b/packages/search-core/tests/test_service.py index 71fb9bd..9abff5e 100644 --- a/packages/search-core/tests/test_service.py +++ b/packages/search-core/tests/test_service.py @@ -800,6 +800,13 @@ def test_create_documents_from_sitemap_deduplicates_urls(service: SearchService, assert documents[0].canonical_url == f"{http_server}/canonical" +def test_create_documents_from_sitemap_skips_shop_paths(service: SearchService, http_server): + documents = service.create_documents_from_sitemap(f"{http_server}/sitemap-shop-skip.xml") + + assert len(documents) == 1 + assert documents[0].canonical_url == f"{http_server}/canonical" + + def test_create_documents_from_sitemap_invalid_xml_raises_value_error(service: SearchService, http_server): with pytest.raises(ValueError, match="Invalid sitemap XML"): service.create_documents_from_sitemap(f"{http_server}/invalid-sitemap.xml")