Skip to content

Commit 96b1e8d

Browse files
sarahboycepauloxnet
andcommitted
Made typos in docs searches return some results.
Co-authored-by: Paolo Melchiorre <[email protected]>
1 parent 418d589 commit 96b1e8d

File tree

3 files changed

+72
-19
lines changed

3 files changed

+72
-19
lines changed

docs/models.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,7 @@ def search(self, query_text, release):
250250
query_text, config=models.F("config"), search_type="websearch"
251251
)
252252
search_rank = SearchRank(models.F("search"), search_query)
253-
similarity = TrigramSimilarity("title", query_text)
254-
return (
253+
base_qs = (
255254
self.prefetch_related(
256255
Prefetch(
257256
"release",
@@ -261,12 +260,8 @@ def search(self, query_text, release):
261260
"release__release", queryset=Release.objects.only("version")
262261
),
263262
)
264-
.filter(
265-
release_id=release.id,
266-
search=search_query,
267-
)
263+
.filter(release_id=release.id)
268264
.annotate(
269-
rank=search_rank + similarity,
270265
headline=SearchHeadline(
271266
"title",
272267
search_query,
@@ -283,12 +278,30 @@ def search(self, query_text, release):
283278
),
284279
breadcrumbs=models.F("metadata__breadcrumbs"),
285280
)
286-
.order_by("-rank")
287281
.only(
288282
"path",
289283
"release",
290284
)
291285
)
286+
vector_qs = (
287+
base_qs.alias(rank=search_rank)
288+
.filter(search=search_query)
289+
.order_by("-rank")
290+
)
291+
if not vector_qs:
292+
return (
293+
base_qs
294+
.alias(
295+
similarity=TrigramSimilarity(
296+
"title",
297+
utils.sanitize_for_trigram(query_text)
298+
)
299+
)
300+
.filter(similarity__gt=0.3)
301+
.order_by("-similarity")
302+
)
303+
else:
304+
return vector_qs
292305
else:
293306
return self.none()
294307

docs/tests.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from .models import DOCUMENT_SEARCH_VECTOR, Document, DocumentRelease
2121
from .sitemaps import DocsSitemap
2222
from .templatetags.docs import get_all_doc_versions
23-
from .utils import get_doc_path
23+
from .utils import get_doc_path, sanitize_for_trigram
2424

2525

2626
class ModelsTests(TestCase):
@@ -262,6 +262,21 @@ def test_get_doc_path(self):
262262
path, filename = __file__.rsplit(os.path.sep, 1)
263263
self.assertEqual(get_doc_path(Path(path), filename), None)
264264

265+
def test_sanitize_for_trigram(self):
266+
for query, sanitized_query in [
267+
("simple search", "simple search"),
268+
("Python Django -Flask", "Python Django"),
269+
('Python "Django Framework" -Flask', "Python Django Framework"),
270+
("Développement -'Framework Django' web", "Developpement web"),
271+
("Γλώσσα προγραμματισμού Python -'Flask και Django'", 'Γλωσσα προγραμματισμου Python'),
272+
("Pemrograman Python -'Flask dan Django' backend", "Pemrograman Python backend"),
273+
("Programmazione 'Python e Django' -Flask", "Programmazione Python e Django"),
274+
("Linguagem Python -'Django e Flask' web", "Linguagem Python web"),
275+
("Desarrollo Python -'Django y Flask' rápido", "Desarrollo Python rapido"),
276+
]:
277+
with self.subTest(query=query):
278+
self.assertEqual(sanitize_for_trigram(query), sanitized_query)
279+
265280

266281
class UpdateDocTests(TestCase):
267282
@classmethod
@@ -548,18 +563,16 @@ def setUp(self):
548563
def test_search(self):
549564
expected_list = [
550565
(
551-
0.96982837,
552566
"releases/1.2.1",
553-
"<mark>Django</mark> 1.2.1 release notes",
567+
"<mark>Django</mark> 1.2.1 release notes", # Ranked: 0.96982837.
554568
(
555569
"<mark>Django</mark> 1.2.1 release notes ¶ \n "
556570
"<mark>Django</mark> 1.2.1 was released almost immediately after 1.2.0 to correct two small"
557571
),
558572
),
559573
(
560-
0.9490876,
561574
"releases/1.9.4",
562-
"<mark>Django</mark> 1.9.4 release notes",
575+
"<mark>Django</mark> 1.9.4 release notes", # Ranked: 0.9490876.
563576
(
564577
"<mark>Django</mark> 1.9.4 release notes ¶ \n "
565578
"March 5, 2016 \n "
@@ -570,24 +583,24 @@ def test_search(self):
570583
self.assertQuerySetEqual(
571584
Document.objects.search("django", self.release),
572585
expected_list,
573-
transform=attrgetter("rank", "path", "headline", "highlight"),
586+
transform=attrgetter("path", "headline", "highlight"),
574587
)
575588

576589
def test_websearch(self):
577590
self.assertQuerySetEqual(
578591
Document.objects.search('django "release notes" -packaging', self.release),
579-
[("Django 1.9.4 release notes", 1.5675676)],
580-
transform=attrgetter("title", "rank"),
592+
["Django 1.9.4 release notes"],
593+
transform=attrgetter("title"),
581594
)
582595

583596
def test_multilingual_search(self):
584597
self.assertQuerySetEqual(
585598
Document.objects.search("publication", self.release_fr),
586599
[
587-
("Notes de publication de Django 1.2.1", 1.0693262),
588-
("Notes de publication de Django 1.9.4", 1.0458658),
600+
"Notes de publication de Django 1.2.1", # Ranked: 1.0693262.
601+
"Notes de publication de Django 1.9.4", # Ranked: 1.0458658.
589602
],
590-
transform=attrgetter("title", "rank"),
603+
transform=attrgetter("title"),
591604
)
592605

593606
def test_empty_search(self):
@@ -642,6 +655,15 @@ def test_search_highlight_stemmed(self):
642655
transform=attrgetter("headline", "highlight"),
643656
)
644657

658+
def test_search_title(self):
659+
misspelled_query = Document.objects.search("viewss", self.release)
660+
with self.assertNumQueries(2):
661+
self.assertQuerySetEqual(
662+
misspelled_query,
663+
["Generic views"],
664+
transform=attrgetter("headline"),
665+
)
666+
645667

646668
class TemplateTestCase(TestCase):
647669
def _assertOGTitleEqual(self, doc, expected):

docs/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import re
2+
import unicodedata
3+
14
from django.conf import settings
25
from django.http import Http404
36

@@ -39,3 +42,18 @@ def get_doc_path_or_404(docroot, subpath):
3942
if doc is None:
4043
raise Http404(doc)
4144
return doc
45+
46+
47+
def sanitize_for_trigram(text):
48+
"""
49+
Sanitize search query for PostgreSQL Trigram search.
50+
51+
- Removes parts starting with '-'
52+
- Normalizes Unicode characters (NFKD)
53+
- Keeps only letters, numbers and spaces
54+
- Removes multiple spaces and trims
55+
"""
56+
text = re.sub(r'(\s|^)-[^\s"\']+|(\s|^)-["\'][^"\']+["\']', '', text)
57+
text = unicodedata.normalize("NFKD", text)
58+
text = re.sub(r"[^\w\s]", "", text, flags=re.UNICODE)
59+
return " ".join(text.split())

0 commit comments

Comments
 (0)