Skip to content

Commit 469f52c

Browse files
committed
fix(search): address PR 94 feedback on reranking and tests
1 parent 744727b commit 469f52c

2 files changed

Lines changed: 95 additions & 1 deletion

File tree

backend/ks_search_tool.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ async def general_search_async(
348348
"""Async version of general search with parallel enrichment"""
349349
logger.info("--> Executing async general search...")
350350
base_url = "https://api.knowledge-space.org/datasets/search"
351-
params = {"q": query or "*", "per_page": min(top_k * 2, 50)}
351+
params = {"q": expand_query(query) if query else "*", "per_page": min(top_k * 2, 50)}
352352
try:
353353
async with aiohttp.ClientSession() as session:
354354
async with session.get(base_url, params=params, timeout=15) as resp:
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import pytest
2+
from ks_search_tool import rerank_results_using_metadata
3+
4+
5+
def test_rerank_max_bounds():
6+
"""
7+
Test that the maximum possible boost is exactly +30%
8+
(10% for Year, 15% for Citations, 5% for Trusted Source)
9+
"""
10+
results = [
11+
# Baseline dataset
12+
{
13+
"_score": 100.0,
14+
"title_guess": "Old Data",
15+
"metadata": {"year": 1990, "citations": 0, "source": "Unknown"},
16+
},
17+
# Perfect dataset that should get the max 1.30x multiplier
18+
{
19+
"_score": 100.0,
20+
"title_guess": "Perfect Data",
21+
"metadata": {
22+
"year": 2024,
23+
"citations": 10000,
24+
"source": "Allen Brain Atlas",
25+
},
26+
},
27+
]
28+
29+
ranked = rerank_results_using_metadata(results)
30+
31+
# "Perfect Data" should be first due to boost
32+
assert ranked[0]["title_guess"] == "Perfect Data"
33+
34+
# Baseline should remain exactly 100.0 (no multiplier via min scaling)
35+
assert ranked[1]["_score"] == pytest.approx(100.0)
36+
37+
# Perfect Data should be exactly 130.0 (1.30x multiplier)
38+
assert ranked[0]["_score"] == pytest.approx(130.0)
39+
assert ranked[0]["_rerank_multiplier"] == pytest.approx(1.30)
40+
41+
42+
def test_rerank_log_normalization():
43+
"""
44+
Test that 10k citations doesn't astronomically outscore 10 citations
45+
thanks to log normalization.
46+
"""
47+
results = [
48+
{"_score": 100.0, "title_guess": "Zero Cits", "metadata": {"citations": 0}},
49+
{"_score": 100.0, "title_guess": "Ten Cits", "metadata": {"citations": 10}},
50+
{
51+
"_score": 100.0,
52+
"title_guess": "Ten Thousand Cits",
53+
"metadata": {"citations": 10000},
54+
},
55+
]
56+
57+
ranked = rerank_results_using_metadata(results)
58+
59+
# Highest should still be first
60+
assert ranked[0]["title_guess"] == "Ten Thousand Cits"
61+
62+
multiplier_high = ranked[0]["_rerank_multiplier"]
63+
multiplier_mid = ranked[1]["_rerank_multiplier"]
64+
multiplier_low = ranked[2]["_rerank_multiplier"]
65+
66+
# Verify the bounded maximum is respected (max +15% for citations)
67+
assert multiplier_high == pytest.approx(1.15)
68+
assert multiplier_low == pytest.approx(1.00)
69+
70+
# 10 citations should give a meaningful logarithmic boost (log10(11) / log10(10001)) * 0.15
71+
# Let's just assert it is meaningfully greater than 1.0 but less than 1.15
72+
assert 1.0 < multiplier_mid < 1.15
73+
74+
75+
def test_rerank_empty_metadata_handling():
76+
"""
77+
Test that datasets missing metadata fields do not break the calculation.
78+
"""
79+
results = [
80+
{"_score": 10.0, "title_guess": "No Meta1"},
81+
{"_score": 10.0, "title_guess": "No Meta2", "metadata": {}},
82+
{
83+
"_score": 10.0,
84+
"title_guess": "Garbage Meta",
85+
"metadata": {"year": "unknown", "citations": None},
86+
},
87+
]
88+
89+
ranked = rerank_results_using_metadata(results)
90+
91+
# All should retain their base score of 10.0
92+
for r in ranked:
93+
assert r["_score"] == pytest.approx(10.0)
94+
assert r["_rerank_multiplier"] == pytest.approx(1.0)

0 commit comments

Comments
 (0)