Skip to content

Commit 3070275

Browse files
Merge pull request #119 from gomate-community/pipeline
feature@optimizing deepsearch
2 parents 5b0235f + 33eada5 commit 3070275

File tree

9 files changed

+211
-111
lines changed

9 files changed

+211
-111
lines changed

trustrag/modules/deepsearch/.env

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ OLLAMA_MODEL="deepseek-r1:1.5b"
3333
# Uncomment if you want to use a service that mimics OpenAI's API (e.g., OpenRouter or Gemini).
3434
# OPENAI_ENDPOINT="http://localhost:1234/v1"
3535

36-
# Default scraper to use (options: firecrawl, playwright_ddgs)
36+
# Default scraper to use (options: firecrawl, playwright_ddgs,playwright_searxng)
3737
DEFAULT_SCRAPER="playwright_ddgs"
3838

3939
# -----------------------------------------------------------------------------
@@ -49,4 +49,4 @@ FIRECRAWL_API_KEY="your_firecrawl_api_key_here"
4949
# -----------------------------------------------------------------------------
5050
# SearXNG
5151
# -----------------------------------------------------------------------------
52-
SEARXNG_URL="http://192.168.1.5:11434/v1/"
52+
SEARXNG_URL="http://localhost:8080/search"

trustrag/modules/deepsearch/action.py

Lines changed: 8 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,11 @@ async def generate_serp_queries(
3535
) -> List[SerpQuery]:
3636
"""Generate SERP queries based on user input and previous learnings."""
3737

38-
# prompt = f"""Given the following prompt from the user, generate a list of SERP queries to research the topic. Return a JSON object with a 'queries' array field containing {num_queries} queries (or less if the original prompt is clear). Each query object should have 'query' and 'research_goal' fields. Make sure each query is unique and not similar to each other: <prompt>{query}</prompt>"""
39-
prompt = f"""根据用户提供的以下提示,生成SERP(Search Engine Results Page ,搜索引擎结果页面)查询列表以研究该主题。
40-
返回一个JSON对象,其中包含'queries'数组字段,该字段包含{num_queries}个查询(如果原始提示已经很明确,则可以少于此数量)。
41-
每个查询对象应有'query'和'research_goal'字段。
42-
确保每个查询都是唯一的,层层递进的,彼此之间不要相似。
43-
请注意JSON对象一定要格式正确,不要输出其他额外内容。"""
38+
prompt = f"""根据用户的以下提示,生成一系列SERP查询来研究该主题。
39+
返回一个JSON对象,其中包含一个'queries'数组字段,包含{num_queries}个查询(如果原始提示已经很明确,则可以少于这个数量)。
40+
每个查询对象应该有'query'和'research_goal'字段。
41+
请注意JSON对象一定要格式正确,不要输出其他额外内容。
42+
确保每个查询都是唯一的,且彼此不相似:主题<prompt>{query}</prompt>"""
4443
if learnings:
4544
# prompt += f"\n\nHere are some learnings from previous research, use them to generate more specific queries: {' '.join(learnings)}"
4645
prompt += f"\n\n这里是之前研究步骤的一些发现,请使用它们生成更具体的查询:{' '.join(learnings)}。请确保生成的查询与用户原始提示的语言保持一致。"
@@ -54,8 +53,6 @@ async def generate_serp_queries(
5453
],
5554
response_format={"type": "json_object"},
5655
)
57-
loguru.logger.info("generate_serp_queries done:")
58-
loguru.logger.info(response)
5956
try:
6057
queries = response.get("queries", [])
6158
return [SerpQuery(**q) for q in queries][:num_queries]
@@ -84,15 +81,6 @@ async def process_serp_result(
8481
# Create the contents string separately
8582
contents_str = "".join(f"<content>\n{content}\n</content>" for content in contents)
8683

87-
# prompt = (
88-
# f"Given the following contents from a SERP search for the query <query>{query}</query>, "
89-
# f"generate a list of learnings from the contents. Return a JSON object with 'learnings' "
90-
# f"and 'followUpQuestions' keys with array of strings as values. Include up to {num_learnings} learnings and "
91-
# f"{num_follow_up_questions} follow-up questions. The learnings should be unique, "
92-
# "concise, and information-dense, including entities, metrics, numbers, and dates.\n\n"
93-
# f"<contents>{contents_str}</contents>"
94-
# )
95-
9684
prompt = (
9785
f"根据以下对查询<query>{query}</query>的SERP搜索内容,"
9886
f"生成从内容中得到的学习要点列表。返回一个JSON对象,包含'learnings'和'followUpQuestions'键(key),"
@@ -139,19 +127,11 @@ async def write_final_report(
139127
150_000,
140128
)
141129

142-
# user_prompt = (
143-
# f"Given the following prompt from the user, write a final report on the topic using "
144-
# f"the learnings from research. Return a JSON object with a 'reportMarkdown' field "
145-
# f"containing a detailed markdown report (aim for 3+ pages). Include ALL the learnings "
146-
# f"from research:\n\n<prompt>{prompt}</prompt>\n\n"
147-
# f"Here are all the learnings from research:\n\n<learnings>\n{learnings_string}\n</learnings>"
148-
# )
149130
user_prompt = (
150131
f"根据以下用户提供的提示,使用研究中获得的学习要点撰写关于该主题的最终报告。返回一个JSON对象,"
151-
f"其中包含'reportMarkdown'字段,该字段包含详细的markdown报告(目标为3+页)。包括研究中的所有学习要点:\n\n"
132+
f"其中包含'reportMarkdown'字段,该字段包含详细的markdown报告(目标为3页以上),尽量内容丰富饱满。包括研究中的所有学习要点:\n\n"
152133
f"<prompt>{prompt}</prompt>\n\n"
153134
f"以下是研究中获得的所有学习要点:\n\n<learnings>\n{learnings_string}\n</learnings>"
154-
f"请确保生成的报告与用户原始提示({prompt})的语言保持一致。"
155135
)
156136
response = await get_client_response(
157137
client=client,
@@ -167,7 +147,7 @@ async def write_final_report(
167147
report = response.get("reportMarkdown", "")
168148

169149
# Append sources
170-
urls_section = "\n\n## Sources\n\n" + "\n".join(
150+
urls_section = "\n\n## 来源\n\n" + "\n".join(
171151
[f"- {url}" for url in visited_urls]
172152
)
173153
return report + urls_section
@@ -216,7 +196,7 @@ async def process_query(serp_query: SerpQuery) -> ResearchResult:
216196
async with semaphore:
217197
try:
218198
# Search for content
219-
result = await search_service.search(serp_query.query, limit=5)
199+
result = await search_service.search(serp_query.query, limit=2)
220200
loguru.logger.info("process_query:")
221201
loguru.logger.info(result)
222202
# Collect new URLs

trustrag/modules/deepsearch/agent/providers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import os
2+
3+
import loguru
24
import typer
35
import json
46
from openai import AsyncOpenAI
@@ -58,11 +60,13 @@ def get_model(cls, service_provider_name: Optional[str] = None) -> str:
5860
async def get_client_response(
5961
client: AsyncOpenAI, model: str, messages: list, response_format: dict
6062
):
63+
# loguru.logger.info(messages)
6164
response = await client.beta.chat.completions.parse(
6265
model=model,
6366
messages=messages,
6467
response_format=response_format,
6568
)
69+
# loguru.logger.info(response)
6670

6771
result = response.choices[0].message.content
6872

trustrag/modules/deepsearch/finder/manager.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
from typing import List, Dict, Union
33
# from trustrag.modules.deepsearch.utils import logger
44
import loguru
5-
from trustrag.modules.deepsearch.finder.searcher import SearchResult, SearchEngine, DeepSearchEngine
5+
from trustrag.modules.deepsearch.finder.searcher import SearchResult, SearchEngine, UnifiedSearchEngine
66
from trustrag.modules.deepsearch.finder.scraper import ScrapedContent, Scraper, PlaywrightScraper
77

88

99
class SearchAndScrapeManager:
1010
"""Main class for coordinating search and scrape operations."""
1111

1212
def __init__(self, search_engine: SearchEngine = None, scraper: Scraper = None):
13-
self.search_engine = search_engine or DeepSearchEngine()
13+
self.search_engine = search_engine or UnifiedSearchEngine()
1414
self.scraper = scraper or PlaywrightScraper()
1515

1616
async def setup(self):
@@ -24,7 +24,7 @@ async def teardown(self):
2424
await self.scraper.teardown()
2525

2626
async def search(
27-
self, query: str, num_results: int = 10, **kwargs
27+
self, query: str, num_results: int = 5, **kwargs
2828
) -> List[SearchResult]:
2929
"""Perform a search using the configured search engine."""
3030
return await self.search_engine.search_async(query, num_results, **kwargs)
@@ -36,7 +36,7 @@ async def scrape(self, url: str, **kwargs) -> ScrapedContent:
3636
async def search_and_scrape(
3737
self,
3838
query: str,
39-
num_results: int = 10,
39+
num_results: int = 5,
4040
scrape_all: bool = False,
4141
max_concurrent_scrapes: int = 5,
4242
**kwargs,
@@ -65,7 +65,7 @@ async def search_and_scrape(
6565
semaphore = asyncio.Semaphore(max_concurrent_scrapes)
6666

6767
async def scrape_with_semaphore(url):
68-
loguru.logger.info("Scraping %s", url)
68+
loguru.logger.info("Scraping %s"+url)
6969
async with semaphore:
7070
return await self.scrape(url, **kwargs)
7171

trustrag/modules/deepsearch/finder/scraper.py

Lines changed: 88 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
from dataclasses import dataclass
22
from typing import Dict, Any
3-
# from trustrag.modules.deepsearch.utils import logger
43
import loguru
54
from abc import ABC, abstractmethod
65
from playwright.async_api import async_playwright
7-
8-
# ---- Data Models ----
6+
import asyncio
7+
import re
98

109

1110
@dataclass
@@ -19,9 +18,6 @@ class ScrapedContent:
1918
metadata: Dict[str, Any] = None
2019

2120

22-
# ---- Scraper Interfaces ----
23-
24-
2521
class Scraper(ABC):
2622
"""Abstract base class for scrapers."""
2723

@@ -73,6 +69,7 @@ async def teardown(self):
7369

7470
async def scrape(self, url: str, **kwargs) -> ScrapedContent:
7571
"""Scrape a URL using Playwright and return standardized content."""
72+
loguru.logger.info(f"{url} scraped")
7673
if not self.browser:
7774
await self.setup()
7875

@@ -117,4 +114,88 @@ async def scrape(self, url: str, **kwargs) -> ScrapedContent:
117114
loguru.logger.error(f"Error scraping {url}: {str(e)}")
118115
return ScrapedContent(
119116
url=url, html="", text="", status_code=0, metadata={"error": str(e)}
120-
)
117+
)
118+
119+
120+
def clean_scraped_text(text):
121+
"""
122+
清理爬取的文本内容,移除HTML/CSS代码片段
123+
124+
参数:
125+
text (str): 原始爬取的文本内容
126+
127+
返回:
128+
str: 清理后的文本内容
129+
"""
130+
# 移除CSS样式定义
131+
text = re.sub(r'[a-zA-Z#\.\-\_\s,:]+ \{[^\}]*\}', '', text)
132+
133+
# 移除HTML标签
134+
text = re.sub(r'<[^>]*>', '', text)
135+
136+
# 移除URL引用
137+
text = re.sub(r'url\([^\)]*\)', '', text)
138+
139+
# 移除background-image等样式属性
140+
text = re.sub(r'background-[a-z\-]+:[^;]*;', '', text)
141+
142+
# 移除width, height等样式属性
143+
text = re.sub(r'(width|height|display|float|vertical-align):[^;]*;', '', text)
144+
145+
# 移除只包含空白字符的行
146+
text = re.sub(r'^\s*$', '', text, flags=re.MULTILINE)
147+
148+
# 移除连续的多个空行,保留单个空行
149+
text = re.sub(r'\n\s*\n', '\n\n', text)
150+
151+
return text.strip()
152+
153+
154+
async def main():
155+
# 配置日志
156+
loguru.logger.add("scraper.log", rotation="10 MB")
157+
158+
# 创建爬虫实例,headless=True表示无界面模式
159+
scraper = PlaywrightScraper(headless=True)
160+
161+
try:
162+
# 要爬取的网址
163+
target_url = "https://blog.sciencenet.cn/blog-2089193-1469701.html"
164+
target_url = "https://blog.csdn.net/2401_85375151/article/details/144805338"
165+
target_url = "https://zhuanlan.zhihu.com/p/19647641182"
166+
# 爬取内容
167+
result = await scraper.scrape(target_url)
168+
169+
# 清理文本内容
170+
cleaned_text = clean_scraped_text(result.text)
171+
172+
# 打印爬取结果
173+
print(f"URL: {result.url}")
174+
print(f"状态码: {result.status_code}")
175+
print(f"标题: {result.metadata.get('title', 'N/A')}")
176+
print("\n-----清理后的文本内容预览(前500字)-----")
177+
print(cleaned_text[:500] + "...")
178+
179+
# 保存完整HTML到文件
180+
with open("sciencenet_blog.html", "w", encoding="utf-8") as f:
181+
f.write(result.html)
182+
183+
# 保存原始提取的文本到文件
184+
with open("sciencenet_blog_raw.txt", "w", encoding="utf-8") as f:
185+
f.write(result.text)
186+
187+
# 保存清理后的文本到文件
188+
with open("sciencenet_blog_cleaned.txt", "w", encoding="utf-8") as f:
189+
f.write(cleaned_text)
190+
191+
print("\n爬取内容已保存到文件,包括原始HTML、原始文本和清理后的文本")
192+
193+
except Exception as e:
194+
loguru.logger.error(f"爬虫运行出错: {str(e)}")
195+
finally:
196+
# 清理资源
197+
await scraper.teardown()
198+
199+
# 运行主函数
200+
if __name__ == "__main__":
201+
asyncio.run(main())

trustrag/modules/deepsearch/finder/searcher.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
from dataclasses import dataclass
44
from typing import List, Dict, Optional, Any
55
import os
6+
7+
import loguru
8+
69
from trustrag.modules.engine.websearch import DuckduckEngine, SearxngEngine
710

811

@@ -28,18 +31,18 @@ async def search(
2831
pass
2932

3033

31-
class DeepSearchEngine:
34+
class UnifiedSearchEngine:
3235
"""
3336
A unified search engine that can use either DuckduckEngine or SearxngEngine
3437
based on the engine_type parameter.
3538
"""
3639

3740
def __init__(
3841
self,
39-
engine_type: str = "duckduckgo",
42+
engine_type: str = "searxng",
4043
proxy: Optional[str] = None,
4144
timeout: int = 20,
42-
searxng_url: str = os.getenv("SEARXNG_URL")
45+
searxng_url: str = os.getenv("SEARXNG_URL","http://localhost:8080/search")
4346
) -> None:
4447
"""
4548
Initialize the UnifiedSearchEngine class.
@@ -49,6 +52,7 @@ def __init__(
4952
:param timeout: Request timeout in seconds
5053
:param searxng_url: URL of the SearxNG instance if using searxng
5154
"""
55+
loguru.logger.info(searxng_url)
5256
self.engine_type = engine_type.lower()
5357

5458
if self.engine_type == "duckduckgo":
@@ -135,18 +139,18 @@ def print_results(self, results: List[SearchResult]) -> None:
135139
# duck_search.print_results(results)
136140

137141
# 使用SearxNG引擎
138-
searx_search = DeepSearchEngine(
142+
searx_search = UnifiedSearchEngine(
139143
engine_type="searxng",
140144
searxng_url="http://localhost:8080/search"
141145
)
142146
results = searx_search.search(
143-
"机器学习教程",
147+
"大模型强化学习技术",
144148
top_k=3,
145149
language="zh-CN",
146150
categories="general"
147151
)
148152
print(results)
149-
# searx_search.print_results(results)
153+
searx_search.print_results(results)
150154

151155
# async def search_example():
152156
# engine = DeepSearchEngine()

0 commit comments

Comments
 (0)