Merge pull request #119 from gomate-community/pipeline

yanqiangmiffy · web-flow · commit 3070275c5f01 · 2025-03-09T00:57:37.000+08:00
feature@optimizing deepsearch
diff --git a/trustrag/modules/deepsearch/.env b/trustrag/modules/deepsearch/.env
@@ -33,7 +33,7 @@ OLLAMA_MODEL="deepseek-r1:1.5b"
 # Uncomment if you want to use a service that mimics OpenAI's API (e.g., OpenRouter or Gemini).
 # OPENAI_ENDPOINT="http://localhost:1234/v1"
 
-# Default scraper to use (options: firecrawl, playwright_ddgs)
+# Default scraper to use (options: firecrawl, playwright_ddgs,playwright_searxng)
 DEFAULT_SCRAPER="playwright_ddgs"
 
 # -----------------------------------------------------------------------------
@@ -49,4 +49,4 @@ FIRECRAWL_API_KEY="your_firecrawl_api_key_here"
 # -----------------------------------------------------------------------------
 # SearXNG
 # -----------------------------------------------------------------------------
-SEARXNG_URL="http://192.168.1.5:11434/v1/"
+SEARXNG_URL="http://localhost:8080/search"
diff --git a/trustrag/modules/deepsearch/action.py b/trustrag/modules/deepsearch/action.py
@@ -35,12 +35,11 @@ async def generate_serp_queries(
 ) -> List[SerpQuery]:
     """Generate SERP queries based on user input and previous learnings."""
 
-    # prompt = f"""Given the following prompt from the user, generate a list of SERP queries to research the topic. Return a JSON object with a 'queries' array field containing {num_queries} queries (or less if the original prompt is clear). Each query object should have 'query' and 'research_goal' fields. Make sure each query is unique and not similar to each other: <prompt>{query}</prompt>"""
-    prompt  = f"""根据用户提供的以下提示，生成SERP（Search Engine Results Page ，搜索引擎结果页面）查询列表以研究该主题。
-    返回一个JSON对象，其中包含'queries'数组字段，该字段包含{num_queries}个查询（如果原始提示已经很明确，则可以少于此数量）。
-    每个查询对象应有'query'和'research_goal'字段。
-    确保每个查询都是唯一的，层层递进的，彼此之间不要相似。
-    请注意JSON对象一定要格式正确，不要输出其他额外内容。"""
+    prompt = f"""根据用户的以下提示，生成一系列SERP查询来研究该主题。
+    返回一个JSON对象，其中包含一个'queries'数组字段，包含{num_queries}个查询（如果原始提示已经很明确，则可以少于这个数量）。
+    每个查询对象应该有'query'和'research_goal'字段。
+    请注意JSON对象一定要格式正确，不要输出其他额外内容。
+    确保每个查询都是唯一的，且彼此不相似：主题<prompt>{query}</prompt>"""
     if learnings:
         # prompt += f"\n\nHere are some learnings from previous research, use them to generate more specific queries: {' '.join(learnings)}"
         prompt += f"\n\n这里是之前研究步骤的一些发现，请使用它们生成更具体的查询：{' '.join(learnings)}。请确保生成的查询与用户原始提示的语言保持一致。"
@@ -54,8 +53,6 @@ async def generate_serp_queries(
         ],
         response_format={"type": "json_object"},
     )
-    loguru.logger.info("generate_serp_queries done:")
-    loguru.logger.info(response)
     try:
         queries = response.get("queries", [])
         return [SerpQuery(**q) for q in queries][:num_queries]
@@ -84,15 +81,6 @@ async def process_serp_result(
     # Create the contents string separately
     contents_str = "".join(f"<content>\n{content}\n</content>" for content in contents)
 
-    # prompt = (
-    #     f"Given the following contents from a SERP search for the query <query>{query}</query>, "
-    #     f"generate a list of learnings from the contents. Return a JSON object with 'learnings' "
-    #     f"and 'followUpQuestions' keys with array of strings as values. Include up to {num_learnings} learnings and "
-    #     f"{num_follow_up_questions} follow-up questions. The learnings should be unique, "
-    #     "concise, and information-dense, including entities, metrics, numbers, and dates.\n\n"
-    #     f"<contents>{contents_str}</contents>"
-    # )
-
     prompt = (
         f"根据以下对查询<query>{query}</query>的SERP搜索内容，"
         f"生成从内容中得到的学习要点列表。返回一个JSON对象，包含'learnings'和'followUpQuestions'键(key)，"
@@ -139,19 +127,11 @@ async def write_final_report(
         150_000,
     )
 
-    # user_prompt = (
-    #     f"Given the following prompt from the user, write a final report on the topic using "
-    #     f"the learnings from research. Return a JSON object with a 'reportMarkdown' field "
-    #     f"containing a detailed markdown report (aim for 3+ pages). Include ALL the learnings "
-    #     f"from research:\n\n<prompt>{prompt}</prompt>\n\n"
-    #     f"Here are all the learnings from research:\n\n<learnings>\n{learnings_string}\n</learnings>"
-    # )
     user_prompt = (
         f"根据以下用户提供的提示，使用研究中获得的学习要点撰写关于该主题的最终报告。返回一个JSON对象，"
-        f"其中包含'reportMarkdown'字段，该字段包含详细的markdown报告（目标为3+页）。包括研究中的所有学习要点：\n\n"
+        f"其中包含'reportMarkdown'字段，该字段包含详细的markdown报告（目标为3页以上），尽量内容丰富饱满。包括研究中的所有学习要点：\n\n"
         f"<prompt>{prompt}</prompt>\n\n"
         f"以下是研究中获得的所有学习要点：\n\n<learnings>\n{learnings_string}\n</learnings>"
-        f"请确保生成的报告与用户原始提示({prompt})的语言保持一致。"
     )
     response = await get_client_response(
         client=client,
@@ -167,7 +147,7 @@ async def write_final_report(
         report = response.get("reportMarkdown", "")
 
         # Append sources
-        urls_section = "\n\n## Sources\n\n" + "\n".join(
+        urls_section = "\n\n## 来源\n\n" + "\n".join(
             [f"- {url}" for url in visited_urls]
         )
         return report + urls_section
@@ -216,7 +196,7 @@ async def process_query(serp_query: SerpQuery) -> ResearchResult:
         async with semaphore:
             try:
                 # Search for content
-                result = await search_service.search(serp_query.query, limit=5)
+                result = await search_service.search(serp_query.query, limit=2)
                 loguru.logger.info("process_query:")
                 loguru.logger.info(result)
                 # Collect new URLs
diff --git a/trustrag/modules/deepsearch/agent/providers.py b/trustrag/modules/deepsearch/agent/providers.py
@@ -1,4 +1,6 @@
 import os
+
+import loguru
 import typer
 import json
 from openai import AsyncOpenAI
@@ -58,11 +60,13 @@ def get_model(cls, service_provider_name: Optional[str] = None) -> str:
 async def get_client_response(
     client: AsyncOpenAI, model: str, messages: list, response_format: dict
 ):
+    # loguru.logger.info(messages)
     response = await client.beta.chat.completions.parse(
         model=model,
         messages=messages,
         response_format=response_format,
     )
+    # loguru.logger.info(response)
 
     result = response.choices[0].message.content
 
diff --git a/trustrag/modules/deepsearch/finder/manager.py b/trustrag/modules/deepsearch/finder/manager.py
@@ -2,15 +2,15 @@
 from typing import List, Dict, Union
 # from trustrag.modules.deepsearch.utils import logger
 import loguru
-from trustrag.modules.deepsearch.finder.searcher import SearchResult, SearchEngine, DeepSearchEngine
+from trustrag.modules.deepsearch.finder.searcher import SearchResult, SearchEngine, UnifiedSearchEngine
 from trustrag.modules.deepsearch.finder.scraper import ScrapedContent, Scraper, PlaywrightScraper
 
 
 class SearchAndScrapeManager:
     """Main class for coordinating search and scrape operations."""
 
     def __init__(self, search_engine: SearchEngine = None, scraper: Scraper = None):
-        self.search_engine = search_engine or DeepSearchEngine()
+        self.search_engine = search_engine or UnifiedSearchEngine()
         self.scraper = scraper or PlaywrightScraper()
 
     async def setup(self):
@@ -24,7 +24,7 @@ async def teardown(self):
             await self.scraper.teardown()
 
     async def search(
-        self, query: str, num_results: int = 10, **kwargs
+        self, query: str, num_results: int = 5, **kwargs
     ) -> List[SearchResult]:
         """Perform a search using the configured search engine."""
         return await self.search_engine.search_async(query, num_results, **kwargs)
@@ -36,7 +36,7 @@ async def scrape(self, url: str, **kwargs) -> ScrapedContent:
     async def search_and_scrape(
         self,
         query: str,
-        num_results: int = 10,
+        num_results: int = 5,
         scrape_all: bool = False,
         max_concurrent_scrapes: int = 5,
         **kwargs,
@@ -65,7 +65,7 @@ async def search_and_scrape(
             semaphore = asyncio.Semaphore(max_concurrent_scrapes)
 
             async def scrape_with_semaphore(url):
-                loguru.logger.info("Scraping %s", url)
+                loguru.logger.info("Scraping %s"+url)
                 async with semaphore:
                     return await self.scrape(url, **kwargs)
 
diff --git a/trustrag/modules/deepsearch/finder/scraper.py b/trustrag/modules/deepsearch/finder/scraper.py
@@ -1,11 +1,10 @@
 from dataclasses import dataclass
 from typing import Dict, Any
-# from trustrag.modules.deepsearch.utils import logger
 import loguru
 from abc import ABC, abstractmethod
 from playwright.async_api import async_playwright
-
-# ---- Data Models ----
+import asyncio
+import re
 
 
 @dataclass
@@ -19,9 +18,6 @@ class ScrapedContent:
     metadata: Dict[str, Any] = None
 
 
-# ---- Scraper Interfaces ----
-
-
 class Scraper(ABC):
     """Abstract base class for scrapers."""
 
@@ -73,6 +69,7 @@ async def teardown(self):
 
     async def scrape(self, url: str, **kwargs) -> ScrapedContent:
         """Scrape a URL using Playwright and return standardized content."""
+        loguru.logger.info(f"{url} scraped")
         if not self.browser:
             await self.setup()
 
@@ -117,4 +114,88 @@ async def scrape(self, url: str, **kwargs) -> ScrapedContent:
             loguru.logger.error(f"Error scraping {url}: {str(e)}")
             return ScrapedContent(
                 url=url, html="", text="", status_code=0, metadata={"error": str(e)}
-            )
+            )
+
+
+def clean_scraped_text(text):
+    """
+    清理爬取的文本内容，移除HTML/CSS代码片段
+
+    参数:
+        text (str): 原始爬取的文本内容
+
+    返回:
+        str: 清理后的文本内容
+    """
+    # 移除CSS样式定义
+    text = re.sub(r'[a-zA-Z#\.\-\_\s,:]+ \{[^\}]*\}', '', text)
+
+    # 移除HTML标签
+    text = re.sub(r'<[^>]*>', '', text)
+
+    # 移除URL引用
+    text = re.sub(r'url\([^\)]*\)', '', text)
+
+    # 移除background-image等样式属性
+    text = re.sub(r'background-[a-z\-]+:[^;]*;', '', text)
+
+    # 移除width, height等样式属性
+    text = re.sub(r'(width|height|display|float|vertical-align):[^;]*;', '', text)
+
+    # 移除只包含空白字符的行
+    text = re.sub(r'^\s*$', '', text, flags=re.MULTILINE)
+
+    # 移除连续的多个空行，保留单个空行
+    text = re.sub(r'\n\s*\n', '\n\n', text)
+
+    return text.strip()
+
+
+async def main():
+    # 配置日志
+    loguru.logger.add("scraper.log", rotation="10 MB")
+
+    # 创建爬虫实例，headless=True表示无界面模式
+    scraper = PlaywrightScraper(headless=True)
+
+    try:
+        # 要爬取的网址
+        target_url = "https://blog.sciencenet.cn/blog-2089193-1469701.html"
+        target_url = "https://blog.csdn.net/2401_85375151/article/details/144805338"
+        target_url = "https://zhuanlan.zhihu.com/p/19647641182"
+        # 爬取内容
+        result = await scraper.scrape(target_url)
+
+        # 清理文本内容
+        cleaned_text = clean_scraped_text(result.text)
+
+        # 打印爬取结果
+        print(f"URL: {result.url}")
+        print(f"状态码: {result.status_code}")
+        print(f"标题: {result.metadata.get('title', 'N/A')}")
+        print("\n-----清理后的文本内容预览(前500字)-----")
+        print(cleaned_text[:500] + "...")
+
+        # 保存完整HTML到文件
+        with open("sciencenet_blog.html", "w", encoding="utf-8") as f:
+            f.write(result.html)
+
+        # 保存原始提取的文本到文件
+        with open("sciencenet_blog_raw.txt", "w", encoding="utf-8") as f:
+            f.write(result.text)
+
+        # 保存清理后的文本到文件
+        with open("sciencenet_blog_cleaned.txt", "w", encoding="utf-8") as f:
+            f.write(cleaned_text)
+
+        print("\n爬取内容已保存到文件，包括原始HTML、原始文本和清理后的文本")
+
+    except Exception as e:
+        loguru.logger.error(f"爬虫运行出错: {str(e)}")
+    finally:
+        # 清理资源
+        await scraper.teardown()
+
+# 运行主函数
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/trustrag/modules/deepsearch/finder/searcher.py b/trustrag/modules/deepsearch/finder/searcher.py
@@ -3,6 +3,9 @@
 from dataclasses import dataclass
 from typing import List, Dict, Optional, Any
 import os
+
+import loguru
+
 from trustrag.modules.engine.websearch import DuckduckEngine, SearxngEngine
 
 
@@ -28,18 +31,18 @@ async def search(
         pass
 
 
-class DeepSearchEngine:
+class UnifiedSearchEngine:
     """
     A unified search engine that can use either DuckduckEngine or SearxngEngine
     based on the engine_type parameter.
     """
 
     def __init__(
             self,
-            engine_type: str = "duckduckgo",
+            engine_type: str = "searxng",
             proxy: Optional[str] = None,
             timeout: int = 20,
-            searxng_url: str = os.getenv("SEARXNG_URL")
+            searxng_url: str = os.getenv("SEARXNG_URL","http://localhost:8080/search")
     ) -> None:
         """
         Initialize the UnifiedSearchEngine class.
@@ -49,6 +52,7 @@ def __init__(
         :param timeout: Request timeout in seconds
         :param searxng_url: URL of the SearxNG instance if using searxng
         """
+        loguru.logger.info(searxng_url)
         self.engine_type = engine_type.lower()
 
         if self.engine_type == "duckduckgo":
@@ -135,18 +139,18 @@ def print_results(self, results: List[SearchResult]) -> None:
     # duck_search.print_results(results)
 
     # 使用SearxNG引擎
-    searx_search = DeepSearchEngine(
+    searx_search = UnifiedSearchEngine(
         engine_type="searxng",
         searxng_url="http://localhost:8080/search"
     )
     results = searx_search.search(
-        "机器学习教程",
+        "大模型强化学习技术",
         top_k=3,
         language="zh-CN",
         categories="general"
     )
     print(results)
-    # searx_search.print_results(results)
+    searx_search.print_results(results)
 
     # async def search_example():
     #     engine = DeepSearchEngine()
diff --git a/trustrag/modules/deepsearch/finder/services.py b/trustrag/modules/deepsearch/finder/services.py
diff --git a/trustrag/modules/deepsearch/output.md b/trustrag/modules/deepsearch/output.md
diff --git a/trustrag/modules/deepsearch/pipeline.py b/trustrag/modules/deepsearch/pipeline.py