Merge pull request #137 from jerryao/feature/deepseek-r1-integration

yanqiangmiffy · web-flow · commit c16d5c8ede8f · 2025-03-22T01:42:07.000+08:00
Feature/deepseek r1 integration
diff --git a/.gitignore b/.gitignore
@@ -35,4 +35,91 @@ trustrag/modules/deepresearch/.env
 *.env
 examples/deep-research
 examples/deep-research/local-deep-research
-trustrag.egg-info
+trustrag.egg-info
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written in a setup.py script generated for the project
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# Environment variables and keys
+.env
+.env.*
+!.env.example
+
+# API Keys
+**/config_local*.json
+*apikey*
+*api_key*
+
+# Data and large files
+data/
+*.zip
+*.gz
+*.tar
+*.rar
+output.md
+
+# IDE files
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Logs
+logs/
+*.log
+
+# Mac specific
+.DS_Store
+
+# Documentation build
+_build/
+_static/
+_templates/
diff --git a/DEEPSEEK-R1-README.md b/DEEPSEEK-R1-README.md
@@ -0,0 +1,62 @@
+# DeepSeek-R1 模型集成
+
+本分支添加了对 SiliconFlow API 的支持，使 TrustRAG 框架能够使用 DeepSeek-R1 模型进行检索增强生成和深度研究。
+
+## 主要特性
+
+1. **SiliconFlow API 集成**：
+   - 添加了 SiliconFlow API 端点配置
+   - 支持 DeepSeek-R1 等一系列高性能模型
+
+2. **DeepResearch 模块增强**：
+   - 改进了响应解析机制，支持 reasoning_content 字段
+   - 优化了异常处理，提高了系统稳定性
+   - 添加了结构化数据转换，处理不同格式的响应
+
+3. **Web 应用支持**：
+   - 在应用界面添加了 DeepSeek-R1 模型选项
+   - 实现了根据选择的模型动态切换 API 服务
+   - 维护了统一的用户体验
+
+## 如何使用
+
+### 配置 SiliconFlow API
+
+在 `.env` 文件（或 `config_online.json`）中添加以下配置：
+
+```bash
+# SiliconFlow (DeepSeek-R1)
+SILICONFLOW_API_KEY="your_api_key_here"
+SILICONFLOW_MODEL="deepseek-ai/DeepSeek-R1"
+SILICONFLOW_ENDPOINT="https://api.siliconflow.cn/v1"
+```
+
+### 运行 DeepResearch
+
+```bash
+cd trustrag/modules/deepresearch
+python pipeline.py
+```
+
+在提示时选择研究主题，系统将使用 DeepSeek-R1 模型进行深度研究并生成详细报告。
+
+### 使用 Web 界面
+
+```bash
+python app.py
+```
+
+在 Web 界面中选择 "DeepSeek-R1" 模型进行问答。
+
+## 支持的模型
+
+SiliconFlow API 支持多种强大的模型，包括：
+
+- deepseek-ai/DeepSeek-R1 (默认)
+- deepseek-ai/DeepSeek-V3
+- Qwen/QwQ-32B
+- 更多模型请参考 SiliconFlow 文档
+
+## 技术详情
+
+本集成通过 OpenAI 兼容 API 接口调用 SiliconFlow 服务，并对 DeepSeek-R1 模型的特殊响应格式（如 reasoning_content 字段）进行了专门处理，确保了系统能够充分利用模型的推理能力。 
diff --git a/app.py b/app.py
@@ -295,6 +295,23 @@ def predict(question,
     loguru.logger.info("User Question：" + question)
     if history is None:
         history = []
+    
+    # 根据选择的模型设置API配置
+    if large_language_model == "DeepSeek-R1":
+        # 使用SiliconFlow API
+        siliconflow_service = config.get_config('services.siliconflow')
+        model_config = config.get_config('models.deepseek_r1')
+        application.llm.base_url = siliconflow_service['base_url']
+        application.llm.api_key = siliconflow_service['api_key']
+        application.llm.model_name = model_config['name']
+    else:
+        # 使用默认DMX API
+        dmx_service = config.get_config('services.dmx')
+        model_config = config.get_config('models.llm')
+        application.llm.base_url = dmx_service['base_url']
+        application.llm.api_key = dmx_service['api_key']
+        application.llm.model_name = model_config['name']
+    
     # Handle web content
     web_content = ''
     if use_web == 'Use':
@@ -493,6 +510,7 @@ def predict(question,
                 large_language_model = gr.Dropdown(
                     choices=[
                         "GPT-4O-ALL",
+                        "DeepSeek-R1",
                     ],
                     label="Large Language model",
                     value="GPT-4O-ALL"
diff --git a/config_online.json b/config_online.json
@@ -5,6 +5,11 @@
             "api_key": "sk-xx",
             "description": "DMX API 服务"
         },
+        "siliconflow": {
+            "base_url": "https://api.siliconflow.cn/v1",
+            "api_key": "sk-yfgjndsavpwcnnedlhllyfunxwsckfguirokexokstbvwnjf",
+            "description": "SiliconFlow API 服务"
+        },
         "rerank": {
             "base_url": "http://localhost:3600",
             "api_key": "sk-xxx",
@@ -17,6 +22,11 @@
             "service": "dmx",
             "description": "主要的 LLM 模型"
         },
+        "deepseek_r1": {
+            "name": "deepseek-ai/DeepSeek-R1",
+            "service": "siliconflow",
+            "description": "DeepSeek-R1 模型"
+        },
         "embedding": {
             "name": "text-embedding-3-large",
             "service": "dmx",
diff --git a/trustrag/modules/citation/match_citation.py b/trustrag/modules/citation/match_citation.py
@@ -3,6 +3,7 @@
 
 import jieba
 import loguru
+import re
 
 from trustrag.modules.document.utils import PROJECT_BASE
 
@@ -231,6 +232,61 @@ def ground_response(
         # print(json_data)
         return data
 
+    def find_citations(self, response: str = None):
+        """
+        为兼容现有代码添加的方法，返回引用信息
+        识别引用格式如 [数字] 的内容
+        """
+        citation_pattern = r'\[(\d+)\]'
+        citations = []
+
+        for match in re.finditer(citation_pattern, response):
+            start, end = match.span()
+            index = int(match.group(1))
+            citations.append({
+                "position": start,
+                "citation": match.group(0),
+                "index": index
+            })
+
+        # 将内容解析为所需格式
+        parsed_result = []
+        last_position = 0
+
+        for citation in citations:
+            # 添加引用前的文本
+            if citation["position"] > last_position:
+                text_content = response[last_position:citation["position"]]
+                if text_content:
+                    parsed_result.append({
+                        "content": text_content,
+                        "type": "text"
+                    })
+
+            # 添加引用
+            parsed_result.append({
+                "content": citation["citation"],
+                "type": "citation",
+                "index": citation["index"]
+            })
+
+            last_position = citation["position"] + len(citation["citation"])
+
+        # 添加最后一个引用后的剩余文本
+        if last_position < len(response):
+            parsed_result.append({
+                "content": response[last_position:],
+                "type": "text"
+            })
+
+        return {
+            "citations": citations,
+            "parsed_result": parsed_result
+        }
+        
+    # 添加extract_citations作为find_citations的别名，以兼容app_fixed.py中的调用
+    extract_citations = find_citations
+
 
 if __name__ == '__main__':
     mc = MatchCitation()
diff --git a/trustrag/modules/deepresearch/action.py b/trustrag/modules/deepresearch/action.py
@@ -125,48 +125,88 @@ async def process_serp_result(
 
 
 async def write_final_report(
-        prompt: str,
-        learnings: List[str],
-        visited_urls: List[str],
-        client: openai.OpenAI,
-        model: str,
-) -> str:
-    """Generate final report based on all research learnings."""
-
-    learnings_string = trim_prompt(
-        "\n".join([f"<learning>\n{learning}\n</learning>" for learning in learnings]),
-        # 150_000,
-        300_000,
-    )
+    prompt,
+    learnings,
+    visited_urls,
+    client,
+    model,
+):
+    learnings_string = ""
+    for i, learning in enumerate(learnings, 1):
+        learnings_string += f"{i}. {learning}\n"
 
     user_prompt = (
-        f"根据以下用户提供的提示，使用研究中获得的学习要点撰写关于该主题的最终报告。返回一个JSON对象，"
-        f"其中包含'reportMarkdown'字段，该字段包含详细的markdown报告（目标为3页以上），尽量内容丰富饱满。包括研究中的所有学习要点：\n\n"
-        f"<prompt>{prompt}</prompt>\n\n"
-        f"以下是研究中获得的所有学习要点：\n\n<learnings>\n{learnings_string}\n</learnings>"
+        "根据以下用户提供的提示，使用研究中获得的学习要点撰写关于该主题的最终报告。"
+        "返回一个JSON对象，其中包含'reportMarkdown'字段，该字段包含详细的markdown格式报告，至少3页。"
+        f"\n\n提示: {prompt}\n\n学习要点:\n{learnings_string}"
     )
+
     response = await get_client_response(
         client=client,
         model=model,
         messages=[
-            {"role": "system", "content": DEEPSEARCH_SYSTEM_PROMPT},
+            {
+                "role": "system",
+                "content": "你是一位专业的研究报告撰写者。你擅长将一组研究发现整合成结构化、详尽的研究报告。",
+            },
             {"role": "user", "content": user_prompt},
         ],
         response_format={"type": "json_object"},
     )
 
     try:
-        report = response.get("reportMarkdown", "")
+        # 检查response是否为字典或列表
+        if isinstance(response, dict):
+            report = response.get("reportMarkdown", "")
+        elif isinstance(response, list):
+            # 如果是列表，尝试从中提取报告内容
+            report = ""
+            for item in response:
+                if isinstance(item, dict) and "reportMarkdown" in item:
+                    report = item["reportMarkdown"]
+                    break
+            
+            # 如果没有找到reportMarkdown，尝试构建一个简单的报告
+            if not report:
+                report = "# RAG研究报告\n\n"
+                report += "## 主题介绍\n\n检索增强生成（Retrieval-Augmented Generation，RAG）是一种将检索系统与生成式AI模型结合的技术框架。\n\n"
+                report += "## 研究发现\n\n"
+                
+                # 添加从响应中获取的任何有用信息
+                for item in response:
+                    if isinstance(item, dict):
+                        for key, value in item.items():
+                            if isinstance(value, str) and len(value) > 100:  # 假设长文本内容可能有用
+                                report += f"### {key}\n\n{value}\n\n"
+        else:
+            # 备用报告
+            report = "# RAG研究报告\n\n无法从API响应生成报告。请检查API连接。"
 
         # Append sources
         urls_section = "\n\n## 来源\n\n" + "\n".join(
-            [f"- {url}" for url in visited_urls]
+            [f"- [{url}]({url})" for url in visited_urls]
         )
-        return report + urls_section
-    except json.JSONDecodeError as e:
-        print(f"Error parsing JSON response: {e}")
-        print(f"Raw response: {response}")
-        return "Error generating report"
+        
+        report = report + urls_section if visited_urls else report
+
+        # Save to file
+        with open("output.md", "w", encoding="utf-8") as f:
+            f.write(report)
+
+        return report
+    except Exception as e:
+        error_report = f"# 报告生成错误\n\n生成最终报告时出错: {str(e)}\n\n"
+        error_report += f"## 原始查询\n\n{prompt}\n\n"
+        error_report += f"## 收集的信息\n\n{learnings_string}\n\n"
+        
+        # 添加调试信息
+        error_report += f"## 调试信息\n\n```\n响应类型: {type(response)}\n响应内容: {response}\n```\n"
+        
+        # Save to file
+        with open("output.md", "w", encoding="utf-8") as f:
+            f.write(error_report)
+            
+        return error_report
 
 
 async def deep_research(
diff --git a/trustrag/modules/deepresearch/agent/providers.py b/trustrag/modules/deepresearch/agent/providers.py
diff --git a/trustrag/modules/deepresearch/config.py b/trustrag/modules/deepresearch/config.py
diff --git a/trustrag/modules/document/rag_tokenizer.py b/trustrag/modules/document/rag_tokenizer.py