Merge pull request #130 from gomate-community/pipeline

yanqiangmiffy · web-flow · commit fd60d31c4082 · 2025-03-14T01:41:20.000+08:00
Pipeline
diff --git a/.gitignore b/.gitignore
@@ -30,4 +30,6 @@ examples/datasets/papers
 examples/download/models
 .gradio
 examples/datasets/arxiv/papers
-examples/projects/arxiv/papers
+examples/projects/arxiv/papers
+trustrag/modules/deepsearch/.env
+*.env
diff --git a/docs/mysql.md b/docs/mysql.md
@@ -0,0 +1,15 @@
+## mysql部署
+
+```bash
+docker stop mysql
+
+docker rm -f mysql
+
+docker run --name mysql \
+    -p 3306:3306 \
+    --restart always \
+    -v G:/Ubuntu_WSL/rag-middlewares/mysql/data:/var/lib/mysql \
+    -e MYSQL_ROOT_PASSWORD=123456 \
+    -d mysql:latest
+
+```
diff --git a/trustrag/config/config_loader.py b/trustrag/config/config_loader.py
@@ -10,13 +10,13 @@ class ConfigLoader:
     
     _instance = None
     _config = None
-    
-    def __new__(cls):
+
+    def __new__(cls, *args, **kwargs):
         """单例模式，确保只有一个配置实例"""
         if cls._instance is None:
             cls._instance = super(ConfigLoader, cls).__new__(cls)
         return cls._instance
-    
+
     def __init__(self,config_path):
         """初始化配置加载器"""
         self.config_path=config_path
diff --git a/trustrag/modules/citation/citation_match_llm.json b/trustrag/modules/citation/citation_match_llm.json
diff --git a/trustrag/modules/citation/citation_match_llm_res.json b/trustrag/modules/citation/citation_match_llm_res.json
diff --git a/trustrag/modules/citation/llm_citation.py b/trustrag/modules/citation/llm_citation.py
@@ -1,6 +1,7 @@
 import json
+import re
 from typing import List
-import  re
+
 import jieba
 import loguru
 
@@ -15,7 +16,7 @@ def cut(self, para: str):
 
         # 定义结束符号列表
         # end_symbols = ['。', '！', '？', '…', '；', '\n'] # sent
-        end_symbols = ['。', '！', '？', '…', '；', '\n']# para
+        end_symbols = ['。', '！', '？', '…', '；', '\n']  # para
 
         # 定义引号对
         quote_pairs = {'"': '"', "'": "'", '「': '」', '『': '』'}
@@ -102,8 +103,7 @@ def highlight_common_substrings(self, response_content, select_content, min_leng
                 best_match_positions = [[0, len(select_content) - 1]]
         return best_match_positions
 
-
-    def cal_common_ration(self,response,evidence):
+    def cal_common_ration(self, response, evidence):
         """
         计算答案中的段落与匹配证据的相似度，或者重合度，直接居于共现词的比例
         """
@@ -114,8 +114,7 @@ def cal_common_ration(self,response,evidence):
         ratio = len(overlap) / sentence_seg_cut_length
         return ratio
 
-
-    def extract_citations(self,response:str=None):
+    def extract_citations(self, response: str = None):
         """
         xxx[1]xxx[2],
         find all citation patterns like [number]
@@ -166,6 +165,7 @@ def extract_citations(self,response:str=None):
             "citations": citations,
             "parsed_result": parsed_result
         }
+
     def ground_response(
             self,
             question: str,
@@ -189,37 +189,34 @@ def ground_response(
 
         # Save to JSON file
         try:
-            output_file = "/home/yanqiang/code/citation_match_llm.json"
+            output_file = "/home/yanqiang/code/citation_match_llm_res.json"
             with open(output_file, 'w', encoding='utf-8') as f:
                 json.dump(json_data, f, ensure_ascii=False, indent=4)
         except Exception as e:
-            print(json_data)
             output_file = "citation_match_llm_res.json"
             with open(output_file, 'w', encoding='utf-8') as f:
-                loguru.logger.info(json_data)
+                # loguru.logger.info(json_data)
                 json.dump(json_data, f, ensure_ascii=False, indent=4)
         loguru.logger.info(f"Parameters saved to {output_file}")
-        citation_result=self.extract_citations(response=response)
-        parsed_result=citation_result["parsed_result"]
-        print(citation_result)
+        citation_result = self.extract_citations(response=response)
+        parsed_result = citation_result["parsed_result"]
 
         quote_list = []
-
-        for idx,citation_item in enumerate(parsed_result):
-            #todo:判断citation_item的类型，是text还是citation，
+        existed_citations=[]
+        citation_indices_map = {}
+        start_indices = 0
+        for idx, citation_item in enumerate(parsed_result):
+            # todo:判断citation_item的类型，是text还是citation，
             # 如果当前citation_item是text，判断下一个类型是否为是citation，如果为citation，那么best_idx等于下面：
             # best_idx=parsed_result[idx+1]["index"]
-            if idx<=len(parsed_result)-2:
+            if idx <= len(parsed_result) - 2:
                 if citation_item["type"] == "text":
-                    if parsed_result[idx+1]["type"] == "citation":
-
-                        best_idx=parsed_result[idx+1]["index"]# 这个是selected_idx的真实引号+1，例如38
-                        best_idx=selected_idx.index((int(best_idx)-1))#
-
-                        print(best_idx)
-                        response_content=citation_item["content"]
-                        select_content=selected_docs[best_idx]["content"]
-
+                    if parsed_result[idx + 1]["type"] == "citation":
+                        raw_idx = parsed_result[idx + 1]["index"]  # 这个是selected_idx的真实引号+1，例如38
+                        best_idx = selected_idx.index((int(raw_idx) - 1))  #
+                        # loguru.logger.info(f"raw_idx:{raw_idx},best_idx:{best_idx}")
+                        response_content = citation_item["content"]
+                        select_content = selected_docs[best_idx]["content"]
                         highlighted_start_end = self.highlight_common_substrings(response_content, select_content)
                         group_item = {
                             "doc_id": selected_docs[best_idx]["doc_id"],
@@ -229,30 +226,39 @@ def ground_response(
                             "doc_title": selected_docs[best_idx]["newsinfo"]["title"],
                             # "chk_content": selected_docs[best_idx]['content'],
                             "chk_content": select_content,
-                            "best_ratio": self.cal_common_ration(response_content,select_content),
+                            "best_ratio": self.cal_common_ration(response_content, select_content),
                             "highlight": highlighted_start_end,
                         }
-
                         group_data = {
                             "doc_list": [group_item],
                             "chk_content": group_item["chk_content"],
                             "highlight": group_item["highlight"],
                         }
-                        quote_list.append(group_data)
-
-
-        response_result=''.join([item["content"] for item in citation_result["parsed_result"]])
+                        if start_indices not in citation_result["citations"] and group_data["chk_content"] not in existed_citations:
+                            quote_list.append(group_data)
+                            existed_citations.append(group_data["chk_content"])
+                            citation_indices_map[raw_idx] = start_indices
+                            start_indices += 1
+
+        loguru.logger.info(citation_indices_map)
+        loguru.logger.info(len(quote_list))
+        final_responses=[]
+        for item in citation_result["parsed_result"]:
+            if item["type"] == "text":
+                final_responses.append(item["content"])
+            else:
+                citation_ind=citation_indices_map[item["index"]]+1
+                final_responses.append(f"[{citation_ind}]")
+        response_result = ''.join(final_responses)
         data = {'result': response_result, 'quote_list': quote_list, 'summary': ''}
-
         # Save to JSON file
         json_data['result'] = response_result
         json_data['quote_list'] = quote_list
-        output_file = "citation_match_llm_res.json"
+        # loguru.logger.info(response_result)
+
         with open(output_file, 'w', encoding='utf-8') as f:
             json.dump(json_data, f, ensure_ascii=False, indent=4)
         loguru.logger.info(f"Parameters saved to {output_file}")
-
-
         return data
 
 
diff --git a/trustrag/modules/document/utils.py b/trustrag/modules/document/utils.py
@@ -12,11 +12,9 @@
 import os
 import pathlib
 import re
-import chardet
-
-import tiktoken
+from typing import Union
 
-import pathlib
+import chardet
 
 # 获取当前文件所在的路径
 current_path = pathlib.Path(__file__).resolve()
@@ -34,7 +32,6 @@
 project_root_str = str(project_root)
 print(f"项目根目录为: {project_root_str}")
 
-
 PROJECT_BASE = project_root_str
 all_codecs = [
     'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@@ -144,6 +141,8 @@ def findMaxTm(fnm):
     except Exception as e:
         pass
     return m
+
+
 def get_encoding(file: Union[str, bytes]) -> str:
     """
     Detects the encoding of a given file.
@@ -158,7 +157,6 @@ def get_encoding(file: Union[str, bytes]) -> str:
         tmp = chardet.detect(f.read())
         return tmp['encoding']
 
-
 # # https://stackoverflow.com/questions/76106366/how-to-use-tiktoken-in-offline-mode-computer
 # tiktoken_cache_dir = "/data/users/searchgpt/yq/GoMate/data/docs"
 # os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir