codefuse-ai · XingYu-Zhong · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 31, 2024
diff --git a/examples/muagent_examples/repochat/.env b/examples/muagent_examples/repochat/.env
@@ -0,0 +1,3 @@
+github_token =""
+openai_api_key =""
+openai_base_url = ""
diff --git a/examples/muagent_examples/repochat/.env-tmp b/examples/muagent_examples/repochat/.env-tmp
@@ -0,0 +1,6 @@
+github_token =""
+OPENAI_API_KEY = ""
+API_BASE_URL = 'https://api.openai.com/v1/'
+model_name = 'gpt-4o'
+embed_model = 'text-embedding-ada-002'
+model_engine = 'openai'
diff --git a/examples/muagent_examples/repochat/README.md b/examples/muagent_examples/repochat/README.md
@@ -0,0 +1,8 @@
+muAgent-RepoChat
+
+## 需求
+	1.	GitHub仓库克隆：用户提供GitHub仓库地址后，系统应自动克隆代码至指定的本地路径，便于后续分析。
+	2.	代码结构解析与信息提取：解析仓库结构，统计文件和文件夹数量，分析每个文件夹内容并生成描述。
+	3.	初始信息生成：基于文件结构，推测仓库的主要目的或功能，识别可能的启动文件及其路径，并提供如何启动仓库的指导。
+	4.	图数据库构建：将仓库内容以某种结构存入图数据库，以便后续的问答和查询。
+	5.	问答功能：基于初始化信息提供关于仓库的基本问题解答。
diff --git a/examples/muagent_examples/repochat/README_zh.md b/examples/muagent_examples/repochat/README_zh.md
diff --git a/examples/muagent_examples/repochat/codebase/coderetrieval.py b/examples/muagent_examples/repochat/codebase/coderetrieval.py
@@ -0,0 +1,43 @@
+import os
+from muagent.llm_models.llm_config import EmbedConfig, LLMConfig
+from muagent.codechat.codebase_handler.codebase_handler import CodeBaseHandler
+
+from dotenv import load_dotenv
+
+from utils.tools import check_java_project
+class CodeRetrieval:
+    def __init__(self,code_path,use_nh) -> None:
+        load_dotenv()
+        api_key = os.environ["OPENAI_API_KEY"]
+        api_base_url= os.environ["API_BASE_URL"]
+        model_name = os.environ["model_name"]
+        embed_model = os.environ["embed_model"]
+        model_engine = os.environ["model_engine"]
+        self.llm_config = LLMConfig(
+            model_name=model_name, model_engine=model_engine, api_key=api_key,  api_base_url=api_base_url, temperature=0.3
+        )
+        self.embed_config = EmbedConfig(
+            embed_engine=model_engine, embed_model=embed_model,  api_key=api_key,  api_base_url=api_base_url)
+        if use_nh:
+            os.environ['nb_host'] = 'graphd'
+            os.environ['nb_port'] = '9669'
+            os.environ['nb_username'] = 'root'
+            os.environ['nb_password'] = 'nebula'
+            os.environ['nb_space'] = "client"
+        # 开始检查codepath是否存在
+        if not os.path.exists(code_path):
+            raise Exception(f"code_path {code_path} not exists")
+        # 开始检查code_path这个是否是java项目 TODO:后面加其它语言
+        check_java_project(code_path)
+        self.code_path = code_path
+        self.lang = "java" 
+        self.use_nh = use_nh
+        self.CB_ROOT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "repobase")
+        os.makedirs(self.CB_ROOT_PATH, exist_ok=True)
+
+    def init_codebase(self, codebase_name: str,do_interpret:bool = False):
+        self.cbh = CodeBaseHandler(codebase_name, self.code_path, crawl_type='dir', use_nh=self.use_nh, local_graph_path=self.CB_ROOT_PATH,
+                      llm_config=self.llm_config, embed_config=self.embed_config,language=self.lang)
+        self.cbh.import_code(do_interpret=False)
+    def search_code(self, query,search_type="cypher",limit=10):
+        return self.cbh.search_code(query,search_type,limit=limit)
diff --git a/examples/muagent_examples/repochat/codebase/prompt.py b/examples/muagent_examples/repochat/codebase/prompt.py
@@ -0,0 +1,109 @@
+analyze_project_tree_prompt_add_prompt = """
+Input:
+[项目目录架构]
+{dictory_structure}
+[用户issue]
+{user_issue}
+Output:
+
+"""
+
+analyze_files_project_tree_prompt = """
+你是一名代码架构专家，根据用户提供的issue，判断项目中哪个文件可能可以回答问题。
+
+请按照以下JSON格式进行响应：
+{
+    "files": {
+        "thoughts": "用中文说明为何选择这些文件，如果没有确定的文件路径则留空。",
+        "file_path": ["如果确定需要修改的文件路径，一定要包含项目目录架构最外层的完整路径，请基于项目目录架构提供，最多5个"]
+    }
+}
+##NOTE：
+要是路径一定要跟着项目目录架构，否则会出现问题。
+django/
+    Gruntfile.js
+    scripts/
+        manage_translations.py
+        rpm-install.sh
+    django/
+        templatetags/
+            l10n.py
+比如想要找l10n.py这个文件一定要按照这样输出：'django/django/templatetags/l10n.py'
+
+规则：
+- file_path 最多五个元素。
+- 不要输出其他信息，避免使用引号（例如`, \", \'等）。
+- 确保输出可以被Python的 `json.loads` 解析。
+- 不要使用markdown格式，例如```json或```，只需以相应的字符串格式输出。
+Input:
+[项目目录架构]
+django/
+    Gruntfile.js
+    .git-blame-ignore-revs
+    INSTALL
+    LICENSE
+    CONTRIBUTING.rst
+    AUTHORS
+    .pre-commit-config.yaml
+    pyproject.toml
+    .eslintrc
+    MANIFEST.in
+    .readthedocs.yml
+    .editorconfig
+    LICENSE.python
+    setup.py
+    .gitignore
+    package.json
+    tox.ini
+    .gitattributes
+    setup.cfg
+    .eslintignore
+    README.rst
+    scripts/
+        manage_translations.py
+        rpm-install.sh
+    django/
+        shortcuts.py
+        __init__.py
+        __main__.py
+        templatetags/
+            l10n.py
+            tz.py
+            cache.py
+            __init__.py
+            static.py
+            i18n.py
+        template/
+            library.py
+            __init__.py
+            response.py
+            smartif.py
+            context_processors.py
+            defaultfilters.py
+            engine.py
+            context.py
+            utils.py
+            loader.py
+            loader_tags.py
+            exceptions.py
+            autoreload.py
+            base.py
+            defaulttags.py
+
+
+[用户issue]
+New template filter `escapeseq`
+Description
+
+Following #34574, and after some conversations within the security team, it seems appropriate to provide a new template filter escapeseq which would be to escape what safeseq is to safe. An example of usage would be:
+{{ some_list|escapeseq|join:"," }}
+where each item of some_list is escaped before applying the join operation. This usage makes sense in a context where autoescape is off.
+
+Output:
+{
+    "files": {
+        "thoughts": "新的模板过滤器escapeseq会涉及到过滤器的具体实现文件。根据Django项目结构，这些过滤器通常定义在defaultfilters.py文件中。",
+        "file_path": ["django/django/template/defaultfilters.py"]
+    }
+}
+"""
diff --git a/examples/muagent_examples/repochat/requirements.txt b/examples/muagent_examples/repochat/requirements.txt
@@ -0,0 +1,4 @@
+python-dotenv
+gitpython
+codefuse-muagent 
+chainlit
diff --git a/examples/muagent_examples/repochat/tmp-java/Person.java b/examples/muagent_examples/repochat/tmp-java/Person.java
@@ -0,0 +1,32 @@
+package com.example.bank;
+
+public class Person {
+    private String name;
+    private BankAccount account;
+
+    public Person(String name) {
+        this.name = name;
+        this.account = new BankAccount(this);
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    public BankAccount getAccount() {
+        return account;
+    }
+
+    public void deposit(double amount) {
+        account.deposit(amount);
+        System.out.println(name + " deposited " + amount + " to their account.");
+    }
+
+    public void withdraw(double amount) {
+        if (account.withdraw(amount)) {
+            System.out.println(name + " withdrew " + amount + " from their account.");
+        } else {
+            System.out.println(name + " does not have enough balance to withdraw " + amount + ".");
+        }
+    }
+}
diff --git a/examples/muagent_examples/repochat/utils/tools.py b/examples/muagent_examples/repochat/utils/tools.py
@@ -0,0 +1,90 @@
+import os
+import git
+from dotenv import load_dotenv
+import urllib.parse
+
+
+
+def get_directory_structure(directory_path:str, notallow:set=None):
+    """
+    获取指定目录下的文件结构并返回为字符串格式。
+
+    :param directory_path: str, 目录路径
+    :param notallow: set, 不允许包含的文件或目录集合，默认值为None
+    :return: str, 文件结构
+    """
+    structure = []
+    notallow_dict = {'.git', '__pycache__', '.idea','.github','.tx'}
+
+    # 如果 notallow 参数不为空，将其合并到 notallow_dict 中
+    if notallow:
+        notallow_dict.update(notallow)
+    for root, dirs, files in os.walk(directory_path):
+        # 过滤掉不需要的目录
+        dirs[:] = [d for d in dirs if d not in notallow_dict]
+
+        level = root.replace(directory_path, '').count(os.sep)
+        indent = ' ' * 4 * level
+        structure.append(f"{indent}{os.path.basename(root)}/")
+        sub_indent = ' ' * 4 * (level + 1)
+        for file in files:
+            structure.append(f"{sub_indent}{file}")
+
+    return "\n".join(structure)
+
+# 开始检查 code_path 是否是 Java 项目
+def check_java_project(code_path):
+    # 检查是否存在 pom.xml 文件
+    if not os.path.exists(os.path.join(code_path, "pom.xml")):
+        # 如果没有 pom.xml 文件，检查是否有 .java 文件
+        has_java_file = False
+        for root, dirs, files in os.walk(code_path):
+            if any(file.endswith(".java") for file in files):
+                has_java_file = True
+                break
+        # 如果既没有 pom.xml 也没有 .java 文件，抛出异常
+        if not has_java_file:
+            raise Exception(f"code_path {code_path} is not a Java project")
+    print(f"code_path {code_path} is a Java project")
+
+def clone_repo_with_token(repo_url, clone_to):
+    """
+    克隆一个需要认证的GitHub仓库。
+
+    参数:
+    repo_url (str): 原始仓库的URL。
+    clone_to (str): 克隆到的本地目录。
+
+    返回:
+    str: 成功时返回克隆到的本地目录（包含子目录），不成功时返回空字符串。
+    """
+    try:
+        if not os.path.exists(clone_to):
+            os.makedirs(clone_to)
+        load_dotenv()
+        # 从环境变量中获取令牌
+        token = os.getenv('github_token')
+        if not token:
+            raise ValueError("GitHub token not found in environment variables")
+
+        # 提取仓库的域名和路径
+        if repo_url.startswith("https://"):
+            repo_url = repo_url.replace("https://", f"https://{token}@")
+        elif repo_url.startswith("http://"):
+            repo_url = repo_url.replace("http://", f"http://{token}@")
+
+        # 从URL中提取仓库名称
+        repo_name = urllib.parse.urlparse(repo_url).path.split('/')[-1]
+
+        # 在clone_to目录下创建新的目录
+        cloned_path = os.path.join(clone_to, repo_name)
+        if os.path.exists(cloned_path):
+            return cloned_path
+        # 克隆仓库
+        repo = git.Repo.clone_from(repo_url, cloned_path)
+
+        print(f"Repository cloned to {cloned_path}")
+        return cloned_path
+    except Exception as e:
+        print(f"Failed to clone repository: {e}")
+        return ''
diff --git a/muagent/codechat/code_analyzer/code_static_analysis.py b/muagent/codechat/code_analyzer/code_static_analysis.py
@@ -7,6 +7,7 @@
 '''
 from muagent.codechat.code_analyzer.language_static_analysis import *
 
+
 class CodeStaticAnalysis:
     def __init__(self, language):
         self.language = language
@@ -19,6 +20,8 @@ def analyze(self, code_dict):
         '''
         if self.language == 'java':
             analyzer = JavaStaticAnalysis()
+        elif self.language == 'python':
+            analyzer = PythonStaticAnalysis()
         else:
             raise ValueError('language should be one of [java]')
 

diff --git a/muagent/codechat/code_analyzer/language_static_analysis/__init__.py b/muagent/codechat/code_analyzer/language_static_analysis/__init__.py
@@ -7,8 +7,8 @@
 '''
 
 from .java_static_analysis import JavaStaticAnalysis
-
+from .python_static_analysis import PythonStaticAnalysis
 
 __all__ = [
-    'JavaStaticAnalysis'
+    'JavaStaticAnalysis','PythonStaticAnalysis'
     ]