Skip to content

muAgent: [{#21}][{difficulty}]{场景用例:构建一个代码仓库阅读的场景} #41

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions examples/muagent_examples/repochat/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
github_token =""
openai_api_key =""
openai_base_url = ""
6 changes: 6 additions & 0 deletions examples/muagent_examples/repochat/.env-tmp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
github_token =""
OPENAI_API_KEY = ""
API_BASE_URL = 'https://api.openai.com/v1/'
model_name = 'gpt-4o'
embed_model = 'text-embedding-ada-002'
model_engine = 'openai'
8 changes: 8 additions & 0 deletions examples/muagent_examples/repochat/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
muAgent-RepoChat

## 需求
1. GitHub仓库克隆:用户提供GitHub仓库地址后,系统应自动克隆代码至指定的本地路径,便于后续分析。
2. 代码结构解析与信息提取:解析仓库结构,统计文件和文件夹数量,分析每个文件夹内容并生成描述。
3. 初始信息生成:基于文件结构,推测仓库的主要目的或功能,识别可能的启动文件及其路径,并提供如何启动仓库的指导。
4. 图数据库构建:将仓库内容以某种结构存入图数据库,以便后续的问答和查询。
5. 问答功能:基于初始化信息提供关于仓库的基本问题解答。
Empty file.
43 changes: 43 additions & 0 deletions examples/muagent_examples/repochat/codebase/coderetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os
from muagent.llm_models.llm_config import EmbedConfig, LLMConfig
from muagent.codechat.codebase_handler.codebase_handler import CodeBaseHandler

from dotenv import load_dotenv

from utils.tools import check_java_project
class CodeRetrieval:
def __init__(self,code_path,use_nh) -> None:
load_dotenv()
api_key = os.environ["OPENAI_API_KEY"]
api_base_url= os.environ["API_BASE_URL"]
model_name = os.environ["model_name"]
embed_model = os.environ["embed_model"]
model_engine = os.environ["model_engine"]
self.llm_config = LLMConfig(
model_name=model_name, model_engine=model_engine, api_key=api_key, api_base_url=api_base_url, temperature=0.3
)
self.embed_config = EmbedConfig(
embed_engine=model_engine, embed_model=embed_model, api_key=api_key, api_base_url=api_base_url)
if use_nh:
os.environ['nb_host'] = 'graphd'
os.environ['nb_port'] = '9669'
os.environ['nb_username'] = 'root'
os.environ['nb_password'] = 'nebula'
os.environ['nb_space'] = "client"
# 开始检查codepath是否存在
if not os.path.exists(code_path):
raise Exception(f"code_path {code_path} not exists")
# 开始检查code_path这个是否是java项目 TODO:后面加其它语言
check_java_project(code_path)
self.code_path = code_path
self.lang = "java"
self.use_nh = use_nh
self.CB_ROOT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "repobase")
os.makedirs(self.CB_ROOT_PATH, exist_ok=True)

def init_codebase(self, codebase_name: str,do_interpret:bool = False):
self.cbh = CodeBaseHandler(codebase_name, self.code_path, crawl_type='dir', use_nh=self.use_nh, local_graph_path=self.CB_ROOT_PATH,
llm_config=self.llm_config, embed_config=self.embed_config,language=self.lang)
self.cbh.import_code(do_interpret=False)
def search_code(self, query,search_type="cypher",limit=10):
return self.cbh.search_code(query,search_type,limit=limit)
109 changes: 109 additions & 0 deletions examples/muagent_examples/repochat/codebase/prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
analyze_project_tree_prompt_add_prompt = """
Input:
[项目目录架构]
{dictory_structure}
[用户issue]
{user_issue}
Output:

"""

analyze_files_project_tree_prompt = """
你是一名代码架构专家,根据用户提供的issue,判断项目中哪个文件可能可以回答问题。

请按照以下JSON格式进行响应:
{
"files": {
"thoughts": "用中文说明为何选择这些文件,如果没有确定的文件路径则留空。",
"file_path": ["如果确定需要修改的文件路径,一定要包含项目目录架构最外层的完整路径,请基于项目目录架构提供,最多5个"]
}
}
##NOTE:
要是路径一定要跟着项目目录架构,否则会出现问题。
django/
Gruntfile.js
scripts/
manage_translations.py
rpm-install.sh
django/
templatetags/
l10n.py
比如想要找l10n.py这个文件一定要按照这样输出:'django/django/templatetags/l10n.py'

规则:
- file_path 最多五个元素。
- 不要输出其他信息,避免使用引号(例如`, \", \'等)。
- 确保输出可以被Python的 `json.loads` 解析。
- 不要使用markdown格式,例如```json或```,只需以相应的字符串格式输出。
Input:
[项目目录架构]
django/
Gruntfile.js
.git-blame-ignore-revs
INSTALL
LICENSE
CONTRIBUTING.rst
AUTHORS
.pre-commit-config.yaml
pyproject.toml
.eslintrc
MANIFEST.in
.readthedocs.yml
.editorconfig
LICENSE.python
setup.py
.gitignore
package.json
tox.ini
.gitattributes
setup.cfg
.eslintignore
README.rst
scripts/
manage_translations.py
rpm-install.sh
django/
shortcuts.py
__init__.py
__main__.py
templatetags/
l10n.py
tz.py
cache.py
__init__.py
static.py
i18n.py
template/
library.py
__init__.py
response.py
smartif.py
context_processors.py
defaultfilters.py
engine.py
context.py
utils.py
loader.py
loader_tags.py
exceptions.py
autoreload.py
base.py
defaulttags.py


[用户issue]
New template filter `escapeseq`
Description

Following #34574, and after some conversations within the security team, it seems appropriate to provide a new template filter escapeseq which would be to escape what safeseq is to safe. An example of usage would be:
{{ some_list|escapeseq|join:"," }}
where each item of some_list is escaped before applying the join operation. This usage makes sense in a context where autoescape is off.

Output:
{
"files": {
"thoughts": "新的模板过滤器escapeseq会涉及到过滤器的具体实现文件。根据Django项目结构,这些过滤器通常定义在defaultfilters.py文件中。",
"file_path": ["django/django/template/defaultfilters.py"]
}
}
"""
4 changes: 4 additions & 0 deletions examples/muagent_examples/repochat/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
python-dotenv
gitpython
codefuse-muagent
chainlit
32 changes: 32 additions & 0 deletions examples/muagent_examples/repochat/tmp-java/Person.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package com.example.bank;

public class Person {
private String name;
private BankAccount account;

public Person(String name) {
this.name = name;
this.account = new BankAccount(this);
}

public String getName() {
return name;
}

public BankAccount getAccount() {
return account;
}

public void deposit(double amount) {
account.deposit(amount);
System.out.println(name + " deposited " + amount + " to their account.");
}

public void withdraw(double amount) {
if (account.withdraw(amount)) {
System.out.println(name + " withdrew " + amount + " from their account.");
} else {
System.out.println(name + " does not have enough balance to withdraw " + amount + ".");
}
}
}
90 changes: 90 additions & 0 deletions examples/muagent_examples/repochat/utils/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
import git
from dotenv import load_dotenv
import urllib.parse



def get_directory_structure(directory_path:str, notallow:set=None):
"""
获取指定目录下的文件结构并返回为字符串格式。

:param directory_path: str, 目录路径
:param notallow: set, 不允许包含的文件或目录集合,默认值为None
:return: str, 文件结构
"""
structure = []
notallow_dict = {'.git', '__pycache__', '.idea','.github','.tx'}

# 如果 notallow 参数不为空,将其合并到 notallow_dict 中
if notallow:
notallow_dict.update(notallow)
for root, dirs, files in os.walk(directory_path):
# 过滤掉不需要的目录
dirs[:] = [d for d in dirs if d not in notallow_dict]

level = root.replace(directory_path, '').count(os.sep)
indent = ' ' * 4 * level
structure.append(f"{indent}{os.path.basename(root)}/")
sub_indent = ' ' * 4 * (level + 1)
for file in files:
structure.append(f"{sub_indent}{file}")

return "\n".join(structure)

# 开始检查 code_path 是否是 Java 项目
def check_java_project(code_path):
# 检查是否存在 pom.xml 文件
if not os.path.exists(os.path.join(code_path, "pom.xml")):
# 如果没有 pom.xml 文件,检查是否有 .java 文件
has_java_file = False
for root, dirs, files in os.walk(code_path):
if any(file.endswith(".java") for file in files):
has_java_file = True
break
# 如果既没有 pom.xml 也没有 .java 文件,抛出异常
if not has_java_file:
raise Exception(f"code_path {code_path} is not a Java project")
print(f"code_path {code_path} is a Java project")

def clone_repo_with_token(repo_url, clone_to):
"""
克隆一个需要认证的GitHub仓库。

参数:
repo_url (str): 原始仓库的URL。
clone_to (str): 克隆到的本地目录。

返回:
str: 成功时返回克隆到的本地目录(包含子目录),不成功时返回空字符串。
"""
try:
if not os.path.exists(clone_to):
os.makedirs(clone_to)
load_dotenv()
# 从环境变量中获取令牌
token = os.getenv('github_token')
if not token:
raise ValueError("GitHub token not found in environment variables")

# 提取仓库的域名和路径
if repo_url.startswith("https://"):
repo_url = repo_url.replace("https://", f"https://{token}@")
elif repo_url.startswith("http://"):
repo_url = repo_url.replace("http://", f"http://{token}@")

# 从URL中提取仓库名称
repo_name = urllib.parse.urlparse(repo_url).path.split('/')[-1]

# 在clone_to目录下创建新的目录
cloned_path = os.path.join(clone_to, repo_name)
if os.path.exists(cloned_path):
return cloned_path
# 克隆仓库
repo = git.Repo.clone_from(repo_url, cloned_path)

print(f"Repository cloned to {cloned_path}")
return cloned_path
except Exception as e:
print(f"Failed to clone repository: {e}")
return ''
3 changes: 3 additions & 0 deletions muagent/codechat/code_analyzer/code_static_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
'''
from muagent.codechat.code_analyzer.language_static_analysis import *


class CodeStaticAnalysis:
def __init__(self, language):
self.language = language
Expand All @@ -19,6 +20,8 @@ def analyze(self, code_dict):
'''
if self.language == 'java':
analyzer = JavaStaticAnalysis()
elif self.language == 'python':
analyzer = PythonStaticAnalysis()
else:
raise ValueError('language should be one of [java]')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
'''

from .java_static_analysis import JavaStaticAnalysis

from .python_static_analysis import PythonStaticAnalysis

__all__ = [
'JavaStaticAnalysis'
'JavaStaticAnalysis','PythonStaticAnalysis'
]
Loading