Skip to content

Commit 086aba0

Browse files
Refactor and enhance gitingest module for improved clarity, maintainability, and functionality.
- **Introduced the `CloneConfig` dataclass** to encapsulate cloning parameters, including `url`, `local_path`, `commit`, and `branch`. - **Enhanced documentation** by adding detailed docstrings to the functions `check_repo_exists`, `run_git_command`, and `clone_repo`. - **Improved error handling** by refining exception management processes. - **Streamlined repository existence checks** for increased reliability. - **Added the `run_git_command` function** to centralize and simplify the execution of Git commands. - **Refactored code structure** to enhance readability and maintainability. --- - **Replaced manual hexadecimal comparison (`"0123456789abcdefABCDEF"`)** with the `string` module by defining `HEX_DIGITS = set(string.hexdigits)`. - **Revised the construction of the `parsed` dictionary** in the `parse_url` function for clarity. - **Refactored the `parse_patterns` function** to store patterns in a list (`patterns`) instead of repeatedly joining and splitting them. - **Enhanced documentation** by adding docstrings to the `override_ignore_patterns` and `parse_query` functions. - **Removed redundant `pattern.strip()` call** in `normalize_pattern`, as this is now handled within `parse_patterns`. - **Optimized the `override_ignore_patterns` function** by implementing set difference for unordered comparisons. - **Improved the `parse_query` function's structure** for better readability and maintainability. --- - **Refined `print_query`, `print_error`, and `print_success` functions** to accept only the `url` parameter, removing the dependency on the entire `query` object. - **Eliminated the unused `request` argument** from the above functions. - **Integrated the `CloneConfig` dataclass** for improved parameter handling. --- - **Adopted the `CloneConfig` dataclass** for consistent parameter management. --- - **Removed the unused `files` argument** from the `create_summary_string` function to reduce unnecessary complexity. --- - **Simplified the `AsyncTimeoutError` class** by removing a redundant `pass` statement. --- - **Updated tests** to utilize the `CloneConfig` dataclass and align with the newly introduced `run_git_command` function for encapsulated Git command execution. --- - **Aligned comparison with `DEFAULT_IGNORE_PATTERNS`** to use a set difference, ensuring unordered existence comparison.
1 parent 39f30a9 commit 086aba0

File tree

9 files changed

+261
-183
lines changed

9 files changed

+261
-183
lines changed

src/gitingest/cli.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
import click
55

6-
from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
76
from gitingest.ingest import ingest
87
from gitingest.ingest_from_query import MAX_FILE_SIZE
98

@@ -37,7 +36,7 @@ def main(
3736

3837
if not output:
3938
output = "digest.txt"
40-
summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output)
39+
summary, _, _ = ingest(source, max_size, include_patterns, exclude_patterns, output=output)
4140

4241
click.echo(f"Analysis complete! Output written to: {output}")
4342
click.echo("\nSummary:")

src/gitingest/clone.py

Lines changed: 121 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,147 @@
11
import asyncio
2-
from typing import Any, Dict, Tuple
2+
from dataclasses import dataclass
3+
from typing import Optional, Tuple
34

4-
from gitingest.utils import async_timeout
5+
from gitingest.utils import AsyncTimeoutError, async_timeout
56

67
CLONE_TIMEOUT = 20
78

89

10+
@dataclass
11+
class CloneConfig:
12+
url: str
13+
local_path: str
14+
commit: Optional[str] = None
15+
branch: Optional[str] = None
16+
17+
918
async def check_repo_exists(url: str) -> bool:
19+
"""
20+
Check if a repository exists at the given URL using an HTTP HEAD request.
21+
22+
Parameters
23+
----------
24+
url : str
25+
The URL of the repository.
26+
27+
Returns
28+
-------
29+
bool
30+
True if the repository exists, False otherwise.
31+
"""
1032
proc = await asyncio.create_subprocess_exec(
1133
"curl",
1234
"-I",
1335
url,
1436
stdout=asyncio.subprocess.PIPE,
1537
stderr=asyncio.subprocess.PIPE,
1638
)
17-
stdout, stderr = await proc.communicate()
39+
stdout, _ = await proc.communicate()
1840
if proc.returncode != 0:
1941
return False
2042
# Check if stdout contains "404" status code
2143
stdout_str = stdout.decode()
2244
return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str
2345

2446

25-
@async_timeout(CLONE_TIMEOUT)
26-
async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]:
27-
if not await check_repo_exists(query['url']):
28-
raise ValueError("Repository not found, make sure it is public")
47+
async def run_git_command(*args: str) -> Tuple[bytes, bytes]:
48+
"""
49+
Executes a git command asynchronously and captures its output.
50+
51+
Parameters
52+
----------
53+
*args : str
54+
The git command and its arguments to execute.
2955
30-
if query['commit']:
31-
proc = await asyncio.create_subprocess_exec(
32-
"git",
33-
"clone",
34-
"--single-branch",
35-
query['url'],
36-
query['local_path'],
37-
stdout=asyncio.subprocess.PIPE,
38-
stderr=asyncio.subprocess.PIPE,
39-
)
40-
stdout, stderr = await proc.communicate()
41-
42-
proc = await asyncio.create_subprocess_exec(
43-
"git",
44-
"-C",
45-
query['local_path'],
46-
"checkout",
47-
query['branch'],
48-
stdout=asyncio.subprocess.PIPE,
49-
stderr=asyncio.subprocess.PIPE,
50-
)
51-
stdout, stderr = await proc.communicate()
52-
elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']:
53-
proc = await asyncio.create_subprocess_exec(
54-
"git",
55-
"clone",
56-
"--depth=1",
57-
"--single-branch",
58-
"--branch",
59-
query['branch'],
60-
query['url'],
61-
query['local_path'],
62-
stdout=asyncio.subprocess.PIPE,
63-
stderr=asyncio.subprocess.PIPE,
64-
)
65-
else:
66-
proc = await asyncio.create_subprocess_exec(
67-
"git",
68-
"clone",
69-
"--depth=1",
70-
"--single-branch",
71-
query['url'],
72-
query['local_path'],
73-
stdout=asyncio.subprocess.PIPE,
74-
stderr=asyncio.subprocess.PIPE,
75-
)
56+
Returns
57+
-------
58+
Tuple[bytes, bytes]
59+
A tuple containing the stdout and stderr of the git command.
7660
61+
Raises
62+
------
63+
RuntimeError
64+
If the git command exits with a non-zero status.
65+
"""
66+
proc = await asyncio.create_subprocess_exec(
67+
*args,
68+
stdout=asyncio.subprocess.PIPE,
69+
stderr=asyncio.subprocess.PIPE,
70+
)
7771
stdout, stderr = await proc.communicate()
72+
if proc.returncode != 0:
73+
error_message = stderr.decode().strip()
74+
raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}")
7875

7976
return stdout, stderr
77+
78+
79+
@async_timeout(CLONE_TIMEOUT)
80+
async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]:
81+
"""
82+
Clones a repository to a local path based on the provided query parameters.
83+
84+
Parameters
85+
----------
86+
config : CloneConfig
87+
A dictionary containing the following keys:
88+
- url (str): The URL of the repository.
89+
- local_path (str): The local path to clone the repository to.
90+
- commit (Optional[str]): The specific commit hash to checkout.
91+
- branch (Optional[str]): The branch to clone. Defaults to 'main' or 'master' if not provided.
92+
93+
Returns
94+
-------
95+
Tuple[bytes, bytes]
96+
A tuple containing the stdout and stderr of the git commands executed.
97+
98+
Raises
99+
------
100+
ValueError
101+
If the repository does not exist or if required query parameters are missing.
102+
RuntimeError
103+
If any git command fails during execution.
104+
AsyncTimeoutError
105+
If the cloning process exceeds the specified timeout.
106+
"""
107+
# Extract and validate query parameters
108+
url: str = config.url
109+
local_path: str = config.local_path
110+
commit: Optional[str] = config.commit
111+
branch: Optional[str] = config.branch
112+
113+
if not url:
114+
raise ValueError("The 'url' parameter is required.")
115+
116+
if not local_path:
117+
raise ValueError("The 'local_path' parameter is required.")
118+
119+
# if commit and branch:
120+
# raise ValueError("Provide either 'commit' or 'branch', not both.")
121+
122+
# Check if the repository exists
123+
if not await check_repo_exists(url):
124+
raise ValueError("Repository not found, make sure it is public")
125+
126+
try:
127+
if commit:
128+
# Scenario 1: Clone and checkout a specific commit
129+
# Clone the repository without depth to ensure full history for checkout
130+
clone_cmd = ["git", "clone", "--single-branch", url, local_path]
131+
await run_git_command(*clone_cmd)
132+
133+
# Checkout the specific commit
134+
checkout_cmd = ["git", "-C", local_path, "checkout", commit]
135+
return await run_git_command(*checkout_cmd)
136+
137+
if branch and branch.lower() not in ('main', 'master'):
138+
# Scenario 2: Clone a specific branch with shallow depth
139+
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path]
140+
return await run_git_command(*clone_cmd)
141+
142+
# Scenario 3: Clone the default branch with shallow depth
143+
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path]
144+
return await run_git_command(*clone_cmd)
145+
146+
except (RuntimeError, asyncio.TimeoutError, AsyncTimeoutError):
147+
raise # Re-raise the exception

src/gitingest/ingest.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
from pathlib import Path
55
from typing import List, Optional, Tuple, Union
66

7-
from gitingest.clone import clone_repo
7+
from gitingest.clone import CloneConfig, clone_repo
88
from gitingest.ingest_from_query import ingest_from_query
99
from gitingest.parse_query import parse_query
1010

1111

1212
def ingest(
1313
source: str,
14-
max_file_size: int = 10 * 1024 * 1024,
14+
max_file_size: int = 10 * 1024 * 1024, # 10 MB
1515
include_patterns: Union[List[str], str, None] = None,
1616
exclude_patterns: Union[List[str], str, None] = None,
1717
output: Optional[str] = None,
@@ -25,7 +25,14 @@ def ingest(
2525
ignore_patterns=exclude_patterns,
2626
)
2727
if query['url']:
28-
clone_result = clone_repo(query)
28+
# Extract relevant fields for CloneConfig
29+
clone_config = CloneConfig(
30+
url=f"https://github.com/{query['slug']}.git",
31+
local_path=query['local_path'],
32+
commit=query.get('commit'),
33+
branch=query.get('branch'),
34+
)
35+
clone_result = clone_repo(clone_config)
2936
if inspect.iscoroutine(clone_result):
3037
asyncio.run(clone_result)
3138
else:

src/gitingest/ingest_from_query.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str:
278278
return output
279279

280280

281-
def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str:
281+
def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str:
282282
"""Creates a summary string with file counts and content size."""
283283
if "user_name" in query:
284284
summary = f"Repository: {query['user_name']}/{query['repo_name']}\n"
@@ -297,12 +297,7 @@ def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: L
297297
return summary
298298

299299

300-
def create_tree_structure(
301-
query: Dict[str, Any],
302-
node: Dict[str, Any],
303-
prefix: str = "",
304-
is_last: bool = True,
305-
) -> str:
300+
def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: str = "", is_last: bool = True) -> str:
306301
"""Creates a tree-like string representation of the file structure."""
307302
tree = ""
308303

@@ -386,7 +381,7 @@ def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]:
386381
if not nodes:
387382
raise ValueError(f"No files found in {path}")
388383
files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size'])
389-
summary = create_summary_string(query, nodes, files)
384+
summary = create_summary_string(query, nodes)
390385
tree = "Directory structure:\n" + create_tree_structure(query, nodes)
391386
files_content = create_file_content_string(files)
392387

0 commit comments

Comments
 (0)