1
1
import os
2
2
import uuid
3
- from typing import Any , Dict , List , Optional , Union
4
3
4
+ from urllib .parse import unquote
5
+ from typing import Any , Dict , List , Optional , Union
5
6
from gitingest .ignore_patterns import DEFAULT_IGNORE_PATTERNS
6
7
7
8
TMP_BASE_PATH = "../tmp"
@@ -22,6 +23,8 @@ def parse_url(url: str) -> Dict[str, Any]:
22
23
}
23
24
24
25
url = url .split (" " )[0 ]
26
+ url = unquote (url ) # Decode URL-encoded characters
27
+
25
28
if not url .startswith ('https://' ):
26
29
url = 'https://' + url
27
30
@@ -36,19 +39,34 @@ def parse_url(url: str) -> Dict[str, Any]:
36
39
parsed ["user_name" ] = path_parts [0 ]
37
40
parsed ["repo_name" ] = path_parts [1 ]
38
41
39
- # Keep original URL format
42
+ # Keep original URL format but with decoded components
40
43
parsed ["url" ] = f"https://{ domain } /{ parsed ['user_name' ]} /{ parsed ['repo_name' ]} "
41
44
parsed ['slug' ] = f"{ parsed ['user_name' ]} -{ parsed ['repo_name' ]} "
42
45
parsed ["id" ] = str (uuid .uuid4 ())
43
46
parsed ["local_path" ] = f"{ TMP_BASE_PATH } /{ parsed ['id' ]} /{ parsed ['slug' ]} "
44
47
45
48
if len (path_parts ) > 3 :
46
- parsed ["type" ] = path_parts [2 ]
47
- parsed ["branch" ] = path_parts [3 ]
48
- if len (parsed ['branch' ]) == 40 and all (c in '0123456789abcdefABCDEF' for c in parsed ['branch' ]):
49
- parsed ["commit" ] = parsed ['branch' ]
50
49
51
- parsed ["subpath" ] = "/" + "/" .join (path_parts [4 :])
50
+ parsed ["type" ] = path_parts [2 ] # Usually 'tree' or 'blob'
51
+
52
+ # Find the commit hash or reconstruct the branch name
53
+ remaining_parts = path_parts [3 :]
54
+ if remaining_parts [0 ] and len (remaining_parts [0 ]) == 40 and all (c in '0123456789abcdefABCDEF' for c in remaining_parts [0 ]):
55
+ parsed ["commit" ] = remaining_parts [0 ]
56
+ parsed ["subpath" ] = "/" + "/" .join (remaining_parts [1 :]) if len (remaining_parts ) > 1 else "/"
57
+ else :
58
+ # Handle branch names with slashes and special characters
59
+ for i , part in enumerate (remaining_parts ):
60
+ if part in ('tree' , 'blob' ):
61
+ # Found another type indicator, everything before this was the branch name
62
+ parsed ["branch" ] = "/" .join (remaining_parts [:i ])
63
+ parsed ["subpath" ] = "/" + "/" .join (remaining_parts [i + 2 :]) if len (remaining_parts ) > i + 2 else "/"
64
+ break
65
+ else :
66
+ # No additional type indicator found, assume everything is part of the branch name
67
+ parsed ["branch" ] = "/" .join (remaining_parts )
68
+ parsed ["subpath" ] = "/"
69
+
52
70
53
71
return parsed
54
72
0 commit comments