Skip to content

Commit 4bfe6d1

Browse files
committed
update: remove useless comments
1 parent 27e191f commit 4bfe6d1

File tree

3 files changed

+16
-78
lines changed

3 files changed

+16
-78
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
*.bak
22
*.pyc
33
*.log
4-
Config.py
4+
config/Config.py

GitPrey.py

+1-63
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,7 @@ class GitPrey(object):
5353
\______/ \______| \__| \__| \__| \__|\________| \__|
5454
5555
Author: repoog
56-
Version: 2.6
57-
Create Date: 2016-03-15
58-
Update Date: 2019-05-20
56+
Version: 2.6.2
5957
Python Version: v3.6.4
6058
"""
6159

@@ -66,10 +64,6 @@ def __init__(self, keyword):
6664
self.cookies = ""
6765

6866
def search_project(self):
69-
"""
70-
Search related projects with recently indexed sort according to keyword
71-
:returns: Related projects list
72-
"""
7367
unique_project_list = []
7468
self.__auto_login(USER_NAME, PASSWORD)
7569
info_print('[*] Searching hard for projects...')
@@ -99,23 +93,12 @@ def search_project(self):
9993

10094
@staticmethod
10195
def __page_project_list(page_html):
102-
"""
103-
Get project list of one searching result page
104-
:param page_html: Html page content
105-
:returns: Project list of per page
106-
"""
10796
cur_par_html = BeautifulSoup(page_html, "lxml")
10897
project_info = cur_par_html.select("a.link-gray")
10998
page_project = [project.text.strip() for project in project_info]
11099
return page_project
111100

112101
def sensitive_info_query(self, project_string, mode):
113-
"""
114-
Search sensitive information and sensitive file from projects
115-
:param project_string: Key words string for querying
116-
:param mode: Searching mode within "content" or "filename"
117-
:returns: Code segments or file lists
118-
"""
119102
if mode == "content":
120103
# Output code line with sensitive key words like username.
121104
info_sig_list = self.__pattern_db_list(INFO_DB)
@@ -137,13 +120,6 @@ def sensitive_info_query(self, project_string, mode):
137120
return repo_file_dic
138121

139122
def __file_content_inspect(self, project_string, file_pattern, project_pattern):
140-
"""
141-
Check sensitive code in particular project
142-
:param project_string: Projects for searching
143-
:param file_pattern: File string for searching
144-
:param project_pattern: Content signature match regular
145-
:returns: Code segments
146-
"""
147123
query_string = " OR ".join(project_pattern)
148124
repo_file_dic = self.__file_name_inspect(query_string + project_string + file_pattern)
149125
repo_code_dic = {}
@@ -167,12 +143,6 @@ def __file_content_inspect(self, project_string, file_pattern, project_pattern):
167143
return repo_code_dic
168144

169145
def __file_name_inspect(self, file_query_string, print_mode=0):
170-
"""
171-
Inspect sensitive file in particular project
172-
:param file_query_string: File string for searching
173-
:param print_mode: 1 means print file, 0 means print code
174-
:returns: Files lists
175-
"""
176146
page_num = 1
177147
repo_file_dic = {}
178148
while page_num <= SCAN_DEEP[SEARCH_LEVEL - 1]:
@@ -201,11 +171,6 @@ def __file_name_inspect(self, file_query_string, print_mode=0):
201171

202172
@staticmethod
203173
def __pattern_db_list(file_path):
204-
"""
205-
Read file name pattern item from signature file
206-
:param file_path: Pattern file path
207-
:returns: Signature item list
208-
"""
209174
item_list = []
210175
with open(os.path.join(os.path.dirname(__file__), file_path), 'r') as pattern_file:
211176
item_line = pattern_file.readline()
@@ -216,10 +181,6 @@ def __pattern_db_list(file_path):
216181

217182
@staticmethod
218183
def __output_project_info(project):
219-
"""
220-
Output user information and project information of particular project
221-
:returns: None
222-
"""
223184
user_name, project_name = project.split(r"/")
224185
user_info = "[+] User Nickname: {nickname}"
225186
project_print(user_info.format(nickname=user_name))
@@ -229,10 +190,6 @@ def __output_project_info(project):
229190
project_print(project_info.format(link=HOST_NAME + project))
230191

231192
def __auto_login(self, username, password):
232-
"""
233-
Get cookie for logining GitHub
234-
:returns: None
235-
"""
236193
login_request = requests.Session()
237194
login_html = login_request.get("https://github.com/login", headers=self.headers)
238195
post_data = {}
@@ -248,11 +205,6 @@ def __auto_login(self, username, password):
248205
exit()
249206

250207
def __get_page_html(self, url):
251-
"""
252-
Get parse html page from requesting url
253-
:param url: Requesting url
254-
:returns: Parsed html page
255-
"""
256208
try:
257209
page_html = requests.get(url, headers=self.headers, cookies=self.cookies, timeout=SCAN_DEEP[SEARCH_LEVEL - 1])
258210
if page_html.status_code == 429:
@@ -267,11 +219,6 @@ def __get_page_html(self, url):
267219

268220

269221
def is_keyword_valid(keyword):
270-
"""
271-
Verify input/config keywords are valid
272-
:param keyword: Keyword for searching
273-
:returns: False if invalid, True if valid
274-
"""
275222
keyword_valid = re.match(r'^[a-zA-Z0-9].*$', keyword, re.I)
276223
if keyword_valid:
277224
return True
@@ -280,10 +227,6 @@ def is_keyword_valid(keyword):
280227

281228

282229
def init():
283-
"""
284-
Initialize GitPrey with module inspection and input inspection
285-
:return: Key words
286-
"""
287230
if not importlib.util.find_spec('lxml'):
288231
error_print('[!] Error: You have to install lxml module.')
289232
exit()
@@ -310,11 +253,6 @@ def init():
310253
return key_words
311254

312255
def project_miner(key_words):
313-
"""
314-
Search projects for content and path inspection later.
315-
:param key_words: key words for searching
316-
:return:
317-
"""
318256
# Search projects according to key words and searching level
319257
_gitprey = GitPrey(key_words)
320258
total_project_list = _gitprey.search_project()

README.md

+14-14
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ GitPrey是根据企业关键词进行项目检索以及相应敏感文件和敏
66
* 根据关键词在GitHub中进行全局代码内容和路径的搜索(in:file,path),将项目结果做项目信息去重整理得到所有关键词相关的项目,即疑似项目结果;
77
* 基于PATTERN_DB中的敏感文件名或敏感代码对所有疑似项目做文件名搜索(filename:)和代码搜索(in:file);
88
* 将匹配搜索到的结果按照项目整理输出;
9-
由于无法做到精确匹配和精确识别,因此扫描结果或存在一定的漏报(比如项目中未出现关键词路径或内容)或误报(比如第三方项目引用关键词内容)情况,其中漏报的原因还包括Github的搜索限制
9+
由于无法做到精确匹配和精确识别,因此扫描结果或存在一定的漏报(比如项目中未出现关键词路径或内容)或误报(比如第三方项目引用关键词内容)情况,其中漏报的原因还包括GitHub的搜索限制
1010
* 默认只搜索主分支代码,多数情况下是master分支;
11-
* Github最大只允许搜索1000条代码项,即100页代码;
11+
* GitHub最大只允许搜索1000条代码项,即100页代码;
1212
* 代码搜索仅搜索不大于384Kb的文件;
1313

1414
此外,不同关键词搜索的疑似项目数量不同,少则数个,多则数十个甚至数百个,并会对搜索和扫描时间造成直接影响(另一影响因素是匹配的文件名关键词数量和内容关键词数量),项目和关键词越多,扫描时间越长。因此可以根据需要进行扫描深度的选择,这一维度由GitHub最近索引(Recently Indexed)排序的代码页决定,深度越深,检索的项目数量越多,反之亦然。深度选项和说明如下:
@@ -21,8 +21,8 @@ GitPrey是根据企业关键词进行项目检索以及相应敏感文件和敏
2121
深度选择与企业扫描周期性应该成正相关,深度选择小,则相应扫描的周期性也应当较小,如深度选择为Level 1,则相应的扫描周期基于企业情况可定为每天或每周,深度选择为Level 5,则相应的扫描周期可适当延长。例如,关键词“Google”最大(Level 5)可搜索两天前上传的项目代码,而关键词“repoog”搜索结果则不足1页。
2222

2323
### 技术实现
24-
项目配置文件Config.py中需要配置使用者的Github用户名、密码:
25-
* 未登录Github进行代码搜索会因为请求速度过快(约10页代码结果页)而返回HTTP STATUE 429,即Too Many Requests的错误,因此需要登录后进行搜索;
24+
项目配置文件Config.py中需要配置使用者的GitHub用户名、密码:
25+
* 未登录GitHub进行代码搜索会因为请求速度过快(约10页代码结果页)而返回HTTP STATUE 429,即Too Many Requests的错误,因此需要登录后进行搜索;
2626
* 在项目内关键词文件名和关键词内容扫描时未采用API,原因有两点:一是搜索代码的API频率限制很大(认证后30次/分钟)无法满足快速搜索;二是某些项目关键词的搜索结果项超过100条,而API在设置per_page参数后至多支持展现100条结果项;
2727
项目配置文件Config.py中需要配置FILE_DB/INFO_DB/PASS_DB/PATH_DB项,用途如下:_
2828
* 敏感文件搜索是基于配置项中的PATH_DB内容检索特定文件的泄漏;
@@ -44,7 +44,7 @@ USAGE:
4444
pattern为搜索项文件配置目录,相关文件说明如下:
4545
* path.db:敏感文件名或文件后缀,用于搜索文件名,如:htpasswd
4646
* file.db:敏感内容关键词搜索的文件名称范围,内容搜索在该文件名称范围内进行,如:.env
47-
* info.db:敏感内容关键词(由于AND/OR/NOT操作符在Github单次搜索中最多支持五个,故关键词会进行轮询),如:password
47+
* info.db:敏感内容关键词(由于AND/OR/NOT操作符在GitHub单次搜索中最多支持五个,故关键词会进行轮询),如:password
4848

4949
### 程序更新
5050
* v1.0 初始版本
@@ -53,20 +53,20 @@ pattern为搜索项文件配置目录,相关文件说明如下:
5353
* v2.2 优化部分代码,增加项目搜索进度条,解决代码输出BUG
5454
* v2.4 优化程序目录设计,优化源码实现,增加默认文件输出
5555
* v2.5 优化代码搜索为整页代码搜索,优化颜色输出及文件输出,优化代码实现
56-
* v2.6 更新Python版本到Python3.6,修正Github页面爬虫
56+
* v2.6 更新Python版本到Python3.6,修正GitHub页面爬虫
5757

5858
***
59-
## Sensitive info scan tool of Github
59+
## Sensitive info scan tool of GitHub
6060
### Function introduction and design
61-
GitPrey is a tool for searching sensitive information or data according to company name or key word something.The design mind is from searching sensitive data leakling in Github:
61+
GitPrey is a tool for searching sensitive information or data according to company name or key word something.The design mind is from searching sensitive data leakling in GitHub:
6262
* Search code in file and path according to key word to get all related projects;
6363
* Search code in every related project to find matching file or content in PATTERN_DB;
6464
* Output all matching file information,project information and user information;
6565

6666
By the way, there is some missing file or mistake file with using Gitprey,the reason is:
67-
* Only the default branch is considered by Github. In most cases, this will be the master branch.
68-
* Only files smaller than 384 KB are searchable by Github.
69-
* Github only make up to 1,000 results for each search.
67+
* Only the default branch is considered by GitHub. In most cases, this will be the master branch.
68+
* Only files smaller than 384 KB are searchable by GitHub.
69+
* GitHub only make up to 1,000 results for each search.
7070

7171
Gitprey also provides the search level to adjust scanning deep, it's between Level 1 to Level 5:
7272
* Level 1: Only search 10 pages in recently indexed code results.
@@ -75,12 +75,12 @@ Gitprey also provides the search level to adjust scanning deep, it's between Lev
7575
* Level 4: Only search 70 pages in recently indexed code results.
7676
* Level 5: Only search 100 pages in recently indexed code results.
7777

78-
You can modify the Level in Config.py.To search as quick as you can,you must configure your own Github account username and password to avoid 429 ERROR which is too many requests.
78+
You can modify the Level in Config.py.To search as quick as you can,you must configure your own GitHub account username and password to avoid 429 ERROR which is too many requests.
7979

8080
### Tech detail introduction
8181
There are some hints to declare about technological details:
82-
* Github API is not used in searching code,because its rate limit up to 30 times per minute,even if you authenticate by access token.
83-
* Only user information crawler used Github API,it's enough for scanning speed.
82+
* GitHub API is not used in searching code,because its rate limit up to 30 times per minute,even if you authenticate by access token.
83+
* Only user information crawler used GitHub API,it's enough for scanning speed.
8484
You have to config FILE_DB/INFO_DB/PASS_DB/PATH_DB in config.py:
8585
* PATH_DB is used to search specific file in related projects when searching file leaking.
8686
* FILE_DB and PASS_DB are used to searching sensitive content in related projects when searching content leaking, while INFO_DB and PASS_DB is used to output code line._

0 commit comments

Comments
 (0)