Skip to content

Commit 743c415

Browse files
committed
update: update crawling and fix #19 and #24
1 parent b8249fd commit 743c415

File tree

6 files changed

+37
-32
lines changed

6 files changed

+37
-32
lines changed

.gitignore

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
*.bak
2-
32
*.pyc
4-
*.log
3+
*.log
4+
Config.py

GitPrey.py

+23-21
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
print("[!]Error: You have to install BeautifulSoup module.")
1414
exit()
1515

16+
import os
1617
import re
1718
import math
1819
import sys
@@ -61,8 +62,7 @@ class GitPrey(object):
6162
def __init__(self, keyword):
6263
self.keyword = keyword
6364
self.search_url = "https://github.com/search?o=desc&p={page}&q={keyword}&ref=searchresults&s=indexed&type=Code&utf8=%E2%9C%93"
64-
self.headers = {
65-
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36"}
65+
self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36"}
6666
self.cookies = ""
6767

6868
def search_project(self):
@@ -72,7 +72,7 @@ def search_project(self):
7272
"""
7373
unique_project_list = []
7474
self.__auto_login(USER_NAME, PASSWORD)
75-
info_print('[*] Searching projects hard...')
75+
info_print('[*] Searching hard for projects...')
7676

7777
# Get unique project list of first page searched results
7878
total_progress = SCAN_DEEP[SEARCH_LEVEL - 1]
@@ -105,16 +105,16 @@ def __page_project_list(page_html):
105105
:returns: Project list of per page
106106
"""
107107
cur_par_html = BeautifulSoup(page_html, "lxml")
108-
project_info = cur_par_html.select("a.text-bold")
109-
page_project = [project.text for project in project_info]
108+
project_info = cur_par_html.select("a.link-gray")
109+
page_project = [project.text.strip() for project in project_info]
110110
return page_project
111111

112112
def sensitive_info_query(self, project_string, mode):
113113
"""
114114
Search sensitive information and sensitive file from projects
115115
:param project_string: Key words string for querying
116116
:param mode: Searching mode within "content" or "filename"
117-
:returns: None
117+
:returns: Code segments or file lists
118118
"""
119119
if mode == "content":
120120
# Output code line with sensitive key words like username.
@@ -132,16 +132,17 @@ def sensitive_info_query(self, project_string, mode):
132132
if mode == "filename":
133133
# Search project according to file path.
134134
path_sig_list = self.__pattern_db_list(PATH_DB)
135-
path_string = " filename:" + " filename:".join(path_sig_list) + project_string
135+
path_string = "filename:" + " filename:".join(path_sig_list) + project_string
136136
repo_file_dic = self.__file_name_inspect(path_string, print_mode=1)
137137
return repo_file_dic
138138

139139
def __file_content_inspect(self, project_string, file_pattern, project_pattern):
140140
"""
141141
Check sensitive code in particular project
142-
:param content_query_string: Content string for searching
143-
:param info_sig_match: information signature match regular
144-
:returns: None
142+
:param project_string: Projects for searching
143+
:param file_pattern: File string for searching
144+
:param project_pattern: Content signature match regular
145+
:returns: Code segments
145146
"""
146147
query_string = " OR ".join(project_pattern)
147148
repo_file_dic = self.__file_name_inspect(query_string + project_string + file_pattern)
@@ -169,15 +170,16 @@ def __file_name_inspect(self, file_query_string, print_mode=0):
169170
"""
170171
Inspect sensitive file in particular project
171172
:param file_query_string: File string for searching
172-
:returns: None
173+
:param print_mode: 1 means print file, 0 means print code
174+
:returns: Files lists
173175
"""
174176
page_num = 1
175177
repo_file_dic = {}
176178
while page_num <= SCAN_DEEP[SEARCH_LEVEL - 1]:
177179
check_url = self.search_url.format(page=page_num, keyword=file_query_string)
178180
page_html = self.__get_page_html(check_url)
179181
project_html = BeautifulSoup(page_html, 'lxml')
180-
repo_list = project_html.select('div .min-width-0 > a:nth-of-type(2)')
182+
repo_list = project_html.select('a[data-hydro-click-hmac]')
181183
if not repo_list:
182184
break
183185
# Handle file links for each project
@@ -205,7 +207,7 @@ def __pattern_db_list(file_path):
205207
:returns: Signature item list
206208
"""
207209
item_list = []
208-
with open(file_path, 'r') as pattern_file:
210+
with open(os.path.join(os.path.dirname(__file__), file_path), 'r') as pattern_file:
209211
item_line = pattern_file.readline()
210212
while item_line:
211213
item_list.append(item_line.strip())
@@ -239,7 +241,7 @@ def __auto_login(self, username, password):
239241
for item in input_items:
240242
post_data[item.get('name')] = item.get('value')
241243
post_data['login'], post_data['password'] = username, password
242-
login_request.post("https://github.com/session", data=post_data, headers=self.headers)
244+
login_request.post("https://github.com/session", data=post_data, cookies=login_html.cookies, headers=self.headers)
243245
self.cookies = login_request.cookies
244246
if self.cookies['logged_in'] == 'no':
245247
error_print('[!] Error: Login Github failed, please check account in config file.')
@@ -280,18 +282,16 @@ def is_keyword_valid(keyword):
280282
def init():
281283
"""
282284
Initialize GitPrey with module inspection and input inspection
283-
:return: None
285+
:return: Key words
284286
"""
285287
if not importlib.util.find_spec('lxml'):
286-
error_print('[!]Error: You have to install lxml module.')
288+
error_print('[!] Error: You have to install lxml module.')
287289
exit()
288290

289291
# Get command parameters for searching level and key words
290292
parser = argparse.ArgumentParser(description="Searching sensitive file and content in GitHub.")
291-
parser.add_argument("-l", "--level", type=int, choices=range(1, 6), default=1, metavar="level",
292-
help="Set search level within 1~5, default is 1.")
293-
parser.add_argument("-k", "--keywords", metavar="keywords", required=True,
294-
help="Set key words to search projects.")
293+
parser.add_argument("-l", "--level", type=int, choices=range(1, 6), default=1, metavar="level", help="Set search level within 1~5, default is 1.")
294+
parser.add_argument("-k", "--keywords", metavar="keywords", required=True, help="Set key words to search projects.")
295295
args = parser.parse_args()
296296

297297
SEARCH_LEVEL = args.level if args.level else 1
@@ -309,7 +309,6 @@ def init():
309309

310310
return key_words
311311

312-
313312
def project_miner(key_words):
314313
"""
315314
Search projects for content and path inspection later.
@@ -323,6 +322,9 @@ def project_miner(key_words):
323322
project_info_output = "\n[*] Found {num} public projects related to the key words.\n"
324323
info_print(project_info_output.format(num=len(total_project_list)))
325324

325+
if (len(total_project_list) == 0):
326+
exit(0)
327+
326328
# Join all projects to together to search
327329
repo_string = " repo:" + " repo:".join(total_project_list)
328330

README.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
## GitHub敏感信息扫描工具
22
![](https://img.shields.io/github/license/repoog/GitPrey.svg)
33
[![Twitter](https://img.shields.io/twitter/url/https/github.com/repoog/GitPrey.svg?style=social)](https://twitter.com/intent/tweet?text=Wow:&url=https%3A%2F%2Fgithub.com%2Frepoog%2FGitPrey)
4-
### 功能设计说明
4+
### 功能设计
55
GitPrey是根据企业关键词进行项目检索以及相应敏感文件和敏感文件内容扫描的工具,其设计思路如下:
66
* 根据关键词在GitHub中进行全局代码内容和路径的搜索(in:file,path),将项目结果做项目信息去重整理得到所有关键词相关的项目,即疑似项目结果;
77
* 基于PATTERN_DB中的敏感文件名或敏感代码对所有疑似项目做文件名搜索(filename:)和代码搜索(in:file);
@@ -20,15 +20,15 @@ GitPrey是根据企业关键词进行项目检索以及相应敏感文件和敏
2020

2121
深度选择与企业扫描周期性应该成正相关,深度选择小,则相应扫描的周期性也应当较小,如深度选择为Level 1,则相应的扫描周期基于企业情况可定为每天或每周,深度选择为Level 5,则相应的扫描周期可适当延长。例如,关键词“Google”最大(Level 5)可搜索两天前上传的项目代码,而关键词“repoog”搜索结果则不足1页。
2222

23-
### 技术实现说明
23+
### 技术实现
2424
项目配置文件Config.py中需要配置使用者的Github用户名、密码:
2525
* 未登录Github进行代码搜索会因为请求速度过快(约10页代码结果页)而返回HTTP STATUE 429,即Too Many Requests的错误,因此需要登录后进行搜索;
2626
* 在项目内关键词文件名和关键词内容扫描时未采用API,原因有两点:一是搜索代码的API频率限制很大(认证后30次/分钟)无法满足快速搜索;二是某些项目关键词的搜索结果项超过100条,而API在设置per_page参数后至多支持展现100条结果项;
2727
项目配置文件Config.py中需要配置FILE_DB/INFO_DB/PASS_DB/PATH_DB项,用途如下:_
2828
* 敏感文件搜索是基于配置项中的PATH_DB内容检索特定文件的泄漏;
2929
* 敏感内容搜索是基于PASS_DB和FILE_DB进行检索,再根据INFO_DB和PASS_DB输出相关代码行;_
3030

31-
### 程序使用帮助
31+
### 程序使用
3232
GitPrey v2.2版本后去除了ACCESS_TOKEN的配置以及配置文件中的SEARCH_LEVEL和KEYWORDS配置项,改用命令行参数方式执行:
3333
```
3434
USAGE:
@@ -40,13 +40,13 @@ USAGE:
4040
* -k:必填参数,用于设置搜索关键词,若关键词中包含空白字符,需用双引号将关键词括起来;
4141
* -h:帮助信息。
4242

43-
### 文件配置说明
43+
### 文件配置
4444
pattern为搜索项文件配置目录,相关文件说明如下:
4545
* path.db:敏感文件名或文件后缀,用于搜索文件名,如:htpasswd
4646
* file.db:敏感内容关键词搜索的文件名称范围,内容搜索在该文件名称范围内进行,如:.env
4747
* info.db:敏感内容关键词(由于AND/OR/NOT操作符在Github单次搜索中最多支持五个,故关键词会进行轮询),如:password
4848

49-
### 程序更新列表
49+
### 程序更新
5050
* v1.0 初始版本
5151
* v2.0 更新搜索设计和算法
5252
* v2.1 更新搜索结果输出展现

config/Config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55

66
# GitHub account config for searching
77
USER_NAME = ""
8-
PASSWORD = ""
8+
PASSWORD = ""

include/ColorPrint.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@
77
exit()
88

99
import logging
10+
import os
1011

1112
init(autoreset=True)
1213

1314
logger = logging.getLogger('')
1415
logger.setLevel(logging.INFO)
15-
file_handle = logging.FileHandler('GitPrey.log')
16+
file_handle = logging.FileHandler(os.path.join(os.path.dirname(__file__), '../GitPrey.log'))
1617
file_handle.setLevel(logging.INFO)
1718
formatter = logging.Formatter('%(message)s')
1819
file_handle.setFormatter(formatter)
@@ -28,7 +29,6 @@ def error_print(string):
2829
def info_print(string):
2930
# Print information with green color
3031
print(Fore.GREEN + string)
31-
logger.info(string)
3232

3333

3434
def project_print(string):

pattern/info.db

+4-1
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,7 @@ jdbc
1212
password
1313
passwd
1414
pass
15-
pwd
15+
pwd
16+
public_key
17+
publickey
18+
private_ke

0 commit comments

Comments
 (0)