diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt index 61440ca01..1e46b0b38 100644 --- a/.ci/benchmark.txt +++ b/.ci/benchmark.txt @@ -225,7 +225,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .zsh 6 872 12 .zsh-theme 1 97 1 TOTAL: 10345 16358553 11997 46620 5021 -credsweeper result_cnt : 11759, lost_cnt : 0, true_cnt : 11544, false_cnt : 215 +credsweeper result_cnt : 11761, lost_cnt : 0, true_cnt : 11546, false_cnt : 215 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ----- ---- ----- ---- -------- -------- -------- -------- -------- -------- API 125 3172 187 122 122 0 3359 3 0.000000 0.024000 0.999139 1.000000 0.976000 0.987854 @@ -263,7 +263,7 @@ Key 3911 15717 483 392 Nonce 93 49 0 93 92 1 48 1 0.020408 0.010753 0.985915 0.989247 0.989247 0.989247 Other 9 7447 5 0 0 7452 9 0.000000 1.000000 0.998794 0.000000 PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041 -Password 1938 7530 2635 1861 1855 6 10159 83 0.000590 0.042828 0.992646 0.996776 0.957172 0.976573 +Password 1938 7530 2635 1863 1857 6 10159 81 0.000590 0.041796 0.992812 0.996779 0.958204 0.977111 SQL Password 44 13 0 41 41 0 13 3 0.000000 0.068182 0.947368 1.000000 0.931818 0.964706 Salesforce Credentials 2 0 0 2 2 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 Salt 48 75 1 45 45 0 76 3 0.000000 0.062500 0.975806 1.000000 0.937500 0.967742 @@ -276,4 +276,4 @@ Token 645 4170 453 62 Twilio Credentials 30 39 0 30 30 0 39 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 URL Credentials 218 156 208 217 217 0 364 1 0.000000 0.004587 0.998282 1.000000 0.995413 0.997701 UUID 1075 265 0 1074 1073 1 264 2 0.003774 0.001860 0.997761 0.999069 0.998140 0.998604 - 11997 46620 5021 11771 11544 215 46405 453 0.004612 0.037759 0.988604 0.981716 0.962241 0.971881 + 11997 46620 5021 11773 11546 215 46405 451 0.004612 0.037593 0.988638 0.981719 0.962407 0.971967 diff --git a/credsweeper/common/keyword_pattern.py b/credsweeper/common/keyword_pattern.py index 3927606c0..219e99325 100644 --- a/credsweeper/common/keyword_pattern.py +++ b/credsweeper/common/keyword_pattern.py @@ -3,27 +3,30 @@ class KeywordPattern: """Pattern set of keyword types""" - key_left = r"(\\[nrt]|%[0-9a-f]{2})?"\ - r"(?P(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \ + key_left = r"(\\[nrt]|%[0-9a-f]{2})?" \ + r"(?P(([`'\"]{1,8}[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \ r"(?P" # there will be inserted a keyword key_right = r")" \ - r"(&(quot|apos);|[^%:='\"`<>{?!&]*)[`'\"]*))" # - separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \ - r"(?P:( [a-z]{3,9}[?]? )?=|:|=(>|>|\\u0026gt;)|!=|===|==|=|%3d)" \ - r"(\s|\\+[tnr])*" + r"[^%:='\"`<>{?!&]*" \ + r")" \ + r"(&(quot|apos);|%[0-9a-f]{2}|[`'\"])*" \ + r")" # + separator = r"(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*" \ + r"(?P:(\s[a-z]{3,9}[?]?\s)?=|:|=(>|>|\\u0026gt;)|!=|===|==|=|%3d)" \ + r"(\s|\\{1,8}[tnr])*" # might be curly, square or parenthesis with words before wrap = r"(?P(" \ - r"(new(\s|\\+[tnr])+)?" \ + r"(new(\s|\\{1,8}[tnr]){1,8})?" \ r"([0-9a-z_.]|-(>|(&|\\\\*u0026)gt;))*" \ - r"[\[\(\{]"\ - r"(\s|\\+[tnr])*" \ + r"[\[\(\{]" \ + r"(\s|\\{1,8}[tnr])*" \ r"([0-9a-z_]{1,32}=)?" \ - r")+)?" + r"){1,8})?" string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?" left_quote = r"(?P((?P\\{1,8})?([`'\"]|&(quot|apos);)){1,4}))?" # Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential - auth_keywords = r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?" + auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey)\s)?" value = r"(?P" \ r"(?(value_leftquote)" \ r"(" \ @@ -31,11 +34,15 @@ class KeywordPattern: r"(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))" \ r"|" \ r"(?!&(quot|apos);)" \ - r"(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])" \ - r"){3,8000}" \ + r"(\\{1,8}([ tnr]|[^\s`'\"])" \ + r"|" \ + r"(?P%[0-9a-f]{2})" \ + r"|" \ + r"(?(url_esc)[^\s`'\",;\\&]|[^\s`'\",;\\])" \ + r")){3,8000}" \ r"|(\{[^}]{3,8000}\})" \ r"|(<[^>]{3,8000}>)" \ - r")" + r")" # right_quote = r"(?(value_leftquote)" \ r"(?P(? re.Pattern: """Returns compiled regex pattern""" - expression = "".join([ # + expression = ''.join([ # cls.key_left, # keyword, # cls.key_right, # diff --git a/credsweeper/filters/value_allowlist_check.py b/credsweeper/filters/value_allowlist_check.py index f702491e6..1759d7c8a 100644 --- a/credsweeper/filters/value_allowlist_check.py +++ b/credsweeper/filters/value_allowlist_check.py @@ -8,22 +8,35 @@ class ValueAllowlistCheck(Filter): - """Check that patterns from the list is not present in the candidate value.""" + """Check that the patterns do not MATCH the candidate value.""" ALLOWED = [ r"ENC\(.*\)", # r"ENC\[.*\]", # r"\$\{(\*|[0-9]+|[a-z_].*)\}", # - r"\$([0-9]+\b|[a-z_]+[0-9a-z_]*)", # + r"\$[0-9]+(\s|$)", # r"\$\$[a-z_]+(\^%[0-9a-z_]+)?", # r"#\{.*\}", # r"\{\{.+\}\}", # - r"\S{0,5}\*{5,}", # r".*@@@hl@@@(암호|비번|PW|PASS)@@@endhl@@@", # ] ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED), flags=re.IGNORECASE) - ALLOWED_UNQUOTED_PATTERN = re.compile(r"[~a-z0-9_]+((\.|->)[a-z0-9_]+)+\(.*$", flags=re.IGNORECASE) + + ALLOWED_QUOTED = [ + r"\$[a-z_]+[0-9a-z_]*([$\s]|$)", # + r".*\*\*\*", # + ] + + ALLOWED_QUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_QUOTED), flags=re.IGNORECASE) + + ALLOWED_UNQUOTED = [ + r"[~a-z0-9_]+((\.|->)[a-z0-9_]+)+\(.*$", # + r"\$[a-z_]+[0-9a-z_]*\b", # + r".*\*\*\*\*\*", # + ] + + ALLOWED_UNQUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_UNQUOTED), flags=re.IGNORECASE) def __init__(self, config: Config = None) -> None: pass @@ -42,8 +55,11 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: if self.ALLOWED_PATTERN.match(line_data.value): return True - - if not line_data.is_well_quoted_value and self.ALLOWED_UNQUOTED_PATTERN.match(line_data.value): - return True + elif line_data.is_well_quoted_value: + if self.ALLOWED_QUOTED_PATTERN.match(line_data.value): + return True + else: + if self.ALLOWED_UNQUOTED_PATTERN.match(line_data.value): + return True return False diff --git a/experiment/main.sh b/experiment/main.sh index 0b09d8f7e..d59a523c7 100755 --- a/experiment/main.sh +++ b/experiment/main.sh @@ -51,7 +51,7 @@ if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi cd "${CREDSWEEPER_DIR}" report_file=${RESULT_DIR}/${NOW}.json -${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --rules ${CREDSWEEPER_DIR}/experiment/results/train_config.yaml --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file} +${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --rules ${CREDSWEEPER_DIR}/experiment/results/train_config.yaml --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file} --no-stdout cd "${CREDDATA_DIR}" .venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt diff --git a/experiment/src/prepare_data.py b/experiment/src/prepare_data.py index 9e3bb670b..4b06f4c25 100644 --- a/experiment/src/prepare_data.py +++ b/experiment/src/prepare_data.py @@ -12,7 +12,7 @@ def execute_scanner(dataset_location: str, result_location_str: str, jobs: int, """Execute CredSweeper as a separate process to make sure no global states is shared with training script""" dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.." command = f"{sys.executable} -m credsweeper --path {dataset_location}/data" \ - f" --save-json {result_location_str} --log info" \ + f" --save-json {result_location_str} --log info --no-stdout" \ f" {'--doc' if doc_target else ''}" \ f" --jobs {jobs} --sort --rules results/train_config.yaml --ml_threshold 0 --subtext" error_code = subprocess.check_call(command, shell=True, cwd=dir_path) diff --git a/tests/common/test_keyword_pattern.py b/tests/common/test_keyword_pattern.py index 82a79a3e0..b4f7a7329 100644 --- a/tests/common/test_keyword_pattern.py +++ b/tests/common/test_keyword_pattern.py @@ -92,7 +92,7 @@ def test_separator_p(self, config: Config, file_path: pytest.fixture, line: str) ['''db.setCred("{ \"password\" : \"" + SECRET + "\" }");''', ''' + SECRET + '''], ['''\\"password\\": \\"{\\\\"secret\\\\": \\\\"test\\\\"}\\"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'], # ['''"password": "{\\\\"secret\\\\": \\\\"test\\\\"}"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'], # - #normal_str = "First line.\nSecond line.\nEnd of message.\n"; + # normal_str = "First line.\nSecond line.\nEnd of message.\n"; ['''std::string password = R"multiline\\npassword";''', '''multiline\\npassword'''], # ['''const wchar_t* password = L"wchar_t*secret";''', '''wchar_t*secret'''], # ['''const char16_t* password = U"char16_t*secret";''', '''char16_t*secret'''], # @@ -114,6 +114,17 @@ def test_separator_p(self, config: Config, file_path: pytest.fixture, line: str) ['''password24=secret42\\n''', 'secret42'], # ['password = 3VNdhWT3oFo5I7faffKO\\\neEagnK7tYBcGxhla\n;', '''3VNdhWT3oFo5I7faffKO'''], ['password = "3VNdhWT3oFo5I7faffKO\n gnK7tYBcGxhla\n";', '''3VNdhWT3oFo5I7faffKO\n gnK7tYBcGxhla\n'''], + [ + "//&user%5Bemail%5D=credsweeper%40example.com&user%5Bpassword%5D=Dmdkesfdsq452%23%40!&user%5Bpassword_", + "Dmdkesfdsq452%23%40!" + ], + ["password%3dDmsfdsq452!&user%5Bpassword_", "Dmsfdsq452!"], + ["MY_TEST_PASSWORD={MY_TEST_PASSWORD}", "MY_TEST_PASSWORD"], + ["MY_TEST_PASSWORD=(MY_TEST_PASSWORD)", "MY_TEST_PASSWORD"], + ["MY_TEST_PASSWORD=", ""], # <> are used in future to detect a template + ["MY_TEST_PASSWORD=[MY_TEST_PASSWORD]", "MY_TEST_PASSWORD"], + ["MY_TEST_PASSWORD=MY_TEST&PASSWORD!", "MY_TEST&PASSWORD!"], + ["MY_TEST_PASSWORD='MY_TEST&PASSWORD!'", "MY_TEST&PASSWORD!"], ]) def test_keyword_pattern_p(self, config: Config, file_path: pytest.fixture, line: str, value: str) -> None: pattern = KeywordPattern.get_keyword_pattern("password") diff --git a/tests/filters/test_value_allowlist_check.py b/tests/filters/test_value_allowlist_check.py index d3af0d666..989c16fc7 100644 --- a/tests/filters/test_value_allowlist_check.py +++ b/tests/filters/test_value_allowlist_check.py @@ -11,6 +11,7 @@ class TestValueAllowlistCheck: "line", [ # "password = $4eCr3t", # + "pass=$((0394584039))", # "password = 'F(b7)]DAS^iCv0vqIJOvGg<5pass('''ARG", # + "password = '$34%4reGE_'", # + "password = '$D34%4reGE_'", # ]) def test_value_allowlist_check_p(self, file_path: pytest.fixture, line: str) -> None: - line_data = get_line_data(file_path, line=line, pattern=SUCCESS_LINE_PATTERN) + line_data = get_line_data(file_path=file_path, line=line, pattern=SUCCESS_LINE_PATTERN) assert ValueAllowlistCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False @pytest.mark.parametrize( "line", [ # + "pass := \"$pass2id$v=1$m=65536,t=3,p=2$2tNBg5k/rOCN2n3/kFYJ3789X\"", # "pass=get->pass(arg", # "PASS:@@@hl@@@PASS@@@endhl@@@", # "pass:='ENC(Crackle123)'", # @@ -42,7 +46,8 @@ def test_value_allowlist_check_p(self, file_path: pytest.fixture, line: str) -> "pass:test*****", # 'PASS="${*}"', # 'PASS="$123"', # + 'PASS="$A1B2C3D"', # ]) def test_value_allowlist_check_n(self, file_path: pytest.fixture, line: str) -> None: - line_data = get_line_data(file_path, line=line, pattern=SUCCESS_LINE_PATTERN) + line_data = get_line_data(file_path=file_path, line=line, pattern=SUCCESS_LINE_PATTERN) assert ValueAllowlistCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is True diff --git a/tests/filters/test_value_similarity_check.py b/tests/filters/test_value_similarity_check.py index 8be5cb126..0286dab31 100644 --- a/tests/filters/test_value_similarity_check.py +++ b/tests/filters/test_value_similarity_check.py @@ -27,7 +27,7 @@ def test_value_similarity_check_p(self, password_rule: Rule, file_path: str, suc line_data = get_line_data(file_path=file_path, line=success_line, pattern=password_rule.patterns[0]) assert ValueSimilarityCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False - @pytest.mark.parametrize("line", ["password = 'password1'", "password = 'password123'"]) + @pytest.mark.parametrize("line", ["password = 'password1'", "password = 'password123'", "PWD=`pwd`"]) def test_value_similarity_check_n(self, password_rule: Rule, file_path: str, line: str) -> None: line_data = get_line_data(file_path=file_path, line=line, pattern=password_rule.patterns[0]) assert ValueSimilarityCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is True