Skip to content

Commit

Permalink
Keyword pattern, filters improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Feb 20, 2025
1 parent 423532f commit ab99f97
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 31 deletions.
6 changes: 3 additions & 3 deletions .ci/benchmark.txt
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.zsh 6 872 12
.zsh-theme 1 97 1
TOTAL: 10345 16358553 11997 46620 5021
credsweeper result_cnt : 11759, lost_cnt : 0, true_cnt : 11544, false_cnt : 215
credsweeper result_cnt : 11761, lost_cnt : 0, true_cnt : 11546, false_cnt : 215
Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1
------------------------------ ----------- ----------- ----------- ---------- ----- ---- ----- ---- -------- -------- -------- -------- -------- --------
API 125 3172 187 122 122 0 3359 3 0.000000 0.024000 0.999139 1.000000 0.976000 0.987854
Expand Down Expand Up @@ -263,7 +263,7 @@ Key 3911 15717 483 392
Nonce 93 49 0 93 92 1 48 1 0.020408 0.010753 0.985915 0.989247 0.989247 0.989247
Other 9 7447 5 0 0 7452 9 0.000000 1.000000 0.998794 0.000000
PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041
Password 1938 7530 2635 1861 1855 6 10159 83 0.000590 0.042828 0.992646 0.996776 0.957172 0.976573
Password 1938 7530 2635 1863 1857 6 10159 81 0.000590 0.041796 0.992812 0.996779 0.958204 0.977111
SQL Password 44 13 0 41 41 0 13 3 0.000000 0.068182 0.947368 1.000000 0.931818 0.964706
Salesforce Credentials 2 0 0 2 2 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Salt 48 75 1 45 45 0 76 3 0.000000 0.062500 0.975806 1.000000 0.937500 0.967742
Expand All @@ -276,4 +276,4 @@ Token 645 4170 453 62
Twilio Credentials 30 39 0 30 30 0 39 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
URL Credentials 218 156 208 217 217 0 364 1 0.000000 0.004587 0.998282 1.000000 0.995413 0.997701
UUID 1075 265 0 1074 1073 1 264 2 0.003774 0.001860 0.997761 0.999069 0.998140 0.998604
11997 46620 5021 11771 11544 215 46405 453 0.004612 0.037759 0.988604 0.981716 0.962241 0.971881
11997 46620 5021 11773 11546 215 46405 451 0.004612 0.037593 0.988638 0.981719 0.962407 0.971967
37 changes: 22 additions & 15 deletions credsweeper/common/keyword_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,46 @@

class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(\\[nrt]|%[0-9a-f]{2})?"\
r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
key_left = r"(\\[nrt]|%[0-9a-f]{2})?" \
r"(?P<variable>(([`'\"]{1,8}[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
r"(&(quot|apos);|[^%:='\"`<>{?!&]*)[`'\"]*))" # <variable>
separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=|%3d)" \
r"(\s|\\+[tnr])*"
r"[^%:='\"`<>{?!&]*" \
r")" \
r"(&(quot|apos);|%[0-9a-f]{2}|[`'\"])*" \
r")" # <variable>
separator = r"(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*" \
r"(?P<separator>:(\s[a-z]{3,9}[?]?\s)?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=|%3d)" \
r"(\s|\\{1,8}[tnr])*"
# might be curly, square or parenthesis with words before
wrap = r"(?P<wrap>(" \
r"(new(\s|\\+[tnr])+)?" \
r"(new(\s|\\{1,8}[tnr]){1,8})?" \
r"([0-9a-z_.]|-(>|(&|\\\\*u0026)gt;))*" \
r"[\[\(\{]"\
r"(\s|\\+[tnr])*" \
r"[\[\(\{]" \
r"(\s|\\{1,8}[tnr])*" \
r"([0-9a-z_]{1,32}=)?" \
r")+)?"
r"){1,8})?"
string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4}))?"
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
auth_keywords = r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?"
auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey)\s)?"
value = r"(?P<value>" \
r"(?(value_leftquote)" \
r"(" \
r"(?!(?P=value_leftquote))" \
r"(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))" \
r"|" \
r"(?!&(quot|apos);)" \
r"(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])" \
r"){3,8000}" \
r"(\\{1,8}([ tnr]|[^\s`'\"])" \
r"|" \
r"(?P<url_esc>%[0-9a-f]{2})" \
r"|" \
r"(?(url_esc)[^\s`'\",;\\&]|[^\s`'\",;\\])" \
r")){3,8000}" \
r"|(\{[^}]{3,8000}\})" \
r"|(<[^>]{3,8000}>)" \
r")"
r")" # <value>
right_quote = r"(?(value_leftquote)" \
r"(?P<value_rightquote>(?<!\\)(?P=value_leftquote)|\\$|(?<=[0-9a-z+_/-])$)" \
r"|" \
Expand All @@ -44,7 +51,7 @@ class KeywordPattern:
@classmethod
def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
"""Returns compiled regex pattern"""
expression = "".join([ #
expression = ''.join([ #
cls.key_left, #
keyword, #
cls.key_right, #
Expand Down
30 changes: 23 additions & 7 deletions credsweeper/filters/value_allowlist_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,35 @@


class ValueAllowlistCheck(Filter):
"""Check that patterns from the list is not present in the candidate value."""
"""Check that the patterns do not MATCH the candidate value."""

ALLOWED = [
r"ENC\(.*\)", #
r"ENC\[.*\]", #
r"\$\{(\*|[0-9]+|[a-z_].*)\}", #
r"\$([0-9]+\b|[a-z_]+[0-9a-z_]*)", #
r"\$[0-9]+(\s|$)", #
r"\$\$[a-z_]+(\^%[0-9a-z_]+)?", #
r"#\{.*\}", #
r"\{\{.+\}\}", #
r"\S{0,5}\*{5,}", #
r".*@@@hl@@@(암호|비번|PW|PASS)@@@endhl@@@", #
]

ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED), flags=re.IGNORECASE)
ALLOWED_UNQUOTED_PATTERN = re.compile(r"[~a-z0-9_]+((\.|->)[a-z0-9_]+)+\(.*$", flags=re.IGNORECASE)

ALLOWED_QUOTED = [
r"\$[a-z_]+[0-9a-z_]*([$\s]|$)", #
r".*\*\*\*", #
]

ALLOWED_QUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_QUOTED), flags=re.IGNORECASE)

ALLOWED_UNQUOTED = [
r"[~a-z0-9_]+((\.|->)[a-z0-9_]+)+\(.*$", #
r"\$[a-z_]+[0-9a-z_]*\b", #
r".*\*\*\*\*\*", #
]

ALLOWED_UNQUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_UNQUOTED), flags=re.IGNORECASE)

def __init__(self, config: Config = None) -> None:
pass
Expand All @@ -42,8 +55,11 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:

if self.ALLOWED_PATTERN.match(line_data.value):
return True

if not line_data.is_well_quoted_value and self.ALLOWED_UNQUOTED_PATTERN.match(line_data.value):
return True
elif line_data.is_well_quoted_value:
if self.ALLOWED_QUOTED_PATTERN.match(line_data.value):
return True
else:
if self.ALLOWED_UNQUOTED_PATTERN.match(line_data.value):
return True

return False
2 changes: 1 addition & 1 deletion experiment/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi

cd "${CREDSWEEPER_DIR}"
report_file=${RESULT_DIR}/${NOW}.json
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --rules ${CREDSWEEPER_DIR}/experiment/results/train_config.yaml --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file}
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --rules ${CREDSWEEPER_DIR}/experiment/results/train_config.yaml --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file} --no-stdout

cd "${CREDDATA_DIR}"
.venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt
Expand Down
2 changes: 1 addition & 1 deletion experiment/src/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def execute_scanner(dataset_location: str, result_location_str: str, jobs: int,
"""Execute CredSweeper as a separate process to make sure no global states is shared with training script"""
dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.."
command = f"{sys.executable} -m credsweeper --path {dataset_location}/data" \
f" --save-json {result_location_str} --log info" \
f" --save-json {result_location_str} --log info --no-stdout" \
f" {'--doc' if doc_target else ''}" \
f" --jobs {jobs} --sort --rules results/train_config.yaml --ml_threshold 0 --subtext"
error_code = subprocess.check_call(command, shell=True, cwd=dir_path)
Expand Down
13 changes: 12 additions & 1 deletion tests/common/test_keyword_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_separator_p(self, config: Config, file_path: pytest.fixture, line: str)
['''db.setCred("{ \"password\" : \"" + SECRET + "\" }");''', ''' + SECRET + '''],
['''\\"password\\": \\"{\\\\"secret\\\\": \\\\"test\\\\"}\\"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'], #
['''"password": "{\\\\"secret\\\\": \\\\"test\\\\"}"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'], #
#normal_str = "First line.\nSecond line.\nEnd of message.\n";
# normal_str = "First line.\nSecond line.\nEnd of message.\n";
['''std::string password = R"multiline\\npassword";''', '''multiline\\npassword'''], #
['''const wchar_t* password = L"wchar_t*secret";''', '''wchar_t*secret'''], #
['''const char16_t* password = U"char16_t*secret";''', '''char16_t*secret'''], #
Expand All @@ -114,6 +114,17 @@ def test_separator_p(self, config: Config, file_path: pytest.fixture, line: str)
['''password24=secret42\\n''', 'secret42'], #
['password = 3VNdhWT3oFo5I7faffKO\\\neEagnK7tYBcGxhla\n;', '''3VNdhWT3oFo5I7faffKO'''],
['password = "3VNdhWT3oFo5I7faffKO\n gnK7tYBcGxhla\n";', '''3VNdhWT3oFo5I7faffKO\n gnK7tYBcGxhla\n'''],
[
"//&user%5Bemail%5D=credsweeper%40example.com&user%5Bpassword%5D=Dmdkesfdsq452%23%40!&user%5Bpassword_",
"Dmdkesfdsq452%23%40!"
],
["password%3dDmsfdsq452!&user%5Bpassword_", "Dmsfdsq452!"],
["MY_TEST_PASSWORD={MY_TEST_PASSWORD}", "MY_TEST_PASSWORD"],
["MY_TEST_PASSWORD=(MY_TEST_PASSWORD)", "MY_TEST_PASSWORD"],
["MY_TEST_PASSWORD=<MY_TEST_PASSWORD>", "<MY_TEST_PASSWORD>"], # <> are used in future to detect a template
["MY_TEST_PASSWORD=[MY_TEST_PASSWORD]", "MY_TEST_PASSWORD"],
["MY_TEST_PASSWORD=MY_TEST&PASSWORD!", "MY_TEST&PASSWORD!"],
["MY_TEST_PASSWORD='MY_TEST&PASSWORD!'", "MY_TEST&PASSWORD!"],
])
def test_keyword_pattern_p(self, config: Config, file_path: pytest.fixture, line: str, value: str) -> None:
pattern = KeywordPattern.get_keyword_pattern("password")
Expand Down
9 changes: 7 additions & 2 deletions tests/filters/test_value_allowlist_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class TestValueAllowlistCheck:
"line",
[ #
"password = $4eCr3t", #
"pass=$((0394584039))", #
"password = 'F(b7)]DAS^iCv0vqIJOvGg<5<F(lwQ'", #
"password = P@s$w0Rd", #
"password = ENCrackle123)", #
Expand All @@ -23,14 +24,17 @@ class TestValueAllowlistCheck:
"password = ***test***", #
"password = .*@@@@@@", #
"pass=get->pass('''ARG", #
"password = '$34%4reGE_'", #
"password = '$D34%4reGE_'", #
])
def test_value_allowlist_check_p(self, file_path: pytest.fixture, line: str) -> None:
line_data = get_line_data(file_path, line=line, pattern=SUCCESS_LINE_PATTERN)
line_data = get_line_data(file_path=file_path, line=line, pattern=SUCCESS_LINE_PATTERN)
assert ValueAllowlistCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False

@pytest.mark.parametrize(
"line",
[ #
"pass := \"$pass2id$v=1$m=65536,t=3,p=2$2tNBg5k/rOCN2n3/kFYJ3789X\"", #
"pass=get->pass(arg", #
"PASS:@@@hl@@@PASS@@@endhl@@@", #
"pass:='ENC(Crackle123)'", #
Expand All @@ -42,7 +46,8 @@ def test_value_allowlist_check_p(self, file_path: pytest.fixture, line: str) ->
"pass:test*****", #
'PASS="${*}"', #
'PASS="$123"', #
'PASS="$A1B2C3D"', #
])
def test_value_allowlist_check_n(self, file_path: pytest.fixture, line: str) -> None:
line_data = get_line_data(file_path, line=line, pattern=SUCCESS_LINE_PATTERN)
line_data = get_line_data(file_path=file_path, line=line, pattern=SUCCESS_LINE_PATTERN)
assert ValueAllowlistCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is True
2 changes: 1 addition & 1 deletion tests/filters/test_value_similarity_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_value_similarity_check_p(self, password_rule: Rule, file_path: str, suc
line_data = get_line_data(file_path=file_path, line=success_line, pattern=password_rule.patterns[0])
assert ValueSimilarityCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False

@pytest.mark.parametrize("line", ["password = 'password1'", "password = 'password123'"])
@pytest.mark.parametrize("line", ["password = 'password1'", "password = 'password123'", "PWD=`pwd`"])
def test_value_similarity_check_n(self, password_rule: Rule, file_path: str, line: str) -> None:
line_data = get_line_data(file_path=file_path, line=line, pattern=password_rule.patterns[0])
assert ValueSimilarityCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is True

0 comments on commit ab99f97

Please sign in to comment.