Keyword pattern, filters improvement

Samsung · Feb 20, 2025 · ab99f97 · ab99f97
1 parent 423532f
commit ab99f97
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 31 deletions.
diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt
@@ -225,7 +225,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
 TOTAL:                  10345      16358553        11997        46620         5021
-credsweeper result_cnt : 11759, lost_cnt : 0, true_cnt : 11544, false_cnt : 215
+credsweeper result_cnt : 11761, lost_cnt : 0, true_cnt : 11546, false_cnt : 215
 Rules                             Positives    Negatives    Templates    Reported     TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  -----  ----  -----  ----  --------  --------  --------  --------  --------  --------
 API                                     125         3172          187         122    122     0   3359     3  0.000000  0.024000  0.999139  1.000000  0.976000  0.987854
@@ -263,7 +263,7 @@ Key                                    3911        15717          483        392
 Nonce                                    93           49            0          93     92     1     48     1  0.020408  0.010753  0.985915  0.989247  0.989247  0.989247
 Other                                     9         7447            5                  0     0   7452     9  0.000000  1.000000  0.998794            0.000000
 PEM Private Key                        1019         1483            0        1023   1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1938         7530         2635        1861   1855     6  10159    83  0.000590  0.042828  0.992646  0.996776  0.957172  0.976573
+Password                               1938         7530         2635        1863   1857     6  10159    81  0.000590  0.041796  0.992812  0.996779  0.958204  0.977111
 SQL Password                             44           13            0          41     41     0     13     3  0.000000  0.068182  0.947368  1.000000  0.931818  0.964706
 Salesforce Credentials                    2            0            0           2      2     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Salt                                     48           75            1          45     45     0     76     3  0.000000  0.062500  0.975806  1.000000  0.937500  0.967742
@@ -276,4 +276,4 @@ Token                                   645         4170          453         62
 Twilio Credentials                       30           39            0          30     30     0     39     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 URL Credentials                         218          156          208         217    217     0    364     1  0.000000  0.004587  0.998282  1.000000  0.995413  0.997701
 UUID                                   1075          265            0        1074   1073     1    264     2  0.003774  0.001860  0.997761  0.999069  0.998140  0.998604
-                                      11997        46620         5021       11771  11544   215  46405   453  0.004612  0.037759  0.988604  0.981716  0.962241  0.971881
+                                      11997        46620         5021       11773  11546   215  46405   451  0.004612  0.037593  0.988638  0.981719  0.962407  0.971967
diff --git a/credsweeper/common/keyword_pattern.py b/credsweeper/common/keyword_pattern.py
@@ -3,39 +3,46 @@
 
 class KeywordPattern:
     """Pattern set of keyword types"""
-    key_left = r"(\\[nrt]|%[0-9a-f]{2})?"\
-               r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
+    key_left = r"(\\[nrt]|%[0-9a-f]{2})?" \
+               r"(?P<variable>(([`'\"]{1,8}[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
                r"(?P<keyword>"
     # there will be inserted a keyword
     key_right = r")" \
-                r"(&(quot|apos);|[^%:='\"`<>{?!&]*)[`'\"]*))"  # <variable>
-    separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
-                r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=|%3d)" \
-                r"(\s|\\+[tnr])*"
+                r"[^%:='\"`<>{?!&]*" \
+                r")" \
+                r"(&(quot|apos);|%[0-9a-f]{2}|[`'\"])*" \
+                r")"  # <variable>
+    separator = r"(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*" \
+                r"(?P<separator>:(\s[a-z]{3,9}[?]?\s)?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=|%3d)" \
+                r"(\s|\\{1,8}[tnr])*"
     # might be curly, square or parenthesis with words before
     wrap = r"(?P<wrap>(" \
-           r"(new(\s|\\+[tnr])+)?" \
+           r"(new(\s|\\{1,8}[tnr]){1,8})?" \
            r"([0-9a-z_.]|-(>|(&|\\\\*u0026)gt;))*" \
-           r"[\[\(\{]"\
-           r"(\s|\\+[tnr])*" \
+           r"[\[\(\{]" \
+           r"(\s|\\{1,8}[tnr])*" \
            r"([0-9a-z_]{1,32}=)?" \
-           r")+)?"
+           r"){1,8})?"
     string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
     left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4}))?"
     # Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
-    auth_keywords = r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?"
+    auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey)\s)?"
     value = r"(?P<value>" \
             r"(?(value_leftquote)" \
             r"(" \
             r"(?!(?P=value_leftquote))" \
             r"(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))" \
             r"|" \
             r"(?!&(quot|apos);)" \
-            r"(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])" \
-            r"){3,8000}" \
+            r"(\\{1,8}([ tnr]|[^\s`'\"])" \
+            r"|" \
+            r"(?P<url_esc>%[0-9a-f]{2})" \
+            r"|" \
+            r"(?(url_esc)[^\s`'\",;\\&]|[^\s`'\",;\\])" \
+            r")){3,8000}" \
             r"|(\{[^}]{3,8000}\})" \
             r"|(<[^>]{3,8000}>)" \
-            r")"
+            r")"  # <value>
     right_quote = r"(?(value_leftquote)" \
                   r"(?P<value_rightquote>(?<!\\)(?P=value_leftquote)|\\$|(?<=[0-9a-z+_/-])$)" \
                   r"|" \
@@ -44,7 +51,7 @@ class KeywordPattern:
     @classmethod
     def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
         """Returns compiled regex pattern"""
-        expression = "".join([  #
+        expression = ''.join([  #
             cls.key_left,  #
             keyword,  #
             cls.key_right,  #

diff --git a/credsweeper/filters/value_allowlist_check.py b/credsweeper/filters/value_allowlist_check.py
@@ -8,22 +8,35 @@
 
 
 class ValueAllowlistCheck(Filter):
-    """Check that patterns from the list is not present in the candidate value."""
+    """Check that the patterns do not MATCH the candidate value."""
 
     ALLOWED = [
         r"ENC\(.*\)",  #
         r"ENC\[.*\]",  #
         r"\$\{(\*|[0-9]+|[a-z_].*)\}",  #
-        r"\$([0-9]+\b|[a-z_]+[0-9a-z_]*)",  #
+        r"\$[0-9]+(\s|$)",  #
         r"\$\$[a-z_]+(\^%[0-9a-z_]+)?",  #
         r"#\{.*\}",  #
         r"\{\{.+\}\}",  #
-        r"\S{0,5}\*{5,}",  #
         r".*@@@hl@@@(암호|비번|PW|PASS)@@@endhl@@@",  #
     ]
 
     ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED), flags=re.IGNORECASE)
-    ALLOWED_UNQUOTED_PATTERN = re.compile(r"[~a-z0-9_]+((\.|->)[a-z0-9_]+)+\(.*$", flags=re.IGNORECASE)
+
+    ALLOWED_QUOTED = [
+        r"\$[a-z_]+[0-9a-z_]*([$\s]|$)",  #
+        r".*\*\*\*",  #
+    ]
+
+    ALLOWED_QUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_QUOTED), flags=re.IGNORECASE)
+
+    ALLOWED_UNQUOTED = [
+        r"[~a-z0-9_]+((\.|->)[a-z0-9_]+)+\(.*$",  #
+        r"\$[a-z_]+[0-9a-z_]*\b",  #
+        r".*\*\*\*\*\*",  #
+    ]
+
+    ALLOWED_UNQUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_UNQUOTED), flags=re.IGNORECASE)
 
     def __init__(self, config: Config = None) -> None:
         pass
@@ -42,8 +55,11 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
 
         if self.ALLOWED_PATTERN.match(line_data.value):
             return True
-
-        if not line_data.is_well_quoted_value and self.ALLOWED_UNQUOTED_PATTERN.match(line_data.value):
-            return True
+        elif line_data.is_well_quoted_value:
+            if self.ALLOWED_QUOTED_PATTERN.match(line_data.value):
+                return True
+        else:
+            if self.ALLOWED_UNQUOTED_PATTERN.match(line_data.value):
+                return True
 
         return False
diff --git a/experiment/main.sh b/experiment/main.sh
@@ -51,7 +51,7 @@ if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi
 
 cd "${CREDSWEEPER_DIR}"
 report_file=${RESULT_DIR}/${NOW}.json
-${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort  --rules ${CREDSWEEPER_DIR}/experiment/results/train_config.yaml --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS}  --subtext --save-json ${report_file}
+${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort  --rules ${CREDSWEEPER_DIR}/experiment/results/train_config.yaml --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS}  --subtext --save-json ${report_file} --no-stdout
 
 cd "${CREDDATA_DIR}"
 .venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt

diff --git a/experiment/src/prepare_data.py b/experiment/src/prepare_data.py
@@ -12,7 +12,7 @@ def execute_scanner(dataset_location: str, result_location_str: str, jobs: int,
     """Execute CredSweeper as a separate process to make sure no global states is shared with training script"""
     dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.."
     command = f"{sys.executable} -m credsweeper --path {dataset_location}/data" \
-              f" --save-json {result_location_str} --log info" \
+              f" --save-json {result_location_str} --log info --no-stdout" \
               f" {'--doc' if doc_target else ''}" \
               f" --jobs {jobs} --sort --rules results/train_config.yaml --ml_threshold 0 --subtext"
     error_code = subprocess.check_call(command, shell=True, cwd=dir_path)

diff --git a/tests/common/test_keyword_pattern.py b/tests/common/test_keyword_pattern.py
@@ -92,7 +92,7 @@ def test_separator_p(self, config: Config, file_path: pytest.fixture, line: str)
             ['''db.setCred("{ \"password\" : \"" + SECRET + "\" }");''', ''' + SECRET + '''],
             ['''\\"password\\": \\"{\\\\"secret\\\\": \\\\"test\\\\"}\\"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'],  #
             ['''"password": "{\\\\"secret\\\\": \\\\"test\\\\"}"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'],  #
-            #normal_str = "First line.\nSecond line.\nEnd of message.\n";
+            # normal_str = "First line.\nSecond line.\nEnd of message.\n";
             ['''std::string password = R"multiline\\npassword";''', '''multiline\\npassword'''],  #
             ['''const wchar_t* password = L"wchar_t*secret";''', '''wchar_t*secret'''],  #
             ['''const char16_t* password = U"char16_t*secret";''', '''char16_t*secret'''],  #
@@ -114,6 +114,17 @@ def test_separator_p(self, config: Config, file_path: pytest.fixture, line: str)
             ['''password24=secret42\\n''', 'secret42'],  #
             ['password = 3VNdhWT3oFo5I7faffKO\\\neEagnK7tYBcGxhla\n;', '''3VNdhWT3oFo5I7faffKO'''],
             ['password = "3VNdhWT3oFo5I7faffKO\n   gnK7tYBcGxhla\n";', '''3VNdhWT3oFo5I7faffKO\n   gnK7tYBcGxhla\n'''],
+            [
+                "//&user%5Bemail%5D=credsweeper%40example.com&user%5Bpassword%5D=Dmdkesfdsq452%23%40!&user%5Bpassword_",
+                "Dmdkesfdsq452%23%40!"
+            ],
+            ["password%3dDmsfdsq452!&user%5Bpassword_", "Dmsfdsq452!"],
+            ["MY_TEST_PASSWORD={MY_TEST_PASSWORD}", "MY_TEST_PASSWORD"],
+            ["MY_TEST_PASSWORD=(MY_TEST_PASSWORD)", "MY_TEST_PASSWORD"],
+            ["MY_TEST_PASSWORD=<MY_TEST_PASSWORD>", "<MY_TEST_PASSWORD>"],  # <> are used in future to detect a template
+            ["MY_TEST_PASSWORD=[MY_TEST_PASSWORD]", "MY_TEST_PASSWORD"],
+            ["MY_TEST_PASSWORD=MY_TEST&PASSWORD!", "MY_TEST&PASSWORD!"],
+            ["MY_TEST_PASSWORD='MY_TEST&PASSWORD!'", "MY_TEST&PASSWORD!"],
         ])
     def test_keyword_pattern_p(self, config: Config, file_path: pytest.fixture, line: str, value: str) -> None:
         pattern = KeywordPattern.get_keyword_pattern("password")

diff --git a/tests/filters/test_value_allowlist_check.py b/tests/filters/test_value_allowlist_check.py
@@ -11,6 +11,7 @@ class TestValueAllowlistCheck:
         "line",
         [  #
             "password = $4eCr3t",  #
+            "pass=$((0394584039))",  #
             "password = 'F(b7)]DAS^iCv0vqIJOvGg<5<F(lwQ'",  #
             "password = P@s$w0Rd",  #
             "password = ENCrackle123)",  #
@@ -23,14 +24,17 @@ class TestValueAllowlistCheck:
             "password = ***test***",  #
             "password = .*@@@@@@",  #
             "pass=get->pass('''ARG",  #
+            "password = '$34%4reGE_'",  #
+            "password = '$D34%4reGE_'",  #
         ])
     def test_value_allowlist_check_p(self, file_path: pytest.fixture, line: str) -> None:
-        line_data = get_line_data(file_path, line=line, pattern=SUCCESS_LINE_PATTERN)
+        line_data = get_line_data(file_path=file_path, line=line, pattern=SUCCESS_LINE_PATTERN)
         assert ValueAllowlistCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False
 
     @pytest.mark.parametrize(
         "line",
         [  #
+            "pass := \"$pass2id$v=1$m=65536,t=3,p=2$2tNBg5k/rOCN2n3/kFYJ3789X\"",  #
             "pass=get->pass(arg",  #
             "PASS:@@@hl@@@PASS@@@endhl@@@",  #
             "pass:='ENC(Crackle123)'",  #
@@ -42,7 +46,8 @@ def test_value_allowlist_check_p(self, file_path: pytest.fixture, line: str) ->
             "pass:test*****",  #
             'PASS="${*}"',  #
             'PASS="$123"',  #
+            'PASS="$A1B2C3D"',  #
         ])
     def test_value_allowlist_check_n(self, file_path: pytest.fixture, line: str) -> None:
-        line_data = get_line_data(file_path, line=line, pattern=SUCCESS_LINE_PATTERN)
+        line_data = get_line_data(file_path=file_path, line=line, pattern=SUCCESS_LINE_PATTERN)
         assert ValueAllowlistCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is True
diff --git a/tests/filters/test_value_similarity_check.py b/tests/filters/test_value_similarity_check.py
@@ -27,7 +27,7 @@ def test_value_similarity_check_p(self, password_rule: Rule, file_path: str, suc
         line_data = get_line_data(file_path=file_path, line=success_line, pattern=password_rule.patterns[0])
         assert ValueSimilarityCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False
 
-    @pytest.mark.parametrize("line", ["password = 'password1'", "password = 'password123'"])
+    @pytest.mark.parametrize("line", ["password = 'password1'", "password = 'password123'", "PWD=`pwd`"])
     def test_value_similarity_check_n(self, password_rule: Rule, file_path: str, line: str) -> None:
         line_data = get_line_data(file_path=file_path, line=line, pattern=password_rule.patterns[0])
         assert ValueSimilarityCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is True