Skip to content

Commit

Permalink
KeywordPattern regex improvement (#606)
Browse files Browse the repository at this point in the history
* part of changes

* upd BM scores as is

* samples and AWS multi rule fix

* fix
  • Loading branch information
babenek authored Sep 16, 2024
1 parent 2534b92 commit a73faaa
Show file tree
Hide file tree
Showing 32 changed files with 4,192 additions and 3,212 deletions.
107 changes: 54 additions & 53 deletions .ci/benchmark.txt

Large diffs are not rendered by default.

12 changes: 8 additions & 4 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: awsmulti

- name: Markup hashing
run: |
Expand Down Expand Up @@ -72,7 +73,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: awsmulti

- name: Markup hashing
run: |
Expand Down Expand Up @@ -169,7 +171,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: awsmulti

- name: Markup hashing
run: |
Expand Down Expand Up @@ -351,7 +354,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: awsmulti

- name: Markup hashing
run: |
Expand Down
28 changes: 0 additions & 28 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,8 @@
import re
import typing
from enum import Enum
from typing import Optional, Union


class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(\\[nrt])?(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
r"[^:='\"`<>{?!&]*)[`'\"]*)" # <variable>
separator = r"\s*\]?\s*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=" \
r"|:|=>|!=|===|==|=)" \
r"\s*(?P<wrap>((new\s*)?\w|\.|->|\(|\[)*[\[\(\{](\w{1,32}=)?\s*)?"
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
value = r"(?P<value_leftquote>((b|r|br|rb|u|f|rf|fr|\\{0,8})?[`'\"]){1,4})?" \
r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?" \
r"(?P<value>" \
r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){1,8000}" \
r"|(?:\{[^}]{3,8000}\})|(?:<[^>]{3,8000}>)" \
r")" \
r"(?(value_leftquote)(?P<value_rightquote>(\\{0,8}[`'\"]){1,4})?|(?(wrap)[\]\)\},;]))"

@classmethod
def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
"""Returns compiled regex pattern"""
expression = "".join([cls.key_left, keyword, cls.key_right, cls.separator, cls.value])
return re.compile(expression, flags=re.IGNORECASE)


class Severity(Enum):
"""Severity of candidate"""
CRITICAL = "critical"
Expand Down
58 changes: 58 additions & 0 deletions credsweeper/common/keyword_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import re


class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(\\[nrt])?"\
r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
r"[^:='\"`<>{?!&]*)[`'\"]*)" # <variable>
separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=)" \
r"(\s|\\+[tnr])*"
# might be curly, square or parenthesis with words before
wrap = r"(?P<wrap>(" \
r"(new(\s|\\+[tnr])+)?" \
r"([0-9a-z_.]|-(>|(&|\\\\*u0026)gt;))*" \
r"[\[\(\{]"\
r"(\s|\\+[tnr])*" \
r"([0-9a-z_]{1,32}=)?" \
r")+)?"
string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?[`'\"]){1,4}))?"
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
auth_keywords = r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?"
value = r"(?P<value>" \
r"(?(value_leftquote)" \
r"(" \
r"(?!(?P=value_leftquote))" \
r"(?(esq)((?!(?P=esq)['`\"]).)|((?!(?P=value_leftquote)).)))" \
r"|" \
r"(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])" \
r"){3,8000}" \
r"|(\{[^}]{3,8000}\})" \
r"|(<[^>]{3,8000}>)" \
r")"
right_quote = r"(?(value_leftquote)" \
r"(?P<value_rightquote>(?<!\\)(?P=value_leftquote)|\\$|(?<=[0-9a-z+_/-])$)" \
r"|" \
r"(?(wrap)[\]\)\},;]))"

@classmethod
def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
"""Returns compiled regex pattern"""
expression = "".join([ #
cls.key_left, #
keyword, #
cls.key_right, #
cls.separator, #
cls.wrap, #
cls.string_prefix, #
cls.left_quote, #
cls.auth_keywords, #
cls.value, #
cls.right_quote, #
])
return re.compile(expression, flags=re.IGNORECASE | re.DOTALL)
2 changes: 1 addition & 1 deletion credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@
type: multi
values:
- (?<![0-9A-Za-z_+-])(?P<value>(ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})(?![=0-9A-Za-z_+-])
- (?<![0-9A-Za-z_/+-])(?P<value>[0-9A-Za-z/+]{40,80})(?![=0-9A-Za-z_/+-])
- (?<![0-9A-Za-z_/+-])(?P<value>[0-9A-Za-z/+]{35,80})(?![=0-9A-Za-z_/+-])
filter_type: GeneralPattern
required_substrings:
- A
Expand Down
3 changes: 2 additions & 1 deletion credsweeper/rules/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from typing import Dict, List, Optional, Union, Set

from credsweeper import validations, filters
from credsweeper.common.constants import RuleType, Severity, MAX_LINE_LENGTH, KeywordPattern, Confidence
from credsweeper.common.constants import RuleType, Severity, MAX_LINE_LENGTH, Confidence
from credsweeper.common.keyword_pattern import KeywordPattern
from credsweeper.config import Config
from credsweeper.filters import Filter, group
from credsweeper.filters.group import Group
Expand Down
8 changes: 8 additions & 0 deletions docs/source/credsweeper.common.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ credsweeper.common.keyword\_checklist module
:undoc-members:
:show-inheritance:

credsweeper.common.keyword\_pattern module
------------------------------------------

.. automodule:: credsweeper.common.keyword_pattern
:members:
:undoc-members:
:show-inheritance:

Module contents
---------------

Expand Down
10 changes: 5 additions & 5 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT: int = 132
SAMPLES_FILES_COUNT: int = 133

# the lowest value of ML threshold is used to display possible lowest values
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan
SAMPLES_CRED_COUNT: int = 378
SAMPLES_CRED_LINE_COUNT: int = 395
SAMPLES_CRED_COUNT: int = 386
SAMPLES_CRED_LINE_COUNT: int = 404

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 347
SAMPLES_POST_CRED_COUNT: int = 355

# with option --doc
SAMPLES_IN_DOC = 419
SAMPLES_IN_DOC = 423

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 23
Expand Down
48 changes: 0 additions & 48 deletions tests/common/test_constants.py

This file was deleted.

137 changes: 137 additions & 0 deletions tests/common/test_keyword_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import pytest

from credsweeper.common.keyword_pattern import KeywordPattern
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.utils import Util


class TestKeywordPattern:

@pytest.mark.parametrize("line", ["melon is 'banana'"])
def test_separator_n(self, config: Config, file_path: pytest.fixture, line: str) -> None:
pattern = KeywordPattern.get_keyword_pattern("melon")
line_data = LineData(config,
line,
0,
1,
file_path,
Util.get_extension(file_path),
info="dummy",
pattern=pattern)
assert line_data.value is None

@pytest.mark.parametrize("line", ["melon = 'banAna'", "melon : 'banAna'", "melon := 'banAna'"])
def test_separator_p(self, config: Config, file_path: pytest.fixture, line: str) -> None:
pattern = KeywordPattern.get_keyword_pattern("melon")
line_data = LineData(config,
line,
0,
1,
file_path,
Util.get_extension(file_path),
info="dummy",
pattern=pattern)
assert line_data.value == "banAna"

@pytest.mark.parametrize(
"line, value",
[
# ['''...log=1;User ID=X3;password=Quantum42!\\""''', '''Quantum42!'''], # todo
# ["""password='\\\\'secret-1\\\\''""", """\\'secret-1\\'"""], # todo
# ['''password="\\"secret-2\\""''', '''\\"secret-2\\"'''], # todo
# ["""password=rb'\\'secret=1\\''""", """\\'secret=1\\'"""], # todo
# ['''password=f"\\"secret=2\\""''', '''\\"secret=2\\"'''], # todo
# ['''password=r"\\\\"secret=3\\\\""''', '''\\"secret=3\\"'''], # todo
# ['''"password = 'sec;$2`\\'[\\/*;ret';";''', '''sec;$2`\\'[\\/*;ret'''], # todo
['''"$password = "10qoakxncnfh47t_''', '''10qoakxncnfh47t_'''], #
[
'''copes\":[\"user\"],\"note\":\"Note\",\"password\":\"cc6323cb2223f82f01\",\"upd_at\":\"1765....\",''',
'''cc6323cb2223f82f01'''
], #
['''"password = pas:sword # comment''', '''pas:sword'''],
['''x.password=pK5C4tlA/w1cO\\=\\=''', '''pK5C4tlA/w1cO\\=\\='''], #
['''final String body = \"{ \\"passwords\\":\\"i0sEcReT\\\\/MwX3X\\","''', '''i0sEcReT\\\\/MwX3X'''],
[
'''\\\"password\\\"=\\u0026gt;\t\\n\\t\\\"lfFTfDT1roc4YbG9hy5cnvX\\n oZ+Sc/wb+CvdF4s==\\\",\\n",''',
'''lfFTfDT1roc4YbG9hy5cnvX\\n oZ+Sc/wb+CvdF4s=='''
],
[
'''var request = {"password": "{\\"wks\\": \\"8x9s3ga7\\", \\"uzr\": \\"wbm\\"}","Any-Tail":"x\r"};''',
'''{\\"wks\\": \\"8x9s3ga7\\", \\"uzr": \\"wbm\\"}'''
],
['''passwords: ["1029384756",''', '''1029384756'''], #
['''passwords:[ "1029384756", "9801726354" ]''', '''1029384756'''], #
['''password="\\"secret-line-wrap\\''', '''secret-line-wrap'''], #
['''password=r"""secret4"""''', '''secret4'''], #
['''password=r\\"\\"\\"secret5\\"\\"\\"''', '''secret5'''], #
['''password="""secret6"""''', '''secret6'''], #
['''password=\\\\"\\\\"\\\\"secret7\\\\"\\\\"\\\\"''', '''secret7'''], #
['''password=\\\\"\\\\"\\\\"secret"7\\\\"\\\\"\\\\"''', '''secret"7'''], #
['''password="""{\\"secret8\\"}"""''', '''{\\"secret8\\"}'''], #
['''password="""secret'9"""''', '''secret'9'''], #
["""password='''secret'6'''""", '''secret'6'''], #
["""password='''secret`8'''""", '''secret`8'''], #
["""password=``secret`7``""", '''secret`7'''], #
["""password=``secret 5``""", '''secret 5'''], #
["""password='secret\\ 5''""", '''secret\\ 5'''], #
["""password=secret\\ 5""", '''secret\\ 5'''], #
["""password=secret 0""", '''secret'''], #
["""password=secret0\\""", '''secret0'''], #
["""password=r'\\"secret\\"'""", '''\\"secret\\"'''], #
['''password=r\\"{\\\\"secret\\\\"}\\"''', '{\\\\"secret\\\\"}'], #
['''password=r"{\\"secret\\"}"''', '{\\"secret\\"}'], #
["""password=b'"secret4"'""", '"secret4"'], #
["""password=rb'\\\\"secret\\\\"'""", '\\\\"secret\\\\"'], #
["""password=r\\'"sec'"'"'"ret"\\'""", '''"sec'"'"'"ret"'''], #
["""\\'\\\\\\\\'password\\\\\\\\': b\\\\\\\\'secret\\\\\\\\'\\'""", "secret"], #
["""'password': b'secret'""", """secret"""], #
["""'password': r'secret'""", """secret"""], #
["""'password': fr'secret'""", """secret"""], #
["""\\'password\\': \\'secret\\'""", """secret"""], #
['''db.setCred("{ \"password\" : \"" + SECRET + "\" }");''', ''' + SECRET + '''],
['''\\"password\\": \\"{\\\\"secret\\\\": \\\\"test\\\\"}\\"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'], #
['''"password": "{\\\\"secret\\\\": \\\\"test\\\\"}"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'], #
#normal_str = "First line.\nSecond line.\nEnd of message.\n";
['''std::string password = R"multiline\\npassword";''', '''multiline\\npassword'''], #
['''const wchar_t* password = L"wchar_t*secret";''', '''wchar_t*secret'''], #
['''const char16_t* password = U"char16_t*secret";''', '''char16_t*secret'''], #
[
'''char password[] = {'S', 'E', 'C', 'R', 'E', 'T', '\\0'};''',
'''{'S', 'E', 'C', 'R', 'E', 'T', '\\0'}'''
], #
['''"password": "{8vi6wL+10fI/eibC7wFwc}"''', '{8vi6wL+10fI/eibC7wFwc}'], #
['''final String password = new String("SECRET") {''', '''SECRET'''], #
['''final OAuth2AccessToken password = new OAuth2AccessToken(\"SEC.RET\");''', '''SEC.RET'''], #
['''password = obfuscate(arg="SECRET") {''', '''SECRET'''], #
['''final String password = new String(Super(Encrypted("SECRET"))) {''', '''SECRET'''], #
['''final String password = new String(Super( Encrypted("SECRET", "dummy"))) {''', '''SECRET'''], #
["""'password': 'ENC(lqjdoxlandicpfpqk)'""", """ENC(lqjdoxlandicpfpqk)"""], #
["""'password': 'ENC[lqjdoxlandicpfpqk]'""", """ENC[lqjdoxlandicpfpqk]"""], #
['''password24=secret42''', 'secret42'], #
['''password24=secret42\\ ''', 'secret42\\ '], #
['''password24=secret42\\''', 'secret42'], #
['''password24=secret42\\n''', 'secret42'], #
['password = 3VNdhWT3oFo5I7faffKO\\\neEagnK7tYBcGxhla\n;', '''3VNdhWT3oFo5I7faffKO'''],
['password = "3VNdhWT3oFo5I7faffKO\n gnK7tYBcGxhla\n";', '''3VNdhWT3oFo5I7faffKO\n gnK7tYBcGxhla\n'''],
])
def test_keyword_pattern_p(self, config: Config, file_path: pytest.fixture, line: str, value: str) -> None:
pattern = KeywordPattern.get_keyword_pattern("password")
line_data = LineData(config,
line,
0,
1,
file_path,
Util.get_extension(file_path),
info="dummy",
pattern=pattern)
assert line_data.value == value

@pytest.mark.parametrize("line", [
"https://fonts.googleapis.com/css2?family=Montserrat:wght@500;700;900&family=Roboto:wght@300;400;500;700;900"
"&family=Roboto+Mono:wght@300;400;600;900&display=swap"
])
def test_keyword_pattern_n(self, config: Config, file_path: pytest.fixture, line: str) -> None:
pattern = KeywordPattern.get_keyword_pattern("api")
line_data = LineData(config, line, 0, 1, file_path, "file_type", "info", pattern)
assert line_data.value is None
Loading

0 comments on commit a73faaa

Please sign in to comment.