diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index cc8e2dddb..0a3b3e8ee 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -1,15 +1,15 @@ -DATA: 19071512 valid lines. MARKUP: 64429 items +DATA: 19071512 valid lines. MARKUP: 64495 items Category Positives Negatives Template -------------------------- ----------- ----------- ---------- Authentication Credentials 91 2653 32 Cryptographic Primitives 54 171 1 Generic Secret 1204 29587 618 Generic Token 336 3719 556 -Other 684 3709 37 -Password 1430 7147 4224 -Predefined Pattern 378 5292 11 +Other 728 3704 37 +Password 1445 7150 4224 +Predefined Pattern 387 5292 11 Private Key 1018 1477 -TOTAL: 5195 53755 5479 +TOTAL: 5263 53753 5479 FileType FileNumber ValidLines Positives Negatives Template --------------- ------------ ------------ ----------- ----------- ---------- 189 36262 44 407 80 @@ -42,7 +42,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .creds 1 10 1 1 .crlf 1 27 1 .crt 2 5124 206 -.cs 198 86213 10 779 98 +.cs 198 86213 9 780 98 .cshtml 5 207 12 .csp 3 447 9 .csproj 1 14 1 @@ -70,7 +70,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .gd 1 38 1 .gml 3 4011 16 .gni 3 6340 17 -.go 1090 718367 497 4029 742 +.go 1090 718367 501 4029 742 .golden 5 1246 1 12 31 .gradle 41 3647 2 79 59 .graphql 8 575 1 13 @@ -89,11 +89,11 @@ FileType FileNumber ValidLines Positives Negatives Templat .ipynb 1 210 4 .j 1 329 2 .j2 32 6327 8 175 11 -.java 589 169939 173 1265 176 +.java 589 169939 174 1264 176 .jenkinsfile 1 78 1 6 .jinja2 1 64 2 .js 665 705090 321 2443 363 -.json 856 15025976 279 10633 185 +.json 856 15025976 331 10633 185 .jsp 13 4101 1 38 1 .jsx 7 1162 19 .jwt 6 8 6 @@ -120,7 +120,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .markdown 3 146 2 2 .markerb 3 12 2 1 .marko 1 32 2 -.md 659 172418 393 2405 719 +.md 659 172418 394 2405 719 .mdx 3 723 7 .mjml 2 183 3 .mjs 22 5853 85 309 @@ -161,13 +161,13 @@ FileType FileNumber ValidLines Positives Negatives Templat .pug 3 379 3 .purs 1 73 4 .pxd 1 153 5 1 -.py 874 322368 311 3366 861 +.py 874 322368 317 3364 861 .pyi 4 1418 9 .pyp 1 193 1 .pyx 2 1175 21 .r 5 83 5 4 2 .rake 2 66 2 -.rb 868 173874 170 3283 583 +.rb 868 173874 171 3283 583 .re 1 40 1 .red 1 232 1 .release 1 13 4 @@ -187,7 +187,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .scala 40 6603 12 96 6 .scss 16 10191 32 1 .secrets 1 12 1 -.sh 138 26742 36 455 35 +.sh 138 26742 38 455 35 .slim 1 174 1 2 .smali 1 814 12 .snap 3 2390 1 32 2 @@ -227,21 +227,21 @@ FileType FileNumber ValidLines Positives Negatives Templat .xcscheme 1 109 6 .xib 11 504 164 .xsl 1 315 1 -.yaml 151 23500 91 379 52 -.yml 447 41773 284 960 359 +.yaml 151 23500 92 379 52 +.yml 447 41773 285 960 359 .zsh 7 1109 13 .zsh-theme 1 121 1 -TOTAL: 10188 19071512 5195 53755 5479 -Detected Credentials: 5799 -credsweeper result_cnt : 5045, lost_cnt : 0, true_cnt : 4613, false_cnt : 432 +TOTAL: 10188 19071512 5263 53753 5479 +Detected Credentials: 6038 +credsweeper result_cnt : 5113, lost_cnt : 0, true_cnt : 4680, false_cnt : 433 Category TP FP TN FN FPR FNR ACC PRC RCL F1 -------------------------- ---- ---- -------- ---- -------- -------- -------- -------- -------- -------- Authentication Credentials 74 27 2658 17 0.010056 0.186813 0.984150 0.732673 0.813187 0.770833 Cryptographic Primitives 43 3 169 11 0.017442 0.203704 0.938053 0.934783 0.796296 0.860000 Generic Secret 1112 25 30180 92 0.000828 0.076412 0.996275 0.978012 0.923588 0.950021 Generic Token 304 7 4268 32 0.001637 0.095238 0.991542 0.977492 0.904762 0.939722 -Other 503 330 3416 181 0.088094 0.264620 0.884650 0.603842 0.735380 0.663151 -Password 1200 34 11337 230 0.002990 0.160839 0.979377 0.972447 0.839161 0.900901 -Predefined Pattern 359 6 5297 19 0.001131 0.050265 0.995599 0.983562 0.949735 0.966353 +Other 549 329 3412 179 0.087944 0.245879 0.886328 0.625285 0.754121 0.683686 +Password 1212 36 11338 233 0.003165 0.161246 0.979016 0.971154 0.838754 0.900111 +Predefined Pattern 368 6 5297 19 0.001131 0.049096 0.995606 0.983957 0.950904 0.967148 Private Key 1018 0 1477 0 1.000000 1.000000 1.000000 1.000000 - 4613 432 19065885 582 0.000023 0.112031 0.999947 0.914371 0.887969 0.900977 + 4680 433 19065816 583 0.000023 0.110773 0.999947 0.915314 0.889227 0.902082 diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml index 546a18071..c420df400 100644 --- a/credsweeper/rules/config.yaml +++ b/credsweeper/rules/config.yaml @@ -122,7 +122,7 @@ confidence: weak type: pattern values: - - (^|[^.0-9a-zA-Z])(?P[0-2]?[0-9]{1,2}\.[0-2]?[0-9]{1,2}\.[0-2]?[0-9]{1,2}\.[0-2]?[0-9]{1,2})(?!/([123]?[0-9])([^0-9]|$))([^.0-9a-zA-Z$]|$) + - (?[0-2]?[0-9]{1,2}\.[0-2]?[0-9]{1,2}\.[0-2]?[0-9]{1,2}\.[0-2]?[0-9]{1,2})(?![.0-9a-zA-Z$]) filter_type: - ValueIPCheck min_line_len: 10 @@ -135,7 +135,7 @@ confidence: strong type: pattern values: - - (^|[^:0-9a-zA-Z])(?P[0-9A-Fa-f]{0,4}:(:?[0-9A-Fa-f]{1,4}:?){0,6}:[0-9A-Fa-f]{1,4})([^:0-9a-zA-Z]|$) + - (?[0-9A-Fa-f]{0,4}:(:?[0-9A-Fa-f]{1,4}:?){0,6}:[0-9A-Fa-f]{1,4})(?![:0-9a-zA-Z]) filter_type: - ValueIPCheck min_line_len: 10 @@ -148,7 +148,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?P(ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})([^=0-9A-Za-z_+-]|$) + - (?(ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - A @@ -160,8 +160,8 @@ confidence: moderate type: multi values: - - (^|[^0-9A-Za-z_+-])(?P(ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})([^=0-9A-Za-z_+-]|$) - - (^|[^0-9A-Za-z_/+-])(?P[0-9A-Za-z/+]{40,80})([^=0-9A-Za-z_/+-]|$) + - (?(ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})(?![=0-9A-Za-z_+-]) + - (?[0-9A-Za-z/+]{40,80})(?![=0-9A-Za-z_/+-]) filter_type: GeneralPattern required_substrings: - A @@ -173,7 +173,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pamzn\.mws\.[0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})([^=0-9A-Za-z_+-]|$) + - (?amzn\.mws\.[0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - amzn @@ -197,7 +197,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pdt0[a-zA-Z]{1}[0-9]{2}\.[A-Z0-9]{24}\.[A-Z0-9]{64})([^=0-9A-Za-z_+-]|$) + - (?dt0[a-zA-Z]{1}[0-9]{2}\.[A-Z0-9]{24}\.[A-Z0-9]{64})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - dt0 @@ -208,7 +208,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PEAAC[0-9A-Za-z]{27,80}) + - (?EAAC[0-9A-Za-z]{27,80}) filter_type: GeneralPattern required_substrings: - EAAC @@ -233,7 +233,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PAIza[0-9A-Za-z_-]{35})([^=0-9A-Za-z_+-]|$) + - (?AIza[0-9A-Za-z_-]{35})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern validations: - GoogleApiKeyValidation @@ -260,7 +260,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_-])(?PGOCSPX-[0-9A-Za-z_-]{28})([^=0-9A-Za-z_+-]|$) + - (?GOCSPX-[0-9A-Za-z_-]{28})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - GOCSPX- @@ -271,7 +271,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pya29\.[0-9A-Za-z_-]{22,8000}) + - (?ya29\.[0-9A-Za-z_-]{22,8000}) filter_type: GeneralPattern required_substrings: - ya29. @@ -282,7 +282,7 @@ confidence: moderate type: pattern values: - - (?i)(?Pheroku(.{0,20})?[0-9a-f]{8}(-[0-9a-f]{4})+-[0-9a-f]{12})([^=0-9A-Za-z_+-]|$) + - (?i)(?Pheroku(.{0,20})?[0-9a-f]{8}(-[0-9a-f]{4})+-[0-9a-f]{12})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - heroku @@ -293,7 +293,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PIGQVJ[\w]{100,8000}) + - (?IGQVJ[\w]{100,8000}) filter_type: GeneralPattern required_substrings: - IGQVJ @@ -304,7 +304,7 @@ confidence: moderate type: pattern values: - - (^|[^.0-9A-Za-z_+-])(?PeyJ[0-9A-Za-z_=-]{15,8000}([.0-9A-Za-z_=-]{1,8000})?) + - (?eyJ[0-9A-Za-z_=-]{15,8000}([.0-9A-Za-z_=-]{1,8000})?) filter_type: GeneralPattern use_ml: true required_substrings: @@ -317,7 +317,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?P[0-9a-zA-Z]{32}-us[0-9]{1,2})([^=0-9A-Za-z_+-]|$) + - (?[0-9a-zA-Z]{32}-us[0-9]{1,2})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern validations: - MailChimpKeyValidation @@ -330,7 +330,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pkey-[0-9a-zA-Z]{32})([^=0-9A-Za-z_+-]|$) + - (?key-[0-9a-zA-Z]{32})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - key- @@ -355,7 +355,7 @@ confidence: strong type: pattern values: - - (?Paccess_token\$production\$[0-9a-z]{16}\$[0-9a-z]{32})([^=0-9A-Za-z_+-]|$) + - (?Paccess_token\$production\$[0-9a-z]{16}\$[0-9a-z]{32})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - access_token$production$ @@ -400,7 +400,7 @@ confidence: strong type: pattern values: - - (?Psk_live_[0-9a-z]{32})([^=0-9A-Za-z_+-]|$) + - (?Psk_live_[0-9a-z]{32})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - sk_live_ @@ -435,7 +435,7 @@ confidence: strong type: pattern values: - - (?Pshp(at|ca|pa|ss)_[a-fA-F0-9]{32})([^=0-9A-Za-z_+-]|$) + - (?Pshp(at|ca|pa|ss)_[a-fA-F0-9]{32})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern required_substrings: - shp @@ -446,7 +446,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pxox[a|b|p|r|o|s]\-[-a-zA-Z0-9]{10,250}) + - (?xox[a|b|p|r|o|s]\-[-a-zA-Z0-9]{10,250}) filter_type: GeneralPattern validations: - SlackTokenValidation @@ -470,7 +470,7 @@ confidence: strong type: pattern values: - - (?Psk_live_[0-9a-zA-Z]{24})([^=0-9A-Za-z_+-]|$) + - (?Psk_live_[0-9a-zA-Z]{24})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern validations: - StripeApiKeyValidation @@ -483,7 +483,7 @@ confidence: strong type: pattern values: - - (?Prk_live_[0-9a-zA-Z]{24})([^=0-9A-Za-z_+-]|$) + - (?Prk_live_[0-9a-zA-Z]{24})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - rk_live_ @@ -494,7 +494,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PEAAA[0-9A-Za-z_-]{60})([^=0-9A-Za-z_+-]|$) + - (?EAAA[0-9A-Za-z_-]{60})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern validations: - SquareAccessTokenValidation @@ -507,7 +507,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Psq0[a-z]{3}-[0-9A-Za-z_-]{22})([^=0-9A-Za-z_+-]|$) + - (?sq0[a-z]{3}-[0-9A-Za-z_-]{22})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern validations: - SquareClientIdValidation @@ -520,7 +520,7 @@ confidence: strong type: pattern values: - - (?Psq0csp-[0-9A-Za-z_-]{43})([^=0-9A-Za-z_+-]|$) + - (?Psq0csp-[0-9A-Za-z_-]{43})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - sq0csp @@ -544,7 +544,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PSK[0-9a-fA-F]{32})([^=0-9A-Za-z_+-]|$) + - (?SK[0-9a-fA-F]{32})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - SK @@ -594,7 +594,7 @@ confidence: moderate type: pattern values: - - (?P[0-9]{8,10}:[0-9A-Za-z_-]{35})([^=0-9A-Za-z_+-]|$) + - (?P[0-9]{8,10}:[0-9A-Za-z_-]{35})(?![=0-9A-Za-z_+-]) filter_type: GeneralPattern required_substrings: - :AA @@ -616,7 +616,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pgh[pousr]_[0-9A-Za-z_]{36,255}) + - (?gh[pousr]_[0-9A-Za-z_]{36,255}) filter_type: - ValueGitHubCheck validations: @@ -634,7 +634,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pgithub_pat_[0-9A-Za-z_]{80,255}) + - (?github_pat_[0-9A-Za-z_]{80,255}) filter_type: GeneralPattern validations: - GithubTokenValidation @@ -647,7 +647,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_])(?P[a-z0-9.-]+\.firebaseio\.com|[a-z0-9.-]+\.firebaseapp\.com) + - (?[a-z0-9.-]+\.firebaseio\.com|[a-z0-9.-]+\.firebaseapp\.com) filter_type: GeneralPattern required_substrings: - .firebase @@ -658,7 +658,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_])(?P[a-z0-9.-]{3,63}\.s3\.amazonaws\.com|[a-z0-9.-]{3,63}\.s3-website[.-](eu|ap|us|ca|sa|cn)) + - (?[a-z0-9.-]{3,63}\.s3\.amazonaws\.com|[a-z0-9.-]{3,63}\.s3-website[.-](eu|ap|us|ca|sa|cn)) filter_type: GeneralPattern required_substrings: - .s3-website @@ -709,7 +709,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?P(cmVmdGtuO[0-9A-Za-z_-]{55}|AKCp[0-9A-Za-z_-]{69}))([^=0-9A-Za-z_+-]|$) + - (?(cmVmdGtuO[0-9A-Za-z_-]{55}|AKCp[0-9A-Za-z_-]{69}))(?![=0-9A-Za-z_+-]) filter_type: - ValueJfrogTokenCheck required_substrings: @@ -722,7 +722,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PeyJ[A-Za-z0-9_=-]{50,500}\.eyJ[A-Za-z0-9_=-]+\.[A-Za-z0-9_=-]+) + - (?eyJ[A-Za-z0-9_=-]{50,500}\.eyJ[A-Za-z0-9_=-]+\.[A-Za-z0-9_=-]+) filter_type: - ValueJsonWebTokenCheck required_substrings: @@ -734,7 +734,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?P[a-zA-Z0-9_~.-]{3}8Q~[a-zA-Z0-9_~.-]{34})([^=0-9A-Za-z_+-]|$) + - (?[a-zA-Z0-9_~.-]{3}8Q~[a-zA-Z0-9_~.-]{34})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 40 required_substrings: @@ -745,7 +745,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PATBB[A-Za-z0-9]{24}[A-F0-9]{8})([^=0-9A-Za-z_+-]|$) + - (?ATBB[A-Za-z0-9]{24}[A-F0-9]{8})(?![=0-9A-Za-z_+-]) filter_type: - ValueAtlassianTokenCheck min_line_len: 28 @@ -757,7 +757,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PATCTT3xFfGN0[a-zA-Z0-9-_]{171}=[A-F0-9]{8})([^=0-9A-Za-z_+-]|$) + - (?ATCTT3xFfGN0[a-zA-Z0-9-_]{171}=[A-F0-9]{8})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 183 required_substrings: @@ -768,7 +768,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PBBDC-[NMO][ADgjQTwz][A-Za-z0-9+/]{42})([^=0-9A-Za-z_+-]|$) + - (?BBDC-[NMO][ADgjQTwz][A-Za-z0-9+/]{42})(?![=0-9A-Za-z_+-]) filter_type: - ValueAtlassianTokenCheck min_line_len: 49 @@ -780,7 +780,7 @@ confidence: weak type: pattern values: - - (^|[^.0-9A-Za-z_/+-])(?P[a-zA-Z0-9]{18}([a-zA-Z0-9]{14})?)([^0-9A-Za-z.$_/+-]|$) + - (?[a-zA-Z0-9]{18}([a-zA-Z0-9]{14})?)(?![0-9A-Za-z.$_/+-]) filter_type: WeirdBase64Token min_line_len: 18 required_regex: "[a-zA-Z0-9_/+-]{15,80}" @@ -790,7 +790,7 @@ confidence: weak type: pattern values: - - (^|[^.0-9A-Za-z_/+-])(?P([a-zA-Z0-9_-]{32}){1,2})([^0-9A-Za-z.$_/+-]|$) + - (?([a-zA-Z0-9_-]{32}){1,2})(?![0-9A-Za-z.$_/+-]) filter_type: WeirdBase64Token min_line_len: 32 required_regex: "[a-zA-Z0-9_/+-]{15,80}" @@ -800,7 +800,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_/+-])(?P[NMO][ADgjQTwz][a-zA-Z0-9+/]{42})([^=0-9A-Za-z_+-]|$) + - (?[NMO][ADgjQTwz][a-zA-Z0-9+/]{42})(?![=0-9A-Za-z_+-]) filter_type: - ValueAtlassianTokenCheck min_line_len: 44 @@ -815,7 +815,7 @@ confidence: weak type: pattern values: - - (^|[^.0-9A-Za-z_/+-])(?P[a-zA-Z0-9]{24})([^=0-9A-Za-z.$_/+-]|$) + - (?[a-zA-Z0-9]{24})(?![=0-9A-Za-z.$_/+-]) filter_type: WeirdBase64Token min_line_len: 24 required_regex: "[a-zA-Z0-9_/+-]{15,80}" @@ -825,7 +825,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PATATT3xFfGF0[a-zA-Z0-9-_]{171}=[A-F0-9]{8})([^=0-9A-Za-z_+-]|$) + - (?ATATT3xFfGF0[a-zA-Z0-9-_]{171}=[A-F0-9]{8})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 191 required_substrings: @@ -836,7 +836,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pdo[op]_v1_[a-f0-9]{64})([^=0-9A-Za-z_+-]|$) + - (?do[op]_v1_[a-f0-9]{64})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 71 required_substrings: @@ -848,7 +848,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Psl.[A-Za-z0-9_-]{135})([^=0-9A-Za-z_+-]|$) + - (?sl.[A-Za-z0-9_-]{135})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 138 required_substrings: @@ -859,7 +859,7 @@ confidence: moderate type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Poy2[a-z0-9]{43})([^=0-9A-Za-z_+-]|$) + - (?oy2[a-z0-9]{43})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 46 required_substrings: @@ -870,7 +870,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pglpat-[a-zA-Z0-9_-]{20})([^=0-9A-Za-z_+-]|$) + - (?glpat-[a-zA-Z0-9_-]{20})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 26 required_substrings: @@ -881,7 +881,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pglptt-[a-f0-9]{40})([^=0-9A-Za-z_+-]|$) + - (?glptt-[a-f0-9]{40})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 46 required_substrings: @@ -892,7 +892,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PGR1348941[a-zA-Z0-9_-]{20})([^=0-9A-Za-z_+-]|$) + - (?GR1348941[a-zA-Z0-9_-]{20})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 29 required_substrings: @@ -903,7 +903,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pglrt-[a-zA-Z0-9_-]{20})([^=0-9A-Za-z_+-]|$) + - (?glrt-[a-zA-Z0-9_-]{20})(?![=0-9A-Za-z_+-]) filter_type: TokenPattern min_line_len: 25 required_substrings: @@ -914,7 +914,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?PeyJ[a-zA-Z0-9=/-]{64,360})([^=0-9A-Za-z_+-]|$) + - (?eyJ[a-zA-Z0-9=/-]{64,360})(?![=0-9A-Za-z_+-]) filter_type: - ValueGrafanaCheck min_line_len: 67 @@ -926,7 +926,7 @@ confidence: strong type: pattern values: - - (^|[^0-9A-Za-z_+-])(?Pglc_eyJ[a-zA-Z0-9=/-]{80,360})([^=0-9A-Za-z_+-]|$) + - (?glc_eyJ[a-zA-Z0-9=/-]{80,360})(?![=0-9A-Za-z_+-]) filter_type: - ValueGrafanaCheck min_line_len: 87 @@ -938,7 +938,7 @@ confidence: weak type: pattern values: - - (^|[^0-9A-Za-z_+-])(?=[A-Za-z0-9]{64})(?P[A-Za-z0-9]{10,12}[B-Za-z0-9]A{10,12}[B-Za-z0-9][A-Za-z0-9]{40,44})([^=0-9A-Za-z_+-]|$) + - (?[A-Za-z0-9]{10,12}[B-Za-z0-9]A{10,12}[B-Za-z0-9][A-Za-z0-9]{40,44})(?![=0-9A-Za-z_+-]) filter_type: [] min_line_len: 43 required_substrings: @@ -949,7 +949,7 @@ confidence: weak type: pattern values: - - (^|[^.0-9A-Za-z_/+-])(?P[a-z0-9]{15})([^=0-9A-Za-z_/+-]|$) + - (?[a-z0-9]{15})(?![=0-9A-Za-z_/+-]) filter_type: WeirdBase36Token min_line_len: 15 required_regex: "[a-zA-Z0-9_/+-]{15,80}" @@ -959,7 +959,7 @@ confidence: weak type: pattern values: - - (^|[^.0-9A-Za-z_/+-])(?P[a-z0-9]{24,25})([^=0-9A-Za-z_/+-]|$) + - (?[a-z0-9]{24,25})(?![=0-9A-Za-z_/+-]) filter_type: WeirdBase36Token min_line_len: 24 required_regex: "[a-zA-Z0-9_/+-]{15,80}" @@ -969,7 +969,7 @@ confidence: weak type: pattern values: - - (^|[^.0-9A-Za-z_/+-])(?P[a-zA-Z0-9_-]{20})([^=0-9A-Za-z_/+-]|$) + - (?[a-zA-Z0-9_-]{20})(?![=0-9A-Za-z_/+-]) filter_type: WeirdBase64Token min_line_len: 20 required_regex: "[a-zA-Z0-9_/+-]{15,80}" @@ -979,7 +979,7 @@ confidence: weak type: pattern values: - - (^|[^.0-9A-Za-z_/+-])(?P[A-Z2-7]{16})([^=0-9A-Za-z_/+-]|$) + - (?[A-Z2-7]{16})(?![=0-9A-Za-z_/+-]) filter_type: - ValueCoupleKeywordCheck - ValuePatternCheck diff --git a/experiment/README.md b/experiment/README.md index dc2444281..6d6662aee 100644 --- a/experiment/README.md +++ b/experiment/README.md @@ -48,7 +48,10 @@ Example: python main.py --data /home/user/datasets/CredData -j 16 ``` -- Resulting model will be saved to `results/ml_model_at-.h5`. You -now can copy this model to the `credsweeper/ml_model/ml_model.h5` +- Resulting model will be saved to `results/ml_model_at-`. +You now can convert the model to onnx: +```bash +python -m tf2onnx.convert --saved-model results/ml_model_at-20240225_111951 --output ../credsweeper/ml_model/ml_model.onnx --verbose +``` diff --git a/experiment/tf2onnx/requirements.txt b/experiment/tf2onnx/requirements.txt deleted file mode 100644 index e83f5bc8f..000000000 --- a/experiment/tf2onnx/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -h5py==3.10.0 -keras==2.13.1 -numpy==1.23.5 -onnx==1.15.0 -protobuf==3.20.3 -tensorflow==2.13.1 -tf2onnx==1.16.0 -wrapt==1.14.1 diff --git a/tests/__init__.py b/tests/__init__.py index 847c9b395..dff01b27f 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -4,14 +4,14 @@ SAMPLES_FILES_COUNT: int = 124 # credentials count after scan -SAMPLES_CRED_COUNT: int = 395 -SAMPLES_CRED_LINE_COUNT: int = 412 +SAMPLES_CRED_COUNT: int = 401 +SAMPLES_CRED_LINE_COUNT: int = 418 # credentials count after post-processing -SAMPLES_POST_CRED_COUNT: int = 356 +SAMPLES_POST_CRED_COUNT: int = 362 # with option --doc -SAMPLES_IN_DOC = 397 +SAMPLES_IN_DOC = 398 # archived credentials that are not found without --depth SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 18 diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index 66a863b4b..ae7cb4c38 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -268,13 +268,38 @@ "confidence": "moderate", "line_data_list": [ { - "line": "\"AwsAccessKey\": \"AKIAGIREOGIAWSKEY123\",", - "line_num": 2, - "path": "tests/samples/aws_key.groovy", - "info": "tests/samples/aws_key.groovy|RAW", + "line": "The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X", + "line_num": 1, + "path": "tests/samples/aws_client_id", + "info": "tests/samples/aws_client_id|RAW", "value": "AKIAGIREOGIAWSKEY123", - "value_start": 17, - "value_end": 37, + "value_start": 14, + "value_end": 34, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.5464393446710156, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "AWS Client ID", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X", + "line_num": 1, + "path": "tests/samples/aws_client_id", + "info": "tests/samples/aws_client_id|RAW", + "value": "AKIAGIREOGIAWSKEY45X", + "value_start": 35, + "value_end": 55, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", @@ -7494,6 +7519,56 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "100.64.0.0/10", + "line_num": 13, + "path": "tests/samples/ipv4", + "info": "tests/samples/ipv4|RAW", + "value": "100.64.0.0", + "value_start": 0, + "value_end": 10, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.5253496664211537, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "100.64.0.0\u2013100.127.255.255", + "line_num": 14, + "path": "tests/samples/ipv4", + "info": "tests/samples/ipv4|RAW", + "value": "100.127.255.255", + "value_start": 11, + "value_end": 26, + "variable": null, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 2.008519976342584, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -7519,6 +7594,56 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "192.0.0.0\u2013192.0.0.255", + "line_num": 22, + "path": "tests/samples/ipv4", + "info": "tests/samples/ipv4|RAW", + "value": "192.0.0.255", + "value_start": 10, + "value_end": 21, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.9704957226453073, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "192.88.99.0/24", + "line_num": 25, + "path": "tests/samples/ipv4", + "info": "tests/samples/ipv4|RAW", + "value": "192.88.99.0", + "value_start": 0, + "value_end": 11, + "variable": null, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 1.9018695860849921, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -7544,6 +7669,31 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "192.88.99.0\u2013192.88.99.255", + "line_num": 26, + "path": "tests/samples/ipv4", + "info": "tests/samples/ipv4|RAW", + "value": "192.88.99.255", + "value_start": 12, + "value_end": 25, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.019193052249804, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -10637,19 +10787,19 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.91551, + "ml_probability": 0.63953, "rule": "URL Credentials", "severity": "high", "confidence": "moderate", "line_data_list": [ { - "line": "connection_url: 'dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local',", + "line": "const connection_url = require('dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local');", "line_num": 1, "path": "tests/samples/url_cred.js", "info": "tests/samples/url_cred.js|RAW", "value": "5WdF4f2jE76a", - "value_start": 40, - "value_end": 52, + "value_start": 55, + "value_end": 67, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", @@ -10684,4 +10834,4 @@ } ] } -] +] \ No newline at end of file diff --git a/tests/data/doc.json b/tests/data/doc.json index a7c4b35d2..17b07e1af 100644 --- a/tests/data/doc.json +++ b/tests/data/doc.json @@ -83,13 +83,38 @@ "confidence": "moderate", "line_data_list": [ { - "line": "\"AwsAccessKey\": \"AKIAGIREOGIAWSKEY123\",", - "line_num": 2, - "path": "tests/samples/aws_key.groovy", - "info": "tests/samples/aws_key.groovy|RAW", + "line": "The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X", + "line_num": 1, + "path": "tests/samples/aws_client_id", + "info": "tests/samples/aws_client_id|RAW", "value": "AKIAGIREOGIAWSKEY123", - "value_start": 17, - "value_end": 37, + "value_start": 14, + "value_end": 34, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.5464393446710156, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "AWS Client ID", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X", + "line_num": 1, + "path": "tests/samples/aws_client_id", + "info": "tests/samples/aws_client_id|RAW", + "value": "AKIAGIREOGIAWSKEY45X", + "value_start": 35, + "value_end": 55, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", @@ -11589,4 +11614,4 @@ } ] } -] +] \ No newline at end of file diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index e15fdf115..5c3ed52bd 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -183,13 +183,38 @@ "confidence": "moderate", "line_data_list": [ { - "line": "\"AwsAccessKey\": \"AKIAGIREOGIAWSKEY123\",", - "line_num": 2, - "path": "tests/samples/aws_key.groovy", + "line": "The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X", + "line_num": 1, + "path": "tests/samples/aws_client_id", "info": "", "value": "AKIAGIREOGIAWSKEY123", - "value_start": 17, - "value_end": 37, + "value_start": 14, + "value_end": 34, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.5464393446710156, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "AWS Client ID", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X", + "line_num": 1, + "path": "tests/samples/aws_client_id", + "info": "", + "value": "AKIAGIREOGIAWSKEY45X", + "value_start": 35, + "value_end": 55, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", @@ -8084,6 +8109,56 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "100.64.0.0/10", + "line_num": 13, + "path": "tests/samples/ipv4", + "info": "", + "value": "100.64.0.0", + "value_start": 0, + "value_end": 10, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.5253496664211537, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "100.64.0.0\u2013100.127.255.255", + "line_num": 14, + "path": "tests/samples/ipv4", + "info": "", + "value": "100.127.255.255", + "value_start": 11, + "value_end": 26, + "variable": null, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 2.008519976342584, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -8109,6 +8184,56 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "192.0.0.0\u2013192.0.0.255", + "line_num": 22, + "path": "tests/samples/ipv4", + "info": "", + "value": "192.0.0.255", + "value_start": 10, + "value_end": 21, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.9704957226453073, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "192.88.99.0/24", + "line_num": 25, + "path": "tests/samples/ipv4", + "info": "", + "value": "192.88.99.0", + "value_start": 0, + "value_end": 11, + "variable": null, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 1.9018695860849921, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -8134,6 +8259,31 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "192.88.99.0\u2013192.88.99.255", + "line_num": 26, + "path": "tests/samples/ipv4", + "info": "", + "value": "192.88.99.255", + "value_start": 12, + "value_end": 25, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.019193052249804, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -10032,19 +10182,19 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.91551, + "ml_probability": 0.63953, "rule": "URL Credentials", "severity": "high", "confidence": "moderate", "line_data_list": [ { - "line": "connection_url: 'dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local',", + "line": "const connection_url = require('dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local');", "line_num": 1, "path": "tests/samples/url_cred.js", "info": "", "value": "5WdF4f2jE76a", - "value_start": 40, - "value_end": 52, + "value_start": 55, + "value_end": 67, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", @@ -10129,4 +10279,4 @@ } ] } -] +] \ No newline at end of file diff --git a/tests/data/output.json b/tests/data/output.json index 39188b486..d2f5d8093 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -183,13 +183,38 @@ "confidence": "moderate", "line_data_list": [ { - "line": "\"AwsAccessKey\": \"AKIAGIREOGIAWSKEY123\",", - "line_num": 2, - "path": "tests/samples/aws_key.groovy", + "line": "The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X", + "line_num": 1, + "path": "tests/samples/aws_client_id", "info": "", "value": "AKIAGIREOGIAWSKEY123", - "value_start": 17, - "value_end": 37, + "value_start": 14, + "value_end": 34, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.5464393446710156, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "AWS Client ID", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X", + "line_num": 1, + "path": "tests/samples/aws_client_id", + "info": "", + "value": "AKIAGIREOGIAWSKEY45X", + "value_start": 35, + "value_end": 55, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", @@ -7309,6 +7334,56 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "100.64.0.0/10", + "line_num": 13, + "path": "tests/samples/ipv4", + "info": "", + "value": "100.64.0.0", + "value_start": 0, + "value_end": 10, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.5253496664211537, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "100.64.0.0\u2013100.127.255.255", + "line_num": 14, + "path": "tests/samples/ipv4", + "info": "", + "value": "100.127.255.255", + "value_start": 11, + "value_end": 26, + "variable": null, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 2.008519976342584, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -7334,6 +7409,56 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "192.0.0.0\u2013192.0.0.255", + "line_num": 22, + "path": "tests/samples/ipv4", + "info": "", + "value": "192.0.0.255", + "value_start": 10, + "value_end": 21, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.9704957226453073, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "192.88.99.0/24", + "line_num": 25, + "path": "tests/samples/ipv4", + "info": "", + "value": "192.88.99.0", + "value_start": 0, + "value_end": 11, + "variable": null, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 1.9018695860849921, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -7359,6 +7484,31 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "IPv4", + "severity": "info", + "confidence": "weak", + "line_data_list": [ + { + "line": "192.88.99.0\u2013192.88.99.255", + "line_num": 26, + "path": "tests/samples/ipv4", + "info": "", + "value": "192.88.99.255", + "value_start": 12, + "value_end": 25, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.019193052249804, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -9107,19 +9257,19 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.91551, + "ml_probability": 0.63953, "rule": "URL Credentials", "severity": "high", "confidence": "moderate", "line_data_list": [ { - "line": "connection_url: 'dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local',", + "line": "const connection_url = require('dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local');", "line_num": 1, "path": "tests/samples/url_cred.js", "info": "", "value": "5WdF4f2jE76a", - "value_start": 40, - "value_end": 52, + "value_start": 55, + "value_end": 67, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", @@ -9154,4 +9304,4 @@ } ] } -] +] \ No newline at end of file diff --git a/tests/samples/aws_client_id b/tests/samples/aws_client_id new file mode 100644 index 000000000..3685378f5 --- /dev/null +++ b/tests/samples/aws_client_id @@ -0,0 +1,2 @@ +The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X +the coma is necessary there ^ bariers thesting !!! diff --git a/tests/samples/aws_key.groovy b/tests/samples/aws_key.groovy deleted file mode 100644 index 2c6fbb235..000000000 --- a/tests/samples/aws_key.groovy +++ /dev/null @@ -1,2 +0,0 @@ - -"AwsAccessKey": "AKIAGIREOGIAWSKEY123", diff --git a/tests/samples/url_cred.js b/tests/samples/url_cred.js index b965de236..cdd81f59f 100644 --- a/tests/samples/url_cred.js +++ b/tests/samples/url_cred.js @@ -1,4 +1,5 @@ -connection_url: 'dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local', +const connection_url = require('dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local'); + // note:dummyuser@example.com // "fp://no.host.real/any/path/to/nowhere/","key":"f45VgF8jX79o@anydata.com" diff --git a/tests/test_main.py b/tests/test_main.py index 0bab77b2e..42f2776ad 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -697,6 +697,8 @@ def test_doc_n(self) -> None: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def test_data_p(self) -> None: + # the test modifies data/xxx.json with actual result - it discloses impact of changes obviously + # use git diff to review the changes def prepare(report: List[Dict[str, Any]]): for x in report: @@ -734,8 +736,8 @@ def prepare(report: List[Dict[str, Any]]): # instead the config file is used with tempfile.TemporaryDirectory() as tmp_dir: for cfg in DATA_TEST_CFG: - with open(TESTS_PATH / "data" / cfg["json_filename"], "r") as f: - expected_result = json.load(f) + expected_report = TESTS_PATH / "data" / cfg["json_filename"] + expected_result = Util.json_load(expected_report) # informative parameter, relative with other tests counters. CredSweeper does not know it and fails cred_count = cfg.pop("__cred_count") prepare(expected_result) @@ -747,82 +749,23 @@ def prepare(report: List[Dict[str, Any]]): cfg["json_filename"] = str(tmp_file) cred_sweeper = CredSweeper(**cfg) cred_sweeper.run(content_provider=content_provider) - with open(tmp_file, "r") as f: - test_result = json.load(f) + test_result = Util.json_load(tmp_file) prepare(test_result) + # use the same dump as in output + Util.json_dump(test_result, tmp_file) diff = deepdiff.DeepDiff(test_result, expected_result) if diff: # prints produced report to compare with present data in tests/data - print(f"\nThe produced report for {cfg['json_filename']}:\n{json.dumps(test_result)}", flush=True) + print(f"Review updated {cfg['json_filename']} with git.", flush=True) + shutil.copy(tmp_file, expected_report) + # first run fails with the diff but next run will pass self.assertDictEqual(diff, {}, cfg) + # only count of items must be corrected manually self.assertEqual(cred_count, len(expected_result), cfg["json_filename"]) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # - @pytest.mark.skipif(not os.getenv("BRUTEFORCEMAXEXTENSION4ML"), - reason="run the test only for renaming samples with maximal ml_probability") - def test_samples_ml_p(self) -> None: - extensions = [ - "", ".admx", ".adoc", ".api", ".asciidoc", ".backup", ".bash", ".bat", ".bats", ".bazel", ".build", - ".bundle", ".bzl", ".c", ".cc", ".cf", ".cfg", ".clj", ".cljc", ".cls", ".cmd", ".cnf", ".coffee", ".conf", - ".config", ".Config", ".cpp", ".creds", ".crlf", ".crt", ".cs", ".cshtml", ".csp", ".csproj", ".css", - ".csv", ".dart", ".deprecated", ".development", ".diff", ".dist", ".doc", ".dockerfile", ".dot", ".dwl", - ".eex", ".ejs", ".env", ".erb", ".erl", ".ex", ".example", ".exs", ".ext", ".fsproj", ".g4", ".gd", ".gml", - ".gni", ".go", ".golden", ".gradle", ".graphql", ".graphqls", ".groovy", ".h", ".haml", ".hbs", ".hs", - ".idl", ".iml", ".in", ".inc", ".ini", ".init", ".ipynb", ".j", ".j2", ".java", ".Jenkinsfile", ".jinja2", - ".js", ".jsp", ".jsx", ".jwt", ".key", ".kt", ".l", ".las", ".lasso", ".lasso9", ".ldif", ".ldiff", ".ldml", - ".leex", ".less", ".LESSER", ".libsonnet", ".list", ".lkml", ".lock", ".log", ".lua", ".m", ".manifest", - ".map", ".markdown", ".markerb", ".marko", ".md", ".mdx", ".MF", ".mjml", ".mjs", ".mk", ".ml", ".mlir", - ".mod", ".moo", ".mqh", ".msg", ".mst", ".mysql", ".nb", ".ndjson", ".nix", ".nolint", ".odd", ".oracle", - ".p8", ".pan", ".patch", ".pbxproj", ".pem", ".php", ".pl", ".PL", ".plugin", ".pm", ".po", ".pod", ".pony", - ".postinst", ".pp", ".ppk", ".private", ".proj", ".properties", ".proto", ".ps1", ".ps1xml", ".psm1", - ".pug", ".purs", ".pxd", ".pyi", ".pyp", ".python", ".pyx", ".R", ".rake", ".rb", ".re", ".red", ".release", - ".response", ".resx", ".rexx", ".rnh", ".rno", ".rrc", ".rs", ".rsc", ".rsp", ".rst", ".rules", ".sample", - ".sbt", ".scala", ".scss", ".secrets", ".service", ".sh", ".slim", ".smali", ".snap", ".spec", ".spin", - ".sql", ".sqlite3", ".srt", ".storyboard", ".strings", ".stub", ".sublime - keymap", ".sum", ".svg", - ".swift", ".t", ".td", ".test", ".testsettings", ".tf", ".tfstate", ".tfvars", ".tl", ".tmpl", ".token", - ".toml", ".tpl", ".travis", ".ts", ".tsx", ".ttar", ".txt", ".user", ".utf8", ".vsixmanifest", ".vsmdi", - ".vue", ".xaml", ".xcscheme", ".xib", ".xsl", ".yara", ".yml", ".zsh", ".zsh - theme", ".1" - # , ".template" - ] - cred_sweeper = CredSweeper() - for __, _, filenames in os.walk(SAMPLES_PATH): - self.assertEqual(SAMPLES_FILES_COUNT, len(filenames)) - for filename in filenames: - file_path = SAMPLES_PATH / filename - if file_path.suffix in [ - ".patch", ".xml", ".bz2", ".docx", ".apk", ".zip", ".gz", ".pdf", ".py", ".json", ".html", - ".yaml", ".jks", ".template" - ]: - continue - data = file_path.read_bytes() - stat: Dict[str, List[Candidate]] = {} - for extension in extensions: - cred_sweeper.credential_manager.candidates.clear() - provider = TextContentProvider(file_path=(f"dummy{extension}", io.BytesIO(data))) - candidates = cred_sweeper.file_scan(provider) - cred_sweeper.credential_manager.set_credentials(candidates) - cred_sweeper.post_processing() - post_credentials = cred_sweeper.credential_manager.get_credentials() - if post_credentials: - stat[extension] = copy.deepcopy(post_credentials) - max_ml = 0 - max_ext = "" - for ext_key, creds in stat.items(): - for cred in creds: - if cred.ml_probability and max_ml < cred.ml_probability: - max_ml = cred.ml_probability - max_ext = ext_key - if max_ml: - print(max_ext, max_ml) - shutil.move(file_path, SAMPLES_PATH / f"{file_path.stem}{max_ext}") - else: - shutil.move(file_path, SAMPLES_PATH / f"{file_path.stem}") - del stat - - # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # - def test_param_p(self) -> None: # internal parametrized tests to keep items = [(" STP_PASSWORD=qbgomdtpqch \\", "qbgomdtpqch")]