diff --git a/benchmark.txt b/benchmark.txt deleted file mode 100644 index 8f3d489e9..000000000 --- a/benchmark.txt +++ /dev/null @@ -1,269 +0,0 @@ -META MD5 877d5780a3115b42628c7fe43c869801 -DATA MD5 b3a2698b63448efee7ab94ffd4d11814 -DATA: 16345157 interested lines. MARKUP: 62634 items -FileType FileNumber ValidLines Positives Negatives Templates ---------------- ------------ ------------ ----------- ----------- ----------- - 194 28318 66 414 85 -.1 2 641 2 5 -.admx 1 26 1 -.adoc 1 158 13 6 1 -.api 2 118 4 -.asciidoc 96 14471 50 347 27 -.axaml 5 286 5 -.backup 1 62 2 1 -.bash 2 2158 2 1 -.bat 4 233 14 2 -.bats 15 2804 14 49 9 -.bazel 3 424 8 -.build 2 40 3 -.bundle 4 1512 580 -.bzl 3 2503 11 -.c 179 284009 8 942 5 -.cc 29 30562 617 1 -.cf 3 126 2 1 -.cfg 1 385 1 1 -.cjs 1 725 3 6 -.clj 2 133 3 -.cljc 5 2421 11 -.cls 1 657 1 -.cmd 4 401 2 3 -.cnf 8 858 15 34 18 -.coffee 1 585 2 -.conf 60 4945 53 67 53 -.config 20 492 16 38 1 -.cpp 15 5688 2 61 -.creds 1 10 1 1 -.crlf 1 27 1 -.crt 2 4979 211 -.cs 268 79532 158 894 94 -.cshtml 5 180 12 -.csp 3 379 9 -.csproj 1 14 1 -.css 6 13564 10 -.csv 1 109 78 -.dart 2 22 2 -.deprecated 1 126 1 -.development 1 5 1 -.diff 2 2460 8 2 -.dist 5 257 7 13 -.doc 1 2489 3 -.dockerfile 1 19 1 -.dot 1 160 6 -.eex 4 74 8 -.ejs 1 13 1 -.env 10 136 11 3 17 -.erb 13 323 27 -.erl 4 96 7 -.ex 25 4968 5 98 5 -.example 17 1838 74 38 54 -.exs 24 4842 8 187 4 -.ext 5 211 1 4 2 -.fsproj 1 75 1 2 -.g4 2 201 2 -.gd 1 37 1 -.gml 3 3075 16 -.gni 3 5017 19 -.go 1080 566476 693 4114 739 -.golden 5 1168 1 13 29 -.gradle 45 3265 4 90 100 -.graphql 7 420 13 -.graphqls 1 30 1 -.groovy 22 4986 24 215 1 -.h 9 1958 36 -.haml 9 191 17 -.hbs 2 54 3 -.hs 14 4140 30 61 5 -.html 53 15327 22 110 18 -.idl 2 777 1 4 -.iml 6 699 30 -.in 6 2130 6 43 10 -.inc 2 56 2 1 -.ini 11 1437 25 12 18 -.ipynb 1 134 5 -.j 1 241 2 2 -.j2 30 5530 6 186 10 -.java 621 134132 360 1366 171 -.jenkinsfile 1 58 2 6 -.jinja2 1 64 2 -.js 659 536413 535 2489 330 -.json 850 13046270 1070 10897 140 -.jsp 13 3202 1 40 -.jsx 7 857 19 -.jwt 1 1 2 -.key 83 2737 70 14 -.kt 123 20774 67 379 3 -.l 1 982 1 -.las 1 6656 35 -.lasso 1 230 7 -.lasso9 1 164 5 -.ldif 2 286 20 -.ldiff 1 20 1 -.ldml 1 6656 35 -.leex 1 9 2 -.less 4 3023 12 -.libsonnet 2 210 1 11 -.list 2 15 2 -.lkml 1 43 1 -.lock 24 160912 142 -.log 2 199 38 52 -.lua 10 1924 37 3 -.m 16 13358 11 158 3 -.manifest 3 102 9 6 -.markdown 3 139 3 1 -.markerb 3 12 3 -.marko 1 21 2 -.md 674 149399 710 2336 624 -.mdx 3 549 7 -.mjml 1 18 1 -.mjs 22 4424 76 340 -.mk 1 5878 13 -.ml 1 1856 16 -.mlir 2 1596 19 -.mod 2 96 4 -.moo 1 1404 26 -.mqh 1 1023 2 -.msg 1 26644 1 1 -.mysql 1 36 2 -.ndjson 2 5006 69 237 2 -.nix 4 211 12 -.nolint 1 2 1 -.odd 1 1281 43 -.oracle 1 9 1 -.p8 4 64 4 -.pan 2 48 4 -.patch 4 109405 4 27 -.pbxproj 1 941 2 -.pem 48 1169 47 8 -.php 371 75710 128 1619 79 -.pl 16 14727 6 34 -.pm 3 744 7 -.po 3 2994 15 -.pod 9 1859 1 23 -.pony 1 83 4 -.postinst 2 354 4 15 -.pp 10 563 16 -.ppk 1 45 36 -.private 1 15 1 -.proj 1 85 5 -.properties 48 1621 52 27 33 -.proto 5 5768 2 49 -.ps1 16 8509 15 64 2 -.ps1xml 1 5022 1 -.pug 2 193 2 -.purs 1 69 4 -.pxd 1 150 5 2 -.py 890 291553 674 3290 728 -.pyi 4 1361 9 -.pyp 1 167 1 -.pyx 2 1094 23 -.r 4 62 6 3 1 -.rake 2 51 2 -.rb 860 131838 258 3311 613 -.re 1 31 1 -.red 1 159 1 -.release 1 13 4 -.response 1 26 2 -.resx 11 3519 310 -.rexx 1 92 3 -.rnh 1 1354 3 2 -.rno 1 7229 2 -.rrc 39 1404 281 -.rs 31 9855 2 233 11 -.rsc 1 691 1 -.rsp 16 7101 19 10 28 -.rst 86 33980 70 321 68 -.rules 1 6 2 -.sample 2 25 3 4 4 -.sbt 3 570 5 2 -.scala 40 5071 22 101 -.scss 16 8553 32 1 -.secrets 1 11 1 -.sh 143 21525 51 466 30 -.slim 1 153 1 2 -.smali 1 775 18 -.snap 3 1708 9 30 2 -.spec 2 332 2 -.spin 1 565 1 -.sql 27 6606 126 56 4 -.storyboard 20 1802 341 -.strings 20 1240 137 -.stub 3 84 6 -.sublime-keymap 1 3 1 -.sum 37 22854 283 -.svg 1 638 12 -.t 9 1767 24 43 14 -.td 2 14002 6 -.template 19 1633 4 35 11 -.test 2 24 25 4 -.testsettings 1 21 1 10 -.tf 21 1377 3 29 2 -.tfstate 4 307 22 11 4 -.tfvars 1 31 3 2 -.tl 2 2161 161 2 -.tmpl 5 336 3 9 -.token 1 1 3 -.toml 83 2379 53 105 156 -.tpl 1 43 1 -.travis 1 34 4 3 1 -.ts 583 106730 159 1800 201 -.tsx 54 7914 1 114 5 -.ttar 1 452 1 -.txt 440 78102 5301 6341 49 -.utf8 1 77 2 -.vsixmanifest 1 36 1 -.vsmdi 1 6 2 -.vue 50 8736 1 154 1 -.xaml 21 8103 162 -.xcscheme 1 109 6 -.xib 11 503 169 -.xml 9 689 9 -.xsl 1 311 1 -.yaml 137 19004 123 345 44 -.yml 418 36162 545 892 380 -.zsh 6 872 12 -.zsh-theme 1 97 1 -TOTAL: 10259 16345157 12147 50315 5114 -credsweeper result_cnt : 7753, lost_cnt : 0, true_cnt : 7531, false_cnt : 222 -Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------- ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- -------- -API 128 3161 189 113 111 2 3348 17 0.000597 0.132812 0.994537 0.982301 0.867188 0.921162 -AWS Client ID 167 21 0 160 160 0 21 7 0.000000 0.041916 0.962766 1.000000 0.958084 0.978593 -AWS Multi 75 16 0 87 75 11 5 0 0.687500 0.000000 0.879121 0.872093 1.000000 0.931677 -AWS S3 Bucket 66 24 0 91 65 24 0 1 1.000000 0.015152 0.722222 0.730337 0.984848 0.838710 -Atlassian Old PAT token 27 308 3 12 3 8 303 24 0.025723 0.888889 0.905325 0.272727 0.111111 0.157895 -Auth 412 2726 76 378 359 19 2783 53 0.006781 0.128641 0.977598 0.949735 0.871359 0.908861 -Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194 -BASE64 Private Key 7 4 0 7 7 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333 -Bitbucket Client ID 143 2097 9 48 28 19 2087 115 0.009022 0.804196 0.940418 0.595745 0.195804 0.294737 -Bitbucket Client Secret 301 809 10 40 29 11 808 272 0.013431 0.903654 0.747321 0.725000 0.096346 0.170088 -Certificate 23 471 1 26 18 8 464 5 0.016949 0.217391 0.973737 0.692308 0.782609 0.734694 -Credential 95 420 74 92 92 0 494 3 0.000000 0.031579 0.994907 1.000000 0.968421 0.983957 -Docker Swarm Token 2 0 0 1 1 0 0 1 0.500000 0.500000 1.000000 0.500000 0.666667 -Dropbox App secret 64 139 1 46 35 10 130 29 0.071429 0.453125 0.808824 0.777778 0.546875 0.642202 -Facebook Access Token 0 1 0 0 0 1 0 0.000000 1.000000 -Firebase Domain 6 1 0 7 6 1 0 0 1.000000 0.000000 0.857143 0.857143 1.000000 0.923077 -Github Old Token 1 0 0 1 1 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Gitlab Feed Token 189 751 87 56 44 11 827 145 0.013126 0.767196 0.848101 0.800000 0.232804 0.360656 -Gitlab Incoming Email Token 37 8 0 21 19 2 6 18 0.250000 0.486486 0.555556 0.904762 0.513514 0.655172 -Google API Key 12 0 0 12 12 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Google Multi 10 2 0 11 10 1 1 0 0.500000 0.000000 0.916667 0.909091 1.000000 0.952381 -Google OAuth Access Token 3 0 0 3 3 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Grafana Provisioned API Key 22 1 0 5 5 0 1 17 0.000000 0.772727 0.260870 1.000000 0.227273 0.370370 -JSON Web Token 170 61 0 131 131 0 61 39 0.000000 0.229412 0.831169 1.000000 0.770588 0.870432 -Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000 -Jira 2FA 15 6 1 12 12 0 7 3 0.000000 0.200000 0.863636 1.000000 0.800000 0.888889 -Key 3920 15689 482 472 466 6 16165 3454 0.000371 0.881122 0.827784 0.987288 0.118878 0.212204 -Nonce 91 49 0 83 81 2 47 10 0.040816 0.109890 0.914286 0.975904 0.890110 0.931034 -Other 0 8291 1 0 0 8292 0 0.000000 1.000000 -PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041 -Password 1844 7524 2713 1726 1657 69 10168 187 0.006740 0.101410 0.978810 0.960023 0.898590 0.928291 -Salt 45 76 2 42 41 1 77 4 0.012821 0.088889 0.959350 0.976190 0.911111 0.942529 -Secret 1296 1574 800 1236 1230 6 2368 66 0.002527 0.050926 0.980381 0.995146 0.949074 0.971564 -Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 -Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -Token 648 4177 438 540 534 6 4609 114 0.001300 0.175926 0.977199 0.988889 0.824074 0.898990 -Twilio API Key 0 5 2 0 0 7 0 0.000000 1.000000 -URL Credentials 209 144 225 196 196 0 369 13 0.000000 0.062201 0.977509 1.000000 0.937799 0.967901 -UUID 1069 265 0 1061 1060 1 264 9 0.003774 0.008419 0.992504 0.999057 0.991581 0.995305 - 12147 50315 5114 7760 7531 222 50093 4616 0.004412 0.380012 0.922545 0.971366 0.619988 0.756884 diff --git a/build.sh b/build.sh index 70acf02ea..d57c35575 100755 --- a/build.sh +++ b/build.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -e set -x @@ -10,17 +10,12 @@ if [ -z "${VIRTUAL_ENV}" ]; then echo "Virtual environment has been not activated" if ! [ -d "${THISDIR}/${VENVDIR}" ]; then echo "Create new virtual environment" - python3.8 -m virtualenv -v --copies "${THISDIR}/${VENVDIR}" + python3.10 -m virtualenv -v --copies "${THISDIR}/${VENVDIR}" fi fi - + if [ -z "${VIRTUAL_ENV}" ]; then . "${THISDIR}/${VENVDIR}/bin/activate" fi -if ! pip list | grep PyYAML; then - pip install PyYAML -fi - -python download_data.py --data_dir data - +python download_data.py --clean_data --jobs $(nproc) diff --git a/download_data.py b/download_data.py index b9ae8ec0e..fae7462e9 100644 --- a/download_data.py +++ b/download_data.py @@ -9,7 +9,7 @@ import string import subprocess import sys -from argparse import ArgumentParser, Namespace +from argparse import Namespace, ArgumentParser from multiprocessing import Pool from typing import List @@ -347,7 +347,42 @@ def get_obfuscated_value(value, meta_row: MetaRow): return obfuscated_value +def check_asc_or_desc(line_data_value: str) -> bool: + """ValuePatternCheck as example""" + count_asc = 1 + count_desc = 1 + for i in range(len(line_data_value) - 1): + if line_data_value[i] in string.ascii_letters + string.digits \ + and ord(line_data_value[i + 1]) - ord(line_data_value[i]) == 1: + count_asc += 1 + if 4 == count_asc: + return True + else: + count_asc = 1 + if line_data_value[i] in string.ascii_letters + string.digits \ + and ord(line_data_value[i]) - ord(line_data_value[i + 1]) == 1: + count_desc += 1 + if 4 == count_desc: + return True + else: + count_desc = 1 + continue + return False + def generate_value(value): + """Wrapper to skip obfuscation with false positive or negatives""" + pattern_keyword = re.compile(r"(api|pass|pw[d\b])", flags=re.IGNORECASE) + pattern_similar = re.compile(r"(\w)\1{3,}") + new_value = None + while new_value is None \ + or pattern_keyword.findall(new_value) \ + or pattern_similar.findall(new_value) \ + or check_asc_or_desc(new_value): + new_value = gen_random_value(value) + return new_value + + +def gen_random_value(value): obfuscated_value = "" digits_set = string.digits diff --git a/meta/2ba83c6a.csv b/meta/2ba83c6a.csv index c08a19c0f..db3eb2bcc 100644 --- a/meta/2ba83c6a.csv +++ b/meta/2ba83c6a.csv @@ -595,10 +595,10 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 24213,911dde03,GitHub,2ba83c6a,data/2ba83c6a/test/911dde03.txt,22033,22033,T,F,6,70,F,F,,,,,0.0,0,F,F,F,Key 24214,8a68cd28,GitHub,2ba83c6a,data/2ba83c6a/test/8a68cd28.txt,119,119,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key 24215,5e763eae,GitHub,2ba83c6a,data/2ba83c6a/test/5e763eae.txt,195,195,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key -24216,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,250,250,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key +24216,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,250,250,F,F,6,38,F,F,,,,,0.0,0,F,F,F,Key 24217,4acf8d32,GitHub,2ba83c6a,data/2ba83c6a/test/4acf8d32.txt,37,37,F,F,-1,-1,F,F,,,,,0.0,0,F,F,F,Key:Bitbucket Client Secret:Bitbucket Client ID 24218,556bad09,GitHub,2ba83c6a,data/2ba83c6a/test/556bad09.txt,525,525,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key -24219,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,160,160,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key +24219,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,160,160,F,F,6,38,F,F,,,,,0.0,0,F,F,F,Key 24220,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,166,166,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key 24221,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,172,172,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key 24222,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,178,178,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key @@ -923,8 +923,8 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 27342,eb0705d8,GitHub,2ba83c6a,data/2ba83c6a/other/eb0705d8.pod,460,460,T,F,23,39,F,F,Any,,,Secret,3.88,16,F,F,F,Key 138189,eb0705d8,GitHub,2ba83c6a,data/2ba83c6a/other/eb0705d8.pod,460,460,F,F,54,-1,F,F,,,,,0.0,0,F,F,F,Other 27386,dc676919,GitHub,2ba83c6a,data/2ba83c6a/test/dc676919.txt,1,1,F,F,32,37,F,F,,,,,0,0,F,F,F,Secret -27407,6cfa362d,GitHub,2ba83c6a,data/2ba83c6a/src/6cfa362d.cnf,344,344,Template,T,9,19,F,F,Any,,,Secret,2.31,10,F,F,F,Password:Secret -27425,23d50951,GitHub,2ba83c6a,data/2ba83c6a/src/23d50951.cnf,358,358,T,T,9,23,F,F,Any,,,Secret,2.52,14,F,F,F,Password:Secret +27407,6cfa362d,GitHub,2ba83c6a,data/2ba83c6a/src/6cfa362d.cnf,344,344,T,T,9,19,F,F,Any,,,Secret,2.31,10,F,F,F,Secret +27425,23d50951,GitHub,2ba83c6a,data/2ba83c6a/src/23d50951.cnf,358,358,F,T,9,23,F,F,Any,,,Secret,2.52,14,F,F,F,Password:Secret 27489,a3cf7bc8,GitHub,2ba83c6a,data/2ba83c6a/test/a3cf7bc8.txt,298,298,F,F,6,24,F,F,,,,,0.0,0,F,F,F,Key 27490,a3cf7bc8,GitHub,2ba83c6a,data/2ba83c6a/test/a3cf7bc8.txt,282,282,F,F,6,14,F,F,,,,,0.0,0,F,F,F,Key 27491,b40503ed,GitHub,2ba83c6a,data/2ba83c6a/test/b40503ed.txt,154,154,F,F,6,11,F,F,,,,,0.0,0,F,F,F,Key diff --git a/meta/60f9915d.csv b/meta/60f9915d.csv index 28645ae95..f0f7d2a97 100644 --- a/meta/60f9915d.csv +++ b/meta/60f9915d.csv @@ -62,7 +62,7 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 32158,a414e92e,GitHub,60f9915d,data/60f9915d/test/a414e92e.go,300,300,T,T,67,74,T,F,CharsOnly,,,Secret,2.81,7,F,F,F,URL Credentials 32159,7e47b56e,GitHub,60f9915d,data/60f9915d/test/7e47b56e.go,14,14,T,T,79,86,T,F,CharsOnly,,,Secret,2.81,7,F,F,F,URL Credentials 32162,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,206,206,T,T,49,56,T,F,CharsOnly,,,Secret,2.81,7,F,F,F,URL Credentials -32163,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,219,219,T,T,54,70,T,F,Any,,,Secret,3.25,16,F,F,F,URL Credentials +32163,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,219,219,F,T,,,T,F,Any,,,Secret,3.25,16,F,F,F,URL Credentials:Password 35875,e45e45ba,GitHub,60f9915d,data/60f9915d/src/e45e45ba.yml,48,48,T,T,25,32,F,F,CharsOnly,,,Secret,2.81,7,F,F,F,Password 46090,35a99f76,GitHub,60f9915d,data/60f9915d/test/35a99f76.go,79,79,Template,T,18,24,F,F,CharsOnly,,,Secret,2.25,6,F,F,F,Password 52007,3d9a9f38,GitHub,60f9915d,data/60f9915d/test/3d9a9f38.go,639,639,T,F,53,60,F,F,,,,,0.0,0,F,F,F,URL Credentials @@ -122,4 +122,3 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 137948,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,161,161,T,F,86,93,F,F,,,,,0.0,0,F,F,F,Password:URL Credentials 138239,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,161,161,F,F,65,-1,F,F,,,,,0.0,0,F,F,F,Other 138240,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,161,161,F,F,104,-1,F,F,,,,,0.0,0,F,F,F,Other -138241,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,219,219,F,F,81,85,F,F,,,,,0.0,0,F,F,F,Password diff --git a/meta/69d49010.csv b/meta/69d49010.csv index fc79a94e2..fdab5f4c4 100644 --- a/meta/69d49010.csv +++ b/meta/69d49010.csv @@ -163,3 +163,9 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 133411,24cbbb32,GitHub,69d49010,data/69d49010/test/24cbbb32.py,680,680,F,F,,,F,F,,,,,0,0,F,F,F,Secret 135283,85d52436,GitHub,69d49010,data/69d49010/test/85d52436.py,279,279,T,F,20,33,F,F,,,,,0.0,0,F,F,F,Password:URL Credentials 1341461,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,438,438,T,F,17,53,F,F,,,,,0.0,0,F,F,F,UUID:Token +1479347,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,258,258,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth +1479349,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,277,277,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth +1479352,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,303,303,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth +1479354,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,329,329,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth +1479356,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,356,356,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth +1479361,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,433,433,T,F,40,51,F,F,,,,,0.0,0,F,F,F,Auth diff --git a/meta/75e7c64d.csv b/meta/75e7c64d.csv index 81eed77ee..08be620db 100644 --- a/meta/75e7c64d.csv +++ b/meta/75e7c64d.csv @@ -47,3 +47,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 114451,cb047af3,GitHub,75e7c64d,data/75e7c64d/src/cb047af3.yml,39,39,F,F,-1,-1,F,F,,,,,0.0,-1,F,F,F,Key 114452,2a39f30f,GitHub,75e7c64d,data/75e7c64d/src/2a39f30f.yml,37,37,F,F,-1,-1,F,F,,,,,0.0,-1,F,F,F,Key 133463,841e3aef,GitHub,75e7c64d,data/75e7c64d/src/841e3aef.py,329,329,F,F,-1,-1,F,F,,,,,0.0,0,F,F,F,Key +1479362,0f6303ab,GitHub,75e7c64d,data/75e7c64d/src/0f6303ab.py,101,101,F,F,-1,-1,F,F,,,,,0.0,0,F,F,F,Key diff --git a/meta/8cda00f3.csv b/meta/8cda00f3.csv index 4addf083d..c5474f1a9 100644 --- a/meta/8cda00f3.csv +++ b/meta/8cda00f3.csv @@ -913,3 +913,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 135313,eaf18c55,GitHub,8cda00f3,data/8cda00f3/other/eaf18c55.md,209,209,T,F,21,37,F,F,,,,,0,0,F,F,F,Auth 135314,eaf18c55,GitHub,8cda00f3,data/8cda00f3/other/eaf18c55.md,300,300,T,F,21,37,F,F,,,,,0,0,F,F,F,Auth 1340770,4cf2897e,GitHub,8cda00f3,data/8cda00f3/src/4cf2897e.go,16,16,T,F,22,58,F,F,,,,,0.0,0,F,F,F,UUID +1479363,0a7921b3,GitHub,8cda00f3,data/8cda00f3/test/0a7921b3.go,155,155,F,F,,,F,F,,,,,0.0,0,F,F,F,Auth diff --git a/meta_cred.py b/meta_cred.py index 3abf1fe9e..d1175101f 100644 --- a/meta_cred.py +++ b/meta_cred.py @@ -16,7 +16,8 @@ def __init__(self, cs_cred: dict): if not self.path.startswith('data/'): # license files ... self.path = '/'.join([str(x) for x in path.parts[-3:]]) - assert self.path.startswith('data/'), cs_cred # path for benchmark must start from data/ + # path for benchmark must start from "data/" + assert self.path.startswith('data/'), cs_cred self.valid_path = bool(self.valid_path_regex.match(self.path)) # to skip license files self.line_start = line_data_list[0]["line_num"] diff --git a/update_meta.py b/update_meta.py deleted file mode 100644 index e2cf7acd5..000000000 --- a/update_meta.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env python3 - -""" -The script is developed to update meta with absolute positions of value instead from stripped line -""" -import json -import os -import subprocess -import sys -from argparse import ArgumentParser -from functools import cache -from typing import Dict, Tuple, List - -from meta_cred import MetaCred -from meta_row import read_meta - -EXIT_SUCCESS = 0 -EXIT_FAILURE = 1 - - -@cache -def read_cache(path) -> list[str]: - with open(path, "r", encoding="utf8") as f: - return f.read().replace("\r\n", '\n').replace('\r', '\n').split('\n') - - -def main(meta_dir: str, data_dir: str, report_file: str) -> int: - errors = 0 - updated_rows = 0 - - if not os.path.exists(meta_dir): - raise FileExistsError(f"{meta_dir} directory does not exist.") - if not os.path.exists(data_dir): - raise FileExistsError(f"{data_dir} directory does not exist.") - creds: Dict[Tuple[str, int, int], List[MetaCred]] = {} - with open(report_file, 'r') as f: - for i in json.load(f): - cred = MetaCred(i) - multi_cred_key = (cred.path, cred.line_start, cred.line_end) - if multi_cred_key in creds: - creds[multi_cred_key].append(cred) - else: - creds[multi_cred_key] = [cred] - - meta = read_meta(meta_dir) - meta.sort(key=lambda x: (x.FilePath, x.LineStart, x.LineEnd, x.ValueStart, x.ValueEnd)) - for row in meta: - if "2ba83c6a" != row.RepoName: - continue # later - categories = set(row.Category.split(':')) - if "Secret" in categories : - meta_key = (row.FilePath, row.LineStart, row.LineEnd) - possible_creds = creds.get(meta_key) - if not possible_creds: - lines = read_cache(i.FilePath) - line = lines[i.LineStart - 1] - if 'secret' in line.lower: - continue - row.Category = "Other" - errors += subprocess.call( - ["sed", "-i", - f"s|^{row.Id},{row.FileID},.*$|" + str(row) + "|", - f"{meta_dir}/{row.RepoName}.csv"]) - updated_rows += 1 - continue - - if 0 > row.ValueStart: - # has markup for whole line - if any("Secret" == x.rule for x in possible_creds): - # ok - continue - categories.remove("Secret") - if 1 == len(categories): - # should be changed - categories = set(x.rule for x in possible_creds) - - cred = possible_creds[0] - if "Key" == cred.rule: - if ((16 <= len(cred.value) or 'hexkey' in cred.variable) - and not any(x in cred.line.lower() for x in - ['0011223344', '0001020304', '0b0b0b0b0b0b0b0', 'alice_', 'bob_', 'alice-', - 'bob-', 'fffefdfcfbfaf9f', '7f7e7d7c7b7a797877', '010203040506070809', - 'fefefefefefe','808182838485868788','000000000','111111111','eeeeeeeeee','fffffffffff','0123456789' - ]) - and 'OBJ_' not in cred.line - - ): - # may look like norm key - row.ValueStart = cred.value_start - row.ValueEnd = cred.value_end - row.GroundTruth = 'T' - else: - categories = set(x.rule for x in possible_creds) - for cred in possible_creds: - if "Key" == cred.rule: - if ((16 <= len(cred.value) or 'hexkey' in cred.variable) - and not any(x in cred.line.lower() for x in - ['0011223344', '0001020304', '0b0b0b0b0b0b0b0', 'alice_', 'bob_', 'alice-', - 'bob-', 'fffefdfcfbfaf9f', '7f7e7d7c7b7a797877', '010203040506070809', - 'fefefefefefe', '808182838485868788', '000000000', '111111111', - 'eeeeeeeeee', 'fffffffffff', '0123456789' - ]) - and 'OBJ_' not in cred.line - - ): - # may look like norm key - row.ValueStart = cred.value_start - row.ValueEnd = cred.value_end - row.GroundTruth = 'T' - break - - else: - if any("Secret" == x.rule for x in possible_creds if x.value_start == row.ValueStart): - # ok - continue - else: - # wrong position in markup - must be skipped - if 1 == len(categories): - # should be changed - categories = set(x.rule for x in possible_creds if x.value_start == row.ValueStart and ( - x.value_end == row.ValueEnd or 0 > row.ValueEnd)) - if not categories: - # wrong end position - categories = set(x.rule for x in possible_creds if x.value_start == row.ValueStart) - row.ValueEnd = -1 - assert row.GroundTruth == 'F' or row.GroundTruth == 'Template', row - row.GroundTruth = 'F' - else: - categories.remove("Secret") - - if not categories: - lines = read_cache(row.FilePath) - line = lines[row.LineStart - 1] - if 'secret' in line.lower(): - continue - categories.add("Other") - row.Category = ':'.join(categories) - errors += subprocess.call( - ["sed", "-i", - f"s|^{row.Id},{row.FileID},.*$|" + str(row) + "|", - f"{meta_dir}/{row.RepoName}.csv"]) - updated_rows += 1 - - result = EXIT_SUCCESS if 0 == errors else EXIT_FAILURE - print(f"Updated {updated_rows} of {len(meta)}, errors: {errors}, {result}", flush=True) - return result - - -if __name__ == "__main__": - parser = ArgumentParser(prog=f"python {os.path.basename(__file__)}", - description="Temporally console script for update meta with Secret category to Other") - - parser.add_argument("report_file", help="Credentials report from CredSweeper") - parser.add_argument("meta_dir", help="Markup location", nargs='?', default="meta") - parser.add_argument("data_dir", help="Dataset location", nargs='?', default="data") - _args = parser.parse_args() - - exit_code = main(_args.meta_dir, _args.data_dir, _args.report_file) - sys.exit(exit_code)