Skip to content

Commit

Permalink
[skip actions] [experiment] 2024-01-09T13:42:58+02:00
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Jan 9, 2024
1 parent 0513749 commit 5306e87
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 29 deletions.
2 changes: 1 addition & 1 deletion credsweeper/ml_model/model_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,6 @@
".asciidoc", ".yaml", ".sh", ".c", ".cs", ".php", ".txt", ".yml", ".java", ".ts", ".md", ".js", ".json",
".rb", ".py", ".go"
]}},
{"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "AWS Client ID", "API", "Credential", "Password", "Key", "Auth"]}}
{"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "Github Old Token", "API", "Credential", "Password", "Key", "Auth", "JSON Web Token", "URL Credentials", "Nonce", "Salt", "Certificate"]}}
]
}
26 changes: 13 additions & 13 deletions credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@
doc_only: true

- name: API
severity: medium
severity: critical
type: keyword
values:
- api
Expand Down Expand Up @@ -169,7 +169,7 @@
min_line_len: 30

- name: Credential
severity: medium
severity: critical
type: keyword
values:
- credential
Expand Down Expand Up @@ -201,7 +201,7 @@
min_line_len: 31

- name: Github Old Token
severity: high
severity: critical
type: pattern
values:
- (?i)((git)[\w\-]*(token|key|api)[\w\-]*(\s)*(=|:|:=)(\s)*(["']?)(?P<value>[a-z|\d]{40})(["']?))
Expand Down Expand Up @@ -279,7 +279,7 @@
min_line_len: 105

- name: JSON Web Token
severity: medium
severity: critical
type: pattern
values:
- (^|[^.0-9A-Za-z_/+-])(?P<value>eyJ[0-9A-Za-z_=-]{15,8000}([.0-9A-Za-z_=-]{1,8000})?)
Expand Down Expand Up @@ -313,7 +313,7 @@
min_line_len: 36

- name: Password
severity: medium
severity: critical
type: keyword
values:
- (?<!by)pass(?!ed|ing|es|\s+[a-z]{3,80})|pw(d|\b)
Expand Down Expand Up @@ -366,7 +366,7 @@
min_line_len: 40

- name: Secret
severity: medium
severity: critical
type: keyword
values:
- secret
Expand Down Expand Up @@ -476,7 +476,7 @@
min_line_len: 50

- name: Token
severity: medium
severity: critical
type: keyword
values:
- token
Expand All @@ -498,7 +498,7 @@
min_line_len: 34

- name: URL Credentials
severity: high
severity: critical
type: pattern
values:
- ://[^:\s]+(?P<separator>:)(?P<value>[^@\s]+)@
Expand All @@ -510,7 +510,7 @@
doc_available: false

- name: Auth
severity: medium
severity: critical
type: keyword
values:
- auth(?!or)
Expand All @@ -522,7 +522,7 @@
doc_available: false

- name: Key
severity: medium
severity: critical
type: keyword
values:
- key(?!word)
Expand Down Expand Up @@ -604,7 +604,7 @@
min_line_len: 14

- name: Nonce
severity: medium
severity: critical
type: keyword
values:
- nonce
Expand All @@ -616,7 +616,7 @@
doc_available: false

- name: Salt
severity: medium
severity: critical
type: keyword
values:
- salt
Expand All @@ -628,7 +628,7 @@
doc_available: false

- name: Certificate
severity: medium
severity: critical
type: keyword
values:
- cert
Expand Down
Empty file added experiment/__init__.py
Empty file.
34 changes: 21 additions & 13 deletions experiment/augmentation/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
from shutil import rmtree
from multiprocessing import Pool
Expand All @@ -9,6 +10,11 @@

from obfuscation import get_obfuscated_value, generate_value, SecretCreds

logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s",
level="DEBUG")
logger = logging.getLogger(__name__)

BASE_PATH = ["test", "src", "other"]
COLUMN_TYPES = {
"Id": str,
Expand Down Expand Up @@ -67,9 +73,9 @@ def obfuscate_row(row, meta, secret_creds):
obfuscated_value = get_obfuscated_value(value, pattern)
else:
if meta.WithWords == "1" and meta.Category not in [
"Authentication Key & Token", #
"Generic Secret", #
"Generic Token" #
"Authentication Key & Token", #
"Generic Secret", #
"Generic Token" #
]:
obfuscated_value = secret_creds.get_word_secret()
elif meta.Category == "Password":
Expand Down Expand Up @@ -291,19 +297,21 @@ def aug_dir(arg):
write_meta(new_meta, aug_meta)


def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, scale: float):
def build_corpus(repo_local_path: Path, meta_path: Path, repos_paths, true_stake: float, scale: float):
""" Build the corpus for this repo.
Parameters
----------
repo_path: str
repo_local_path: str
Path to the CredPosDataset repository
meta_path: str
Path to the metadata
repos_paths: List[str]
List of repos directory names
true_stake:
Part of the rows with "True" cases in the aggregated data
scale:
scale
Returns
-------
Expand Down Expand Up @@ -332,8 +340,8 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca

if __name__ == "__main__":
CredDataDirectory = sys.argv[1]
true_stake = sys.argv[2]
scale = sys.argv[3]
_true_stake = sys.argv[2]
_scale = sys.argv[3]

try:
CredDataDirectory = os.path.abspath(CredDataDirectory)
Expand All @@ -343,20 +351,20 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca
raise ValueError("Please set a valid CredData. It should be a valid path")

try:
true_stake = float(true_stake)
_true_stake = float(_true_stake)
except:
raise ValueError("Please set a valid true_stake. It cannot contain commas, spaces, or characters.")
if true_stake < 0 or true_stake > 0.5:
if _true_stake < 0 or _true_stake > 0.5:
raise ValueError("Please set a valid true_stake. It should be between 0 and 0.5")

try:
scale = float(scale)
_scale = float(_scale)
except:
raise ValueError("Please set a valid scale. It cannot contain commas, spaces, or characters.")

repo_path = Path(CredDataDirectory)
data_path = repo_path / "data"
meta_path = repo_path / "meta"
repos_paths = os.listdir(data_path)
_meta_path = repo_path / "meta"
_repos_paths = os.listdir(data_path)

build_corpus(repo_path, meta_path, repos_paths, true_stake, scale)
build_corpus(repo_path, _meta_path, _repos_paths, _true_stake, _scale)
4 changes: 2 additions & 2 deletions experiment/src/prepare_data.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import os
import sys
import subprocess
import sys


def execute_scanner(dataset_location: str, result_location_str, j, use_ml=False):
"""Execute CredSweeper as a separate process to make sure no global states is shared with training script"""
dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.."
command = f"{sys.executable} -m credsweeper --path {dataset_location}/data --save-json {result_location_str} -j {j}"
command = f"{sys.executable} -m credsweeper --path {dataset_location}/data --save-json {result_location_str} -j {j} --severity critical"
if not use_ml:
command += " --ml_threshold 0"
subprocess.call(command, shell=True, cwd=dir_path, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
Expand Down

0 comments on commit 5306e87

Please sign in to comment.