From 1fa1a3013efaee3adc81531702e12197b12a57fe Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Sat, 20 Jan 2024 10:38:18 +0200 Subject: [PATCH] [skip actions] [experiment] 2024-01-20T10:38:18+02:00 --- .github/workflows/benchmark.yml | 87 +++++++++++++++++++++++++- credsweeper/ml_model/model_config.json | 2 +- credsweeper/rules/config.yaml | 26 ++++---- experiment/__init__.py | 0 experiment/augmentation/main.py | 81 ++++++++++++------------ experiment/main.py | 2 +- experiment/requirements.txt | 10 +++ experiment/src/data_loader.py | 21 ++++--- experiment/src/features.py | 18 +++--- experiment/src/prepare_data.py | 8 +-- tests/data/ml_threshold.json | 4 +- 11 files changed, 179 insertions(+), 80 deletions(-) create mode 100644 experiment/__init__.py create mode 100644 experiment/requirements.txt diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index fb7f89e0d..d4d4b4336 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -23,7 +23,7 @@ jobs: uses: actions/checkout@v3 with: repository: babenek/CredData - ref: xml + ref: experiment - name: Cache data id: cache-data @@ -64,7 +64,7 @@ jobs: uses: actions/checkout@v3 with: repository: babenek/CredData - ref: xml + ref: experiment - name: Cache data id: cache-data @@ -169,7 +169,7 @@ jobs: uses: actions/checkout@v3 with: repository: babenek/CredData - ref: xml + ref: experiment - name: Cache data id: cache-data @@ -333,4 +333,85 @@ jobs: exit ${exit_code} +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + + experiment: + # the ml train test is placed here to use cached data set + needs: [ download_data ] + + runs-on: ubuntu-latest + + steps: + + - name: Checkout CredData + uses: actions/checkout@v3 + with: + repository: babenek/CredData + ref: experiment + + - name: Cache data + id: cache-data + uses: actions/cache@v3 + with: + path: data + key: cred-data-${{ hashFiles('snapshot.yaml') }} + + - name: Failure in case when cache missed + if: steps.cache-data.outputs.cache-hit != 'true' + run: exit 1 + + - name: Exclude some sets and place to CredData dir + # keep b* & c* only to easy correct experiment/src/split.json + if: steps.cache-data.outputs.cache-hit == 'true' + run: | + rm -rf data/0* data/1* data/2* data/3* data/4* data/5* data/6* data/7* data/8* data/9* data/a* data/d* data/e* data/f* + rm -rf meta/0* meta/1* meta/2* meta/3* meta/4* meta/5* meta/6* meta/7* meta/8* meta/9* meta/a* meta/d* meta/e* meta/f* + mkdir -vp ${{ github.workspace }}/CredData + mv data ${{ github.workspace }}/CredData/ + mv meta ${{ github.workspace }}/CredData/ + + - name: Set up Python 3.8 + if: steps.cache-data.outputs.cache-hit != 'true' + uses: actions/setup-python@v3 + with: + python-version: "3.8" + + - name: Update PIP + run: python -m pip install --upgrade pip + + - name: Checkout current CredSweeper + uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: CredSweeper.head + + - name: Install development packages + run: python -m pip install --requirement CredSweeper.head/requirements.txt + + - name: Install experimental packages + # some versions will be changed for compatibility + run: python -m pip install --requirement CredSweeper.head/experiment/requirements.txt + + - name: dbg + run: echo ${{ github.workspace }} && ls -al ${{ github.workspace }} && tree ${{ github.workspace }} + + - name: Lighten spit.json + run: | + mv -vf ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak + cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak + grep -v '"[0-9ad-f][0-9a-f]\+' ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak >${{ github.workspace }}/CredSweeper.head/experiment/src/split.json + cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json + + - name: Do the experiment + run: | + cd CredSweeper.head + ls -al #dbg + pwd #dbg + export PYTHONPATH=$(pwd):${PYTHONPATH} + cd experiment + python -m credsweeper --banner #dbg + python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) )) + ls -al results + + # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # diff --git a/credsweeper/ml_model/model_config.json b/credsweeper/ml_model/model_config.json index 1715b45d6..2153d7ed9 100644 --- a/credsweeper/ml_model/model_config.json +++ b/credsweeper/ml_model/model_config.json @@ -48,6 +48,6 @@ ".asciidoc", ".yaml", ".sh", ".c", ".cs", ".php", ".txt", ".yml", ".java", ".ts", ".md", ".js", ".json", ".rb", ".py", ".go" ]}}, - {"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "AWS Client ID", "API", "Credential", "Password", "Key", "Auth"]}} + {"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "Github Old Token", "API", "Credential", "Password", "Key", "Auth", "JSON Web Token", "URL Credentials", "Nonce", "Salt", "Certificate"]}} ] } diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml index 24f27f88d..3f3d6a219 100644 --- a/credsweeper/rules/config.yaml +++ b/credsweeper/rules/config.yaml @@ -100,7 +100,7 @@ doc_only: true - name: API - severity: medium + severity: critical type: keyword values: - api @@ -169,7 +169,7 @@ min_line_len: 30 - name: Credential - severity: medium + severity: critical type: keyword values: - credential @@ -201,7 +201,7 @@ min_line_len: 31 - name: Github Old Token - severity: high + severity: critical type: pattern values: - (?i)((git)[\w\-]*(token|key|api)[\w\-]*(\s)*(=|:|:=)(\s)*(["']?)(?P[a-z|\d]{40})(["']?)) @@ -279,7 +279,7 @@ min_line_len: 105 - name: JSON Web Token - severity: medium + severity: critical type: pattern values: - (^|[^.0-9A-Za-z_+-])(?PeyJ[0-9A-Za-z_=-]{15,8000}([.0-9A-Za-z_=-]{1,8000})?) @@ -313,7 +313,7 @@ min_line_len: 36 - name: Password - severity: medium + severity: critical type: keyword values: - (?:)(?P[^@\s]+)@ @@ -510,7 +510,7 @@ doc_available: false - name: Auth - severity: medium + severity: critical type: keyword values: - auth(?!(or|ors)(?!i[tz])) @@ -522,7 +522,7 @@ doc_available: false - name: Key - severity: medium + severity: critical type: keyword values: - key(?!word) @@ -604,7 +604,7 @@ min_line_len: 14 - name: Nonce - severity: medium + severity: critical type: keyword values: - nonce @@ -616,7 +616,7 @@ doc_available: false - name: Salt - severity: medium + severity: critical type: keyword values: - salt @@ -628,7 +628,7 @@ doc_available: false - name: Certificate - severity: medium + severity: critical type: keyword values: - cert diff --git a/experiment/__init__.py b/experiment/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/experiment/augmentation/main.py b/experiment/augmentation/main.py index e1caf39ac..0b0c26819 100644 --- a/experiment/augmentation/main.py +++ b/experiment/augmentation/main.py @@ -1,3 +1,4 @@ +import logging import os from shutil import rmtree from multiprocessing import Pool @@ -9,6 +10,11 @@ from obfuscation import get_obfuscated_value, generate_value, SecretCreds +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s", + level="DEBUG") +logger = logging.getLogger(__name__) + BASE_PATH = ["test", "src", "other"] COLUMN_TYPES = { "Id": str, @@ -67,9 +73,9 @@ def obfuscate_row(row, meta, secret_creds): obfuscated_value = get_obfuscated_value(value, pattern) else: if meta.WithWords == "1" and meta.Category not in [ - "Authentication Key & Token", # - "Generic Secret", # - "Generic Token" # + "Authentication Key & Token", # + "Generic Secret", # + "Generic Token" # ]: obfuscated_value = secret_creds.get_word_secret() elif meta.Category == "Password": @@ -184,37 +190,30 @@ def get_true_row(df, idx, aug_file): "RawLine": "" # }) idx += line_diff - t_df = t_df.append(add_series) + t_df = pd.concat([t_df, add_series]) return t_df, idx -def get_false_row(row_numb, aug_filename, files_length, fl_true_lines): - fl_path = list(files_length.keys()) +def get_false_row(df, idx, aug_file): + temp_df = df[df["GroundTruth"] == "F"] + fl_path = list(temp_df["FilePath"]) + if len(fl_path) == 0: + return None, idx + lines = list(temp_df["LineStart:LineEnd"]) rand = random.randint(0, len(fl_path) - 1) - fl_name = fl_path[rand] - fl_length = files_length[fl_name] - # Filter true lines - true_lines = fl_true_lines[fl_name] - if fl_length == len(true_lines): - return None - - t_df = None - while t_df is None: - rand_row = random.randint(1, fl_length) - if rand_row in true_lines: - continue - orig_linenumb = str(rand_row) + ":" + str(rand_row) - new_linenumb = str(row_numb) + ":" + str(row_numb) - t_df = pd.Series({ - "FilePath": fl_name, - "LineStart:LineEnd": orig_linenumb, - "GroundTruth": "F", - "New_LineNumb": new_linenumb, - "New_FilePath": aug_filename, - "RawLine": "" - }) - return t_df + line_numb = lines[rand].split(":") + t_df = temp_df.iloc[rand].copy() + line_diff = int(line_numb[1]) - int(line_numb[0]) + new_linenumb = str(idx) + ":" + str(idx + line_diff) + add_series = pd.Series({ + "New_LineNumb": new_linenumb, # + "New_FilePath": aug_file, # + "RawLine": "" # + }) + idx += line_diff + t_df = pd.concat([t_df, add_series]) + return t_df, idx def get_true_lines(df): @@ -249,7 +248,7 @@ def generate_rows(repo_local_path, aug_filename, df, true_stake, scale): ground_trues, idx = get_true_row(df, row_numb, aug_filename) row_numb = idx else: - ground_trues = get_false_row(row_numb, aug_filename, files_length, fl_true_lines) + ground_trues, idx = get_false_row(df, row_numb, aug_filename) if ground_trues is None: row_numb -= 1 continue @@ -291,12 +290,12 @@ def aug_dir(arg): write_meta(new_meta, aug_meta) -def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, scale: float): +def build_corpus(repo_local_path: Path, meta_path: Path, repos_paths, true_stake: float, scale: float): """ Build the corpus for this repo. Parameters ---------- - repo_path: str + repo_local_path: str Path to the CredPosDataset repository meta_path: str Path to the metadata @@ -304,6 +303,8 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca List of repos directory names true_stake: Part of the rows with "True" cases in the aggregated data + scale: + scale Returns ------- @@ -332,8 +333,8 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca if __name__ == "__main__": CredDataDirectory = sys.argv[1] - true_stake = sys.argv[2] - scale = sys.argv[3] + _true_stake = sys.argv[2] + _scale = sys.argv[3] try: CredDataDirectory = os.path.abspath(CredDataDirectory) @@ -343,20 +344,20 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca raise ValueError("Please set a valid CredData. It should be a valid path") try: - true_stake = float(true_stake) + _true_stake = float(_true_stake) except: raise ValueError("Please set a valid true_stake. It cannot contain commas, spaces, or characters.") - if true_stake < 0 or true_stake > 0.5: + if _true_stake < 0 or _true_stake > 0.5: raise ValueError("Please set a valid true_stake. It should be between 0 and 0.5") try: - scale = float(scale) + _scale = float(_scale) except: raise ValueError("Please set a valid scale. It cannot contain commas, spaces, or characters.") repo_path = Path(CredDataDirectory) data_path = repo_path / "data" - meta_path = repo_path / "meta" - repos_paths = os.listdir(data_path) + _meta_path = repo_path / "meta" + _repos_paths = os.listdir(data_path) - build_corpus(repo_path, meta_path, repos_paths, true_stake, scale) + build_corpus(repo_path, _meta_path, _repos_paths, _true_stake, _scale) diff --git a/experiment/main.py b/experiment/main.py index 52caf76f2..52c6cfd6b 100644 --- a/experiment/main.py +++ b/experiment/main.py @@ -78,7 +78,7 @@ def main(cred_data_location: str) -> str: os.makedirs("results/", exist_ok=True) current_time = int(time()) - model_file_name = f"results/ml_model_at-{current_time}.h5" + model_file_name = f"results/ml_model_at-{current_time}" keras_model.save(model_file_name, include_optimizer=False) print('-' * 40) diff --git a/experiment/requirements.txt b/experiment/requirements.txt new file mode 100644 index 000000000..d7910374b --- /dev/null +++ b/experiment/requirements.txt @@ -0,0 +1,10 @@ +h5py==3.10.0 +keras==2.13.1 +numpy==1.23.5 +onnx==1.15.0 +protobuf==3.20.3 +tensorflow==2.13.1 +tf2onnx==1.16.0 +wrapt==1.14.1 + +tqdm==4.66.1 diff --git a/experiment/src/data_loader.py b/experiment/src/data_loader.py index 134fb3418..960582286 100644 --- a/experiment/src/data_loader.py +++ b/experiment/src/data_loader.py @@ -1,9 +1,10 @@ import json import os -from typing import Tuple, Dict from copy import deepcopy -import pandas as pd +from typing import Tuple, Dict + import numpy as np +import pandas as pd identifier = Tuple[str, int] @@ -21,6 +22,8 @@ def read_detected_data(file_path: str, split="CredData/") -> Dict[identifier, Di detected_lines = {} for detection in detections: + if 1 != len(detection["line_data_list"]): + continue for line_data in detection["line_data_list"]: relative_path = strip_data_path(line_data["path"], split) index = relative_path, line_data["line_num"] @@ -114,9 +117,9 @@ def eval_no_model(df: pd.DataFrame, df_missing: pd.DataFrame): f1: float = (2 * precision * recall) / (precision + recall) report = f"TP : {true_positive}, FP : {false_positive}, TN : {true_negative}, " \ - f"FN : {false_negative}, FPR : {false_positive_rate:.10f}, " \ - f"FNR : {false_negative_rate:.10f}, PRC : {precision:.10f}, " \ - f"RCL : {recall:.10f}, F1 : {f1:.10f}" + f"FN : {false_negative}, FPR : {false_positive_rate:.10f}, " \ + f"FNR : {false_negative_rate:.10f}, PRC : {precision:.10f}, " \ + f"RCL : {recall:.10f}, F1 : {f1:.10f}" print(report) @@ -146,12 +149,12 @@ def eval_with_model(df: pd.DataFrame, df_missing: pd.DataFrame, predictions: np. f1: float = (2 * precision * recall) / (precision + recall) report = f"TP : {true_positive}, FP : {false_positive}, TN : {true_negative}, " \ - f"FN : {false_negative}, FPR : {false_positive_rate:.10f}, " \ - f"FNR : {false_negative_rate:.10f}, PRC : {precision:.10f}, " \ - f"RCL : {recall:.10f}, F1 : {f1:.10f}" + f"FN : {false_negative}, FPR : {false_positive_rate:.10f}, " \ + f"FNR : {false_negative_rate:.10f}, PRC : {precision:.10f}, " \ + f"RCL : {recall:.10f}, F1 : {f1:.10f}" print(report) def get_y_labels(df: pd.DataFrame) -> np.ndarray: - true_cases = np.array(df["GroundTruth"]) + true_cases = np.array(df["GroundTruth"], dtype=np.int8) return true_cases diff --git a/experiment/src/features.py b/experiment/src/features.py index fb6da58e6..38ad816d4 100644 --- a/experiment/src/features.py +++ b/experiment/src/features.py @@ -1,13 +1,15 @@ from typing import Tuple, Union +import numpy as np +import pandas as pd + from credsweeper.common.constants import Severity from credsweeper.credentials import Candidate from credsweeper.credentials import LineData from credsweeper.ml_model import MlValidator -import numpy as np -import pandas as pd +from credsweeper.utils import Util -MlValidator() # Initialize global MLValidator object +ml_validator = MlValidator(0.5) # Initialize global MLValidator object class CustomLineData(LineData): @@ -18,6 +20,7 @@ def __init__(self, line: str, value: str, line_num: int, path: str) -> None: self.line_num: int = line_num self.path: str = path self.value = value + self.file_type = Util.get_extension(path) def get_candidates(line_data: dict): @@ -25,7 +28,7 @@ def get_candidates(line_data: dict): ld = CustomLineData(line_data["line"], line_data["value"], line_data["line_num"], line_data["path"]) candidates = [] for rule in line_data["RuleNames"]: - candidates.append(Candidate([ld], [], rule, Severity.MEDIUM, [], True)) + candidates.append(Candidate([ld], [], rule, Severity.MEDIUM, None, None, True)) return candidates @@ -35,10 +38,10 @@ def get_features(line_data: Union[dict, pd.Series]): value = line_data["value"] candidates = get_candidates(line_data) - line_input = MlValidator.encode(value, MlValidator.char_to_index) + line_input = ml_validator.encode(value, ml_validator.char_to_index) - common_features = MlValidator.extract_common_features(candidates) - unique_features = MlValidator.extract_unique_features(candidates) + common_features = ml_validator.extract_common_features(candidates) + unique_features = ml_validator.extract_unique_features(candidates) extracted_features = np.hstack([common_features, unique_features]) @@ -51,6 +54,7 @@ def prepare_data(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: X_features = [] for i, row in df.iterrows(): + assert row["line"] is not None, row line_input, extracted_features = get_features(row) X_values.append(line_input) X_features.append(extracted_features) diff --git a/experiment/src/prepare_data.py b/experiment/src/prepare_data.py index d748b60f1..409c52079 100644 --- a/experiment/src/prepare_data.py +++ b/experiment/src/prepare_data.py @@ -1,19 +1,19 @@ import os -import sys import subprocess +import sys def execute_scanner(dataset_location: str, result_location_str, j, use_ml=False): """Execute CredSweeper as a separate process to make sure no global states is shared with training script""" dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.." - command = f"{sys.executable} -m credsweeper --path {dataset_location}/data --save-json {result_location_str} -j {j}" + command = f"{sys.executable} -m credsweeper --path {dataset_location}/data --save-json {result_location_str} -j {j} --severity critical" if not use_ml: - command += " --ml_threshold 0" + command += "--no-filters --ml_threshold 0" subprocess.call(command, shell=True, cwd=dir_path, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) def get_aug_data(dataset_location: str): - """Execute CredSweeper as a separate process to make sure no global states is shared with training script""" + """TODO: use normal import for the workflow""" dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.." command = f"{sys.executable} main.py {dataset_location} 0.1 5" subprocess.call(command, shell=True, cwd=dir_path + "/augmentation") diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index f661d9985..7d5c56c1d 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -97,8 +97,8 @@ }, { "api_validation": "NOT_AVAILABLE", - "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.99679, + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, "rule": "Auth", "severity": "medium", "line_data_list": [