From 1fa1a3013efaee3adc81531702e12197b12a57fe Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Sat, 20 Jan 2024 10:38:18 +0200
Subject: [PATCH] [skip actions] [experiment] 2024-01-20T10:38:18+02:00

---
 .github/workflows/benchmark.yml        | 87 +++++++++++++++++++++++++-
 credsweeper/ml_model/model_config.json |  2 +-
 credsweeper/rules/config.yaml          | 26 ++++----
 experiment/__init__.py                 |  0
 experiment/augmentation/main.py        | 81 ++++++++++++------------
 experiment/main.py                     |  2 +-
 experiment/requirements.txt            | 10 +++
 experiment/src/data_loader.py          | 21 ++++---
 experiment/src/features.py             | 18 +++---
 experiment/src/prepare_data.py         |  8 +--
 tests/data/ml_threshold.json           |  4 +-
 11 files changed, 179 insertions(+), 80 deletions(-)
 create mode 100644 experiment/__init__.py
 create mode 100644 experiment/requirements.txt

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index fb7f89e0d..d4d4b4336 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -23,7 +23,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: babenek/CredData
-        ref: xml
+        ref: experiment
 
     - name: Cache data
       id: cache-data
@@ -64,7 +64,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: babenek/CredData
-        ref: xml
+        ref: experiment
 
     - name: Cache data
       id: cache-data
@@ -169,7 +169,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: babenek/CredData
-        ref: xml
+        ref: experiment
 
     - name: Cache data
       id: cache-data
@@ -333,4 +333,85 @@ jobs:
 
         exit ${exit_code}
 
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
+  experiment:
+    # the ml train test is placed here to use cached data set
+    needs: [ download_data ]
+
+    runs-on: ubuntu-latest
+
+    steps:
+
+      - name: Checkout CredData
+        uses: actions/checkout@v3
+        with:
+          repository: babenek/CredData
+          ref: experiment
+
+      - name: Cache data
+        id: cache-data
+        uses: actions/cache@v3
+        with:
+          path: data
+          key: cred-data-${{ hashFiles('snapshot.yaml') }}
+
+      - name: Failure in case when cache missed
+        if: steps.cache-data.outputs.cache-hit != 'true'
+        run: exit 1
+
+      - name: Exclude some sets and place to CredData dir
+        # keep b* & c* only to easy correct experiment/src/split.json
+        if: steps.cache-data.outputs.cache-hit == 'true'
+        run: |
+          rm -rf data/0* data/1* data/2* data/3* data/4* data/5*  data/6* data/7* data/8* data/9* data/a* data/d* data/e* data/f*
+          rm -rf meta/0* meta/1* meta/2* meta/3* meta/4* meta/5*  meta/6* meta/7* meta/8* meta/9* meta/a* meta/d* meta/e* meta/f*
+          mkdir -vp ${{ github.workspace }}/CredData
+          mv data ${{ github.workspace }}/CredData/
+          mv meta ${{ github.workspace }}/CredData/
+
+      - name: Set up Python 3.8
+        if: steps.cache-data.outputs.cache-hit != 'true'
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.8"
+
+      - name: Update PIP
+        run: python -m pip install --upgrade pip
+
+      - name: Checkout current CredSweeper
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          path: CredSweeper.head
+
+      - name: Install development packages
+        run: python -m pip install --requirement CredSweeper.head/requirements.txt
+
+      - name: Install experimental packages
+        # some versions will be changed for compatibility
+        run: python -m pip install --requirement CredSweeper.head/experiment/requirements.txt
+
+      - name: dbg
+        run: echo ${{ github.workspace }} && ls -al ${{ github.workspace }} && tree ${{ github.workspace }}
+        
+      - name: Lighten spit.json
+        run: |
+          mv -vf ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
+          cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
+          grep -v '"[0-9ad-f][0-9a-f]\+' ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak >${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
+          cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
+
+      - name: Do the experiment
+        run: |
+          cd CredSweeper.head
+          ls -al #dbg
+          pwd #dbg
+          export PYTHONPATH=$(pwd):${PYTHONPATH}
+          cd experiment
+          python -m credsweeper --banner #dbg
+          python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
+          ls -al results
+
+
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
diff --git a/credsweeper/ml_model/model_config.json b/credsweeper/ml_model/model_config.json
index 1715b45d6..2153d7ed9 100644
--- a/credsweeper/ml_model/model_config.json
+++ b/credsweeper/ml_model/model_config.json
@@ -48,6 +48,6 @@
             ".asciidoc", ".yaml", ".sh", ".c", ".cs", ".php", ".txt", ".yml", ".java", ".ts", ".md", ".js", ".json",
             ".rb", ".py", ".go"
         ]}},
-    {"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "AWS Client ID", "API", "Credential", "Password", "Key", "Auth"]}}
+    {"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "Github Old Token", "API", "Credential", "Password", "Key", "Auth", "JSON Web Token", "URL Credentials", "Nonce", "Salt", "Certificate"]}}
   ]
 }
diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml
index 24f27f88d..3f3d6a219 100644
--- a/credsweeper/rules/config.yaml
+++ b/credsweeper/rules/config.yaml
@@ -100,7 +100,7 @@
   doc_only: true
 
 - name: API
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - api
@@ -169,7 +169,7 @@
   min_line_len: 30
 
 - name: Credential
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - credential
@@ -201,7 +201,7 @@
   min_line_len: 31
 
 - name: Github Old Token
-  severity: high
+  severity: critical
   type: pattern
   values:
     - (?i)((git)[\w\-]*(token|key|api)[\w\-]*(\s)*(=|:|:=)(\s)*(["']?)(?P<value>[a-z|\d]{40})(["']?))
@@ -279,7 +279,7 @@
   min_line_len: 105
 
 - name: JSON Web Token
-  severity: medium
+  severity: critical
   type: pattern
   values:
     - (^|[^.0-9A-Za-z_+-])(?P<value>eyJ[0-9A-Za-z_=-]{15,8000}([.0-9A-Za-z_=-]{1,8000})?)
@@ -313,7 +313,7 @@
   min_line_len: 36
 
 - name: Password
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - (?<!by)pass(?!ed|ing|es|\s+[a-z]{3,80})|pw(d|\b)
@@ -366,7 +366,7 @@
   min_line_len: 40
 
 - name: Secret
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - secret
@@ -476,7 +476,7 @@
   min_line_len: 50
 
 - name: Token
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - token
@@ -498,7 +498,7 @@
   min_line_len: 34
 
 - name: URL Credentials
-  severity: high
+  severity: critical
   type: pattern
   values:
     - ://[^:\s]*(?P<separator>:)(?P<value>[^@\s]+)@
@@ -510,7 +510,7 @@
   doc_available: false
 
 - name: Auth
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - auth(?!(or|ors)(?!i[tz]))
@@ -522,7 +522,7 @@
   doc_available: false
 
 - name: Key
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - key(?!word)
@@ -604,7 +604,7 @@
   min_line_len: 14
 
 - name: Nonce
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - nonce
@@ -616,7 +616,7 @@
   doc_available: false
 
 - name: Salt
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - salt
@@ -628,7 +628,7 @@
   doc_available: false
 
 - name: Certificate
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - cert
diff --git a/experiment/__init__.py b/experiment/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/experiment/augmentation/main.py b/experiment/augmentation/main.py
index e1caf39ac..0b0c26819 100644
--- a/experiment/augmentation/main.py
+++ b/experiment/augmentation/main.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from shutil import rmtree
 from multiprocessing import Pool
@@ -9,6 +10,11 @@
 
 from obfuscation import get_obfuscated_value, generate_value, SecretCreds
 
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s",
+    level="DEBUG")
+logger = logging.getLogger(__name__)
+
 BASE_PATH = ["test", "src", "other"]
 COLUMN_TYPES = {
     "Id": str,
@@ -67,9 +73,9 @@ def obfuscate_row(row, meta, secret_creds):
         obfuscated_value = get_obfuscated_value(value, pattern)
     else:
         if meta.WithWords == "1" and meta.Category not in [
-                "Authentication Key & Token",  #
-                "Generic Secret",  #
-                "Generic Token"  #
+            "Authentication Key & Token",  #
+            "Generic Secret",  #
+            "Generic Token"  #
         ]:
             obfuscated_value = secret_creds.get_word_secret()
         elif meta.Category == "Password":
@@ -184,37 +190,30 @@ def get_true_row(df, idx, aug_file):
         "RawLine": ""  #
     })
     idx += line_diff
-    t_df = t_df.append(add_series)
+    t_df = pd.concat([t_df, add_series])
     return t_df, idx
 
 
-def get_false_row(row_numb, aug_filename, files_length, fl_true_lines):
-    fl_path = list(files_length.keys())
+def get_false_row(df, idx, aug_file):
+    temp_df = df[df["GroundTruth"] == "F"]
+    fl_path = list(temp_df["FilePath"])
+    if len(fl_path) == 0:
+        return None, idx
 
+    lines = list(temp_df["LineStart:LineEnd"])
     rand = random.randint(0, len(fl_path) - 1)
-    fl_name = fl_path[rand]
-    fl_length = files_length[fl_name]
-    # Filter true lines
-    true_lines = fl_true_lines[fl_name]
-    if fl_length == len(true_lines):
-        return None
-
-    t_df = None
-    while t_df is None:
-        rand_row = random.randint(1, fl_length)
-        if rand_row in true_lines:
-            continue
-        orig_linenumb = str(rand_row) + ":" + str(rand_row)
-        new_linenumb = str(row_numb) + ":" + str(row_numb)
-        t_df = pd.Series({
-            "FilePath": fl_name,
-            "LineStart:LineEnd": orig_linenumb,
-            "GroundTruth": "F",
-            "New_LineNumb": new_linenumb,
-            "New_FilePath": aug_filename,
-            "RawLine": ""
-        })
-    return t_df
+    line_numb = lines[rand].split(":")
+    t_df = temp_df.iloc[rand].copy()
+    line_diff = int(line_numb[1]) - int(line_numb[0])
+    new_linenumb = str(idx) + ":" + str(idx + line_diff)
+    add_series = pd.Series({
+        "New_LineNumb": new_linenumb,  #
+        "New_FilePath": aug_file,  #
+        "RawLine": ""  #
+    })
+    idx += line_diff
+    t_df = pd.concat([t_df, add_series])
+    return t_df, idx
 
 
 def get_true_lines(df):
@@ -249,7 +248,7 @@ def generate_rows(repo_local_path, aug_filename, df, true_stake, scale):
             ground_trues, idx = get_true_row(df, row_numb, aug_filename)
             row_numb = idx
         else:
-            ground_trues = get_false_row(row_numb, aug_filename, files_length, fl_true_lines)
+            ground_trues, idx = get_false_row(df, row_numb, aug_filename)
         if ground_trues is None:
             row_numb -= 1
             continue
@@ -291,12 +290,12 @@ def aug_dir(arg):
         write_meta(new_meta, aug_meta)
 
 
-def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, scale: float):
+def build_corpus(repo_local_path: Path, meta_path: Path, repos_paths, true_stake: float, scale: float):
     """ Build the corpus for this repo.
 
         Parameters
         ----------
-        repo_path: str
+        repo_local_path: str
             Path to the CredPosDataset repository
         meta_path: str
             Path to the metadata
@@ -304,6 +303,8 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca
             List of repos directory names
         true_stake:
             Part of the rows with "True" cases in the aggregated data
+        scale:
+            scale
 
         Returns
         -------
@@ -332,8 +333,8 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca
 
 if __name__ == "__main__":
     CredDataDirectory = sys.argv[1]
-    true_stake = sys.argv[2]
-    scale = sys.argv[3]
+    _true_stake = sys.argv[2]
+    _scale = sys.argv[3]
 
     try:
         CredDataDirectory = os.path.abspath(CredDataDirectory)
@@ -343,20 +344,20 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca
         raise ValueError("Please set a valid CredData. It should be a valid path")
 
     try:
-        true_stake = float(true_stake)
+        _true_stake = float(_true_stake)
     except:
         raise ValueError("Please set a valid true_stake. It cannot contain commas, spaces, or characters.")
-    if true_stake < 0 or true_stake > 0.5:
+    if _true_stake < 0 or _true_stake > 0.5:
         raise ValueError("Please set a valid true_stake. It should be between 0 and 0.5")
 
     try:
-        scale = float(scale)
+        _scale = float(_scale)
     except:
         raise ValueError("Please set a valid scale. It cannot contain commas, spaces, or characters.")
 
     repo_path = Path(CredDataDirectory)
     data_path = repo_path / "data"
-    meta_path = repo_path / "meta"
-    repos_paths = os.listdir(data_path)
+    _meta_path = repo_path / "meta"
+    _repos_paths = os.listdir(data_path)
 
-    build_corpus(repo_path, meta_path, repos_paths, true_stake, scale)
+    build_corpus(repo_path, _meta_path, _repos_paths, _true_stake, _scale)
diff --git a/experiment/main.py b/experiment/main.py
index 52caf76f2..52c6cfd6b 100644
--- a/experiment/main.py
+++ b/experiment/main.py
@@ -78,7 +78,7 @@ def main(cred_data_location: str) -> str:
 
     os.makedirs("results/", exist_ok=True)
     current_time = int(time())
-    model_file_name = f"results/ml_model_at-{current_time}.h5"
+    model_file_name = f"results/ml_model_at-{current_time}"
     keras_model.save(model_file_name, include_optimizer=False)
 
     print('-' * 40)
diff --git a/experiment/requirements.txt b/experiment/requirements.txt
new file mode 100644
index 000000000..d7910374b
--- /dev/null
+++ b/experiment/requirements.txt
@@ -0,0 +1,10 @@
+h5py==3.10.0
+keras==2.13.1
+numpy==1.23.5
+onnx==1.15.0
+protobuf==3.20.3
+tensorflow==2.13.1
+tf2onnx==1.16.0
+wrapt==1.14.1
+
+tqdm==4.66.1
diff --git a/experiment/src/data_loader.py b/experiment/src/data_loader.py
index 134fb3418..960582286 100644
--- a/experiment/src/data_loader.py
+++ b/experiment/src/data_loader.py
@@ -1,9 +1,10 @@
 import json
 import os
-from typing import Tuple, Dict
 from copy import deepcopy
-import pandas as pd
+from typing import Tuple, Dict
+
 import numpy as np
+import pandas as pd
 
 identifier = Tuple[str, int]
 
@@ -21,6 +22,8 @@ def read_detected_data(file_path: str, split="CredData/") -> Dict[identifier, Di
     detected_lines = {}
 
     for detection in detections:
+        if 1 != len(detection["line_data_list"]):
+            continue
         for line_data in detection["line_data_list"]:
             relative_path = strip_data_path(line_data["path"], split)
             index = relative_path, line_data["line_num"]
@@ -114,9 +117,9 @@ def eval_no_model(df: pd.DataFrame, df_missing: pd.DataFrame):
     f1: float = (2 * precision * recall) / (precision + recall)
 
     report = f"TP : {true_positive}, FP : {false_positive}, TN : {true_negative}, " \
-                f"FN : {false_negative}, FPR : {false_positive_rate:.10f}, " \
-                f"FNR : {false_negative_rate:.10f}, PRC : {precision:.10f}, " \
-                f"RCL : {recall:.10f}, F1 : {f1:.10f}"
+             f"FN : {false_negative}, FPR : {false_positive_rate:.10f}, " \
+             f"FNR : {false_negative_rate:.10f}, PRC : {precision:.10f}, " \
+             f"RCL : {recall:.10f}, F1 : {f1:.10f}"
     print(report)
 
 
@@ -146,12 +149,12 @@ def eval_with_model(df: pd.DataFrame, df_missing: pd.DataFrame, predictions: np.
     f1: float = (2 * precision * recall) / (precision + recall)
 
     report = f"TP : {true_positive}, FP : {false_positive}, TN : {true_negative}, " \
-                f"FN : {false_negative}, FPR : {false_positive_rate:.10f}, " \
-                f"FNR : {false_negative_rate:.10f}, PRC : {precision:.10f}, " \
-                f"RCL : {recall:.10f}, F1 : {f1:.10f}"
+             f"FN : {false_negative}, FPR : {false_positive_rate:.10f}, " \
+             f"FNR : {false_negative_rate:.10f}, PRC : {precision:.10f}, " \
+             f"RCL : {recall:.10f}, F1 : {f1:.10f}"
     print(report)
 
 
 def get_y_labels(df: pd.DataFrame) -> np.ndarray:
-    true_cases = np.array(df["GroundTruth"])
+    true_cases = np.array(df["GroundTruth"], dtype=np.int8)
     return true_cases
diff --git a/experiment/src/features.py b/experiment/src/features.py
index fb6da58e6..38ad816d4 100644
--- a/experiment/src/features.py
+++ b/experiment/src/features.py
@@ -1,13 +1,15 @@
 from typing import Tuple, Union
 
+import numpy as np
+import pandas as pd
+
 from credsweeper.common.constants import Severity
 from credsweeper.credentials import Candidate
 from credsweeper.credentials import LineData
 from credsweeper.ml_model import MlValidator
-import numpy as np
-import pandas as pd
+from credsweeper.utils import Util
 
-MlValidator()  # Initialize global MLValidator object
+ml_validator = MlValidator(0.5)  # Initialize global MLValidator object
 
 
 class CustomLineData(LineData):
@@ -18,6 +20,7 @@ def __init__(self, line: str, value: str, line_num: int, path: str) -> None:
         self.line_num: int = line_num
         self.path: str = path
         self.value = value
+        self.file_type = Util.get_extension(path)
 
 
 def get_candidates(line_data: dict):
@@ -25,7 +28,7 @@ def get_candidates(line_data: dict):
     ld = CustomLineData(line_data["line"], line_data["value"], line_data["line_num"], line_data["path"])
     candidates = []
     for rule in line_data["RuleNames"]:
-        candidates.append(Candidate([ld], [], rule, Severity.MEDIUM, [], True))
+        candidates.append(Candidate([ld], [], rule, Severity.MEDIUM, None, None, True))
 
     return candidates
 
@@ -35,10 +38,10 @@ def get_features(line_data: Union[dict, pd.Series]):
     value = line_data["value"]
     candidates = get_candidates(line_data)
 
-    line_input = MlValidator.encode(value, MlValidator.char_to_index)
+    line_input = ml_validator.encode(value, ml_validator.char_to_index)
 
-    common_features = MlValidator.extract_common_features(candidates)
-    unique_features = MlValidator.extract_unique_features(candidates)
+    common_features = ml_validator.extract_common_features(candidates)
+    unique_features = ml_validator.extract_unique_features(candidates)
 
     extracted_features = np.hstack([common_features, unique_features])
 
@@ -51,6 +54,7 @@ def prepare_data(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
     X_features = []
 
     for i, row in df.iterrows():
+        assert row["line"] is not None, row
         line_input, extracted_features = get_features(row)
         X_values.append(line_input)
         X_features.append(extracted_features)
diff --git a/experiment/src/prepare_data.py b/experiment/src/prepare_data.py
index d748b60f1..409c52079 100644
--- a/experiment/src/prepare_data.py
+++ b/experiment/src/prepare_data.py
@@ -1,19 +1,19 @@
 import os
-import sys
 import subprocess
+import sys
 
 
 def execute_scanner(dataset_location: str, result_location_str, j, use_ml=False):
     """Execute CredSweeper as a separate process to make sure no global states is shared with training script"""
     dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.."
-    command = f"{sys.executable} -m credsweeper --path {dataset_location}/data --save-json {result_location_str} -j {j}"
+    command = f"{sys.executable} -m credsweeper --path {dataset_location}/data --save-json {result_location_str} -j {j} --severity critical"
     if not use_ml:
-        command += " --ml_threshold 0"
+        command += "--no-filters --ml_threshold 0"
     subprocess.call(command, shell=True, cwd=dir_path, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
 
 
 def get_aug_data(dataset_location: str):
-    """Execute CredSweeper as a separate process to make sure no global states is shared with training script"""
+    """TODO: use normal import for the workflow"""
     dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.."
     command = f"{sys.executable} main.py {dataset_location} 0.1 5"
     subprocess.call(command, shell=True, cwd=dir_path + "/augmentation")
diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json
index f661d9985..7d5c56c1d 100644
--- a/tests/data/ml_threshold.json
+++ b/tests/data/ml_threshold.json
@@ -97,8 +97,8 @@
     },
     {
         "api_validation": "NOT_AVAILABLE",
-        "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.99679,
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
         "rule": "Auth",
         "severity": "medium",
         "line_data_list": [