Samsung · babenek · Jan 8, 2024 · Jan 10, 2024
@@ -1,4 +1,4 @@
 # This workflow runs benchmark
 # Separation of jobs helps to cache data even benchmark is fail

 name: Benchmark
@@ -332,3 +332,89 @@
 
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
+  experiment:
+    # the ml train test is placed here to use cached data set
+    needs: [ download_data ]
+
+    runs-on: ubuntu-latest
+
+    steps:
+
+      - name: Checkout CredData
+        uses: actions/checkout@v3
+        with:
+          repository: Samsung/CredData
+
+      - name: Cache data
+        id: cache-data
+        uses: actions/cache@v3
+        with:
+          path: data
+          key: cred-data-${{ hashFiles('snapshot.yaml') }}
+
+      - name: Failure in case when cache missed
+        if: steps.cache-data.outputs.cache-hit != 'true'
+        run: exit 1
+
+      - name: Exclude some sets and place to CredData dir
+        # keep b* & c* only to easy experiment/src/split.json
+        run: |
+          rm -rf data/0* data/1* data/2* data/3* data/4* data/5*  data/6* data/7* data/8* data/9* data/a* data/d* data/e* data/f*
+          rm -rf meta/0* meta/1* meta/2* meta/3* meta/4* meta/5*  meta/6* meta/7* meta/8* meta/9* meta/a* meta/d* meta/e* meta/f*
+          mkdir -vp ${{ github.workspace }}/CredData
+          mv data ${{ github.workspace }}/CredData/
+          mv meta ${{ github.workspace }}/CredData/
+
+      - name: Set up Python 3.8
+        if: steps.cache-data.outputs.cache-hit != 'true'
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.8"
+
+      - name: Update PIP
+        run: python -m pip install --upgrade pip
+
+      - name: Checkout current CredSweeper
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          path: CredSweeper.head
+
+      - name: Install development packages
+        run: python -m pip install --requirement CredSweeper.head/requirements.txt
+
+      - name: Install experimental packages
+        # some versions will be changed for compatibility
+        run: python -m pip install --requirement CredSweeper.head/experiment/requirements.txt
+
+      - name: dbg
+        run: echo ${{ github.workspace }} && ls -al ${{ github.workspace }} && tree ${{ github.workspace }}
+
+      - name: Lighten spit.json
+        run: |
+          mv -vf ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
+          cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
+          grep -v '"[0-9ad-f][0-9a-f]\+' ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak >${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
+          cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
+
+      - name: Run the experiment
+        run: |
+          cd CredSweeper.head
+          ls -al #dbg
+          pwd #dbg
+          export PYTHONPATH=$(pwd):${PYTHONPATH}
+          cd experiment
+          python -m credsweeper --banner #dbg - check whether credsweeper is available as moule
+          python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
+          ls -al results #dbg
+          python -m tf2onnx.convert --saved-model $(find results -mindepth 1 -maxdepth 1 -type d) --output ../credsweeper/ml_model/ml_model.onnx --verbose --rename-inputs feature_input,line_input
+          git diff #dbg
+          python -m credsweeper --banner #dbg - crc32 should be changed
+          python -m credsweeper --log debug --path ../tests/samples --save-json
+          NEW_MODEL_FOUND_SAMPLES=$(jq '.|length' output.json)
+          if [ 1000 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
+            echo "Failure"
+            exit 1
+          fi
+
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@@ -118,10 +118,10 @@ def add_raw_lines(meta_df, filepath, content):
 def write2aug_file(repo_local_path, meta_df, aug_file):
     fls_path = list(set(meta_df.FilePath))
     for filepath in fls_path:
-        with open(repo_local_path / filepath, "r") as reader:
+        with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
             content = reader.readlines()
         add_raw_lines(meta_df, filepath, content)
-    with open(repo_local_path / aug_file, "w") as writer:
+    with open(repo_local_path / aug_file, "w", encoding="utf8") as writer:
         Rows = meta_df.RawLine
         writer.writelines(Rows)
 
@@ -150,7 +150,7 @@ def get_linage(repo_local_path, df):
     files_length = {}
     overall_linage = 0
     for filepath in fls_path:
-        with open(repo_local_path / filepath, "r") as reader:
+        with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
             content = reader.readlines()
         overall_linage += len(content)
         files_length[filepath] = len(content)
@@ -184,7 +184,7 @@ def get_true_row(df, idx, aug_file):
         "RawLine": ""  #
     })
     idx += line_diff
-    t_df = t_df.append(add_series)
+    t_df = pd.concat([t_df, add_series])
     return t_df, idx
 
 

@@ -51,6 +51,7 @@ def main(cred_data_location: str) -> str:
     train_repo_list, test_repo_list = load_fixed_split()
 
     df_train = df[df["repo"].isin(train_repo_list)]
+    df_train = df_train[df_train["value"].notna()]
 
     print('-' * 40)
     print(f"Train size: {len(df_train)}")
@@ -78,14 +79,15 @@ def main(cred_data_location: str) -> str:
 
     os.makedirs("results/", exist_ok=True)
     current_time = int(time())
-    model_file_name = f"results/ml_model_at-{current_time}.h5"
+    model_file_name = f"results/ml_model_at-{current_time}"
     keras_model.save(model_file_name, include_optimizer=False)
 
     print('-' * 40)
     print("Validate results on the test subset")
     df = join_label(detected_data_copy, meta_data_copy)
     df_missing = get_missing(detected_data_copy, meta_data_copy)
     df_test = df[df["repo"].isin(test_repo_list)]
+    df_test = df_test[df_test["value"].notna()]
     df_missing_test = df_missing[df_missing["repo"].isin(test_repo_list)]
     X_test_value, X_test_features = prepare_data(df_test)
     y_test = get_y_labels(df_test)

@@ -1,5 +1,6 @@
 import json
 import os
+import pathlib
 from typing import Tuple, Dict
 from copy import deepcopy
 import pandas as pd
@@ -9,7 +10,7 @@
 
 
 def strip_data_path(file_path, split="CredData/"):
-    file_path = file_path.replace("//", "/")
+    file_path = pathlib.Path(file_path).as_posix()
     return file_path.split(split, 1)[-1]
 
 
@@ -153,5 +154,5 @@ def eval_with_model(df: pd.DataFrame, df_missing: pd.DataFrame, predictions: np.
 
 
 def get_y_labels(df: pd.DataFrame) -> np.ndarray:
-    true_cases = np.array(df["GroundTruth"])
+    true_cases = np.array(df["GroundTruth"], dtype=np.int32)
     return true_cases
@@ -1,13 +1,16 @@
+import os
+import re
 from typing import Tuple, Union
 
 from credsweeper.common.constants import Severity
+from credsweeper.config import Config
 from credsweeper.credentials import Candidate
 from credsweeper.credentials import LineData
 from credsweeper.ml_model import MlValidator
 import numpy as np
 import pandas as pd
 
-MlValidator()  # Initialize global MLValidator object
+ml_validator = MlValidator(0.5)  # Initialize global MLValidator object
 
 
 class CustomLineData(LineData):
@@ -18,6 +21,7 @@ def __init__(self, line: str, value: str, line_num: int, path: str) -> None:
         self.line_num: int = line_num
         self.path: str = path
         self.value = value
+        self.file_type = os.path.splitext(path)[-1]
 
 
 def get_candidates(line_data: dict):
@@ -35,10 +39,10 @@ def get_features(line_data: Union[dict, pd.Series]):
     value = line_data["value"]
     candidates = get_candidates(line_data)
 
-    line_input = MlValidator.encode(value, MlValidator.char_to_index)
+    line_input = ml_validator.encode(value, ml_validator.char_to_index)
 
-    common_features = MlValidator.extract_common_features(candidates)
-    unique_features = MlValidator.extract_unique_features(candidates)
+    common_features = ml_validator.extract_common_features(candidates)
+    unique_features = ml_validator.extract_unique_features(candidates)
 
     extracted_features = np.hstack([common_features, unique_features])