Update training model experiment

Samsung · Jan 8, 2024 · bb52cd2 · bb52cd2
1 parent 253b6ae
commit bb52cd2
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 11 deletions.
diff --git a/experiment/augmentation/main.py b/experiment/augmentation/main.py
@@ -118,10 +118,10 @@ def add_raw_lines(meta_df, filepath, content):
 def write2aug_file(repo_local_path, meta_df, aug_file):
     fls_path = list(set(meta_df.FilePath))
     for filepath in fls_path:
-        with open(repo_local_path / filepath, "r") as reader:
+        with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
             content = reader.readlines()
         add_raw_lines(meta_df, filepath, content)
-    with open(repo_local_path / aug_file, "w") as writer:
+    with open(repo_local_path / aug_file, "w", encoding="utf8") as writer:
         Rows = meta_df.RawLine
         writer.writelines(Rows)
 
@@ -150,7 +150,7 @@ def get_linage(repo_local_path, df):
     files_length = {}
     overall_linage = 0
     for filepath in fls_path:
-        with open(repo_local_path / filepath, "r") as reader:
+        with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
             content = reader.readlines()
         overall_linage += len(content)
         files_length[filepath] = len(content)
@@ -184,7 +184,7 @@ def get_true_row(df, idx, aug_file):
         "RawLine": ""  #
     })
     idx += line_diff
-    t_df = t_df.append(add_series)
+    t_df = pd.concat([t_df, add_series])
     return t_df, idx
 
 

diff --git a/experiment/main.py b/experiment/main.py
@@ -51,6 +51,7 @@ def main(cred_data_location: str) -> str:
     train_repo_list, test_repo_list = load_fixed_split()
 
     df_train = df[df["repo"].isin(train_repo_list)]
+    df_train = df_train[df_train["value"].notna()]
 
     print('-' * 40)
     print(f"Train size: {len(df_train)}")
@@ -78,14 +79,15 @@ def main(cred_data_location: str) -> str:
 
     os.makedirs("results/", exist_ok=True)
     current_time = int(time())
-    model_file_name = f"results/ml_model_at-{current_time}.h5"
+    model_file_name = f"results/ml_model_at-{current_time}"
     keras_model.save(model_file_name, include_optimizer=False)
 
     print('-' * 40)
     print("Validate results on the test subset")
     df = join_label(detected_data_copy, meta_data_copy)
     df_missing = get_missing(detected_data_copy, meta_data_copy)
     df_test = df[df["repo"].isin(test_repo_list)]
+    df_test = df_test[df_test["value"].notna()]
     df_missing_test = df_missing[df_missing["repo"].isin(test_repo_list)]
     X_test_value, X_test_features = prepare_data(df_test)
     y_test = get_y_labels(df_test)

diff --git a/experiment/src/data_loader.py b/experiment/src/data_loader.py
@@ -1,5 +1,6 @@
 import json
 import os
+import pathlib
 from typing import Tuple, Dict
 from copy import deepcopy
 import pandas as pd
@@ -9,7 +10,7 @@
 
 
 def strip_data_path(file_path, split="CredData/"):
-    file_path = file_path.replace("//", "/")
+    file_path = pathlib.Path(file_path).as_posix()
     return file_path.split(split, 1)[-1]
 
 
@@ -153,5 +154,5 @@ def eval_with_model(df: pd.DataFrame, df_missing: pd.DataFrame, predictions: np.
 
 
 def get_y_labels(df: pd.DataFrame) -> np.ndarray:
-    true_cases = np.array(df["GroundTruth"])
+    true_cases = np.array(df["GroundTruth"], dtype=np.int32)
     return true_cases
diff --git a/experiment/src/features.py b/experiment/src/features.py
@@ -1,13 +1,16 @@
+import os
+import re
 from typing import Tuple, Union
 
 from credsweeper.common.constants import Severity
+from credsweeper.config import Config
 from credsweeper.credentials import Candidate
 from credsweeper.credentials import LineData
 from credsweeper.ml_model import MlValidator
 import numpy as np
 import pandas as pd
 
-MlValidator()  # Initialize global MLValidator object
+ml_validator = MlValidator(0.5)  # Initialize global MLValidator object
 
 
 class CustomLineData(LineData):
@@ -18,6 +21,7 @@ def __init__(self, line: str, value: str, line_num: int, path: str) -> None:
         self.line_num: int = line_num
         self.path: str = path
         self.value = value
+        self.file_type = os.path.splitext(path)[-1]
 
 
 def get_candidates(line_data: dict):
@@ -35,10 +39,10 @@ def get_features(line_data: Union[dict, pd.Series]):
     value = line_data["value"]
     candidates = get_candidates(line_data)
 
-    line_input = MlValidator.encode(value, MlValidator.char_to_index)
+    line_input = ml_validator.encode(value, ml_validator.char_to_index)
 
-    common_features = MlValidator.extract_common_features(candidates)
-    unique_features = MlValidator.extract_unique_features(candidates)
+    common_features = ml_validator.extract_common_features(candidates)
+    unique_features = ml_validator.extract_unique_features(candidates)
 
     extracted_features = np.hstack([common_features, unique_features])