Skip to content

Commit

Permalink
Update training model experiment
Browse files Browse the repository at this point in the history
  • Loading branch information
yuliia.t committed Jan 8, 2024
1 parent 253b6ae commit bb52cd2
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 11 deletions.
8 changes: 4 additions & 4 deletions experiment/augmentation/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ def add_raw_lines(meta_df, filepath, content):
def write2aug_file(repo_local_path, meta_df, aug_file):
fls_path = list(set(meta_df.FilePath))
for filepath in fls_path:
with open(repo_local_path / filepath, "r") as reader:
with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
content = reader.readlines()
add_raw_lines(meta_df, filepath, content)
with open(repo_local_path / aug_file, "w") as writer:
with open(repo_local_path / aug_file, "w", encoding="utf8") as writer:
Rows = meta_df.RawLine
writer.writelines(Rows)

Expand Down Expand Up @@ -150,7 +150,7 @@ def get_linage(repo_local_path, df):
files_length = {}
overall_linage = 0
for filepath in fls_path:
with open(repo_local_path / filepath, "r") as reader:
with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
content = reader.readlines()
overall_linage += len(content)
files_length[filepath] = len(content)
Expand Down Expand Up @@ -184,7 +184,7 @@ def get_true_row(df, idx, aug_file):
"RawLine": "" #
})
idx += line_diff
t_df = t_df.append(add_series)
t_df = pd.concat([t_df, add_series])
return t_df, idx


Expand Down
4 changes: 3 additions & 1 deletion experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def main(cred_data_location: str) -> str:
train_repo_list, test_repo_list = load_fixed_split()

df_train = df[df["repo"].isin(train_repo_list)]
df_train = df_train[df_train["value"].notna()]

print('-' * 40)
print(f"Train size: {len(df_train)}")
Expand Down Expand Up @@ -78,14 +79,15 @@ def main(cred_data_location: str) -> str:

os.makedirs("results/", exist_ok=True)
current_time = int(time())
model_file_name = f"results/ml_model_at-{current_time}.h5"
model_file_name = f"results/ml_model_at-{current_time}"
keras_model.save(model_file_name, include_optimizer=False)

print('-' * 40)
print("Validate results on the test subset")
df = join_label(detected_data_copy, meta_data_copy)
df_missing = get_missing(detected_data_copy, meta_data_copy)
df_test = df[df["repo"].isin(test_repo_list)]
df_test = df_test[df_test["value"].notna()]
df_missing_test = df_missing[df_missing["repo"].isin(test_repo_list)]
X_test_value, X_test_features = prepare_data(df_test)
y_test = get_y_labels(df_test)
Expand Down
5 changes: 3 additions & 2 deletions experiment/src/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import pathlib
from typing import Tuple, Dict
from copy import deepcopy
import pandas as pd
Expand All @@ -9,7 +10,7 @@


def strip_data_path(file_path, split="CredData/"):
file_path = file_path.replace("//", "/")
file_path = pathlib.Path(file_path).as_posix()
return file_path.split(split, 1)[-1]


Expand Down Expand Up @@ -153,5 +154,5 @@ def eval_with_model(df: pd.DataFrame, df_missing: pd.DataFrame, predictions: np.


def get_y_labels(df: pd.DataFrame) -> np.ndarray:
true_cases = np.array(df["GroundTruth"])
true_cases = np.array(df["GroundTruth"], dtype=np.int32)
return true_cases
12 changes: 8 additions & 4 deletions experiment/src/features.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import os
import re
from typing import Tuple, Union

from credsweeper.common.constants import Severity
from credsweeper.config import Config
from credsweeper.credentials import Candidate
from credsweeper.credentials import LineData
from credsweeper.ml_model import MlValidator
import numpy as np
import pandas as pd

MlValidator() # Initialize global MLValidator object
ml_validator = MlValidator(0.5) # Initialize global MLValidator object


class CustomLineData(LineData):
Expand All @@ -18,6 +21,7 @@ def __init__(self, line: str, value: str, line_num: int, path: str) -> None:
self.line_num: int = line_num
self.path: str = path
self.value = value
self.file_type = os.path.splitext(path)[-1]


def get_candidates(line_data: dict):
Expand All @@ -35,10 +39,10 @@ def get_features(line_data: Union[dict, pd.Series]):
value = line_data["value"]
candidates = get_candidates(line_data)

line_input = MlValidator.encode(value, MlValidator.char_to_index)
line_input = ml_validator.encode(value, ml_validator.char_to_index)

common_features = MlValidator.extract_common_features(candidates)
unique_features = MlValidator.extract_unique_features(candidates)
common_features = ml_validator.extract_common_features(candidates)
unique_features = ml_validator.extract_unique_features(candidates)

extracted_features = np.hstack([common_features, unique_features])

Expand Down

0 comments on commit bb52cd2

Please sign in to comment.