Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tmp #487

Closed
wants to merge 2 commits into from
Closed

Tmp #487

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 86 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# This workflow runs benchmark

Check failure on line 1 in .github/workflows/benchmark.yml

View workflow job for this annotation

GitHub Actions / download_data

.github/workflows/benchmark.yml#L1

This run was manually canceled.
# Separation of jobs helps to cache data even benchmark is fail

name: Benchmark
Expand Down Expand Up @@ -332,3 +332,89 @@

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

experiment:
# the ml train test is placed here to use cached data set
needs: [ download_data ]

runs-on: ubuntu-latest

steps:

- name: Checkout CredData
uses: actions/checkout@v3
with:
repository: Samsung/CredData

- name: Cache data
id: cache-data
uses: actions/cache@v3
with:
path: data
key: cred-data-${{ hashFiles('snapshot.yaml') }}

- name: Failure in case when cache missed
if: steps.cache-data.outputs.cache-hit != 'true'
run: exit 1

- name: Exclude some sets and place to CredData dir
# keep b* & c* only to easy experiment/src/split.json
run: |
rm -rf data/0* data/1* data/2* data/3* data/4* data/5* data/6* data/7* data/8* data/9* data/a* data/d* data/e* data/f*
rm -rf meta/0* meta/1* meta/2* meta/3* meta/4* meta/5* meta/6* meta/7* meta/8* meta/9* meta/a* meta/d* meta/e* meta/f*
mkdir -vp ${{ github.workspace }}/CredData
mv data ${{ github.workspace }}/CredData/
mv meta ${{ github.workspace }}/CredData/

- name: Set up Python 3.8
if: steps.cache-data.outputs.cache-hit != 'true'
uses: actions/setup-python@v3
with:
python-version: "3.8"

- name: Update PIP
run: python -m pip install --upgrade pip

- name: Checkout current CredSweeper
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
path: CredSweeper.head

- name: Install development packages
run: python -m pip install --requirement CredSweeper.head/requirements.txt

- name: Install experimental packages
# some versions will be changed for compatibility
run: python -m pip install --requirement CredSweeper.head/experiment/requirements.txt

- name: dbg
run: echo ${{ github.workspace }} && ls -al ${{ github.workspace }} && tree ${{ github.workspace }}

- name: Lighten spit.json
run: |
mv -vf ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
grep -v '"[0-9ad-f][0-9a-f]\+' ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak >${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json

- name: Run the experiment
run: |
cd CredSweeper.head
ls -al #dbg
pwd #dbg
export PYTHONPATH=$(pwd):${PYTHONPATH}
cd experiment
python -m credsweeper --banner #dbg - check whether credsweeper is available as moule
python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
ls -al results #dbg
python -m tf2onnx.convert --saved-model $(find results -mindepth 1 -maxdepth 1 -type d) --output ../credsweeper/ml_model/ml_model.onnx --verbose --rename-inputs feature_input,line_input
git diff #dbg
python -m credsweeper --banner #dbg - crc32 should be changed
python -m credsweeper --log debug --path ../tests/samples --save-json
NEW_MODEL_FOUND_SAMPLES=$(jq '.|length' output.json)
if [ 1000 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
echo "Failure"
exit 1
fi

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
8 changes: 4 additions & 4 deletions experiment/augmentation/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ def add_raw_lines(meta_df, filepath, content):
def write2aug_file(repo_local_path, meta_df, aug_file):
fls_path = list(set(meta_df.FilePath))
for filepath in fls_path:
with open(repo_local_path / filepath, "r") as reader:
with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
content = reader.readlines()
add_raw_lines(meta_df, filepath, content)
with open(repo_local_path / aug_file, "w") as writer:
with open(repo_local_path / aug_file, "w", encoding="utf8") as writer:
Rows = meta_df.RawLine
writer.writelines(Rows)

Expand Down Expand Up @@ -150,7 +150,7 @@ def get_linage(repo_local_path, df):
files_length = {}
overall_linage = 0
for filepath in fls_path:
with open(repo_local_path / filepath, "r") as reader:
with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
content = reader.readlines()
overall_linage += len(content)
files_length[filepath] = len(content)
Expand Down Expand Up @@ -184,7 +184,7 @@ def get_true_row(df, idx, aug_file):
"RawLine": "" #
})
idx += line_diff
t_df = t_df.append(add_series)
t_df = pd.concat([t_df, add_series])
return t_df, idx


Expand Down
4 changes: 3 additions & 1 deletion experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def main(cred_data_location: str) -> str:
train_repo_list, test_repo_list = load_fixed_split()

df_train = df[df["repo"].isin(train_repo_list)]
df_train = df_train[df_train["value"].notna()]

print('-' * 40)
print(f"Train size: {len(df_train)}")
Expand Down Expand Up @@ -78,14 +79,15 @@ def main(cred_data_location: str) -> str:

os.makedirs("results/", exist_ok=True)
current_time = int(time())
model_file_name = f"results/ml_model_at-{current_time}.h5"
model_file_name = f"results/ml_model_at-{current_time}"
keras_model.save(model_file_name, include_optimizer=False)

print('-' * 40)
print("Validate results on the test subset")
df = join_label(detected_data_copy, meta_data_copy)
df_missing = get_missing(detected_data_copy, meta_data_copy)
df_test = df[df["repo"].isin(test_repo_list)]
df_test = df_test[df_test["value"].notna()]
df_missing_test = df_missing[df_missing["repo"].isin(test_repo_list)]
X_test_value, X_test_features = prepare_data(df_test)
y_test = get_y_labels(df_test)
Expand Down
5 changes: 3 additions & 2 deletions experiment/src/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import pathlib
from typing import Tuple, Dict
from copy import deepcopy
import pandas as pd
Expand All @@ -9,7 +10,7 @@


def strip_data_path(file_path, split="CredData/"):
file_path = file_path.replace("//", "/")
file_path = pathlib.Path(file_path).as_posix()
return file_path.split(split, 1)[-1]


Expand Down Expand Up @@ -153,5 +154,5 @@ def eval_with_model(df: pd.DataFrame, df_missing: pd.DataFrame, predictions: np.


def get_y_labels(df: pd.DataFrame) -> np.ndarray:
true_cases = np.array(df["GroundTruth"])
true_cases = np.array(df["GroundTruth"], dtype=np.int32)
return true_cases
12 changes: 8 additions & 4 deletions experiment/src/features.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import os
import re
from typing import Tuple, Union

from credsweeper.common.constants import Severity
from credsweeper.config import Config
from credsweeper.credentials import Candidate
from credsweeper.credentials import LineData
from credsweeper.ml_model import MlValidator
import numpy as np
import pandas as pd

MlValidator() # Initialize global MLValidator object
ml_validator = MlValidator(0.5) # Initialize global MLValidator object


class CustomLineData(LineData):
Expand All @@ -18,6 +21,7 @@ def __init__(self, line: str, value: str, line_num: int, path: str) -> None:
self.line_num: int = line_num
self.path: str = path
self.value = value
self.file_type = os.path.splitext(path)[-1]


def get_candidates(line_data: dict):
Expand All @@ -35,10 +39,10 @@ def get_features(line_data: Union[dict, pd.Series]):
value = line_data["value"]
candidates = get_candidates(line_data)

line_input = MlValidator.encode(value, MlValidator.char_to_index)
line_input = ml_validator.encode(value, ml_validator.char_to_index)

common_features = MlValidator.extract_common_features(candidates)
unique_features = MlValidator.extract_unique_features(candidates)
common_features = ml_validator.extract_common_features(candidates)
unique_features = ml_validator.extract_unique_features(candidates)

extracted_features = np.hstack([common_features, unique_features])

Expand Down
Loading