Skip to content

Commit

Permalink
[skip actions] [experiment] 2024-01-17T14:21:49+02:00
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Jan 17, 2024
1 parent 667f2b7 commit e865870
Show file tree
Hide file tree
Showing 11 changed files with 175 additions and 77 deletions.
80 changes: 80 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -333,4 +333,84 @@ jobs:
exit ${exit_code}
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

experiment:
# the ml train test is placed here to use cached data set
needs: [ download_data ]

runs-on: ubuntu-latest

steps:

- name: Checkout CredData
uses: actions/checkout@v3
with:
repository: Samsung/CredData

- name: Cache data
id: cache-data
uses: actions/cache@v3
with:
path: data
key: cred-data-${{ hashFiles('snapshot.yaml') }}

- name: Failure in case when cache missed
if: steps.cache-data.outputs.cache-hit != 'true'
run: exit 1

- name: Exclude some sets and place to CredData dir
# keep b* & c* only to easy correct experiment/src/split.json
if: steps.cache-data.outputs.cache-hit == 'true'
run: |
rm -rf data/0* data/1* data/2* data/3* data/4* data/5* data/6* data/7* data/8* data/9* data/a* data/d* data/e* data/f*
rm -rf meta/0* meta/1* meta/2* meta/3* meta/4* meta/5* meta/6* meta/7* meta/8* meta/9* meta/a* meta/d* meta/e* meta/f*
mkdir -vp ${{ github.workspace }}/CredData
mv data ${{ github.workspace }}/CredData/
mv meta ${{ github.workspace }}/CredData/
- name: Set up Python 3.8
if: steps.cache-data.outputs.cache-hit != 'true'
uses: actions/setup-python@v3
with:
python-version: "3.8"

- name: Update PIP
run: python -m pip install --upgrade pip

- name: Checkout current CredSweeper
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
path: CredSweeper.head

- name: Install development packages
run: python -m pip install --requirement CredSweeper.head/requirements.txt

- name: Install experimental packages
# some versions will be changed for compatibility
run: python -m pip install --requirement CredSweeper.head/experiment/requirements.txt

- name: dbg
run: echo ${{ github.workspace }} && ls -al ${{ github.workspace }} && tree ${{ github.workspace }}

- name: Lighten spit.json
run: |
mv -vf ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
grep -v '"[0-9ad-f][0-9a-f]\+' ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak >${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
- name: Do the experiment
run: |
cd CredSweeper.head
ls -al #dbg
pwd #dbg
export PYTHONPATH=$(pwd):${PYTHONPATH}
cd experiment
python -m credsweeper --banner #dbg
python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
ls -al results
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
2 changes: 1 addition & 1 deletion credsweeper/ml_model/model_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,6 @@
".asciidoc", ".yaml", ".sh", ".c", ".cs", ".php", ".txt", ".yml", ".java", ".ts", ".md", ".js", ".json",
".rb", ".py", ".go"
]}},
{"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "AWS Client ID", "API", "Credential", "Password", "Key", "Auth"]}}
{"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "Github Old Token", "API", "Credential", "Password", "Key", "Auth", "JSON Web Token", "URL Credentials", "Nonce", "Salt", "Certificate"]}}
]
}
26 changes: 13 additions & 13 deletions credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@
doc_only: true

- name: API
severity: medium
severity: critical
type: keyword
values:
- api
Expand Down Expand Up @@ -169,7 +169,7 @@
min_line_len: 30

- name: Credential
severity: medium
severity: critical
type: keyword
values:
- credential
Expand Down Expand Up @@ -201,7 +201,7 @@
min_line_len: 31

- name: Github Old Token
severity: high
severity: critical
type: pattern
values:
- (?i)((git)[\w\-]*(token|key|api)[\w\-]*(\s)*(=|:|:=)(\s)*(["']?)(?P<value>[a-z|\d]{40})(["']?))
Expand Down Expand Up @@ -279,7 +279,7 @@
min_line_len: 105

- name: JSON Web Token
severity: medium
severity: critical
type: pattern
values:
- (^|[^.0-9A-Za-z_+-])(?P<value>eyJ[0-9A-Za-z_=-]{15,8000}([.0-9A-Za-z_=-]{1,8000})?)
Expand Down Expand Up @@ -313,7 +313,7 @@
min_line_len: 36

- name: Password
severity: medium
severity: critical
type: keyword
values:
- (?<!by)pass(?!ed|ing|es|\s+[a-z]{3,80})|pw(d|\b)
Expand Down Expand Up @@ -366,7 +366,7 @@
min_line_len: 40

- name: Secret
severity: medium
severity: critical
type: keyword
values:
- secret
Expand Down Expand Up @@ -476,7 +476,7 @@
min_line_len: 50

- name: Token
severity: medium
severity: critical
type: keyword
values:
- token
Expand All @@ -498,7 +498,7 @@
min_line_len: 34

- name: URL Credentials
severity: high
severity: critical
type: pattern
values:
- ://[^:\s]*(?P<separator>:)(?P<value>[^@\s]+)@
Expand All @@ -510,7 +510,7 @@
doc_available: false

- name: Auth
severity: medium
severity: critical
type: keyword
values:
- auth(?!(or|ors)(?!i[tz]))
Expand All @@ -522,7 +522,7 @@
doc_available: false

- name: Key
severity: medium
severity: critical
type: keyword
values:
- key(?!word)
Expand Down Expand Up @@ -604,7 +604,7 @@
min_line_len: 14

- name: Nonce
severity: medium
severity: critical
type: keyword
values:
- nonce
Expand All @@ -616,7 +616,7 @@
doc_available: false

- name: Salt
severity: medium
severity: critical
type: keyword
values:
- salt
Expand All @@ -628,7 +628,7 @@
doc_available: false

- name: Certificate
severity: medium
severity: critical
type: keyword
values:
- cert
Expand Down
Empty file added experiment/__init__.py
Empty file.
81 changes: 41 additions & 40 deletions experiment/augmentation/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
from shutil import rmtree
from multiprocessing import Pool
Expand All @@ -9,6 +10,11 @@

from obfuscation import get_obfuscated_value, generate_value, SecretCreds

logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s",
level="DEBUG")
logger = logging.getLogger(__name__)

BASE_PATH = ["test", "src", "other"]
COLUMN_TYPES = {
"Id": str,
Expand Down Expand Up @@ -67,9 +73,9 @@ def obfuscate_row(row, meta, secret_creds):
obfuscated_value = get_obfuscated_value(value, pattern)
else:
if meta.WithWords == "1" and meta.Category not in [
"Authentication Key & Token", #
"Generic Secret", #
"Generic Token" #
"Authentication Key & Token", #
"Generic Secret", #
"Generic Token" #
]:
obfuscated_value = secret_creds.get_word_secret()
elif meta.Category == "Password":
Expand Down Expand Up @@ -184,37 +190,30 @@ def get_true_row(df, idx, aug_file):
"RawLine": "" #
})
idx += line_diff
t_df = t_df.append(add_series)
t_df = pd.concat([t_df, add_series])
return t_df, idx


def get_false_row(row_numb, aug_filename, files_length, fl_true_lines):
fl_path = list(files_length.keys())
def get_false_row(df, idx, aug_file):
temp_df = df[df["GroundTruth"] == "F"]
fl_path = list(temp_df["FilePath"])
if len(fl_path) == 0:
return None, idx

lines = list(temp_df["LineStart:LineEnd"])
rand = random.randint(0, len(fl_path) - 1)
fl_name = fl_path[rand]
fl_length = files_length[fl_name]
# Filter true lines
true_lines = fl_true_lines[fl_name]
if fl_length == len(true_lines):
return None

t_df = None
while t_df is None:
rand_row = random.randint(1, fl_length)
if rand_row in true_lines:
continue
orig_linenumb = str(rand_row) + ":" + str(rand_row)
new_linenumb = str(row_numb) + ":" + str(row_numb)
t_df = pd.Series({
"FilePath": fl_name,
"LineStart:LineEnd": orig_linenumb,
"GroundTruth": "F",
"New_LineNumb": new_linenumb,
"New_FilePath": aug_filename,
"RawLine": ""
})
return t_df
line_numb = lines[rand].split(":")
t_df = temp_df.iloc[rand].copy()
line_diff = int(line_numb[1]) - int(line_numb[0])
new_linenumb = str(idx) + ":" + str(idx + line_diff)
add_series = pd.Series({
"New_LineNumb": new_linenumb, #
"New_FilePath": aug_file, #
"RawLine": "" #
})
idx += line_diff
t_df = pd.concat([t_df, add_series])
return t_df, idx


def get_true_lines(df):
Expand Down Expand Up @@ -249,7 +248,7 @@ def generate_rows(repo_local_path, aug_filename, df, true_stake, scale):
ground_trues, idx = get_true_row(df, row_numb, aug_filename)
row_numb = idx
else:
ground_trues = get_false_row(row_numb, aug_filename, files_length, fl_true_lines)
ground_trues, idx = get_false_row(df, row_numb, aug_filename)
if ground_trues is None:
row_numb -= 1
continue
Expand Down Expand Up @@ -291,19 +290,21 @@ def aug_dir(arg):
write_meta(new_meta, aug_meta)


def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, scale: float):
def build_corpus(repo_local_path: Path, meta_path: Path, repos_paths, true_stake: float, scale: float):
""" Build the corpus for this repo.
Parameters
----------
repo_path: str
repo_local_path: str
Path to the CredPosDataset repository
meta_path: str
Path to the metadata
repos_paths: List[str]
List of repos directory names
true_stake:
Part of the rows with "True" cases in the aggregated data
scale:
scale
Returns
-------
Expand Down Expand Up @@ -332,8 +333,8 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca

if __name__ == "__main__":
CredDataDirectory = sys.argv[1]
true_stake = sys.argv[2]
scale = sys.argv[3]
_true_stake = sys.argv[2]
_scale = sys.argv[3]

try:
CredDataDirectory = os.path.abspath(CredDataDirectory)
Expand All @@ -343,20 +344,20 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca
raise ValueError("Please set a valid CredData. It should be a valid path")

try:
true_stake = float(true_stake)
_true_stake = float(_true_stake)
except:
raise ValueError("Please set a valid true_stake. It cannot contain commas, spaces, or characters.")
if true_stake < 0 or true_stake > 0.5:
if _true_stake < 0 or _true_stake > 0.5:
raise ValueError("Please set a valid true_stake. It should be between 0 and 0.5")

try:
scale = float(scale)
_scale = float(_scale)
except:
raise ValueError("Please set a valid scale. It cannot contain commas, spaces, or characters.")

repo_path = Path(CredDataDirectory)
data_path = repo_path / "data"
meta_path = repo_path / "meta"
repos_paths = os.listdir(data_path)
_meta_path = repo_path / "meta"
_repos_paths = os.listdir(data_path)

build_corpus(repo_path, meta_path, repos_paths, true_stake, scale)
build_corpus(repo_path, _meta_path, _repos_paths, _true_stake, _scale)
2 changes: 1 addition & 1 deletion experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def main(cred_data_location: str) -> str:

os.makedirs("results/", exist_ok=True)
current_time = int(time())
model_file_name = f"results/ml_model_at-{current_time}.h5"
model_file_name = f"results/ml_model_at-{current_time}"
keras_model.save(model_file_name, include_optimizer=False)

print('-' * 40)
Expand Down
10 changes: 10 additions & 0 deletions experiment/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
h5py==3.10.0
keras==2.13.1
numpy==1.23.5
onnx==1.15.0
protobuf==3.20.3
tensorflow==2.13.1
tf2onnx==1.16.0
wrapt==1.14.1

tqdm==4.66.1
Loading

0 comments on commit e865870

Please sign in to comment.