[skip actions] [experiment] 2024-01-17T14:21:49+02:00

Samsung · Jan 17, 2024 · e865870 · e865870
1 parent 667f2b7
commit e865870
Show file tree

Hide file tree

Showing 11 changed files with 175 additions and 77 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -333,4 +333,84 @@ jobs:
 
         exit ${exit_code}
 
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
+  experiment:
+    # the ml train test is placed here to use cached data set
+    needs: [ download_data ]
+
+    runs-on: ubuntu-latest
+
+    steps:
+
+      - name: Checkout CredData
+        uses: actions/checkout@v3
+        with:
+          repository: Samsung/CredData
+
+      - name: Cache data
+        id: cache-data
+        uses: actions/cache@v3
+        with:
+          path: data
+          key: cred-data-${{ hashFiles('snapshot.yaml') }}
+
+      - name: Failure in case when cache missed
+        if: steps.cache-data.outputs.cache-hit != 'true'
+        run: exit 1
+
+      - name: Exclude some sets and place to CredData dir
+        # keep b* & c* only to easy correct experiment/src/split.json
+        if: steps.cache-data.outputs.cache-hit == 'true'
+        run: |
+          rm -rf data/0* data/1* data/2* data/3* data/4* data/5*  data/6* data/7* data/8* data/9* data/a* data/d* data/e* data/f*
+          rm -rf meta/0* meta/1* meta/2* meta/3* meta/4* meta/5*  meta/6* meta/7* meta/8* meta/9* meta/a* meta/d* meta/e* meta/f*
+          mkdir -vp ${{ github.workspace }}/CredData
+          mv data ${{ github.workspace }}/CredData/
+          mv meta ${{ github.workspace }}/CredData/
+
+      - name: Set up Python 3.8
+        if: steps.cache-data.outputs.cache-hit != 'true'
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.8"
+
+      - name: Update PIP
+        run: python -m pip install --upgrade pip
+
+      - name: Checkout current CredSweeper
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          path: CredSweeper.head
+
+      - name: Install development packages
+        run: python -m pip install --requirement CredSweeper.head/requirements.txt
+
+      - name: Install experimental packages
+        # some versions will be changed for compatibility
+        run: python -m pip install --requirement CredSweeper.head/experiment/requirements.txt
+
+      - name: dbg
+        run: echo ${{ github.workspace }} && ls -al ${{ github.workspace }} && tree ${{ github.workspace }}
+
+      - name: Lighten spit.json
+        run: |
+          mv -vf ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
+          cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
+          grep -v '"[0-9ad-f][0-9a-f]\+' ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak >${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
+          cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
+
+      - name: Do the experiment
+        run: |
+          cd CredSweeper.head
+          ls -al #dbg
+          pwd #dbg
+          export PYTHONPATH=$(pwd):${PYTHONPATH}
+          cd experiment
+          python -m credsweeper --banner #dbg
+          python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
+          ls -al results
+
+
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
diff --git a/credsweeper/ml_model/model_config.json b/credsweeper/ml_model/model_config.json
@@ -48,6 +48,6 @@
             ".asciidoc", ".yaml", ".sh", ".c", ".cs", ".php", ".txt", ".yml", ".java", ".ts", ".md", ".js", ".json",
             ".rb", ".py", ".go"
         ]}},
-    {"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "AWS Client ID", "API", "Credential", "Password", "Key", "Auth"]}}
+    {"type": "RuleName", "kwargs": {"rule_names": ["Token", "Secret", "Github Old Token", "API", "Credential", "Password", "Key", "Auth", "JSON Web Token", "URL Credentials", "Nonce", "Salt", "Certificate"]}}
   ]
 }
diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml
@@ -100,7 +100,7 @@
   doc_only: true
 
 - name: API
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - api
@@ -169,7 +169,7 @@
   min_line_len: 30
 
 - name: Credential
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - credential
@@ -201,7 +201,7 @@
   min_line_len: 31
 
 - name: Github Old Token
-  severity: high
+  severity: critical
   type: pattern
   values:
     - (?i)((git)[\w\-]*(token|key|api)[\w\-]*(\s)*(=|:|:=)(\s)*(["']?)(?P<value>[a-z|\d]{40})(["']?))
@@ -279,7 +279,7 @@
   min_line_len: 105
 
 - name: JSON Web Token
-  severity: medium
+  severity: critical
   type: pattern
   values:
     - (^|[^.0-9A-Za-z_+-])(?P<value>eyJ[0-9A-Za-z_=-]{15,8000}([.0-9A-Za-z_=-]{1,8000})?)
@@ -313,7 +313,7 @@
   min_line_len: 36
 
 - name: Password
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - (?<!by)pass(?!ed|ing|es|\s+[a-z]{3,80})|pw(d|\b)
@@ -366,7 +366,7 @@
   min_line_len: 40
 
 - name: Secret
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - secret
@@ -476,7 +476,7 @@
   min_line_len: 50
 
 - name: Token
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - token
@@ -498,7 +498,7 @@
   min_line_len: 34
 
 - name: URL Credentials
-  severity: high
+  severity: critical
   type: pattern
   values:
     - ://[^:\s]*(?P<separator>:)(?P<value>[^@\s]+)@
@@ -510,7 +510,7 @@
   doc_available: false
 
 - name: Auth
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - auth(?!(or|ors)(?!i[tz]))
@@ -522,7 +522,7 @@
   doc_available: false
 
 - name: Key
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - key(?!word)
@@ -604,7 +604,7 @@
   min_line_len: 14
 
 - name: Nonce
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - nonce
@@ -616,7 +616,7 @@
   doc_available: false
 
 - name: Salt
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - salt
@@ -628,7 +628,7 @@
   doc_available: false
 
 - name: Certificate
-  severity: medium
+  severity: critical
   type: keyword
   values:
     - cert

diff --git a/experiment/__init__.py b/experiment/__init__.py
diff --git a/experiment/augmentation/main.py b/experiment/augmentation/main.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from shutil import rmtree
 from multiprocessing import Pool
@@ -9,6 +10,11 @@
 
 from obfuscation import get_obfuscated_value, generate_value, SecretCreds
 
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s",
+    level="DEBUG")
+logger = logging.getLogger(__name__)
+
 BASE_PATH = ["test", "src", "other"]
 COLUMN_TYPES = {
     "Id": str,
@@ -67,9 +73,9 @@ def obfuscate_row(row, meta, secret_creds):
         obfuscated_value = get_obfuscated_value(value, pattern)
     else:
         if meta.WithWords == "1" and meta.Category not in [
-                "Authentication Key & Token",  #
-                "Generic Secret",  #
-                "Generic Token"  #
+            "Authentication Key & Token",  #
+            "Generic Secret",  #
+            "Generic Token"  #
         ]:
             obfuscated_value = secret_creds.get_word_secret()
         elif meta.Category == "Password":
@@ -184,37 +190,30 @@ def get_true_row(df, idx, aug_file):
         "RawLine": ""  #
     })
     idx += line_diff
-    t_df = t_df.append(add_series)
+    t_df = pd.concat([t_df, add_series])
     return t_df, idx
 
 
-def get_false_row(row_numb, aug_filename, files_length, fl_true_lines):
-    fl_path = list(files_length.keys())
+def get_false_row(df, idx, aug_file):
+    temp_df = df[df["GroundTruth"] == "F"]
+    fl_path = list(temp_df["FilePath"])
+    if len(fl_path) == 0:
+        return None, idx
 
+    lines = list(temp_df["LineStart:LineEnd"])
     rand = random.randint(0, len(fl_path) - 1)
-    fl_name = fl_path[rand]
-    fl_length = files_length[fl_name]
-    # Filter true lines
-    true_lines = fl_true_lines[fl_name]
-    if fl_length == len(true_lines):
-        return None
-
-    t_df = None
-    while t_df is None:
-        rand_row = random.randint(1, fl_length)
-        if rand_row in true_lines:
-            continue
-        orig_linenumb = str(rand_row) + ":" + str(rand_row)
-        new_linenumb = str(row_numb) + ":" + str(row_numb)
-        t_df = pd.Series({
-            "FilePath": fl_name,
-            "LineStart:LineEnd": orig_linenumb,
-            "GroundTruth": "F",
-            "New_LineNumb": new_linenumb,
-            "New_FilePath": aug_filename,
-            "RawLine": ""
-        })
-    return t_df
+    line_numb = lines[rand].split(":")
+    t_df = temp_df.iloc[rand].copy()
+    line_diff = int(line_numb[1]) - int(line_numb[0])
+    new_linenumb = str(idx) + ":" + str(idx + line_diff)
+    add_series = pd.Series({
+        "New_LineNumb": new_linenumb,  #
+        "New_FilePath": aug_file,  #
+        "RawLine": ""  #
+    })
+    idx += line_diff
+    t_df = pd.concat([t_df, add_series])
+    return t_df, idx
 
 
 def get_true_lines(df):
@@ -249,7 +248,7 @@ def generate_rows(repo_local_path, aug_filename, df, true_stake, scale):
             ground_trues, idx = get_true_row(df, row_numb, aug_filename)
             row_numb = idx
         else:
-            ground_trues = get_false_row(row_numb, aug_filename, files_length, fl_true_lines)
+            ground_trues, idx = get_false_row(df, row_numb, aug_filename)
         if ground_trues is None:
             row_numb -= 1
             continue
@@ -291,19 +290,21 @@ def aug_dir(arg):
         write_meta(new_meta, aug_meta)
 
 
-def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, scale: float):
+def build_corpus(repo_local_path: Path, meta_path: Path, repos_paths, true_stake: float, scale: float):
     """ Build the corpus for this repo.
 
         Parameters
         ----------
-        repo_path: str
+        repo_local_path: str
             Path to the CredPosDataset repository
         meta_path: str
             Path to the metadata
         repos_paths: List[str]
             List of repos directory names
         true_stake:
             Part of the rows with "True" cases in the aggregated data
+        scale:
+            scale
 
         Returns
         -------
@@ -332,8 +333,8 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca
 
 if __name__ == "__main__":
     CredDataDirectory = sys.argv[1]
-    true_stake = sys.argv[2]
-    scale = sys.argv[3]
+    _true_stake = sys.argv[2]
+    _scale = sys.argv[3]
 
     try:
         CredDataDirectory = os.path.abspath(CredDataDirectory)
@@ -343,20 +344,20 @@ def build_corpus(repo_local_path, meta_path, repos_paths, true_stake: float, sca
         raise ValueError("Please set a valid CredData. It should be a valid path")
 
     try:
-        true_stake = float(true_stake)
+        _true_stake = float(_true_stake)
     except:
         raise ValueError("Please set a valid true_stake. It cannot contain commas, spaces, or characters.")
-    if true_stake < 0 or true_stake > 0.5:
+    if _true_stake < 0 or _true_stake > 0.5:
         raise ValueError("Please set a valid true_stake. It should be between 0 and 0.5")
 
     try:
-        scale = float(scale)
+        _scale = float(_scale)
     except:
         raise ValueError("Please set a valid scale. It cannot contain commas, spaces, or characters.")
 
     repo_path = Path(CredDataDirectory)
     data_path = repo_path / "data"
-    meta_path = repo_path / "meta"
-    repos_paths = os.listdir(data_path)
+    _meta_path = repo_path / "meta"
+    _repos_paths = os.listdir(data_path)
 
-    build_corpus(repo_path, meta_path, repos_paths, true_stake, scale)
+    build_corpus(repo_path, _meta_path, _repos_paths, _true_stake, _scale)
diff --git a/experiment/main.py b/experiment/main.py
@@ -78,7 +78,7 @@ def main(cred_data_location: str) -> str:
 
     os.makedirs("results/", exist_ok=True)
     current_time = int(time())
-    model_file_name = f"results/ml_model_at-{current_time}.h5"
+    model_file_name = f"results/ml_model_at-{current_time}"
     keras_model.save(model_file_name, include_optimizer=False)
 
     print('-' * 40)

diff --git a/experiment/requirements.txt b/experiment/requirements.txt
@@ -0,0 +1,10 @@
+h5py==3.10.0
+keras==2.13.1
+numpy==1.23.5
+onnx==1.15.0
+protobuf==3.20.3
+tensorflow==2.13.1
+tf2onnx==1.16.0
+wrapt==1.14.1
+
+tqdm==4.66.1