ML expanded and retrained (#562)

softreset
Samsung · Jun 17, 2024 · bcfae36 · bcfae36
1 parent 606a28d
commit bcfae36
Show file tree

Hide file tree

Showing 23 changed files with 1,648 additions and 1,203 deletions.
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -58,7 +58,7 @@ jobs:
     - name: Check ml_model.onnx integrity
       if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
       run: |
-        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 8f277b2f4a67a9911a9a860f1b5c0489
+        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 1cbfbd7fb1e657d137c9eeec26a07ad4
 
     # # # Python setup
 

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
@@ -224,27 +224,27 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
 TOTAL:                  10336      16998280         8325        60802         5147
-credsweeper result_cnt : 7216, lost_cnt : 0, true_cnt : 6622, false_cnt : 594
+credsweeper result_cnt : 7615, lost_cnt : 0, true_cnt : 7081, false_cnt : 534
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
-API                                     121         3101          184          78    76     2   3283    45  0.000609  0.371901  0.986201  0.974359  0.628099  0.763819
+API                                     121         3101          184         116   113     3   3282     8  0.000913  0.066116  0.996770  0.974138  0.933884  0.953586
 AWS Client ID                           166           13            0         157   157     0     13     9  0.000000  0.054217  0.949721  1.000000  0.945783  0.972136
 AWS Multi                                72           12            0          84    72    11      1     0  0.916667  0.000000  0.869048  0.867470  1.000000  0.929032
 AWS S3 Bucket                            61           25            0          87    61    24      1     0  0.960000  0.000000  0.720930  0.717647  1.000000  0.835616
 Atlassian Old PAT token                  27          212            3          12     3     8    207    24  0.037209  0.888889  0.867769  0.272727  0.111111  0.157895
-Auth                                    319         2749           86         287   263    24   2811    56  0.008466  0.175549  0.974635  0.916376  0.824451  0.867987
+Auth                                    319         2749           86         294   275    19   2816    44  0.006702  0.137931  0.980025  0.935374  0.862069  0.897227
 Azure Access Token                       19            0            0                 0     0      0    19            1.000000  0.000000            0.000000
 BASE64 Private Key                        7            2            0           7     7     0      2     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5     5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
 Bitbucket Client ID                     147         1836            3          45    27    17   1822   120  0.009244  0.816327  0.931017  0.613636  0.183673  0.282723
 Bitbucket Client Secret                 239          535            0          44    33    11    524   206  0.020561  0.861925  0.719638  0.750000  0.138075  0.233216
-Certificate                              22          456            1          16    15     1    456     7  0.002188  0.318182  0.983299  0.937500  0.681818  0.789474
-Credential                               88          130           74          85    82     3    201     6  0.014706  0.068182  0.969178  0.964706  0.931818  0.947977
+Certificate                              22          456            1          17    15     2    455     7  0.004376  0.318182  0.981211  0.882353  0.681818  0.769231
+Credential                               88          130           74          87    86     1    203     2  0.004902  0.022727  0.989726  0.988506  0.977273  0.982857
 Docker Swarm Token                        2            0            0           2     2     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Dropbox App secret                       62          114            0          46    36     9    105    26  0.078947  0.419355  0.801136  0.800000  0.580645  0.672897
 Facebook Access Token                     0            1            0                 0     0      1     0  0.000000            1.000000
 Firebase Domain                           6            1            0           7     6     1      0     0  1.000000  0.000000  0.857143  0.857143  1.000000  0.923077
-Github Old Token                          1            0            0                 0     0      0     1            1.000000  0.000000            0.000000
+Github Old Token                          1            0            0           1     1     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Gitlab Feed Token                       189          465           89          61    47    13    541   142  0.023466  0.751323  0.791386  0.783333  0.248677  0.377510
 Gitlab Incoming Email Token              37            3            0          21    19     2      1    18  0.666667  0.486486  0.500000  0.904762  0.513514  0.655172
 Google API Key                           12            0            0          12    12     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
@@ -253,18 +253,18 @@ Google OAuth Access Token                 3            0            0
 Grafana Provisioned API Key              22            1            0           1     1     0      1    21  0.000000  0.954545  0.086957  1.000000  0.045455  0.086957
 IPv4                                    729          406            0        1205   728   342     64     1  0.842365  0.001372  0.697797  0.680374  0.998628  0.809339
 IPv6                                     33          134            0          33    33     0    134     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-JSON Web Token                          284           11            2         281   272     9      4    12  0.692308  0.042254  0.929293  0.967972  0.957746  0.962832
+JSON Web Token                          284           11            2         274   272     2     11    12  0.153846  0.042254  0.952862  0.992701  0.957746  0.974910
 Jira / Confluence PAT token               0            4            0                 0     0      4     0  0.000000            1.000000
 Jira 2FA                                 14            6            0          10    10     0      6     4  0.000000  0.285714  0.800000  1.000000  0.714286  0.833333
-Key                                     462         7841          462         377   368     9   8294    94  0.001084  0.203463  0.988249  0.976127  0.796537  0.877235
-Nonce                                    79           53            0          36    36     0     53    43  0.000000  0.544304  0.674242  1.000000  0.455696  0.626087
+Key                                     462         7841          462         439   431     8   8295    31  0.000964  0.067100  0.995550  0.981777  0.932900  0.956715
+Nonce                                    79           53            0          84    76     8     45     3  0.150943  0.037975  0.916667  0.904762  0.962025  0.932515
 PEM Private Key                        1019         1483            0        1023  1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1915         7417         2669        1639  1556    83  10003   359  0.008229  0.187467  0.963170  0.949359  0.812533  0.875633
-Salt                                     42           72            2          26    26     0     74    16  0.000000  0.380952  0.862069  1.000000  0.619048  0.764706
-Secret                                 1359        29629          870        1019  1014     5  30494   345  0.000164  0.253863  0.989014  0.995093  0.746137  0.852817
+Password                               1915         7417         2669        1603  1581    22  10064   334  0.002181  0.174413  0.970336  0.986276  0.825587  0.898806
+Salt                                     42           72            2          38    38     0     74     4  0.000000  0.095238  0.965517  1.000000  0.904762  0.950000
+Secret                                 1359        29629          870        1236  1231     5  30494   128  0.000164  0.094187  0.995825  0.995955  0.905813  0.948748
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   572         3959          448         469   458    11   4396   114  0.002496  0.199301  0.974895  0.976546  0.800699  0.879923
+Token                                   572         3959          448         523   504    19   4388    68  0.004311  0.118881  0.982527  0.963671  0.881119  0.920548
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
-URL Credentials                         173          117          252         165   161     4    365    12  0.010840  0.069364  0.970480  0.975758  0.930636  0.952663
-                                       8325        60802         5147        7358  6622   594  60208  1703  0.009769  0.204565  0.966771  0.917683  0.795435  0.852197
+URL Credentials                         173          117          252         165   163     2    367    10  0.005420  0.057803  0.977860  0.987879  0.942197  0.964497
+                                       8325        60802         5147        7757  7081   534  60268  1244  0.008783  0.149429  0.974279  0.929875  0.850571  0.888457
diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py
@@ -86,8 +86,13 @@ def get(confidence: Union[str, "Confidence"]) -> Optional["Confidence"]:
 
 class Base(Enum):
     """Stores types of character sets in lower case"""
+    base16upper = "base16upper"
+    base16lower = "base16lower"
+    base32 = "base32"
     base36 = "base36"
     base64 = "base64"
+    base64std = "base64std"
+    base64url = "base64url"
     hex = "hex"
 
 
@@ -97,6 +102,10 @@ class Chars(Enum):
 
     # set of characters, hexadecimal numeral system (Base16). Upper- and lowercase
     HEX_CHARS = "0123456789ABCDEFabcdef"
+    # set of characters, hexadecimal numeral system (Base16). Uppercase
+    BASE16UPPER = "0123456789ABCDEF"
+    # set of characters, hexadecimal numeral system (Base16). Lowercase
+    BASE16LOWER = "0123456789abcdef"
     # set of 32 characters, used in Base32 encoding
     BASE32_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"
     # set of 36 characters, used in Base36 encoding
@@ -165,6 +174,8 @@ class DiffRowType(Enum):
 CHUNK_SIZE = 4000
 OVERLAP_SIZE = 1000
 CHUNK_STEP_SIZE = CHUNK_SIZE - OVERLAP_SIZE
+# ML hunk size to limit of variable or value size and get substring near value
+ML_HUNK = 80
 """ values according https://docs.python.org/3/library/codecs.html """
 UTF_8 = "utf_8"
 UTF_16 = "utf_16"

diff --git a/credsweeper/filters/value_ip_check.py b/credsweeper/filters/value_ip_check.py
@@ -2,6 +2,7 @@
 import ipaddress
 import re
 
+from credsweeper.common.constants import ML_HUNK
 from credsweeper.config import Config
 from credsweeper.credentials import LineData
 from credsweeper.file_handler.analysis_target import AnalysisTarget
@@ -41,7 +42,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
                 byte_sum = sum(x for x in ip.packed)
                 if 100 > (byte_sum >> 2):
                     # versions usually have low average of sum the bytes
-                    search_text = Util.subtext(line_data.line, line_data.value_start, 80)
+                    search_text = Util.subtext(line_data.line, line_data.value_start, ML_HUNK)
                     if self.FALSE_POSITIVE_PATTERN.search(search_text) \
                             and not self.TRUE_POSITIVE_PATTERN.search(search_text):
                         return True

diff --git a/credsweeper/ml_model/features.py b/credsweeper/ml_model/features.py
@@ -180,6 +180,7 @@ class RenyiEntropy(Feature):
 
     # Constant dictionary to get characters set via name
     CHARS: Dict[Base, Chars] = {  #
+        Base.base32: Chars.BASE32_CHARS,  #
         Base.base36: Chars.BASE36_CHARS,  #
         Base.base64: Chars.BASE64_CHARS,  #
         Base.hex: Chars.HEX_CHARS  #
@@ -253,6 +254,40 @@ def __init__(self, base: str, norm: bool = False) -> None:
         super().__init__(base, 0.0, norm)
 
 
+class CharSet(Feature):
+    """Feature is true when all characters of the value are from a set."""
+
+    # Constant dictionary to get characters set via name
+    CHARS: Dict[Base, str] = {  #
+        Base.base16upper: Chars.BASE16UPPER.value,  #
+        Base.base16lower: Chars.BASE16LOWER.value,  #
+        Base.base32: Chars.BASE32_CHARS.value,  #
+        Base.base36: Chars.BASE36_CHARS.value,  #
+        Base.base64std: Chars.BASE64STD_CHARS.value + '=',  #
+        Base.base64url: Chars.BASE64URL_CHARS.value + '=',  #
+    }
+
+    def __init__(self, base: str) -> None:
+        """CharSet class initializer.
+
+        Args:
+            base: base set ID
+
+        """
+        super().__init__()
+        self.base: Base = getattr(Base, base)
+
+    def extract(self, candidate: Candidate) -> bool:
+        try:
+            for i in self.CHARS[self.base]:
+                if i not in candidate.line_data_list[0].value:
+                    return False
+            else:
+                return True
+        except ValueError:
+            return False
+
+
 class FileExtension(Feature):
     """Categorical feature of file type.
 

diff --git a/credsweeper/ml_model/ml_model.onnx b/credsweeper/ml_model/ml_model.onnx
diff --git a/credsweeper/ml_model/ml_validator.py b/credsweeper/ml_model/ml_validator.py
@@ -6,7 +6,7 @@
 import numpy as np
 import onnxruntime as ort
 
-from credsweeper.common.constants import ThresholdPreset
+from credsweeper.common.constants import ThresholdPreset, ML_HUNK
 from credsweeper.credentials import Candidate, CandidateKey
 from credsweeper.ml_model import features
 from credsweeper.utils import Util
@@ -16,8 +16,7 @@
 
 class MlValidator:
     """ML validation class"""
-    HALF_LEN = 80  # limit of variable or value size
-    MAX_LEN = 2 * HALF_LEN  # for whole line limit
+    MAX_LEN = 2 * ML_HUNK  # for whole line limit
     NON_ASCII = '\xFF'
     CHAR_INDEX = {char: index for index, char in enumerate('\0' + string.printable + NON_ASCII)}
     NUM_CLASSES = len(CHAR_INDEX)
@@ -90,14 +89,14 @@ def encode_line(text: str, position: int):
         pos = position - offset
         stripped = text.strip()
         if MlValidator.MAX_LEN < len(stripped):
-            stripped = Util.subtext(stripped, pos, MlValidator.HALF_LEN)
+            stripped = Util.subtext(stripped, pos, ML_HUNK)
         return MlValidator.encode(stripped, MlValidator.MAX_LEN)
 
     @staticmethod
     def encode_value(text: str) -> np.ndarray:
         """Encodes line with balancing for position"""
         stripped = text.strip()
-        return MlValidator.encode(stripped[:MlValidator.HALF_LEN], MlValidator.HALF_LEN)
+        return MlValidator.encode(stripped[:ML_HUNK], ML_HUNK)
 
     def _call_model(self, line_input: np.ndarray, variable_input: np.ndarray, value_input: np.ndarray,
                     feature_input: np.ndarray) -> np.ndarray:

diff --git a/credsweeper/ml_model/model_config.json b/credsweeper/ml_model/model_config.json
@@ -215,6 +215,12 @@
                 "base": "hex"
             }
         },
+        {
+            "type": "ShannonEntropy",
+            "kwargs": {
+                "base": "base32"
+            }
+        },
         {
             "type": "ShannonEntropy",
             "kwargs": {
@@ -252,6 +258,13 @@
                 "alpha": 0.5
             }
         },
+        {
+            "type": "RenyiEntropy",
+            "kwargs": {
+                "base": "base32",
+                "alpha": 0.5
+            }
+        },
         {
             "type": "RenyiEntropy",
             "kwargs": {
@@ -273,6 +286,13 @@
                 "alpha": 2.0
             }
         },
+        {
+            "type": "RenyiEntropy",
+            "kwargs": {
+                "base": "base32",
+                "alpha": 2.0
+            }
+        },
         {
             "type": "RenyiEntropy",
             "kwargs": {
@@ -287,6 +307,42 @@
                 "alpha": 2.0
             }
         },
+        {
+            "type": "CharSet",
+            "kwargs": {
+                "base": "base16upper"
+            }
+        },
+        {
+            "type": "CharSet",
+            "kwargs": {
+                "base": "base16lower"
+            }
+        },
+        {
+            "type": "CharSet",
+            "kwargs": {
+                "base": "base32"
+            }
+        },
+        {
+            "type": "CharSet",
+            "kwargs": {
+                "base": "base36"
+            }
+        },
+        {
+            "type": "CharSet",
+            "kwargs": {
+                "base": "base64std"
+            }
+        },
+        {
+            "type": "CharSet",
+            "kwargs": {
+                "base": "base64url"
+            }
+        },
         {
             "type": "HasHtmlTag"
         },
@@ -368,6 +424,7 @@
                     ".log",
                     ".lua",
                     ".m",
+                    ".map",
                     ".markerb",
                     ".md",
                     ".mjs",
@@ -377,6 +434,7 @@
                     ".moo",
                     ".ndjson",
                     ".nix",
+                    ".nolint",
                     ".odd",
                     ".patch",
                     ".php",

diff --git a/experiment/main.py b/experiment/main.py
@@ -1,3 +1,4 @@
+import math
 import os
 import pathlib
 import random
@@ -9,10 +10,11 @@
 
 import numpy as np
 import tensorflow as tf
-from keras import Model
+from keras import Model  # type: ignore
 from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score
 from sklearn.model_selection import train_test_split
 from sklearn.utils import compute_class_weight
+from tensorflow.keras.callbacks import LearningRateScheduler
 
 from experiment.plot import save_plot
 from experiment.src.data_loader import read_detected_data, read_metadata, join_label, get_y_labels
@@ -51,11 +53,14 @@ def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray
 def main(cred_data_location: str, jobs: int) -> str:
     current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
 
+    dir_path = pathlib.Path("results")
+    os.makedirs(dir_path, exist_ok=True)
+
     prepare_train_data(_cred_data_location, jobs)
     print(f"Train model on data from {cred_data_location}")
 
     # detected data means which data is passed to ML validator of credsweeper after filters with RuleName
-    detected_data = read_detected_data("detected_data.json")
+    detected_data = read_detected_data("results/detected_data.json")
     print(f"CredSweeper detected {len(detected_data)} credentials without ML")
     # all markup data
     meta_data = read_metadata(f"{cred_data_location}/meta")
@@ -122,10 +127,10 @@ def main(cred_data_location: str, jobs: int) -> str:
     y_test = get_y_labels(df_test)
     print(f"Class-1 prop on test: {np.mean(y_test):.4f}")
 
-    keras_model = get_model(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape)
     batch_size = 2048
-    epochs = 16
+    epochs = 35
 
+    keras_model = get_model(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape)
     fit_history = keras_model.fit(x=[x_train_line, x_train_variable, x_train_value, x_train_features],
                                   y=y_train,
                                   batch_size=batch_size,
@@ -136,8 +141,6 @@ def main(cred_data_location: str, jobs: int) -> str:
                                   class_weight=class_weight,
                                   use_multiprocessing=True)
 
-    dir_path = pathlib.Path("results")
-    os.makedirs(dir_path, exist_ok=True)
     model_file_name = dir_path / f"ml_model_at-{current_time}"
     keras_model.save(model_file_name, include_optimizer=False)