remove ML_VALIDATION

Samsung · Jan 15, 2025 · 5a6a8e0 · 5a6a8e0
1 parent 05468db
commit 5a6a8e0
Show file tree

Hide file tree

Showing 11 changed files with 989 additions and 3,023 deletions.
diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -10,7 +10,7 @@
 # Directory of credsweeper sources MUST be placed before imports to avoid circular import error
 APP_PATH = Path(__file__).resolve().parent
 
-from credsweeper.common.constants import KeyValidationOption, Severity, ThresholdPreset, DiffRowType
+from credsweeper.common.constants import Severity, ThresholdPreset, DiffRowType
 from credsweeper.config import Config
 from credsweeper.credentials import Candidate, CredentialManager, CandidateKey
 from credsweeper.deep_scanner.deep_scanner import DeepScanner
@@ -368,11 +368,9 @@ def post_processing(self) -> None:
                     for candidate in group_candidates:
                         if candidate.use_ml:
                             if is_cred[i]:
-                                candidate.ml_validation = KeyValidationOption.VALIDATED_KEY
                                 candidate.ml_probability = probability[i]
                                 new_cred_list.append(candidate)
                         else:
-                            candidate.ml_validation = KeyValidationOption.NOT_AVAILABLE
                             new_cred_list.append(candidate)
             else:
                 logger.info("Skipping ML validation due not applicable")
@@ -435,7 +433,8 @@ def export_results(self, change_type: Optional[DiffRowType] = None) -> None:
                 for line_data in credential.line_data_list:
                     # bright rule name and path or info
                     print(Style.BRIGHT + credential.rule_name +
-                          f" {line_data.info or line_data.path}:{line_data.line_num}" + Style.RESET_ALL)
+                          f" {line_data.info or line_data.path}:{line_data.line_num} {credential.ml_probability}" +
+                          Style.RESET_ALL)
                     print(line_data.get_colored_line(hashed=self.hashed, subtext=self.subtext))
 
         if is_exported is False:

diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py
@@ -97,14 +97,6 @@ class Chars(Enum):
 ENTROPY_LIMIT_BASE3x = 3
 
 
-class KeyValidationOption(Enum):
-    """API validation state"""
-    INVALID_KEY = 0
-    VALIDATED_KEY = 1
-    UNDECIDED = 2
-    NOT_AVAILABLE = 3
-
-
 class GroupType(Enum):
     """Group type - used in Group constructor for load predefined set of filters"""
     KEYWORD = "keyword"

diff --git a/credsweeper/credentials/candidate.py b/credsweeper/credentials/candidate.py
@@ -1,9 +1,9 @@
 import copy
 import re
 from json.encoder import py_encode_basestring_ascii
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
-from credsweeper.common.constants import KeyValidationOption, Severity, Confidence
+from credsweeper.common.constants import Severity, Confidence
 from credsweeper.config import Config
 from credsweeper.credentials.line_data import LineData
 
@@ -39,16 +39,14 @@ def __init__(self,
         self.config = config
         self.use_ml = use_ml
         self.confidence = confidence
-        self.ml_validation = KeyValidationOption.NOT_AVAILABLE
-        self.ml_probability: Optional[float] = None
+        self.ml_probability: Union[None, int, float] = None if use_ml else -1
 
     def compare(self, other: 'Candidate') -> bool:
         """Comparison method - checks only result of final cred"""
         if self.rule_name == other.rule_name \
                 and self.severity == other.severity \
                 and self.confidence == other.confidence \
                 and self.use_ml == other.use_ml \
-                and self.ml_validation == other.ml_validation \
                 and self.ml_probability == other.ml_probability \
                 and len(self.line_data_list) == len(other.line_data_list):
             for i, j in zip(self.line_data_list, other.line_data_list):
@@ -79,7 +77,7 @@ def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
                f" | severity: {self.severity.value}" \
                f" | confidence: {self.confidence.value}" \
                f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
-               f" | ml_validation: {self.ml_validation.name}"
+               f" | ml_probability: {self.ml_probability}"
 
     def __str__(self):
         return self.to_str()
@@ -95,7 +93,6 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:
 
         """
         full_output = {
-            "ml_validation": self.ml_validation.name,
             "patterns": [pattern.pattern for pattern in self.patterns],
             "ml_probability": self.ml_probability,
             "rule": self.rule_name,

diff --git a/credsweeper/secret/config.json b/credsweeper/secret/config.json
@@ -164,7 +164,6 @@
         "rule",
         "severity",
         "confidence",
-        "ml_validation",
         "ml_probability",
         "line_data_list"
     ]

diff --git a/docs/source/credsweeper.deep_scanner.rst b/docs/source/credsweeper.deep_scanner.rst
@@ -92,6 +92,14 @@ credsweeper.deep\_scanner.lang\_scanner module
    :undoc-members:
    :show-inheritance:
 
+credsweeper.deep\_scanner.mxfile\_scanner module
+------------------------------------------------
+
+.. automodule:: credsweeper.deep_scanner.mxfile_scanner
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 credsweeper.deep\_scanner.pdf\_scanner module
 ---------------------------------------------
 

diff --git a/docs/source/guide.rst b/docs/source/guide.rst
@@ -84,7 +84,7 @@ Get output as JSON file:
 
 .. code-block:: bash
 
-    python -m credsweeper --ml_validation --path tests/samples/password --save-json output.json
+    python -m credsweeper --path tests/samples/password.gradle --save-json output.json
 
 To check JSON file run:
 
@@ -97,10 +97,10 @@ To check JSON file run:
 
     [
         {
-            "ml_validation": "VALIDATED_KEY",
-            "ml_probability": 0.99755,
+            "ml_probability": 0.9857242107391357,
             "rule": "Password",
             "severity": "medium",
+            "confidence": "moderate",
             "line_data_list": [
                 {
                     "line": "password = \"cackle!\"",
@@ -111,9 +111,10 @@ To check JSON file run:
                     "value_start": 12,
                     "value_end": 19,
                     "variable": "password",
-                    "entropy_validation":
-                    {
-                        "iterator": "BASE64_CHARS",
+                    "variable_start": 0,
+                    "variable_end": 8,
+                    "entropy_validation": {
+                        "iterator": "BASE64STDPAD_CHARS",
                         "entropy": 2.120589933192232,
                         "valid": false
                     }
@@ -126,12 +127,12 @@ Get CLI output only:
 
 .. code-block:: bash
 
-    python -m credsweeper --path tests/samples/password
+    python -m credsweeper --path tests/samples/password.gradle
 
 
-.. code-block:: ruby
+.. code-block:: bash
 
-    rule: Password / severity: medium / line_data_list: [line : 'password = "cackle!"' / line_num : 1 / path : tests/samples/password / entropy_validation: False] / ml_validation: VALIDATED_KEY
+    rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
 
 
 Exclude outputs using CLI:
@@ -143,7 +144,7 @@ Space-like characters at left and right will be ignored.
 
 .. code-block:: bash
 
-    $ python -m credsweeper --path tests/samples/password --denylist list.txt
+    $ python -m credsweeper --path tests/samples/password.gradle --denylist list.txt
     Detected Credentials: 0
     Time Elapsed: 0.07523202896118164s
     $ cat list.txt
@@ -169,7 +170,7 @@ Then specify your config in CLI:
 
 .. code-block:: bash
 
-    $ python -m credsweeper --path tests/samples/password --config my_cfg.json
+    $ python -m credsweeper --path tests/samples/password.gradle --config my_cfg.json
     Detected Credentials: 0
     Time Elapsed: 0.07152628898620605s
 
@@ -192,7 +193,7 @@ Minimal example for scanning line list:
 
 .. code-block:: bash
 
-    rule: Password / severity: medium / line_data_list: [line: 'password='in_line_2'' / line_num: 2 / path:  / value: 'in_line_2' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
+    rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
 
 Minimal example for scanning bytes:
 
@@ -201,7 +202,7 @@ Minimal example for scanning bytes:
     from credsweeper import CredSweeper, ByteContentProvider
 
 
-    to_scan = b"line one\npassword='in_line_2'"
+    to_scan = b"line one\npassword='cackle!'"
     cred_sweeper = CredSweeper()
     provider = ByteContentProvider(to_scan)
     results = cred_sweeper.file_scan(provider)
@@ -210,7 +211,7 @@ Minimal example for scanning bytes:
 
 .. code-block:: bash
 
-    rule: Password / severity: medium / line_data_list: [line: 'password='in_line_2'' / line_num: 2 / path:  / value: 'in_line_2' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
+    rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
 
 
 Minimal example for the ML validation:
@@ -220,7 +221,7 @@ Minimal example for the ML validation:
     from credsweeper import CredSweeper, StringContentProvider, MlValidator, ThresholdPreset
 
 
-    to_scan = ["line one", "secret='fgELsRdFA'", "secret='template'"]
+    to_scan = ["line one", "password='cackle!'", "secret='template'"]
     cred_sweeper = CredSweeper()
     provider = StringContentProvider(to_scan)
 
@@ -239,7 +240,7 @@ Note that `"secret='template'"` is not reported due to failing check by the `MlV
 
 .. code-block:: bash
 
-    rule: Secret / severity: medium / line_data_list: [line: 'secret='fgELsRdFA'' / line_num: 2 / path:  / value: 'fgELsRdFA' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
+    rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
 
 Configurations
 --------------