Thrifty memory consumption

Samsung · Feb 5, 2025 · 3b855cd · 3b855cd
1 parent 03b7640
commit 3b855cd
Show file tree

Hide file tree

Showing 17 changed files with 170 additions and 125 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -136,7 +136,7 @@ jobs:
 
       - name: Run CredSweeper tool
         run: |
-          credsweeper --banner --log info --jobs $(nproc) --sort --subtext --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
+          credsweeper --banner --log info --jobs $(nproc) --thrifty --sort --subtext --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
 
       - name: Run Benchmark
         run: |

diff --git a/credsweeper/__main__.py b/credsweeper/__main__.py
@@ -203,6 +203,10 @@ def get_arguments() -> Namespace:
                         dest="jobs",
                         default=1,
                         metavar="POSITIVE_INT")
+    parser.add_argument("--thrifty",
+                        help="clear objects after scan to reduce memory consumption",
+                        action="store_const",
+                        const=True)
     parser.add_argument("--skip_ignored",
                         help="parse .gitignore files and skip credentials from ignored objects",
                         dest="skip_ignored",
@@ -295,6 +299,7 @@ def scan(args: Namespace, content_provider: AbstractProvider) -> int:
                                   size_limit=args.size_limit,
                                   exclude_lines=denylist,
                                   exclude_values=denylist,
+                                  thrifty=args.thrifty,
                                   log_level=args.log)
         return credsweeper.run(content_provider=content_provider)
     except Exception as exc:

diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -59,6 +59,7 @@ def __init__(self,
                  size_limit: Optional[str] = None,
                  exclude_lines: Optional[List[str]] = None,
                  exclude_values: Optional[List[str]] = None,
+                 thrifty: bool = False,
                  log_level: Optional[str] = None) -> None:
         """Initialize Advanced credential scanner.
 
@@ -86,6 +87,7 @@ def __init__(self,
             size_limit: optional string integer or human-readable format to skip oversize files
             exclude_lines: lines to omit in scan. Will be added to the lines already in config
             exclude_values: values to omit in scan. Will be added to the values already in config
+            thrifty: free provider resources after scan to reduce memory consumption
             log_level: str - level for pool initializer according logging levels (UPPERCASE)
 
         """
@@ -118,6 +120,7 @@ def __init__(self,
         self.ml_model = ml_model
         self.ml_providers = ml_providers
         self.ml_validator = None
+        self.__thrifty = thrifty
         self.__log_level = log_level
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@@ -300,8 +303,10 @@ def files_scan(
             content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> List[Candidate]:
         """Auxiliary method for scan one sequence"""
         all_cred: List[Candidate] = []
-        for i in content_providers:
-            candidates = self.file_scan(i)
+        for provider in content_providers:
+            candidates = self.file_scan(provider)
+            if self.__thrifty:
+                provider.free()
             all_cred.extend(candidates)
         logger.info(f"Completed: processed {len(content_providers)} providers with {len(all_cred)} candidates")
         return all_cred

diff --git a/credsweeper/file_handler/byte_content_provider.py b/credsweeper/file_handler/byte_content_provider.py
@@ -1,3 +1,4 @@
+from functools import cached_property
 from typing import List, Optional, Generator
 
 from credsweeper.file_handler.analysis_target import AnalysisTarget
@@ -20,31 +21,30 @@ def __init__(
 
         """
         super().__init__(file_path=file_path, file_type=file_type, info=info)
-        self.data = content
+        self.__data = content
         self.__lines: Optional[List[str]] = None
 
-    @property
+    @cached_property
     def data(self) -> Optional[bytes]:
-        """data getter for ByteContentProvider"""
+        """data RO getter for ByteContentProvider"""
         return self.__data
 
-    @data.setter
-    def data(self, data: Optional[bytes]) -> None:
-        """data setter for ByteContentProvider"""
-        self.__data = data
+    def free(self) -> None:
+        """free data after scan to reduce memory usage"""
+        self.__data = None
+        if hasattr(self, "data"):
+            delattr(self, "data")
+        self.__lines = None
+        if hasattr(self, "lines"):
+            delattr(self, "lines")
 
-    @property
+    @cached_property
     def lines(self) -> List[str]:
-        """lines getter for ByteContentProvider"""
+        """lines RO getter for ByteContentProvider"""
         if self.__lines is None:
             self.__lines = Util.decode_bytes(self.__data)
         return self.__lines if self.__lines is not None else []
 
-    @lines.setter
-    def lines(self, lines: List[str]) -> None:
-        """lines setter for ByteContentProvider"""
-        self.__lines = lines
-
     def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
         """Return lines to scan.
 

diff --git a/credsweeper/file_handler/content_provider.py b/credsweeper/file_handler/content_provider.py
@@ -64,16 +64,15 @@ def info(self) -> str:
         """info getter"""
         return self.__descriptor.info
 
-    @property
+    @cached_property
     @abstractmethod
     def data(self) -> Optional[bytes]:
         """abstract data getter"""
         raise NotImplementedError(__name__)
 
-    @data.setter
     @abstractmethod
-    def data(self, data: Optional[bytes]) -> None:
-        """abstract data setter"""
+    def free(self) -> None:
+        """free data after scan to reduce memory usage"""
         raise NotImplementedError(__name__)
 
     def lines_to_targets(

diff --git a/credsweeper/file_handler/data_content_provider.py b/credsweeper/file_handler/data_content_provider.py
@@ -2,6 +2,7 @@
 import logging
 import string
 import warnings
+from functools import cached_property
 from typing import List, Optional, Any, Generator, Callable, Tuple
 
 import yaml
@@ -37,30 +38,41 @@ def __init__(
 
         """
         super().__init__(file_path=file_path, file_type=file_type, info=info)
-        self.__inited_text: str = ""
-        self.data = data
+        self.__data = data
+        self.__text: Optional[str] = None
         self.structure: Optional[List[Any]] = None
         self.decoded: Optional[bytes] = None
         self.lines: List[str] = []
         self.line_numbers: List[int] = []
         self.__html_lines_size = len(data)  # the size is used to limit extra memory consumption during html combination
 
-    @property
+    @cached_property
     def data(self) -> Optional[bytes]:
-        """data getter for DataContentProvider"""
+        """data RO getter for DataContentProvider and the property is used in deep scan"""
         return self.__data
 
-    @data.setter
-    def data(self, data: Optional[bytes]) -> None:
-        """data setter for DataContentProvider"""
-        self.__data = data
-
-    @property
-    def __text(self) -> str:
-        """Getter which throws exception in case of bad decoding"""
-        if not self.__inited_text:
-            self.__inited_text = self.data.decode(encoding=DEFAULT_ENCODING, errors="strict")
-        return self.__inited_text
+    def free(self) -> None:
+        """free data after scan to reduce memory usage"""
+        self.__data = None
+        if hasattr(self, "data"):
+            delattr(self, "data")
+        self.__text = None
+        if hasattr(self, "text"):
+            delattr(self, "text")
+        self.structure = None
+        self.decoded = None
+        self.lines = []
+        self.line_numbers = []
+
+    @cached_property
+    def text(self) -> str:
+        """Getter to produce a text from DEFAULT_ENCODING. Empty str for unrecognized data"""
+        if self.__text is None:
+            try:
+                self.__text = self.__data.decode(encoding=DEFAULT_ENCODING, errors="strict")
+            except Exception:
+                self.__text = ''
+        return self.__text
 
     def __is_structure(self) -> bool:
         """Check whether a structure was recognized"""
@@ -71,15 +83,12 @@ def represent_as_structure(self) -> bool:
         """Tries to convert data with many parsers. Stores result to internal structure
         Return True if some structure found
         """
-        try:
-            if MIN_DATA_LEN > len(self.__text):
-                return False
-        except Exception:
+        if MIN_DATA_LEN > len(self.text):
             return False
         # JSON & NDJSON
-        if "{" in self.__text and "}" in self.__text and "\"" in self.__text and ":" in self.__text:
+        if "{" in self.text and "}" in self.text and "\"" in self.text and ":" in self.text:
             try:
-                self.structure = json.loads(self.__text)
+                self.structure = json.loads(self.text)
                 logger.debug("CONVERTED from json")
             except Exception as exc:
                 logger.debug("Cannot parse as json:%s %s", exc, self.data)
@@ -88,7 +97,7 @@ def represent_as_structure(self) -> bool:
                     return True
             try:
                 self.structure = []
-                for line in self.__text.splitlines():
+                for line in self.text.splitlines():
                     # each line must be in json format, otherwise - exception rises
                     self.structure.append(json.loads(line))
                 logger.debug("CONVERTED from ndjson")
@@ -104,8 +113,8 @@ def represent_as_structure(self) -> bool:
         # # # Python
         try:
             # search only in sources with strings
-            if (";" in self.__text or 2 < self.__text.count("\n")) and ("\"" in self.__text or "'" in self.__text):
-                self.structure = Util.parse_python(self.__text)
+            if (";" in self.text or 2 < self.text.count("\n")) and ("\"" in self.text or "'" in self.text):
+                self.structure = Util.parse_python(self.text)
                 logger.debug("CONVERTED from Python")
             else:
                 logger.debug("Data do not contain line feed - weak PYTHON")
@@ -116,8 +125,8 @@ def represent_as_structure(self) -> bool:
                 return True
         # # # YAML - almost always recognized
         try:
-            if ":" in self.__text and 2 < self.__text.count("\n"):
-                self.structure = yaml.load(self.__text, Loader=yaml.FullLoader)
+            if ":" in self.text and 2 < self.text.count("\n"):
+                self.structure = yaml.load(self.text, Loader=yaml.FullLoader)
                 logger.debug("CONVERTED from yaml")
             else:
                 logger.debug("Data do not contain colon mark - weak YAML")
@@ -136,11 +145,11 @@ def represent_as_xml(self) -> bool:
              True if reading was successful
 
         """
-        if MIN_XML_LEN > len(self.data):
+        if MIN_XML_LEN > len(self.text):
             return False
         try:
-            if "<" in self.__text and ">" in self.__text and "</" in self.__text:
-                xml_text = self.__text.splitlines()
+            if "<" in self.text and ">" in self.text and "</" in self.text:
+                xml_text = self.text.splitlines()
                 self.lines, self.line_numbers = Util.get_xml_from_lines(xml_text)
                 logger.debug("CONVERTED from xml")
             else:

diff --git a/credsweeper/file_handler/diff_content_provider.py b/credsweeper/file_handler/diff_content_provider.py
@@ -1,4 +1,5 @@
 import logging
+from functools import cached_property
 from typing import List, Tuple, Generator
 
 from credsweeper.common.constants import DiffRowType
@@ -32,26 +33,34 @@ def __init__(
             change_type: DiffRowType,  #
             diff: List[DiffDict]) -> None:
         super().__init__(file_path=file_path, info=f"{file_path}:{change_type.value}")
-        self.change_type = change_type
-        self.diff = diff
+        self.__change_type = change_type
+        self.__diff = diff
 
-    @property
+    @cached_property
     def data(self) -> bytes:
         """data getter for DiffContentProvider"""
         raise NotImplementedError(__name__)
 
-    @data.setter
-    def data(self, data: bytes) -> None:
-        """data setter for DiffContentProvider"""
-        raise NotImplementedError(__name__)
+    @cached_property
+    def diff(self) -> List[DiffDict]:
+        """diff getter for DiffContentProvider"""
+        return self.__diff
+
+    def free(self) -> None:
+        """free data after scan to reduce memory usage"""
+        self.__diff = None
+        if hasattr(self, "diff"):
+            delattr(self, "diff")
 
-    def parse_lines_data(self, lines_data: List[DiffRowData]) -> Tuple[List[int], List[str]]:
+    @staticmethod
+    def parse_lines_data(change_type: DiffRowType, lines_data: List[DiffRowData]) -> Tuple[List[int], List[str]]:
         """Parse diff lines data.
 
         Return list of line numbers with change type "self.change_type" and list of all lines in file
             in original order(replaced all lines not mentioned in diff file with blank line)
 
         Args:
+            change_type: set added or deleted file data to scan
             lines_data: data of all rows mentioned in diff file
 
         Return:
@@ -62,7 +71,7 @@ def parse_lines_data(self, lines_data: List[DiffRowData]) -> Tuple[List[int], Li
         change_numbs = []
         all_lines = []
         for line_data in lines_data:
-            if line_data.line_type == self.change_type:
+            if line_data.line_type == change_type:
                 change_numbs.append(line_data.line_numb)
                 all_lines.append(line_data.line)
         return change_numbs, all_lines
@@ -77,6 +86,6 @@ def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None,
             list of analysis targets of every row of file diff corresponding to change type "self.change_type"
 
         """
-        lines_data = Util.preprocess_file_diff(self.diff)
-        change_numbs, all_lines = self.parse_lines_data(lines_data)
+        lines_data = Util.preprocess_file_diff(self.__diff)
+        change_numbs, all_lines = self.parse_lines_data(self.__change_type, lines_data)
         return self.lines_to_targets(min_len, all_lines, change_numbs)
diff --git a/credsweeper/file_handler/string_content_provider.py b/credsweeper/file_handler/string_content_provider.py
@@ -1,3 +1,4 @@
+from functools import cached_property
 from typing import List, Optional, Generator
 
 from credsweeper.file_handler.analysis_target import AnalysisTarget
@@ -22,20 +23,38 @@ def __init__(
 
         """
         super().__init__(file_path=file_path, file_type=file_type, info=info)
-        self.lines = lines
+        self.__lines = lines
         # fill line numbers only when amounts are equal
-        self.line_numbers = line_numbers if line_numbers and len(self.lines) == len(line_numbers) \
-            else (list(range(1, 1 + len(self.lines))) if self.lines else [])
+        if line_numbers is None or len(lines) != len(line_numbers):
+            self.__line_numbers = None
+        else:
+            self.__line_numbers = line_numbers
 
-    @property
+    @cached_property
     def data(self) -> bytes:
         """data getter for StringContentProvider"""
         raise NotImplementedError(__name__)
 
-    @data.setter
-    def data(self, data: bytes) -> None:
-        """data setter for StringContentProvider"""
-        raise NotImplementedError(__name__)
+    def free(self) -> None:
+        """free data after scan to reduce memory usage"""
+        self.__lines = []
+        if hasattr(self, "lines"):
+            delattr(self, "lines")
+        self.__line_numbers = []
+        if hasattr(self, "line_numbers"):
+            delattr(self, "line_numbers")
+
+    @cached_property
+    def lines(self) -> List[str]:
+        """line_numbers RO getter for StringContentProvider"""
+        return self.__lines
+
+    @cached_property
+    def line_numbers(self) -> List[int]:
+        """line_numbers RO getter for StringContentProvider"""
+        if self.__line_numbers is None or len(self.__lines) != len(self.__line_numbers):
+            self.__line_numbers = list(range(1, 1 + len(self.__lines))) if self.__lines else []
+        return self.__line_numbers
 
     def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
         """Return lines to scan.