Skip to content

Commit

Permalink
Thrifty memory consumption
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Feb 5, 2025
1 parent 03b7640 commit 3b855cd
Show file tree
Hide file tree
Showing 17 changed files with 170 additions and 125 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ jobs:
- name: Run CredSweeper tool
run: |
credsweeper --banner --log info --jobs $(nproc) --sort --subtext --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
credsweeper --banner --log info --jobs $(nproc) --thrifty --sort --subtext --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
- name: Run Benchmark
run: |
Expand Down
5 changes: 5 additions & 0 deletions credsweeper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ def get_arguments() -> Namespace:
dest="jobs",
default=1,
metavar="POSITIVE_INT")
parser.add_argument("--thrifty",
help="clear objects after scan to reduce memory consumption",
action="store_const",
const=True)
parser.add_argument("--skip_ignored",
help="parse .gitignore files and skip credentials from ignored objects",
dest="skip_ignored",
Expand Down Expand Up @@ -295,6 +299,7 @@ def scan(args: Namespace, content_provider: AbstractProvider) -> int:
size_limit=args.size_limit,
exclude_lines=denylist,
exclude_values=denylist,
thrifty=args.thrifty,
log_level=args.log)
return credsweeper.run(content_provider=content_provider)
except Exception as exc:
Expand Down
9 changes: 7 additions & 2 deletions credsweeper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(self,
size_limit: Optional[str] = None,
exclude_lines: Optional[List[str]] = None,
exclude_values: Optional[List[str]] = None,
thrifty: bool = False,
log_level: Optional[str] = None) -> None:
"""Initialize Advanced credential scanner.
Expand Down Expand Up @@ -86,6 +87,7 @@ def __init__(self,
size_limit: optional string integer or human-readable format to skip oversize files
exclude_lines: lines to omit in scan. Will be added to the lines already in config
exclude_values: values to omit in scan. Will be added to the values already in config
thrifty: free provider resources after scan to reduce memory consumption
log_level: str - level for pool initializer according logging levels (UPPERCASE)
"""
Expand Down Expand Up @@ -118,6 +120,7 @@ def __init__(self,
self.ml_model = ml_model
self.ml_providers = ml_providers
self.ml_validator = None
self.__thrifty = thrifty
self.__log_level = log_level

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
Expand Down Expand Up @@ -300,8 +303,10 @@ def files_scan(
content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> List[Candidate]:
"""Auxiliary method for scan one sequence"""
all_cred: List[Candidate] = []
for i in content_providers:
candidates = self.file_scan(i)
for provider in content_providers:
candidates = self.file_scan(provider)
if self.__thrifty:
provider.free()
all_cred.extend(candidates)
logger.info(f"Completed: processed {len(content_providers)} providers with {len(all_cred)} candidates")
return all_cred
Expand Down
28 changes: 14 additions & 14 deletions credsweeper/file_handler/byte_content_provider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import cached_property
from typing import List, Optional, Generator

from credsweeper.file_handler.analysis_target import AnalysisTarget
Expand All @@ -20,31 +21,30 @@ def __init__(
"""
super().__init__(file_path=file_path, file_type=file_type, info=info)
self.data = content
self.__data = content
self.__lines: Optional[List[str]] = None

@property
@cached_property
def data(self) -> Optional[bytes]:
"""data getter for ByteContentProvider"""
"""data RO getter for ByteContentProvider"""
return self.__data

@data.setter
def data(self, data: Optional[bytes]) -> None:
"""data setter for ByteContentProvider"""
self.__data = data
def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__data = None
if hasattr(self, "data"):
delattr(self, "data")
self.__lines = None
if hasattr(self, "lines"):
delattr(self, "lines")

@property
@cached_property
def lines(self) -> List[str]:
"""lines getter for ByteContentProvider"""
"""lines RO getter for ByteContentProvider"""
if self.__lines is None:
self.__lines = Util.decode_bytes(self.__data)
return self.__lines if self.__lines is not None else []

@lines.setter
def lines(self, lines: List[str]) -> None:
"""lines setter for ByteContentProvider"""
self.__lines = lines

def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
"""Return lines to scan.
Expand Down
7 changes: 3 additions & 4 deletions credsweeper/file_handler/content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,15 @@ def info(self) -> str:
"""info getter"""
return self.__descriptor.info

@property
@cached_property
@abstractmethod
def data(self) -> Optional[bytes]:
"""abstract data getter"""
raise NotImplementedError(__name__)

@data.setter
@abstractmethod
def data(self, data: Optional[bytes]) -> None:
"""abstract data setter"""
def free(self) -> None:
"""free data after scan to reduce memory usage"""
raise NotImplementedError(__name__)

def lines_to_targets(
Expand Down
67 changes: 38 additions & 29 deletions credsweeper/file_handler/data_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import string
import warnings
from functools import cached_property
from typing import List, Optional, Any, Generator, Callable, Tuple

import yaml
Expand Down Expand Up @@ -37,30 +38,41 @@ def __init__(
"""
super().__init__(file_path=file_path, file_type=file_type, info=info)
self.__inited_text: str = ""
self.data = data
self.__data = data
self.__text: Optional[str] = None
self.structure: Optional[List[Any]] = None
self.decoded: Optional[bytes] = None
self.lines: List[str] = []
self.line_numbers: List[int] = []
self.__html_lines_size = len(data) # the size is used to limit extra memory consumption during html combination

@property
@cached_property
def data(self) -> Optional[bytes]:
"""data getter for DataContentProvider"""
"""data RO getter for DataContentProvider and the property is used in deep scan"""
return self.__data

@data.setter
def data(self, data: Optional[bytes]) -> None:
"""data setter for DataContentProvider"""
self.__data = data

@property
def __text(self) -> str:
"""Getter which throws exception in case of bad decoding"""
if not self.__inited_text:
self.__inited_text = self.data.decode(encoding=DEFAULT_ENCODING, errors="strict")
return self.__inited_text
def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__data = None
if hasattr(self, "data"):
delattr(self, "data")
self.__text = None
if hasattr(self, "text"):
delattr(self, "text")
self.structure = None
self.decoded = None
self.lines = []
self.line_numbers = []

@cached_property
def text(self) -> str:
"""Getter to produce a text from DEFAULT_ENCODING. Empty str for unrecognized data"""
if self.__text is None:
try:
self.__text = self.__data.decode(encoding=DEFAULT_ENCODING, errors="strict")
except Exception:
self.__text = ''
return self.__text

def __is_structure(self) -> bool:
"""Check whether a structure was recognized"""
Expand All @@ -71,15 +83,12 @@ def represent_as_structure(self) -> bool:
"""Tries to convert data with many parsers. Stores result to internal structure
Return True if some structure found
"""
try:
if MIN_DATA_LEN > len(self.__text):
return False
except Exception:
if MIN_DATA_LEN > len(self.text):
return False
# JSON & NDJSON
if "{" in self.__text and "}" in self.__text and "\"" in self.__text and ":" in self.__text:
if "{" in self.text and "}" in self.text and "\"" in self.text and ":" in self.text:
try:
self.structure = json.loads(self.__text)
self.structure = json.loads(self.text)
logger.debug("CONVERTED from json")
except Exception as exc:
logger.debug("Cannot parse as json:%s %s", exc, self.data)
Expand All @@ -88,7 +97,7 @@ def represent_as_structure(self) -> bool:
return True
try:
self.structure = []
for line in self.__text.splitlines():
for line in self.text.splitlines():
# each line must be in json format, otherwise - exception rises
self.structure.append(json.loads(line))
logger.debug("CONVERTED from ndjson")
Expand All @@ -104,8 +113,8 @@ def represent_as_structure(self) -> bool:
# # # Python
try:
# search only in sources with strings
if (";" in self.__text or 2 < self.__text.count("\n")) and ("\"" in self.__text or "'" in self.__text):
self.structure = Util.parse_python(self.__text)
if (";" in self.text or 2 < self.text.count("\n")) and ("\"" in self.text or "'" in self.text):
self.structure = Util.parse_python(self.text)
logger.debug("CONVERTED from Python")
else:
logger.debug("Data do not contain line feed - weak PYTHON")
Expand All @@ -116,8 +125,8 @@ def represent_as_structure(self) -> bool:
return True
# # # YAML - almost always recognized
try:
if ":" in self.__text and 2 < self.__text.count("\n"):
self.structure = yaml.load(self.__text, Loader=yaml.FullLoader)
if ":" in self.text and 2 < self.text.count("\n"):
self.structure = yaml.load(self.text, Loader=yaml.FullLoader)
logger.debug("CONVERTED from yaml")
else:
logger.debug("Data do not contain colon mark - weak YAML")
Expand All @@ -136,11 +145,11 @@ def represent_as_xml(self) -> bool:
True if reading was successful
"""
if MIN_XML_LEN > len(self.data):
if MIN_XML_LEN > len(self.text):
return False
try:
if "<" in self.__text and ">" in self.__text and "</" in self.__text:
xml_text = self.__text.splitlines()
if "<" in self.text and ">" in self.text and "</" in self.text:
xml_text = self.text.splitlines()
self.lines, self.line_numbers = Util.get_xml_from_lines(xml_text)
logger.debug("CONVERTED from xml")
else:
Expand Down
31 changes: 20 additions & 11 deletions credsweeper/file_handler/diff_content_provider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from functools import cached_property
from typing import List, Tuple, Generator

from credsweeper.common.constants import DiffRowType
Expand Down Expand Up @@ -32,26 +33,34 @@ def __init__(
change_type: DiffRowType, #
diff: List[DiffDict]) -> None:
super().__init__(file_path=file_path, info=f"{file_path}:{change_type.value}")
self.change_type = change_type
self.diff = diff
self.__change_type = change_type
self.__diff = diff

@property
@cached_property
def data(self) -> bytes:
"""data getter for DiffContentProvider"""
raise NotImplementedError(__name__)

@data.setter
def data(self, data: bytes) -> None:
"""data setter for DiffContentProvider"""
raise NotImplementedError(__name__)
@cached_property
def diff(self) -> List[DiffDict]:
"""diff getter for DiffContentProvider"""
return self.__diff

def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__diff = None
if hasattr(self, "diff"):
delattr(self, "diff")

def parse_lines_data(self, lines_data: List[DiffRowData]) -> Tuple[List[int], List[str]]:
@staticmethod
def parse_lines_data(change_type: DiffRowType, lines_data: List[DiffRowData]) -> Tuple[List[int], List[str]]:
"""Parse diff lines data.
Return list of line numbers with change type "self.change_type" and list of all lines in file
in original order(replaced all lines not mentioned in diff file with blank line)
Args:
change_type: set added or deleted file data to scan
lines_data: data of all rows mentioned in diff file
Return:
Expand All @@ -62,7 +71,7 @@ def parse_lines_data(self, lines_data: List[DiffRowData]) -> Tuple[List[int], Li
change_numbs = []
all_lines = []
for line_data in lines_data:
if line_data.line_type == self.change_type:
if line_data.line_type == change_type:
change_numbs.append(line_data.line_numb)
all_lines.append(line_data.line)
return change_numbs, all_lines
Expand All @@ -77,6 +86,6 @@ def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None,
list of analysis targets of every row of file diff corresponding to change type "self.change_type"
"""
lines_data = Util.preprocess_file_diff(self.diff)
change_numbs, all_lines = self.parse_lines_data(lines_data)
lines_data = Util.preprocess_file_diff(self.__diff)
change_numbs, all_lines = self.parse_lines_data(self.__change_type, lines_data)
return self.lines_to_targets(min_len, all_lines, change_numbs)
35 changes: 27 additions & 8 deletions credsweeper/file_handler/string_content_provider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import cached_property
from typing import List, Optional, Generator

from credsweeper.file_handler.analysis_target import AnalysisTarget
Expand All @@ -22,20 +23,38 @@ def __init__(
"""
super().__init__(file_path=file_path, file_type=file_type, info=info)
self.lines = lines
self.__lines = lines
# fill line numbers only when amounts are equal
self.line_numbers = line_numbers if line_numbers and len(self.lines) == len(line_numbers) \
else (list(range(1, 1 + len(self.lines))) if self.lines else [])
if line_numbers is None or len(lines) != len(line_numbers):
self.__line_numbers = None
else:
self.__line_numbers = line_numbers

@property
@cached_property
def data(self) -> bytes:
"""data getter for StringContentProvider"""
raise NotImplementedError(__name__)

@data.setter
def data(self, data: bytes) -> None:
"""data setter for StringContentProvider"""
raise NotImplementedError(__name__)
def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__lines = []
if hasattr(self, "lines"):
delattr(self, "lines")
self.__line_numbers = []
if hasattr(self, "line_numbers"):
delattr(self, "line_numbers")

@cached_property
def lines(self) -> List[str]:
"""line_numbers RO getter for StringContentProvider"""
return self.__lines

@cached_property
def line_numbers(self) -> List[int]:
"""line_numbers RO getter for StringContentProvider"""
if self.__line_numbers is None or len(self.__lines) != len(self.__line_numbers):
self.__line_numbers = list(range(1, 1 + len(self.__lines))) if self.__lines else []
return self.__line_numbers

def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
"""Return lines to scan.
Expand Down
Loading

0 comments on commit 3b855cd

Please sign in to comment.