Prevent HTML extra LF adding

Samsung · Feb 12, 2025 · ab1b581 · ab1b581
1 parent 2a91be1
commit ab1b581
Show file tree

Hide file tree

Showing 12 changed files with 916 additions and 130 deletions.
diff --git a/credsweeper/deep_scanner/mxfile_scanner.py b/credsweeper/deep_scanner/mxfile_scanner.py
@@ -5,7 +5,6 @@
 from bs4 import BeautifulSoup
 from lxml import etree
 
-from credsweeper.common.constants import UTF_8
 from credsweeper.credentials import Candidate
 from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
 from credsweeper.file_handler.data_content_provider import DataContentProvider
@@ -26,7 +25,7 @@ def data_scan(
         try:
             lines = []
             line_numbers = []
-            tree = etree.fromstring(data_provider.data.decode(UTF_8))
+            tree = etree.fromstring(data_provider.text)
             for element in tree.iter():
                 if "mxCell" == getattr(element, "tag"):
                     line_number = element.sourceline

diff --git a/credsweeper/file_handler/data_content_provider.py b/credsweeper/file_handler/data_content_provider.py
@@ -8,7 +8,7 @@
 import yaml
 from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning
 
-from credsweeper.common.constants import DEFAULT_ENCODING, ASCII, MIN_DATA_LEN
+from credsweeper.common.constants import MIN_DATA_LEN
 from credsweeper.file_handler.analysis_target import AnalysisTarget
 from credsweeper.file_handler.content_provider import ContentProvider
 from credsweeper.utils import Util
@@ -68,10 +68,7 @@ def free(self) -> None:
     def text(self) -> str:
         """Getter to produce a text from DEFAULT_ENCODING. Empty str for unrecognized data"""
         if self.__text is None:
-            try:
-                self.__text = self.__data.decode(encoding=DEFAULT_ENCODING, errors="strict")
-            except Exception:
-                self.__text = ''
+            self.__text = Util.decode_text(self.__data) or ''
         return self.__text
 
     def __is_structure(self) -> bool:
@@ -86,7 +83,7 @@ def represent_as_structure(self) -> bool:
         if MIN_DATA_LEN > len(self.text):
             return False
         # JSON & NDJSON
-        if "{" in self.text and "}" in self.text and "\"" in self.text and ":" in self.text:
+        if '{' in self.text and '}' in self.text and '"' in self.text and ':' in self.text:
             try:
                 self.structure = json.loads(self.text)
                 logger.debug("CONVERTED from json")
@@ -113,7 +110,8 @@ def represent_as_structure(self) -> bool:
         # # # Python
         try:
             # search only in sources with strings
-            if (";" in self.text or 2 < self.text.count("\n")) and ("\"" in self.text or "'" in self.text):
+            if (';' in self.text or 2 < self.text.count('\n') or 2 < self.text.count('\r')) \
+                    and ('"' in self.text or "'" in self.text):
                 self.structure = Util.parse_python(self.text)
                 logger.debug("CONVERTED from Python")
             else:
@@ -125,7 +123,7 @@ def represent_as_structure(self) -> bool:
                 return True
         # # # YAML - almost always recognized
         try:
-            if ":" in self.text and 2 < self.text.count("\n"):
+            if ':' in self.text and (2 < self.text.count('\n') or 2 < self.text.count('\r')):
                 self.structure = yaml.load(self.text, Loader=yaml.FullLoader)
                 logger.debug("CONVERTED from yaml")
             else:
@@ -148,7 +146,7 @@ def represent_as_xml(self) -> bool:
         if MIN_XML_LEN > len(self.text):
             return False
         try:
-            if "<" in self.text and ">" in self.text and "</" in self.text:
+            if '<' in self.text and '>' in self.text and "</" in self.text:
                 xml_text = self.text.splitlines()
                 self.lines, self.line_numbers = Util.get_xml_from_lines(xml_text)
                 logger.debug("CONVERTED from xml")
@@ -192,10 +190,10 @@ def simple_html_representation(html: BeautifulSoup) -> Tuple[List[int], List[str
         lines: List[str] = []
         lines_size = 0
         # use dedicated variable to deal with yapf and flake
-        new_line_tags = ["p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "div"]
-        for p in html.find_all(new_line_tags):
-            p.append('\n')
-        for p in html.find_all(["th", "td"]):
+        tags_to_split = [
+            "p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "div", "th", "td"
+        ]
+        for p in html.find_all(tags_to_split):
             p.append('\t')
         html_lines = html.get_text().splitlines()
         for line_number, doc_line in enumerate(html_lines):
@@ -346,9 +344,8 @@ def represent_as_html(
 
         """
         try:
-            text = self.data.decode(encoding=DEFAULT_ENCODING)
-            if "</" in text and ">" in text:
-                if html := BeautifulSoup(text, features="html.parser"):
+            if "</" in self.text and ">" in self.text:
+                if html := BeautifulSoup(self.text, features="html.parser"):
                     line_numbers, lines, lines_size = self.simple_html_representation(html)
                     self.line_numbers.extend(line_numbers)
                     self.lines.extend(lines)
@@ -367,7 +364,7 @@ def represent_as_html(
         return False
 
     def represent_as_encoded(self) -> bool:
-        """Encodes data from base64. Stores result in decoded
+        """Decodes data from base64. Stores result in decoded
 
         Return:
              True if the data correctly parsed and verified
@@ -379,8 +376,7 @@ def represent_as_encoded(self) -> bool:
             return False
         try:
             self.decoded = Util.decode_base64(  #
-                self.data.decode(encoding=ASCII, errors="strict").  #
-                translate(str.maketrans("", "", string.whitespace)),  #
+                self.text.translate(str.maketrans('', '', string.whitespace)),  #
                 padding_safe=True,  #
                 urlsafe_detect=True)  #
         except Exception as exc:

diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py
@@ -203,7 +203,7 @@ def read_file(path: Union[str, Path], encodings: Optional[List[str]] = None) ->
         return Util.decode_bytes(data, encodings)
 
     @staticmethod
-    def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[str]:
+    def decode_text(content: bytes, encodings: Optional[List[str]] = None) -> Optional[str]:
         """Decode content using different encodings.
 
         Try to decode bytes according to the list of encodings "encodings"
@@ -214,12 +214,11 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[
             encodings: supported encodings
 
         Return:
-            list of file rows in a suitable encoding from "encodings",
-            if none of the encodings match, an empty list will be returned
-            Also empty list will be returned after last encoding and 0 symbol is present in lines not at end
+            Decoded text in str for any suitable encoding
+            or None when binary data detected
 
         """
-        lines = []
+        text = None
         binary_suggest = False
         if encodings is None:
             encodings = AVAILABLE_ENCODINGS
@@ -232,15 +231,37 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[
                     break
                 text = content.decode(encoding, errors="strict")
                 if content != text.encode(encoding, errors="strict"):
+                    # the check helps to detect a real encoding
                     raise UnicodeError
-                # windows & macos styles workaround
-                lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
                 break
             except UnicodeError:
                 binary_suggest = True
                 logger.info(f"UnicodeError: Can't decode content as {encoding}.")
             except Exception as exc:
                 logger.error(f"Unexpected Error: Can't read content as {encoding}. Error message: {exc}")
+        return text
+
+    @staticmethod
+    def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[str]:
+        """Decode content using different encodings.
+
+        Try to decode bytes according to the list of encodings "encodings"
+        occurs without any exceptions. UTF-16 requires BOM
+
+        Args:
+            content: raw data that might be text
+            encodings: supported encodings
+
+        Return:
+            list of file rows in a suitable encoding from "encodings",
+            if none of the encodings match, an empty list will be returned
+            Also empty list will be returned after last encoding and 0 symbol is present in lines not at end
+
+        """
+        if text := Util.decode_text(content, encodings):
+            lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
+        else:
+            lines = []
         return lines
 
     @staticmethod

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,13 +1,13 @@
 from pathlib import Path
 
 # total number of files in test samples
-SAMPLES_FILES_COUNT = 146
+SAMPLES_FILES_COUNT = 147
 
 # the lowest value of ML threshold is used to display possible lowest values
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan with negligible ML threshold
-SAMPLES_CRED_COUNT = 465
+SAMPLES_CRED_COUNT = 470
 SAMPLES_CRED_LINE_COUNT = SAMPLES_CRED_COUNT + 19
 
 # Number of filtered credentials with ML
@@ -17,16 +17,17 @@
 SAMPLES_POST_CRED_COUNT = SAMPLES_CRED_COUNT - ML_FILTERED
 
 # with option --doc
-SAMPLES_IN_DOC = 650
+SAMPLES_IN_DOC = 656
 
 # archived credentials that are not found without --depth
-SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 84
+SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 87
 SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 8
 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1
 
 # well known string with all latin letters
 AZ_DATA = b"The quick brown fox jumps over the lazy dog"
-AZ_STRING = AZ_DATA.decode(encoding="ascii")
+# Assume, there should be only ASCII symbols
+AZ_STRING = AZ_DATA.decode(encoding="ascii", errors="strict")
 
 # tests directory - use ONLY this file relevance for "release_test" workflow
 TESTS_PATH = Path(__file__).resolve().parent