Skip to content

Commit

Permalink
Prevent HTML extra LF adding
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Feb 12, 2025
1 parent 2a91be1 commit ab1b581
Show file tree
Hide file tree
Showing 12 changed files with 916 additions and 130 deletions.
3 changes: 1 addition & 2 deletions credsweeper/deep_scanner/mxfile_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from bs4 import BeautifulSoup
from lxml import etree

from credsweeper.common.constants import UTF_8
from credsweeper.credentials import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
Expand All @@ -26,7 +25,7 @@ def data_scan(
try:
lines = []
line_numbers = []
tree = etree.fromstring(data_provider.data.decode(UTF_8))
tree = etree.fromstring(data_provider.text)
for element in tree.iter():
if "mxCell" == getattr(element, "tag"):
line_number = element.sourceline
Expand Down
34 changes: 15 additions & 19 deletions credsweeper/file_handler/data_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import yaml
from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning

from credsweeper.common.constants import DEFAULT_ENCODING, ASCII, MIN_DATA_LEN
from credsweeper.common.constants import MIN_DATA_LEN
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.utils import Util
Expand Down Expand Up @@ -68,10 +68,7 @@ def free(self) -> None:
def text(self) -> str:
"""Getter to produce a text from DEFAULT_ENCODING. Empty str for unrecognized data"""
if self.__text is None:
try:
self.__text = self.__data.decode(encoding=DEFAULT_ENCODING, errors="strict")
except Exception:
self.__text = ''
self.__text = Util.decode_text(self.__data) or ''
return self.__text

def __is_structure(self) -> bool:
Expand All @@ -86,7 +83,7 @@ def represent_as_structure(self) -> bool:
if MIN_DATA_LEN > len(self.text):
return False
# JSON & NDJSON
if "{" in self.text and "}" in self.text and "\"" in self.text and ":" in self.text:
if '{' in self.text and '}' in self.text and '"' in self.text and ':' in self.text:
try:
self.structure = json.loads(self.text)
logger.debug("CONVERTED from json")
Expand All @@ -113,7 +110,8 @@ def represent_as_structure(self) -> bool:
# # # Python
try:
# search only in sources with strings
if (";" in self.text or 2 < self.text.count("\n")) and ("\"" in self.text or "'" in self.text):
if (';' in self.text or 2 < self.text.count('\n') or 2 < self.text.count('\r')) \
and ('"' in self.text or "'" in self.text):
self.structure = Util.parse_python(self.text)
logger.debug("CONVERTED from Python")
else:
Expand All @@ -125,7 +123,7 @@ def represent_as_structure(self) -> bool:
return True
# # # YAML - almost always recognized
try:
if ":" in self.text and 2 < self.text.count("\n"):
if ':' in self.text and (2 < self.text.count('\n') or 2 < self.text.count('\r')):
self.structure = yaml.load(self.text, Loader=yaml.FullLoader)
logger.debug("CONVERTED from yaml")
else:
Expand All @@ -148,7 +146,7 @@ def represent_as_xml(self) -> bool:
if MIN_XML_LEN > len(self.text):
return False
try:
if "<" in self.text and ">" in self.text and "</" in self.text:
if '<' in self.text and '>' in self.text and "</" in self.text:
xml_text = self.text.splitlines()
self.lines, self.line_numbers = Util.get_xml_from_lines(xml_text)
logger.debug("CONVERTED from xml")
Expand Down Expand Up @@ -192,10 +190,10 @@ def simple_html_representation(html: BeautifulSoup) -> Tuple[List[int], List[str
lines: List[str] = []
lines_size = 0
# use dedicated variable to deal with yapf and flake
new_line_tags = ["p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "div"]
for p in html.find_all(new_line_tags):
p.append('\n')
for p in html.find_all(["th", "td"]):
tags_to_split = [
"p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "div", "th", "td"
]
for p in html.find_all(tags_to_split):
p.append('\t')
html_lines = html.get_text().splitlines()
for line_number, doc_line in enumerate(html_lines):
Expand Down Expand Up @@ -346,9 +344,8 @@ def represent_as_html(
"""
try:
text = self.data.decode(encoding=DEFAULT_ENCODING)
if "</" in text and ">" in text:
if html := BeautifulSoup(text, features="html.parser"):
if "</" in self.text and ">" in self.text:
if html := BeautifulSoup(self.text, features="html.parser"):
line_numbers, lines, lines_size = self.simple_html_representation(html)
self.line_numbers.extend(line_numbers)
self.lines.extend(lines)
Expand All @@ -367,7 +364,7 @@ def represent_as_html(
return False

def represent_as_encoded(self) -> bool:
"""Encodes data from base64. Stores result in decoded
"""Decodes data from base64. Stores result in decoded
Return:
True if the data correctly parsed and verified
Expand All @@ -379,8 +376,7 @@ def represent_as_encoded(self) -> bool:
return False
try:
self.decoded = Util.decode_base64( #
self.data.decode(encoding=ASCII, errors="strict"). #
translate(str.maketrans("", "", string.whitespace)), #
self.text.translate(str.maketrans('', '', string.whitespace)), #
padding_safe=True, #
urlsafe_detect=True) #
except Exception as exc:
Expand Down
35 changes: 28 additions & 7 deletions credsweeper/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def read_file(path: Union[str, Path], encodings: Optional[List[str]] = None) ->
return Util.decode_bytes(data, encodings)

@staticmethod
def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[str]:
def decode_text(content: bytes, encodings: Optional[List[str]] = None) -> Optional[str]:
"""Decode content using different encodings.
Try to decode bytes according to the list of encodings "encodings"
Expand All @@ -214,12 +214,11 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[
encodings: supported encodings
Return:
list of file rows in a suitable encoding from "encodings",
if none of the encodings match, an empty list will be returned
Also empty list will be returned after last encoding and 0 symbol is present in lines not at end
Decoded text in str for any suitable encoding
or None when binary data detected
"""
lines = []
text = None
binary_suggest = False
if encodings is None:
encodings = AVAILABLE_ENCODINGS
Expand All @@ -232,15 +231,37 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[
break
text = content.decode(encoding, errors="strict")
if content != text.encode(encoding, errors="strict"):
# the check helps to detect a real encoding
raise UnicodeError
# windows & macos styles workaround
lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
break
except UnicodeError:
binary_suggest = True
logger.info(f"UnicodeError: Can't decode content as {encoding}.")
except Exception as exc:
logger.error(f"Unexpected Error: Can't read content as {encoding}. Error message: {exc}")
return text

@staticmethod
def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[str]:
"""Decode content using different encodings.
Try to decode bytes according to the list of encodings "encodings"
occurs without any exceptions. UTF-16 requires BOM
Args:
content: raw data that might be text
encodings: supported encodings
Return:
list of file rows in a suitable encoding from "encodings",
if none of the encodings match, an empty list will be returned
Also empty list will be returned after last encoding and 0 symbol is present in lines not at end
"""
if text := Util.decode_text(content, encodings):
lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
else:
lines = []
return lines

@staticmethod
Expand Down
11 changes: 6 additions & 5 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT = 146
SAMPLES_FILES_COUNT = 147

# the lowest value of ML threshold is used to display possible lowest values
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan with negligible ML threshold
SAMPLES_CRED_COUNT = 465
SAMPLES_CRED_COUNT = 470
SAMPLES_CRED_LINE_COUNT = SAMPLES_CRED_COUNT + 19

# Number of filtered credentials with ML
Expand All @@ -17,16 +17,17 @@
SAMPLES_POST_CRED_COUNT = SAMPLES_CRED_COUNT - ML_FILTERED

# with option --doc
SAMPLES_IN_DOC = 650
SAMPLES_IN_DOC = 656

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 84
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 87
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 8
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1

# well known string with all latin letters
AZ_DATA = b"The quick brown fox jumps over the lazy dog"
AZ_STRING = AZ_DATA.decode(encoding="ascii")
# Assume, there should be only ASCII symbols
AZ_STRING = AZ_DATA.decode(encoding="ascii", errors="strict")

# tests directory - use ONLY this file relevance for "release_test" workflow
TESTS_PATH = Path(__file__).resolve().parent
Expand Down
Loading

0 comments on commit ab1b581

Please sign in to comment.