Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent extra line split for HTML scan #673

Merged
merged 1 commit into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions credsweeper/deep_scanner/mxfile_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from bs4 import BeautifulSoup
from lxml import etree

from credsweeper.common.constants import UTF_8
from credsweeper.credentials import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
Expand All @@ -26,7 +25,7 @@ def data_scan(
try:
lines = []
line_numbers = []
tree = etree.fromstring(data_provider.data.decode(UTF_8))
tree = etree.fromstring(data_provider.text)
for element in tree.iter():
if "mxCell" == getattr(element, "tag"):
line_number = element.sourceline
Expand Down
34 changes: 15 additions & 19 deletions credsweeper/file_handler/data_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import yaml
from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning

from credsweeper.common.constants import DEFAULT_ENCODING, ASCII, MIN_DATA_LEN
from credsweeper.common.constants import MIN_DATA_LEN
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.utils import Util
Expand Down Expand Up @@ -68,10 +68,7 @@ def free(self) -> None:
def text(self) -> str:
"""Getter to produce a text from DEFAULT_ENCODING. Empty str for unrecognized data"""
if self.__text is None:
try:
self.__text = self.__data.decode(encoding=DEFAULT_ENCODING, errors="strict")
except Exception:
self.__text = ''
self.__text = Util.decode_text(self.__data) or ''
return self.__text

def __is_structure(self) -> bool:
Expand All @@ -86,7 +83,7 @@ def represent_as_structure(self) -> bool:
if MIN_DATA_LEN > len(self.text):
return False
# JSON & NDJSON
if "{" in self.text and "}" in self.text and "\"" in self.text and ":" in self.text:
if '{' in self.text and '}' in self.text and '"' in self.text and ':' in self.text:
try:
self.structure = json.loads(self.text)
logger.debug("CONVERTED from json")
Expand All @@ -113,7 +110,8 @@ def represent_as_structure(self) -> bool:
# # # Python
try:
# search only in sources with strings
if (";" in self.text or 2 < self.text.count("\n")) and ("\"" in self.text or "'" in self.text):
if (';' in self.text or 2 < self.text.count('\n') or 2 < self.text.count('\r')) \
and ('"' in self.text or "'" in self.text):
self.structure = Util.parse_python(self.text)
logger.debug("CONVERTED from Python")
else:
Expand All @@ -125,7 +123,7 @@ def represent_as_structure(self) -> bool:
return True
# # # YAML - almost always recognized
try:
if ":" in self.text and 2 < self.text.count("\n"):
if ':' in self.text and (2 < self.text.count('\n') or 2 < self.text.count('\r')):
self.structure = yaml.load(self.text, Loader=yaml.FullLoader)
logger.debug("CONVERTED from yaml")
else:
Expand All @@ -148,7 +146,7 @@ def represent_as_xml(self) -> bool:
if MIN_XML_LEN > len(self.text):
return False
try:
if "<" in self.text and ">" in self.text and "</" in self.text:
if '<' in self.text and '>' in self.text and "</" in self.text:
xml_text = self.text.splitlines()
self.lines, self.line_numbers = Util.get_xml_from_lines(xml_text)
logger.debug("CONVERTED from xml")
Expand Down Expand Up @@ -192,10 +190,10 @@ def simple_html_representation(html: BeautifulSoup) -> Tuple[List[int], List[str
lines: List[str] = []
lines_size = 0
# use dedicated variable to deal with yapf and flake
new_line_tags = ["p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "div"]
for p in html.find_all(new_line_tags):
p.append('\n')
for p in html.find_all(["th", "td"]):
tags_to_split = [
"p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "div", "th", "td"
]
for p in html.find_all(tags_to_split):
p.append('\t')
html_lines = html.get_text().splitlines()
for line_number, doc_line in enumerate(html_lines):
Expand Down Expand Up @@ -346,9 +344,8 @@ def represent_as_html(

"""
try:
text = self.data.decode(encoding=DEFAULT_ENCODING)
if "</" in text and ">" in text:
if html := BeautifulSoup(text, features="html.parser"):
if "</" in self.text and ">" in self.text:
if html := BeautifulSoup(self.text, features="html.parser"):
line_numbers, lines, lines_size = self.simple_html_representation(html)
self.line_numbers.extend(line_numbers)
self.lines.extend(lines)
Expand All @@ -367,7 +364,7 @@ def represent_as_html(
return False

def represent_as_encoded(self) -> bool:
"""Encodes data from base64. Stores result in decoded
"""Decodes data from base64. Stores result in decoded

Return:
True if the data correctly parsed and verified
Expand All @@ -379,8 +376,7 @@ def represent_as_encoded(self) -> bool:
return False
try:
self.decoded = Util.decode_base64( #
self.data.decode(encoding=ASCII, errors="strict"). #
translate(str.maketrans("", "", string.whitespace)), #
self.text.translate(str.maketrans('', '', string.whitespace)), #
padding_safe=True, #
urlsafe_detect=True) #
except Exception as exc:
Expand Down
35 changes: 28 additions & 7 deletions credsweeper/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def read_file(path: Union[str, Path], encodings: Optional[List[str]] = None) ->
return Util.decode_bytes(data, encodings)

@staticmethod
def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[str]:
def decode_text(content: bytes, encodings: Optional[List[str]] = None) -> Optional[str]:
"""Decode content using different encodings.

Try to decode bytes according to the list of encodings "encodings"
Expand All @@ -214,12 +214,11 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[
encodings: supported encodings

Return:
list of file rows in a suitable encoding from "encodings",
if none of the encodings match, an empty list will be returned
Also empty list will be returned after last encoding and 0 symbol is present in lines not at end
Decoded text in str for any suitable encoding
or None when binary data detected

"""
lines = []
text = None
binary_suggest = False
if encodings is None:
encodings = AVAILABLE_ENCODINGS
Expand All @@ -232,15 +231,37 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[
break
text = content.decode(encoding, errors="strict")
if content != text.encode(encoding, errors="strict"):
# the check helps to detect a real encoding
raise UnicodeError
# windows & macos styles workaround
lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
break
except UnicodeError:
binary_suggest = True
logger.info(f"UnicodeError: Can't decode content as {encoding}.")
except Exception as exc:
logger.error(f"Unexpected Error: Can't read content as {encoding}. Error message: {exc}")
return text

@staticmethod
def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[str]:
"""Decode content using different encodings.

Try to decode bytes according to the list of encodings "encodings"
occurs without any exceptions. UTF-16 requires BOM

Args:
content: raw data that might be text
encodings: supported encodings

Return:
list of file rows in a suitable encoding from "encodings",
if none of the encodings match, an empty list will be returned
Also empty list will be returned after last encoding and 0 symbol is present in lines not at end

"""
if text := Util.decode_text(content, encodings):
lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
else:
lines = []
return lines

@staticmethod
Expand Down
11 changes: 6 additions & 5 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT = 146
SAMPLES_FILES_COUNT = 147

# the lowest value of ML threshold is used to display possible lowest values
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan with negligible ML threshold
SAMPLES_CRED_COUNT = 465
SAMPLES_CRED_COUNT = 470
SAMPLES_CRED_LINE_COUNT = SAMPLES_CRED_COUNT + 19

# Number of filtered credentials with ML
Expand All @@ -17,16 +17,17 @@
SAMPLES_POST_CRED_COUNT = SAMPLES_CRED_COUNT - ML_FILTERED

# with option --doc
SAMPLES_IN_DOC = 650
SAMPLES_IN_DOC = 656

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 84
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 87
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 8
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1

# well known string with all latin letters
AZ_DATA = b"The quick brown fox jumps over the lazy dog"
AZ_STRING = AZ_DATA.decode(encoding="ascii")
# Assume, there should be only ASCII symbols
AZ_STRING = AZ_DATA.decode(encoding="ascii", errors="strict")

# tests directory - use ONLY this file relevance for "release_test" workflow
TESTS_PATH = Path(__file__).resolve().parent
Expand Down
Loading
Loading