From 43758ca85b1120e28df73d56242051babadb01cd Mon Sep 17 00:00:00 2001 From: Karan Sampath <176953591+karanataryn@users.noreply.github.com> Date: Mon, 10 Feb 2025 17:45:30 -0800 Subject: [PATCH] Bump Beautiful Soup (#1167) * bump * check lint * fix action * change action * update * assert Tag * fix table and partition * revert workflow changes --- lib/sycamore/poetry.lock | 13 +++++++------ lib/sycamore/pyproject.toml | 2 +- lib/sycamore/sycamore/data/table.py | 14 +++++++++----- lib/sycamore/sycamore/transforms/partition.py | 3 ++- poetry.lock | 13 +++++++------ 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/lib/sycamore/poetry.lock b/lib/sycamore/poetry.lock index 11b4f1b70..f8856ebca 100644 --- a/lib/sycamore/poetry.lock +++ b/lib/sycamore/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -525,17 +525,18 @@ files = [ [[package]] name = "beautifulsoup4" -version = "4.12.3" +version = "4.13.3" description = "Screen-scraping library" optional = false -python-versions = ">=3.6.0" +python-versions = ">=3.7.0" files = [ - {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, - {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, + {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"}, + {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"}, ] [package.dependencies] soupsieve = ">1.2" +typing-extensions = ">=4.0.0" [package.extras] cchardet = ["cchardet"] @@ -9976,4 +9977,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "ec9791de75120499fa5627b69728ed2bda2d84adc76e4d92e216ae381b9356ec" +content-hash = "9b5e9ff0312b48fa4261aaeee39a557e20496228a07d519049ea46c5af827180" diff --git a/lib/sycamore/pyproject.toml b/lib/sycamore/pyproject.toml index 55601c5ee..9088c5407 100644 --- a/lib/sycamore/pyproject.toml +++ b/lib/sycamore/pyproject.toml @@ -22,7 +22,7 @@ ray = { extras = ["default"], version = "^2.36.0" } pyarrow = "^14.0.2" numpy = "<2.0.0" openai = "^1.60.2" -beautifulsoup4 = "^4.12.2" +beautifulsoup4 = "^4.13.1" amazon-textract-textractor = "^1.3.2" boto3 = "^1.28.70" boto3-stubs = {extras = ["essential"], version = "^1.35.12"} diff --git a/lib/sycamore/sycamore/data/table.py b/lib/sycamore/sycamore/data/table.py index 2ec195dd9..4ed0b3d7a 100644 --- a/lib/sycamore/sycamore/data/table.py +++ b/lib/sycamore/sycamore/data/table.py @@ -202,7 +202,7 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non if (html_str is not None and html_tag is not None) or (html_str is None and html_tag is None): raise ValueError("Exactly one of html_str and html_tag must be specified.") - + root: Union[Tag, BeautifulSoup] if html_str is not None: html_str = html_str.strip() if not html_str.startswith(""): @@ -222,9 +222,10 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non cells = [] caption = None - + assert isinstance(root, Tag), "Expected root to be a Tag" # Traverse the tree of elements in a pre-order traversal. for tag in root.find_all(recursive=True): + assert isinstance(tag, Tag), "Expected root to be a Tag" if tag.name == "tr": cur_row += 1 # TODO: Should this be based on rowspan? cur_col = 0 @@ -234,9 +235,12 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non # they have a thead. if cur_row < 0: cur_row += 1 - - rowspan = int(tag.attrs.get("rowspan", "1")) - colspan = int(tag.attrs.get("colspan", "1")) + if rowspan_str := tag.attrs.get("rowspan", "1"): + assert isinstance(rowspan_str, str) # For mypy + rowspan = int(rowspan_str) + if colspan_str := tag.attrs.get("colspan", "1"): + assert isinstance(colspan_str, str) # For mypy + colspan = int(colspan_str) content = tag.get_text() diff --git a/lib/sycamore/sycamore/transforms/partition.py b/lib/sycamore/sycamore/transforms/partition.py index 0487cb881..c39a5c1b2 100644 --- a/lib/sycamore/sycamore/transforms/partition.py +++ b/lib/sycamore/sycamore/transforms/partition.py @@ -2,7 +2,7 @@ import io from typing import Any, Literal, Optional, Union -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from sycamore.functions import TextOverlapChunker, Chunker from sycamore.functions import CharacterTokenizer, Tokenizer @@ -312,6 +312,7 @@ def partition(self, document: Document) -> Document: if self._extract_tables: for table in soup.find_all("table"): # ignore nested tables + assert isinstance(table, Tag) if len(table.find_all("table")) > 0: continue diff --git a/poetry.lock b/poetry.lock index b88aa8c29..4d96a2578 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiobotocore" @@ -597,17 +597,18 @@ files = [ [[package]] name = "beautifulsoup4" -version = "4.12.3" +version = "4.13.3" description = "Screen-scraping library" optional = false -python-versions = ">=3.6.0" +python-versions = ">=3.7.0" files = [ - {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, - {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, + {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"}, + {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"}, ] [package.dependencies] soupsieve = ">1.2" +typing-extensions = ">=4.0.0" [package.extras] cchardet = ["cchardet"] @@ -8738,7 +8739,7 @@ amazon-textract-textractor = "^1.3.2" anthropic = {version = "^0.42.0", optional = true} apted = {version = "^1.0.3", optional = true} async-timeout = ">4.0.0" -beautifulsoup4 = "^4.12.2" +beautifulsoup4 = "^4.13.1" boto3 = "^1.28.70" boto3-stubs = {version = "^1.35.12", extras = ["essential"]} datasets = {version = "^2.16.1", optional = true}