From 43758ca85b1120e28df73d56242051babadb01cd Mon Sep 17 00:00:00 2001
From: Karan Sampath <176953591+karanataryn@users.noreply.github.com>
Date: Mon, 10 Feb 2025 17:45:30 -0800
Subject: [PATCH] Bump Beautiful Soup (#1167)
* bump
* check lint
* fix action
* change action
* update
* assert Tag
* fix table and partition
* revert workflow changes
---
lib/sycamore/poetry.lock | 13 +++++++------
lib/sycamore/pyproject.toml | 2 +-
lib/sycamore/sycamore/data/table.py | 14 +++++++++-----
lib/sycamore/sycamore/transforms/partition.py | 3 ++-
poetry.lock | 13 +++++++------
5 files changed, 26 insertions(+), 19 deletions(-)
diff --git a/lib/sycamore/poetry.lock b/lib/sycamore/poetry.lock
index 11b4f1b70..f8856ebca 100644
--- a/lib/sycamore/poetry.lock
+++ b/lib/sycamore/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -525,17 +525,18 @@ files = [
[[package]]
name = "beautifulsoup4"
-version = "4.12.3"
+version = "4.13.3"
description = "Screen-scraping library"
optional = false
-python-versions = ">=3.6.0"
+python-versions = ">=3.7.0"
files = [
- {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
- {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+ {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
+ {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
]
[package.dependencies]
soupsieve = ">1.2"
+typing-extensions = ">=4.0.0"
[package.extras]
cchardet = ["cchardet"]
@@ -9976,4 +9977,4 @@ weaviate = ["weaviate-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.13"
-content-hash = "ec9791de75120499fa5627b69728ed2bda2d84adc76e4d92e216ae381b9356ec"
+content-hash = "9b5e9ff0312b48fa4261aaeee39a557e20496228a07d519049ea46c5af827180"
diff --git a/lib/sycamore/pyproject.toml b/lib/sycamore/pyproject.toml
index 55601c5ee..9088c5407 100644
--- a/lib/sycamore/pyproject.toml
+++ b/lib/sycamore/pyproject.toml
@@ -22,7 +22,7 @@ ray = { extras = ["default"], version = "^2.36.0" }
pyarrow = "^14.0.2"
numpy = "<2.0.0"
openai = "^1.60.2"
-beautifulsoup4 = "^4.12.2"
+beautifulsoup4 = "^4.13.1"
amazon-textract-textractor = "^1.3.2"
boto3 = "^1.28.70"
boto3-stubs = {extras = ["essential"], version = "^1.35.12"}
diff --git a/lib/sycamore/sycamore/data/table.py b/lib/sycamore/sycamore/data/table.py
index 2ec195dd9..4ed0b3d7a 100644
--- a/lib/sycamore/sycamore/data/table.py
+++ b/lib/sycamore/sycamore/data/table.py
@@ -202,7 +202,7 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non
if (html_str is not None and html_tag is not None) or (html_str is None and html_tag is None):
raise ValueError("Exactly one of html_str and html_tag must be specified.")
-
+ root: Union[Tag, BeautifulSoup]
if html_str is not None:
html_str = html_str.strip()
if not html_str.startswith("
"):
@@ -222,9 +222,10 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non
cells = []
caption = None
-
+ assert isinstance(root, Tag), "Expected root to be a Tag"
# Traverse the tree of elements in a pre-order traversal.
for tag in root.find_all(recursive=True):
+ assert isinstance(tag, Tag), "Expected root to be a Tag"
if tag.name == "tr":
cur_row += 1 # TODO: Should this be based on rowspan?
cur_col = 0
@@ -234,9 +235,12 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non
# they have a thead.
if cur_row < 0:
cur_row += 1
-
- rowspan = int(tag.attrs.get("rowspan", "1"))
- colspan = int(tag.attrs.get("colspan", "1"))
+ if rowspan_str := tag.attrs.get("rowspan", "1"):
+ assert isinstance(rowspan_str, str) # For mypy
+ rowspan = int(rowspan_str)
+ if colspan_str := tag.attrs.get("colspan", "1"):
+ assert isinstance(colspan_str, str) # For mypy
+ colspan = int(colspan_str)
content = tag.get_text()
diff --git a/lib/sycamore/sycamore/transforms/partition.py b/lib/sycamore/sycamore/transforms/partition.py
index 0487cb881..c39a5c1b2 100644
--- a/lib/sycamore/sycamore/transforms/partition.py
+++ b/lib/sycamore/sycamore/transforms/partition.py
@@ -2,7 +2,7 @@
import io
from typing import Any, Literal, Optional, Union
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
from sycamore.functions import TextOverlapChunker, Chunker
from sycamore.functions import CharacterTokenizer, Tokenizer
@@ -312,6 +312,7 @@ def partition(self, document: Document) -> Document:
if self._extract_tables:
for table in soup.find_all("table"):
# ignore nested tables
+ assert isinstance(table, Tag)
if len(table.find_all("table")) > 0:
continue
diff --git a/poetry.lock b/poetry.lock
index b88aa8c29..4d96a2578 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "aiobotocore"
@@ -597,17 +597,18 @@ files = [
[[package]]
name = "beautifulsoup4"
-version = "4.12.3"
+version = "4.13.3"
description = "Screen-scraping library"
optional = false
-python-versions = ">=3.6.0"
+python-versions = ">=3.7.0"
files = [
- {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
- {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+ {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
+ {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
]
[package.dependencies]
soupsieve = ">1.2"
+typing-extensions = ">=4.0.0"
[package.extras]
cchardet = ["cchardet"]
@@ -8738,7 +8739,7 @@ amazon-textract-textractor = "^1.3.2"
anthropic = {version = "^0.42.0", optional = true}
apted = {version = "^1.0.3", optional = true}
async-timeout = ">4.0.0"
-beautifulsoup4 = "^4.12.2"
+beautifulsoup4 = "^4.13.1"
boto3 = "^1.28.70"
boto3-stubs = {version = "^1.35.12", extras = ["essential"]}
datasets = {version = "^2.16.1", optional = true}