Skip to content

Commit

Permalink
Bump Beautiful Soup (#1167)
Browse files Browse the repository at this point in the history
* bump

* check lint

* fix action

* change action

* update

* assert Tag

* fix table and partition

* revert workflow changes
  • Loading branch information
karanataryn authored Feb 11, 2025
1 parent aa0d7a3 commit 43758ca
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 19 deletions.
13 changes: 7 additions & 6 deletions lib/sycamore/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion lib/sycamore/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ ray = { extras = ["default"], version = "^2.36.0" }
pyarrow = "^14.0.2"
numpy = "<2.0.0"
openai = "^1.60.2"
beautifulsoup4 = "^4.12.2"
beautifulsoup4 = "^4.13.1"
amazon-textract-textractor = "^1.3.2"
boto3 = "^1.28.70"
boto3-stubs = {extras = ["essential"], version = "^1.35.12"}
Expand Down
14 changes: 9 additions & 5 deletions lib/sycamore/sycamore/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non

if (html_str is not None and html_tag is not None) or (html_str is None and html_tag is None):
raise ValueError("Exactly one of html_str and html_tag must be specified.")

root: Union[Tag, BeautifulSoup]
if html_str is not None:
html_str = html_str.strip()
if not html_str.startswith("<table") or not html_str.endswith("</table>"):
Expand All @@ -222,9 +222,10 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non

cells = []
caption = None

assert isinstance(root, Tag), "Expected root to be a Tag"
# Traverse the tree of elements in a pre-order traversal.
for tag in root.find_all(recursive=True):
assert isinstance(tag, Tag), "Expected root to be a Tag"
if tag.name == "tr":
cur_row += 1 # TODO: Should this be based on rowspan?
cur_col = 0
Expand All @@ -234,9 +235,12 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non
# they have a thead.
if cur_row < 0:
cur_row += 1

rowspan = int(tag.attrs.get("rowspan", "1"))
colspan = int(tag.attrs.get("colspan", "1"))
if rowspan_str := tag.attrs.get("rowspan", "1"):
assert isinstance(rowspan_str, str) # For mypy
rowspan = int(rowspan_str)
if colspan_str := tag.attrs.get("colspan", "1"):
assert isinstance(colspan_str, str) # For mypy
colspan = int(colspan_str)

content = tag.get_text()

Expand Down
3 changes: 2 additions & 1 deletion lib/sycamore/sycamore/transforms/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import io
from typing import Any, Literal, Optional, Union

from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag

from sycamore.functions import TextOverlapChunker, Chunker
from sycamore.functions import CharacterTokenizer, Tokenizer
Expand Down Expand Up @@ -312,6 +312,7 @@ def partition(self, document: Document) -> Document:
if self._extract_tables:
for table in soup.find_all("table"):
# ignore nested tables
assert isinstance(table, Tag)
if len(table.find_all("table")) > 0:
continue

Expand Down
13 changes: 7 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 43758ca

Please sign in to comment.