Skip to content

Commit

Permalink
Merge elements in ntsb test ingest
Browse files Browse the repository at this point in the history
  • Loading branch information
baitsguy committed Jul 30, 2024
1 parent 91dd00c commit f20d493
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions lib/sycamore/sycamore/tests/integration/query/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
from opensearchpy import OpenSearch

import sycamore
from sycamore.functions import HuggingFaceTokenizer
from sycamore.tests.config import TEST_DIR
from sycamore.transforms.embed import SentenceTransformerEmbedder
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import UnstructuredPdfPartitioner

QUERY_INTEGRATION_TEST_INDEX_NAME = "sycamore_query_ntsb_integration_tests"
Expand Down Expand Up @@ -46,12 +48,15 @@ def query_integration_test_index():
}
}
paths = str(TEST_DIR / "resources/data/pdfs/ntsb-report.pdf")
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = HuggingFaceTokenizer(model_name)

context = sycamore.init()
ds = (
context.read.binary(paths, binary_format="pdf")
.limit(1)
.partition(partitioner=UnstructuredPdfPartitioner())
.merge(GreedyTextElementMerger(tokenizer=tokenizer, max_tokens=1000))
.explode()
.embed(
embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2")
Expand Down

0 comments on commit f20d493

Please sign in to comment.