Skip to content

Commit

Permalink
Change docs to use ArynPartitioner instead of SycamorePartitioner (
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkLindblad authored Jul 29, 2024
1 parent 5757d06 commit c80d83f
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 28 deletions.
2 changes: 1 addition & 1 deletion docs/source/APIs/transforms/detr_partitioner.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ Detr Partitioner
.. autoclass:: sycamore.transforms.detr_partitioner.SycamoreObjectDetection
:members:
:show-inheritance:
.. autoclass:: sycamore.transforms.detr_partitioner.SycamorePDFPartitioner
.. autoclass:: sycamore.transforms.detr_partitioner.ArynPDFPartitioner
:members:
:show-inheritance:
3 changes: 3 additions & 0 deletions docs/source/APIs/transforms/partition.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ Partition
.. autoclass:: sycamore.transforms.partition.Partition
:members:
:show-inheritance:
.. autoclass:: sycamore.transforms.partition.ArynPartitioner
:members:
:show-inheritance:
.. autoclass:: sycamore.transforms.partition.SycamorePartitioner
:members:
:show-inheritance:
Expand Down
1 change: 1 addition & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

templates_path = ["_templates"]
exclude_patterns = []
smartquotes = False

# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
Expand Down
2 changes: 1 addition & 1 deletion docs/source/welcome_to_sycamore/get_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ For more info:
* [Querying your data](/querying_data/demo_query_ui.md)
* [Using Jupyter notebook to customize data preparation code](/data_ingestion_and_preparation/using_jupyter.md)

>By default, the Docker compose uses the stable version of the containers. You can choose a specific version to run, e.g. latest (last build pushed), latest_rc (last release candidate), or 0.YYYY.MM.DD (date-stamped release). To specify a version, set the VERSION environment variable, e.g. VERSION=latest_rc docker compose up --pull=always. See the .env file if you want to specify different versions for the separate containers.*
>By default, `docker compose` uses the stable version of the containers. You can choose a specific version to run, e.g. `latest` (last build pushed), `latest_rc` (last release candidate), or `0.YYYY.MM.DD` (date-stamped release). To specify a version, set the `VERSION` environment variable, e.g. `VERSION=latest_rc docker compose up --pull=always`. See the `.env` file if you want to specify different versions for the separate containers.
### Optional: Configure AWS Credentials for Amazon Textract usage

Expand Down
50 changes: 25 additions & 25 deletions lib/sycamore/sycamore/docset.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def show(
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.show()
"""
documents = self.take(limit)
Expand Down Expand Up @@ -127,7 +127,7 @@ def count(self, include_metadata=False, **kwargs) -> int:
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.count()
"""
from sycamore import Execution
Expand Down Expand Up @@ -159,7 +159,7 @@ def take(self, limit: int = 20, include_metadata: bool = False, **kwargs) -> lis
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.take()
"""
Expand Down Expand Up @@ -210,7 +210,7 @@ def limit(self, limit: int = 20, **kwargs) -> "DocSet":
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.explode()
.limit()
Expand All @@ -232,7 +232,7 @@ def partition(
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
"""
from sycamore.transforms import Partition

Expand Down Expand Up @@ -285,7 +285,7 @@ def spread_properties(self, props: list[str], **resource_args) -> "DocSet":
.. code-block:: python
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.spread_properties(["title"])
.explode()
"""
Expand Down Expand Up @@ -313,7 +313,7 @@ def augment_text(self, augmentor: TextAugmentor, **resource_args) -> "DocSet":
prompt_template=part_name_template)
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.extract_entity(entity_extractor)
.explode()
.augment_text(augmentor)
Expand All @@ -331,7 +331,7 @@ def split_elements(self, tokenizer: Tokenizer, max_tokens: int = 512, **kwargs)
.. code-block:: python
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.split_elements(tokenizer=tokenizer, max_tokens=512)
.explode()
"""
Expand All @@ -348,7 +348,7 @@ def explode(self, **resource_args) -> "DocSet":
.. code-block:: python
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.explode()
"""
from sycamore.transforms.explode import Explode
Expand All @@ -372,7 +372,7 @@ def embed(self, embedder: Embedder, **kwargs) -> "DocSet":
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.explode()
.embed(embedder=embedder)
"""
Expand Down Expand Up @@ -401,7 +401,7 @@ def extract_entity(self, entity_extractor: EntityExtractor, **kwargs) -> "DocSet
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.extract_entity(entity_extractor=entity_extractor)
"""
Expand Down Expand Up @@ -444,7 +444,7 @@ def extract_schema(self, schema_extractor: SchemaExtractor, **kwargs) -> "DocSet
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.extract_schema(schema_extractor=schema_extractor)
"""

Expand Down Expand Up @@ -474,7 +474,7 @@ def extract_batch_schema(self, schema_extractor: SchemaExtractor, **kwargs) -> "
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.extract_batch_schema(schema_extractor=schema_extractor)
"""

Expand All @@ -501,7 +501,7 @@ def extract_graph_structure(self, extractors: list[GraphExtractor], **kwargs) ->
ds = (
ctx.read.manifest(metadata_provider=JsonManifestMetadataProvider(manifest),...)
.partition(partitioner=SycamorePartitioner(...), num_gpus=0.1)
.partition(partitioner=ArynPartitioner(...), num_gpus=0.1)
.extract_graph_structure(extractors=[MetadataExtractor(metadata=metadata)])
.explode()
)
Expand Down Expand Up @@ -529,7 +529,7 @@ def extract_properties(self, property_extractor: PropertyExtractor, **kwargs) ->
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partition=SycamorePartitioner())
.partition(partition=ArynPartitioner())
.extract_properties(property_extractor)
"""
from sycamore.transforms import ExtractProperties
Expand All @@ -550,7 +550,7 @@ def summarize(self, summarizer: Summarizer, **kwargs) -> "DocSet":
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.summarize(summarizer=summarizer)
"""
from sycamore.transforms import Summarize
Expand Down Expand Up @@ -599,7 +599,7 @@ def merge(self, merger: ElementMerger, **kwargs) -> "DocSet":
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.merge(merger=merger)
"""
from sycamore.transforms import Merge
Expand All @@ -617,7 +617,7 @@ def regex_replace(self, spec: list[tuple[str, str]], **kwargs) -> "DocSet":
from sycamore.transforms import COALESCE_WHITESPACE
ds = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.regex_replace(COALESCE_WHITESPACE)
.regex_replace([(r"\d+", "1313"), (r"old", "new")])
.explode()
Expand All @@ -642,7 +642,7 @@ def sketch(self, window: int = 17, number: int = 16, **kwargs) -> "DocSet":
.. code-block:: python
ds = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.explode()
.sketch(window=17)
"""
Expand All @@ -662,7 +662,7 @@ def term_frequency(self, tokenizer: Tokenizer, with_token_ids: bool = False, **k
tk = OpenAITokenizer("gpt-3.5-turbo")
context = sycamore.init()
context.read.binary(paths, binary_format="pdf")
.partition(SycamorePartitioner())
.partition(ArynPartitioner())
.explode()
.term_frequency(tokenizer=tk)
.show()
Expand All @@ -687,7 +687,7 @@ def transform(self, cls: Type[Transform], **kwargs) -> "DocSet":
from sycamore.transforms import FooBar
ds = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.transform(cls=FooBar, arg=123)
"""
plan = cls(self.plan, **kwargs) # type: ignore
Expand Down Expand Up @@ -722,7 +722,7 @@ def custom_flat_mapping_function(document: Document) -> list[Document]:
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.flat_map(custom_flat_mapping_function)
"""
Expand All @@ -748,7 +748,7 @@ def custom_filter(doc: Document) -> bool:
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
.filter(custom_filter)
"""
Expand Down Expand Up @@ -913,7 +913,7 @@ def write(self) -> DocSetWriter:
Example:
The following example shows reading a DocSet from a collection of PDFs, partitioning
it using the ``SycamorePartitioner``, and then writing it to a new OpenSearch index.
it using the ``ArynPartitioner``, and then writing it to a new OpenSearch index.
.. code-block:: python
Expand Down Expand Up @@ -941,7 +941,7 @@ def write(self) -> DocSetWriter:
context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner())
.partition(partitioner=ArynPartitioner())
pdf.write.opensearch(
os_client_args=os_client_args,
Expand Down
2 changes: 1 addition & 1 deletion lib/sycamore/sycamore/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def pinecone(
ctx = sycamore.init()
ds = (
ctx.read.binary(paths, binary_format="pdf")
.partition(partitioner=SycamorePartitioner(extract_table_structure=True, extract_images=True))
.partition(partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True))
.explode()
.embed(embedder=SentenceTransformerEmbedder(model_name=model_name, batch_size=100))
.term_frequency(tokenizer=tokenizer, with_token_ids=True)
Expand Down

0 comments on commit c80d83f

Please sign in to comment.