Change docs to use ArynPartitioner instead of SycamorePartitioner (…

…#605)
aryn-ai · Jul 29, 2024 · c80d83f · c80d83f
1 parent 5757d06
commit c80d83f
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 28 deletions.
diff --git a/docs/source/APIs/transforms/detr_partitioner.rst b/docs/source/APIs/transforms/detr_partitioner.rst
@@ -10,6 +10,6 @@ Detr Partitioner
 .. autoclass:: sycamore.transforms.detr_partitioner.SycamoreObjectDetection
    :members:
    :show-inheritance:
-.. autoclass:: sycamore.transforms.detr_partitioner.SycamorePDFPartitioner
+.. autoclass:: sycamore.transforms.detr_partitioner.ArynPDFPartitioner
    :members:
    :show-inheritance:
diff --git a/docs/source/APIs/transforms/partition.rst b/docs/source/APIs/transforms/partition.rst
@@ -7,6 +7,9 @@ Partition
 .. autoclass:: sycamore.transforms.partition.Partition
    :members:
    :show-inheritance:
+.. autoclass:: sycamore.transforms.partition.ArynPartitioner
+   :members:
+   :show-inheritance:
 .. autoclass:: sycamore.transforms.partition.SycamorePartitioner
    :members:
    :show-inheritance:

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -30,6 +30,7 @@
 
 templates_path = ["_templates"]
 exclude_patterns = []
+smartquotes = False
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

diff --git a/docs/source/welcome_to_sycamore/get_started.md b/docs/source/welcome_to_sycamore/get_started.md
@@ -32,7 +32,7 @@ For more info:
 * [Querying your data](/querying_data/demo_query_ui.md)
 * [Using Jupyter notebook to customize data preparation code](/data_ingestion_and_preparation/using_jupyter.md)
 
->By default, the Docker compose uses the stable version of the containers. You can choose a specific version to run, e.g. latest (last build pushed), latest_rc (last release candidate), or 0.YYYY.MM.DD (date-stamped release). To specify a version, set the VERSION environment variable, e.g. VERSION=latest_rc docker compose up --pull=always. See the .env file if you want to specify different versions for the separate containers.*
+>By default, `docker compose` uses the stable version of the containers. You can choose a specific version to run, e.g. `latest` (last build pushed), `latest_rc` (last release candidate), or `0.YYYY.MM.DD` (date-stamped release). To specify a version, set the `VERSION` environment variable, e.g. `VERSION=latest_rc docker compose up --pull=always`. See the `.env` file if you want to specify different versions for the separate containers.
 
 ### Optional: Configure AWS Credentials for Amazon Textract usage
 

diff --git a/lib/sycamore/sycamore/docset.py b/lib/sycamore/sycamore/docset.py
@@ -71,7 +71,7 @@ def show(
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .show()
         """
         documents = self.take(limit)
@@ -127,7 +127,7 @@ def count(self, include_metadata=False, **kwargs) -> int:
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .count()
         """
         from sycamore import Execution
@@ -159,7 +159,7 @@ def take(self, limit: int = 20, include_metadata: bool = False, **kwargs) -> lis
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .take()
 
         """
@@ -210,7 +210,7 @@ def limit(self, limit: int = 20, **kwargs) -> "DocSet":
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .explode()
                     .limit()
 
@@ -232,7 +232,7 @@ def partition(
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
         """
         from sycamore.transforms import Partition
 
@@ -285,7 +285,7 @@ def spread_properties(self, props: list[str], **resource_args) -> "DocSet":
             .. code-block:: python
 
                pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .spread_properties(["title"])
                     .explode()
         """
@@ -313,7 +313,7 @@ def augment_text(self, augmentor: TextAugmentor, **resource_args) -> "DocSet":
                                         prompt_template=part_name_template)
             context = sycamore.init()
             pdf_docset = context.read.binary(paths, binary_format="pdf")
-                .partition(partitioner=SycamorePartitioner())
+                .partition(partitioner=ArynPartitioner())
                 .extract_entity(entity_extractor)
                 .explode()
                 .augment_text(augmentor)
@@ -331,7 +331,7 @@ def split_elements(self, tokenizer: Tokenizer, max_tokens: int = 512, **kwargs)
             .. code-block:: python
 
                pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .split_elements(tokenizer=tokenizer, max_tokens=512)
                     .explode()
         """
@@ -348,7 +348,7 @@ def explode(self, **resource_args) -> "DocSet":
             .. code-block:: python
 
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .explode()
         """
         from sycamore.transforms.explode import Explode
@@ -372,7 +372,7 @@ def embed(self, embedder: Embedder, **kwargs) -> "DocSet":
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .explode()
                     .embed(embedder=embedder)
         """
@@ -401,7 +401,7 @@ def extract_entity(self, entity_extractor: EntityExtractor, **kwargs) -> "DocSet
 
                  context = sycamore.init()
                  pdf_docset = context.read.binary(paths, binary_format="pdf")
-                     .partition(partitioner=SycamorePartitioner())
+                     .partition(partitioner=ArynPartitioner())
                      .extract_entity(entity_extractor=entity_extractor)
 
         """
@@ -444,7 +444,7 @@ def extract_schema(self, schema_extractor: SchemaExtractor, **kwargs) -> "DocSet
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .extract_schema(schema_extractor=schema_extractor)
         """
 
@@ -474,7 +474,7 @@ def extract_batch_schema(self, schema_extractor: SchemaExtractor, **kwargs) -> "
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .extract_batch_schema(schema_extractor=schema_extractor)
         """
 
@@ -501,7 +501,7 @@ def extract_graph_structure(self, extractors: list[GraphExtractor], **kwargs) ->
 
                 ds = (
                     ctx.read.manifest(metadata_provider=JsonManifestMetadataProvider(manifest),...)
-                    .partition(partitioner=SycamorePartitioner(...), num_gpus=0.1)
+                    .partition(partitioner=ArynPartitioner(...), num_gpus=0.1)
                     .extract_graph_structure(extractors=[MetadataExtractor(metadata=metadata)])
                     .explode()
                 )
@@ -529,7 +529,7 @@ def extract_properties(self, property_extractor: PropertyExtractor, **kwargs) ->
                 context = sycamore.init()
 
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partition=SycamorePartitioner())
+                    .partition(partition=ArynPartitioner())
                     .extract_properties(property_extractor)
         """
         from sycamore.transforms import ExtractProperties
@@ -550,7 +550,7 @@ def summarize(self, summarizer: Summarizer, **kwargs) -> "DocSet":
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .summarize(summarizer=summarizer)
         """
         from sycamore.transforms import Summarize
@@ -599,7 +599,7 @@ def merge(self, merger: ElementMerger, **kwargs) -> "DocSet":
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .merge(merger=merger)
         """
         from sycamore.transforms import Merge
@@ -617,7 +617,7 @@ def regex_replace(self, spec: list[tuple[str, str]], **kwargs) -> "DocSet":
 
                from sycamore.transforms import COALESCE_WHITESPACE
                ds = context.read.binary(paths, binary_format="pdf")
-                   .partition(partitioner=SycamorePartitioner())
+                   .partition(partitioner=ArynPartitioner())
                    .regex_replace(COALESCE_WHITESPACE)
                    .regex_replace([(r"\d+", "1313"), (r"old", "new")])
                    .explode()
@@ -642,7 +642,7 @@ def sketch(self, window: int = 17, number: int = 16, **kwargs) -> "DocSet":
             .. code-block:: python
 
                ds = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .explode()
                     .sketch(window=17)
         """
@@ -662,7 +662,7 @@ def term_frequency(self, tokenizer: Tokenizer, with_token_ids: bool = False, **k
                 tk = OpenAITokenizer("gpt-3.5-turbo")
                 context = sycamore.init()
                 context.read.binary(paths, binary_format="pdf")
-                    .partition(SycamorePartitioner())
+                    .partition(ArynPartitioner())
                     .explode()
                     .term_frequency(tokenizer=tk)
                     .show()
@@ -687,7 +687,7 @@ def transform(self, cls: Type[Transform], **kwargs) -> "DocSet":
 
                from sycamore.transforms import FooBar
                ds = context.read.binary(paths, binary_format="pdf")
-                   .partition(partitioner=SycamorePartitioner())
+                   .partition(partitioner=ArynPartitioner())
                    .transform(cls=FooBar, arg=123)
         """
         plan = cls(self.plan, **kwargs)  # type: ignore
@@ -722,7 +722,7 @@ def custom_flat_mapping_function(document: Document) -> list[Document]:
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                .partition(partitioner=SycamorePartitioner())
+                .partition(partitioner=ArynPartitioner())
                 .flat_map(custom_flat_mapping_function)
 
         """
@@ -748,7 +748,7 @@ def custom_filter(doc: Document) -> bool:
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
                     .filter(custom_filter)
 
         """
@@ -913,7 +913,7 @@ def write(self) -> DocSetWriter:
 
         Example:
              The following example shows reading a DocSet from a collection of PDFs, partitioning
-             it using the ``SycamorePartitioner``, and then writing it to a new OpenSearch index.
+             it using the ``ArynPartitioner``, and then writing it to a new OpenSearch index.
 
              .. code-block:: python
 
@@ -941,7 +941,7 @@ def write(self) -> DocSetWriter:
 
                 context = sycamore.init()
                 pdf_docset = context.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner())
+                    .partition(partitioner=ArynPartitioner())
 
                 pdf.write.opensearch(
                      os_client_args=os_client_args,

diff --git a/lib/sycamore/sycamore/writer.py b/lib/sycamore/sycamore/writer.py
@@ -329,7 +329,7 @@ def pinecone(
                 ctx = sycamore.init()
                 ds = (
                     ctx.read.binary(paths, binary_format="pdf")
-                    .partition(partitioner=SycamorePartitioner(extract_table_structure=True, extract_images=True))
+                    .partition(partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True))
                     .explode()
                     .embed(embedder=SentenceTransformerEmbedder(model_name=model_name, batch_size=100))
                     .term_frequency(tokenizer=tokenizer, with_token_ids=True)