From c80d83ff2b96c5e277b99d7ef73caba5d1965817 Mon Sep 17 00:00:00 2001 From: Mark Lindblad Date: Mon, 29 Jul 2024 14:43:02 -0700 Subject: [PATCH] Change docs to use `ArynPartitioner` instead of `SycamorePartitioner` (#605) --- .../APIs/transforms/detr_partitioner.rst | 2 +- docs/source/APIs/transforms/partition.rst | 3 ++ docs/source/conf.py | 1 + .../source/welcome_to_sycamore/get_started.md | 2 +- lib/sycamore/sycamore/docset.py | 50 +++++++++---------- lib/sycamore/sycamore/writer.py | 2 +- 6 files changed, 32 insertions(+), 28 deletions(-) diff --git a/docs/source/APIs/transforms/detr_partitioner.rst b/docs/source/APIs/transforms/detr_partitioner.rst index 5f4f205d3..f2e5f4e8f 100644 --- a/docs/source/APIs/transforms/detr_partitioner.rst +++ b/docs/source/APIs/transforms/detr_partitioner.rst @@ -10,6 +10,6 @@ Detr Partitioner .. autoclass:: sycamore.transforms.detr_partitioner.SycamoreObjectDetection :members: :show-inheritance: -.. autoclass:: sycamore.transforms.detr_partitioner.SycamorePDFPartitioner +.. autoclass:: sycamore.transforms.detr_partitioner.ArynPDFPartitioner :members: :show-inheritance: diff --git a/docs/source/APIs/transforms/partition.rst b/docs/source/APIs/transforms/partition.rst index 90ce8764b..204619445 100644 --- a/docs/source/APIs/transforms/partition.rst +++ b/docs/source/APIs/transforms/partition.rst @@ -7,6 +7,9 @@ Partition .. autoclass:: sycamore.transforms.partition.Partition :members: :show-inheritance: +.. autoclass:: sycamore.transforms.partition.ArynPartitioner + :members: + :show-inheritance: .. autoclass:: sycamore.transforms.partition.SycamorePartitioner :members: :show-inheritance: diff --git a/docs/source/conf.py b/docs/source/conf.py index 205aef7e7..c70d8d798 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -30,6 +30,7 @@ templates_path = ["_templates"] exclude_patterns = [] +smartquotes = False # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output diff --git a/docs/source/welcome_to_sycamore/get_started.md b/docs/source/welcome_to_sycamore/get_started.md index 12fcf287a..44f0eb7ba 100644 --- a/docs/source/welcome_to_sycamore/get_started.md +++ b/docs/source/welcome_to_sycamore/get_started.md @@ -32,7 +32,7 @@ For more info: * [Querying your data](/querying_data/demo_query_ui.md) * [Using Jupyter notebook to customize data preparation code](/data_ingestion_and_preparation/using_jupyter.md) ->By default, the Docker compose uses the stable version of the containers. You can choose a specific version to run, e.g. latest (last build pushed), latest_rc (last release candidate), or 0.YYYY.MM.DD (date-stamped release). To specify a version, set the VERSION environment variable, e.g. VERSION=latest_rc docker compose up --pull=always. See the .env file if you want to specify different versions for the separate containers.* +>By default, `docker compose` uses the stable version of the containers. You can choose a specific version to run, e.g. `latest` (last build pushed), `latest_rc` (last release candidate), or `0.YYYY.MM.DD` (date-stamped release). To specify a version, set the `VERSION` environment variable, e.g. `VERSION=latest_rc docker compose up --pull=always`. See the `.env` file if you want to specify different versions for the separate containers. ### Optional: Configure AWS Credentials for Amazon Textract usage diff --git a/lib/sycamore/sycamore/docset.py b/lib/sycamore/sycamore/docset.py index 5616b9608..ce3c04ae0 100644 --- a/lib/sycamore/sycamore/docset.py +++ b/lib/sycamore/sycamore/docset.py @@ -71,7 +71,7 @@ def show( context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .show() """ documents = self.take(limit) @@ -127,7 +127,7 @@ def count(self, include_metadata=False, **kwargs) -> int: context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .count() """ from sycamore import Execution @@ -159,7 +159,7 @@ def take(self, limit: int = 20, include_metadata: bool = False, **kwargs) -> lis context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .take() """ @@ -210,7 +210,7 @@ def limit(self, limit: int = 20, **kwargs) -> "DocSet": context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .explode() .limit() @@ -232,7 +232,7 @@ def partition( context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) """ from sycamore.transforms import Partition @@ -285,7 +285,7 @@ def spread_properties(self, props: list[str], **resource_args) -> "DocSet": .. code-block:: python pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .spread_properties(["title"]) .explode() """ @@ -313,7 +313,7 @@ def augment_text(self, augmentor: TextAugmentor, **resource_args) -> "DocSet": prompt_template=part_name_template) context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .extract_entity(entity_extractor) .explode() .augment_text(augmentor) @@ -331,7 +331,7 @@ def split_elements(self, tokenizer: Tokenizer, max_tokens: int = 512, **kwargs) .. code-block:: python pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .split_elements(tokenizer=tokenizer, max_tokens=512) .explode() """ @@ -348,7 +348,7 @@ def explode(self, **resource_args) -> "DocSet": .. code-block:: python pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .explode() """ from sycamore.transforms.explode import Explode @@ -372,7 +372,7 @@ def embed(self, embedder: Embedder, **kwargs) -> "DocSet": context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .explode() .embed(embedder=embedder) """ @@ -401,7 +401,7 @@ def extract_entity(self, entity_extractor: EntityExtractor, **kwargs) -> "DocSet context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .extract_entity(entity_extractor=entity_extractor) """ @@ -444,7 +444,7 @@ def extract_schema(self, schema_extractor: SchemaExtractor, **kwargs) -> "DocSet context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .extract_schema(schema_extractor=schema_extractor) """ @@ -474,7 +474,7 @@ def extract_batch_schema(self, schema_extractor: SchemaExtractor, **kwargs) -> " context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .extract_batch_schema(schema_extractor=schema_extractor) """ @@ -501,7 +501,7 @@ def extract_graph_structure(self, extractors: list[GraphExtractor], **kwargs) -> ds = ( ctx.read.manifest(metadata_provider=JsonManifestMetadataProvider(manifest),...) - .partition(partitioner=SycamorePartitioner(...), num_gpus=0.1) + .partition(partitioner=ArynPartitioner(...), num_gpus=0.1) .extract_graph_structure(extractors=[MetadataExtractor(metadata=metadata)]) .explode() ) @@ -529,7 +529,7 @@ def extract_properties(self, property_extractor: PropertyExtractor, **kwargs) -> context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partition=SycamorePartitioner()) + .partition(partition=ArynPartitioner()) .extract_properties(property_extractor) """ from sycamore.transforms import ExtractProperties @@ -550,7 +550,7 @@ def summarize(self, summarizer: Summarizer, **kwargs) -> "DocSet": context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .summarize(summarizer=summarizer) """ from sycamore.transforms import Summarize @@ -599,7 +599,7 @@ def merge(self, merger: ElementMerger, **kwargs) -> "DocSet": context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .merge(merger=merger) """ from sycamore.transforms import Merge @@ -617,7 +617,7 @@ def regex_replace(self, spec: list[tuple[str, str]], **kwargs) -> "DocSet": from sycamore.transforms import COALESCE_WHITESPACE ds = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .regex_replace(COALESCE_WHITESPACE) .regex_replace([(r"\d+", "1313"), (r"old", "new")]) .explode() @@ -642,7 +642,7 @@ def sketch(self, window: int = 17, number: int = 16, **kwargs) -> "DocSet": .. code-block:: python ds = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .explode() .sketch(window=17) """ @@ -662,7 +662,7 @@ def term_frequency(self, tokenizer: Tokenizer, with_token_ids: bool = False, **k tk = OpenAITokenizer("gpt-3.5-turbo") context = sycamore.init() context.read.binary(paths, binary_format="pdf") - .partition(SycamorePartitioner()) + .partition(ArynPartitioner()) .explode() .term_frequency(tokenizer=tk) .show() @@ -687,7 +687,7 @@ def transform(self, cls: Type[Transform], **kwargs) -> "DocSet": from sycamore.transforms import FooBar ds = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .transform(cls=FooBar, arg=123) """ plan = cls(self.plan, **kwargs) # type: ignore @@ -722,7 +722,7 @@ def custom_flat_mapping_function(document: Document) -> list[Document]: context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .flat_map(custom_flat_mapping_function) """ @@ -748,7 +748,7 @@ def custom_filter(doc: Document) -> bool: context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) .filter(custom_filter) """ @@ -913,7 +913,7 @@ def write(self) -> DocSetWriter: Example: The following example shows reading a DocSet from a collection of PDFs, partitioning - it using the ``SycamorePartitioner``, and then writing it to a new OpenSearch index. + it using the ``ArynPartitioner``, and then writing it to a new OpenSearch index. .. code-block:: python @@ -941,7 +941,7 @@ def write(self) -> DocSetWriter: context = sycamore.init() pdf_docset = context.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner()) + .partition(partitioner=ArynPartitioner()) pdf.write.opensearch( os_client_args=os_client_args, diff --git a/lib/sycamore/sycamore/writer.py b/lib/sycamore/sycamore/writer.py index dc78f61ee..22b9a4836 100644 --- a/lib/sycamore/sycamore/writer.py +++ b/lib/sycamore/sycamore/writer.py @@ -329,7 +329,7 @@ def pinecone( ctx = sycamore.init() ds = ( ctx.read.binary(paths, binary_format="pdf") - .partition(partitioner=SycamorePartitioner(extract_table_structure=True, extract_images=True)) + .partition(partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True)) .explode() .embed(embedder=SentenceTransformerEmbedder(model_name=model_name, batch_size=100)) .term_frequency(tokenizer=tokenizer, with_token_ids=True)