up (#1664)

* up * add ingestion settings too * up * up * up
SciPhi-AI · Dec 6, 2024 · 63a7917 · 63a7917
1 parent e9e156d
commit 63a7917
Show file tree

Hide file tree

Showing 16 changed files with 481 additions and 47 deletions.
diff --git a/js/sdk/src/v3/clients/documents.ts b/js/sdk/src/v3/clients/documents.ts
@@ -46,6 +46,7 @@ export class DocumentsClient {
     ingestionConfig?: Record<string, any>;
     collectionIds?: string[];
     runWithOrchestration?: boolean;
+    ingestionMode?: "hi-res" | "fast" | "custom";
   }): Promise<WrappedIngestionResponse> {
     const inputCount = [options.file, options.raw_text, options.chunks].filter(
       (x) => x !== undefined,
@@ -128,6 +129,9 @@ export class DocumentsClient {
         String(options.runWithOrchestration),
       );
     }
+    if (options.ingestionMode) {
+      formData.append("ingestion_mode", options.ingestionMode);
+    }
 
     formData.append("file_names", JSON.stringify(processedFiles));
 

diff --git a/js/sdk/src/v3/clients/retrieval.ts b/js/sdk/src/v3/clients/retrieval.ts
@@ -28,13 +28,17 @@ export class RetrievalClient {
   @feature("retrieval.search")
   async search(options: {
     query: string;
+    searchMode?: "advanced" | "basic" | "custom";
     searchSettings?: SearchSettings | Record<string, any>;
   }): Promise<WrappedSearchResponse> {
     const data = {
       query: options.query,
       ...(options.searchSettings && {
         search_settings: options.searchSettings,
       }),
+      ...(options.searchMode && {
+        search_mode: options.searchMode,
+      }),
     };
 
     return await this.client.makeRequest("POST", "retrieval/search", {
@@ -60,13 +64,17 @@ export class RetrievalClient {
   @feature("retrieval.rag")
   async rag(options: {
     query: string;
+    searchMode?: "advanced" | "basic" | "custom";
     searchSettings?: SearchSettings | Record<string, any>;
     ragGenerationConfig?: GenerationConfig | Record<string, any>;
     taskPromptOverride?: string;
     includeTitleIfAvailable?: boolean;
   }): Promise<any | AsyncGenerator<string, void, unknown>> {
     const data = {
       query: options.query,
+      ...(options.searchMode && {
+        search_mode: options.searchMode,
+      }),
       ...(options.searchSettings && {
         search_settings: options.searchSettings,
       }),
@@ -155,6 +163,7 @@ export class RetrievalClient {
   @feature("retrieval.agent")
   async agent(options: {
     message: Message;
+    searchMode?: "advanced" | "basic" | "custom";
     searchSettings?: SearchSettings | Record<string, any>;
     ragGenerationConfig?: GenerationConfig | Record<string, any>;
     taskPromptOverride?: string;
@@ -164,6 +173,9 @@ export class RetrievalClient {
   }): Promise<any | AsyncGenerator<string, void, unknown>> {
     const data: Record<string, any> = {
       message: options.message,
+      ...(options.searchMode && {
+        search_mode: options.searchMode,
+      }),
       ...(options.searchSettings && {
         search_settings: options.searchSettings,
       }),

diff --git a/py/core/__init__.py b/py/core/__init__.py
@@ -79,6 +79,7 @@
     "GraphSearchSettings",
     "ChunkSearchResult",
     "SearchSettings",
+    "SearchMode",
     "HybridSearchSettings",
     # User abstractions
     "Token",

diff --git a/py/core/base/__init__.py b/py/core/base/__init__.py
@@ -52,6 +52,7 @@
     "ChunkSearchSettings",
     "ChunkSearchResult",
     "SearchSettings",
+    "SearchMode",
     "HybridSearchSettings",
     # User abstractions
     "Token",
@@ -117,6 +118,7 @@
     "EmbeddingConfig",
     "EmbeddingProvider",
     # Ingestion provider
+    "IngestionMode",
     "IngestionConfig",
     "IngestionProvider",
     "ChunkingStrategy",

diff --git a/py/core/base/abstractions/__init__.py b/py/core/base/abstractions/__init__.py
@@ -62,6 +62,7 @@
     KGGlobalResult,
     KGRelationshipResult,
     KGSearchResultType,
+    SearchMode,
     SearchSettings,
     WebSearchResponse,
 )
@@ -133,6 +134,7 @@
     "ChunkSearchSettings",
     "ChunkSearchResult",
     "SearchSettings",
+    "SearchMode",
     "HybridSearchSettings",
     # KG abstractions
     "KGCreationSettings",

diff --git a/py/core/base/providers/__init__.py b/py/core/base/providers/__init__.py
@@ -18,7 +18,12 @@
 )
 from .email import EmailConfig, EmailProvider
 from .embedding import EmbeddingConfig, EmbeddingProvider
-from .ingestion import ChunkingStrategy, IngestionConfig, IngestionProvider
+from .ingestion import (
+    ChunkingStrategy,
+    IngestionConfig,
+    IngestionMode,
+    IngestionProvider,
+)
 from .llm import CompletionConfig, CompletionProvider
 from .orchestration import OrchestrationConfig, OrchestrationProvider, Workflow
 
@@ -31,6 +36,7 @@
     "Provider",
     "ProviderConfig",
     # Ingestion provider
+    "IngestionMode",
     "IngestionConfig",
     "IngestionProvider",
     "ChunkingStrategy",

diff --git a/py/core/base/providers/ingestion.py b/py/core/base/providers/ingestion.py
@@ -1,7 +1,6 @@
 import logging
 from abc import ABC
 from enum import Enum
-from typing import Optional
 
 from core.base.abstractions import ChunkEnrichmentSettings
 
@@ -34,6 +33,8 @@ class IngestionConfig(ProviderConfig):
     chunks_for_document_summary: int = 128
     document_summary_model: str = "openai/gpt-4o-mini"
 
+    parser_overrides: dict[str, str] = {}
+
     @property
     def supported_providers(self) -> list[str]:
         return ["r2r", "unstructured_local", "unstructured_api"]
@@ -42,6 +43,21 @@ def validate_config(self) -> None:
         if self.provider not in self.supported_providers:
             raise ValueError(f"Provider {self.provider} is not supported.")
 
+    @classmethod
+    def get_default(cls, mode: str, app) -> "IngestionConfig":
+        """Return default ingestion configuration for a given mode."""
+        if mode == "hi-res":
+            # More thorough parsing, no skipping summaries, possibly larger `chunks_for_document_summary`.
+            return cls(app=app, parser_overrides={"pdf": "zerox"})
+        # elif mode == "fast":
+        #     # Skip summaries and other enrichment steps for speed.
+        #     return cls(
+        #         app=app,
+        #     )
+        else:
+            # For `custom` or any unrecognized mode, return a base config
+            return cls(app=app)
+
 
 class IngestionProvider(Provider, ABC):
 
@@ -66,3 +82,9 @@ class ChunkingStrategy(str, Enum):
     CHARACTER = "character"
     BASIC = "basic"
     BY_TITLE = "by_title"
+
+
+class IngestionMode(str, Enum):
+    hi_res = "hi-res"
+    fast = "fast"
+    custom = "custom"
diff --git a/py/core/main/api/v3/documents_router.py b/py/core/main/api/v3/documents_router.py
@@ -12,6 +12,8 @@
 from pydantic import Json
 
 from core.base import (
+    IngestionConfig,
+    IngestionMode,
     R2RException,
     RunType,
     UnprocessedChunk,
@@ -44,6 +46,18 @@
 MAX_CHUNKS_PER_REQUEST = 1024 * 100
 
 
+def merge_ingestion_config(
+    base: IngestionConfig, overrides: IngestionConfig
+) -> IngestionConfig:
+    base_dict = base.model_dump()
+    overrides_dict = overrides.model_dump(exclude_unset=True)
+
+    for k, v in overrides_dict.items():
+        base_dict[k] = v
+
+    return IngestionConfig(**base_dict)
+
+
 class DocumentsRouter(BaseRouterV3):
     def __init__(
         self,
@@ -106,6 +120,29 @@ def _register_workflows(self):
             },
         )
 
+    def _prepare_ingestion_config(
+        self,
+        ingestion_mode: IngestionMode,
+        ingestion_config: Optional[IngestionConfig],
+    ) -> IngestionConfig:
+        # If not custom, start from defaults
+        if ingestion_mode != IngestionMode.custom:
+            effective_config = IngestionConfig.get_default(
+                ingestion_mode.value, app=self.providers.auth.config.app
+            )
+            if ingestion_config:
+                effective_config = merge_ingestion_config(
+                    effective_config, ingestion_config
+                )
+        else:
+            # custom mode
+            effective_config = ingestion_config or IngestionConfig(
+                app=self.providers.auth.config.app
+            )
+
+        effective_config.validate_config()
+        return effective_config
+
     def _setup_routes(self):
         @self.router.post(
             "/documents",
@@ -199,7 +236,18 @@ async def create_document(
                 None,
                 description="Metadata to associate with the document, such as title, description, or custom fields.",
             ),
-            ingestion_config: Optional[Json[dict]] = Form(
+            ingestion_mode: IngestionMode = Form(
+                default=IngestionMode.custom,
+                description=(
+                    "Ingestion modes:\n"
+                    "- `hi-res`: Thorough ingestion with full summaries and enrichment.\n"
+                    "- `fast`: Quick ingestion with minimal enrichment and no summaries.\n"
+                    "- `custom`: Full control via `ingestion_config`.\n\n"
+                    "If `filters` or `limit` (in `ingestion_config`) are provided alongside `hi-res` or `fast`, "
+                    "they will override the default settings for that mode."
+                ),
+            ),
+            ingestion_config: Optional[Json[IngestionConfig]] = Form(
                 None,
                 description="An optional dictionary to override the default chunking configuration for the ingestion process. If not provided, the system will use the default server-side chunking configuration.",
             ),
@@ -210,14 +258,23 @@ async def create_document(
             auth_user=Depends(self.providers.auth.auth_wrapper),
         ) -> WrappedIngestionResponse:
             """
-            Creates a new Document object from an input file or text content. The document will be processed
-            to create chunks for vector indexing and search.
+            Creates a new Document object from an input file, text content, or chunks. The chosen `ingestion_mode` determines
+            how the ingestion process is configured:
+
+            **Ingestion Modes:**
+            - `hi-res`: Comprehensive parsing and enrichment, including summaries and possibly more thorough parsing.
+            - `fast`: Speed-focused ingestion that skips certain enrichment steps like summaries.
+            - `custom`: Provide a full `ingestion_config` to customize the entire ingestion process.
 
             Either a file or text content must be provided, but not both. Documents are shared through `Collections` which allow for tightly specified cross-user interactions.
 
             The ingestion process runs asynchronously and its progress can be tracked using the returned
             task_id.
             """
+            effective_ingestion_config = self._prepare_ingestion_config(
+                ingestion_mode=ingestion_mode,
+                ingestion_config=ingestion_config,
+            )
             if not file and not raw_text and not chunks:
                 raise R2RException(
                     status_code=422,
@@ -275,6 +332,7 @@ async def create_document(
                     ],
                     "metadata": metadata,  # Base metadata for the document
                     "user": auth_user.model_dump_json(),
+                    "ingestion_config": effective_ingestion_config.model_dump(),
                 }
 
                 # TODO - Modify create_chunks so that we can add chunks to existing document
@@ -347,7 +405,7 @@ async def create_document(
                 "document_id": str(document_id),
                 "collection_ids": collection_ids,
                 "metadata": metadata,
-                "ingestion_config": ingestion_config,
+                "ingestion_config": effective_ingestion_config.model_dump(),
                 "user": auth_user.model_dump_json(),
                 "size_in_bytes": content_length,
                 "is_update": False,