Skip to content

Commit

Permalink
up (#1664)
Browse files Browse the repository at this point in the history
* up

* add ingestion settings too

* up

* up

* up
  • Loading branch information
emrgnt-cmplxty authored Dec 6, 2024
1 parent e9e156d commit 63a7917
Show file tree
Hide file tree
Showing 16 changed files with 481 additions and 47 deletions.
4 changes: 4 additions & 0 deletions js/sdk/src/v3/clients/documents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ export class DocumentsClient {
ingestionConfig?: Record<string, any>;
collectionIds?: string[];
runWithOrchestration?: boolean;
ingestionMode?: "hi-res" | "fast" | "custom";
}): Promise<WrappedIngestionResponse> {
const inputCount = [options.file, options.raw_text, options.chunks].filter(
(x) => x !== undefined,
Expand Down Expand Up @@ -128,6 +129,9 @@ export class DocumentsClient {
String(options.runWithOrchestration),
);
}
if (options.ingestionMode) {
formData.append("ingestion_mode", options.ingestionMode);
}

formData.append("file_names", JSON.stringify(processedFiles));

Expand Down
12 changes: 12 additions & 0 deletions js/sdk/src/v3/clients/retrieval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,17 @@ export class RetrievalClient {
@feature("retrieval.search")
async search(options: {
query: string;
searchMode?: "advanced" | "basic" | "custom";
searchSettings?: SearchSettings | Record<string, any>;
}): Promise<WrappedSearchResponse> {
const data = {
query: options.query,
...(options.searchSettings && {
search_settings: options.searchSettings,
}),
...(options.searchMode && {
search_mode: options.searchMode,
}),
};

return await this.client.makeRequest("POST", "retrieval/search", {
Expand All @@ -60,13 +64,17 @@ export class RetrievalClient {
@feature("retrieval.rag")
async rag(options: {
query: string;
searchMode?: "advanced" | "basic" | "custom";
searchSettings?: SearchSettings | Record<string, any>;
ragGenerationConfig?: GenerationConfig | Record<string, any>;
taskPromptOverride?: string;
includeTitleIfAvailable?: boolean;
}): Promise<any | AsyncGenerator<string, void, unknown>> {
const data = {
query: options.query,
...(options.searchMode && {
search_mode: options.searchMode,
}),
...(options.searchSettings && {
search_settings: options.searchSettings,
}),
Expand Down Expand Up @@ -155,6 +163,7 @@ export class RetrievalClient {
@feature("retrieval.agent")
async agent(options: {
message: Message;
searchMode?: "advanced" | "basic" | "custom";
searchSettings?: SearchSettings | Record<string, any>;
ragGenerationConfig?: GenerationConfig | Record<string, any>;
taskPromptOverride?: string;
Expand All @@ -164,6 +173,9 @@ export class RetrievalClient {
}): Promise<any | AsyncGenerator<string, void, unknown>> {
const data: Record<string, any> = {
message: options.message,
...(options.searchMode && {
search_mode: options.searchMode,
}),
...(options.searchSettings && {
search_settings: options.searchSettings,
}),
Expand Down
1 change: 1 addition & 0 deletions py/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
"GraphSearchSettings",
"ChunkSearchResult",
"SearchSettings",
"SearchMode",
"HybridSearchSettings",
# User abstractions
"Token",
Expand Down
2 changes: 2 additions & 0 deletions py/core/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
"ChunkSearchSettings",
"ChunkSearchResult",
"SearchSettings",
"SearchMode",
"HybridSearchSettings",
# User abstractions
"Token",
Expand Down Expand Up @@ -117,6 +118,7 @@
"EmbeddingConfig",
"EmbeddingProvider",
# Ingestion provider
"IngestionMode",
"IngestionConfig",
"IngestionProvider",
"ChunkingStrategy",
Expand Down
2 changes: 2 additions & 0 deletions py/core/base/abstractions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
KGGlobalResult,
KGRelationshipResult,
KGSearchResultType,
SearchMode,
SearchSettings,
WebSearchResponse,
)
Expand Down Expand Up @@ -133,6 +134,7 @@
"ChunkSearchSettings",
"ChunkSearchResult",
"SearchSettings",
"SearchMode",
"HybridSearchSettings",
# KG abstractions
"KGCreationSettings",
Expand Down
8 changes: 7 additions & 1 deletion py/core/base/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
)
from .email import EmailConfig, EmailProvider
from .embedding import EmbeddingConfig, EmbeddingProvider
from .ingestion import ChunkingStrategy, IngestionConfig, IngestionProvider
from .ingestion import (
ChunkingStrategy,
IngestionConfig,
IngestionMode,
IngestionProvider,
)
from .llm import CompletionConfig, CompletionProvider
from .orchestration import OrchestrationConfig, OrchestrationProvider, Workflow

Expand All @@ -31,6 +36,7 @@
"Provider",
"ProviderConfig",
# Ingestion provider
"IngestionMode",
"IngestionConfig",
"IngestionProvider",
"ChunkingStrategy",
Expand Down
24 changes: 23 additions & 1 deletion py/core/base/providers/ingestion.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
from abc import ABC
from enum import Enum
from typing import Optional

from core.base.abstractions import ChunkEnrichmentSettings

Expand Down Expand Up @@ -34,6 +33,8 @@ class IngestionConfig(ProviderConfig):
chunks_for_document_summary: int = 128
document_summary_model: str = "openai/gpt-4o-mini"

parser_overrides: dict[str, str] = {}

@property
def supported_providers(self) -> list[str]:
return ["r2r", "unstructured_local", "unstructured_api"]
Expand All @@ -42,6 +43,21 @@ def validate_config(self) -> None:
if self.provider not in self.supported_providers:
raise ValueError(f"Provider {self.provider} is not supported.")

@classmethod
def get_default(cls, mode: str, app) -> "IngestionConfig":
"""Return default ingestion configuration for a given mode."""
if mode == "hi-res":
# More thorough parsing, no skipping summaries, possibly larger `chunks_for_document_summary`.
return cls(app=app, parser_overrides={"pdf": "zerox"})
# elif mode == "fast":
# # Skip summaries and other enrichment steps for speed.
# return cls(
# app=app,
# )
else:
# For `custom` or any unrecognized mode, return a base config
return cls(app=app)


class IngestionProvider(Provider, ABC):

Expand All @@ -66,3 +82,9 @@ class ChunkingStrategy(str, Enum):
CHARACTER = "character"
BASIC = "basic"
BY_TITLE = "by_title"


class IngestionMode(str, Enum):
hi_res = "hi-res"
fast = "fast"
custom = "custom"
66 changes: 62 additions & 4 deletions py/core/main/api/v3/documents_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from pydantic import Json

from core.base import (
IngestionConfig,
IngestionMode,
R2RException,
RunType,
UnprocessedChunk,
Expand Down Expand Up @@ -44,6 +46,18 @@
MAX_CHUNKS_PER_REQUEST = 1024 * 100


def merge_ingestion_config(
base: IngestionConfig, overrides: IngestionConfig
) -> IngestionConfig:
base_dict = base.model_dump()
overrides_dict = overrides.model_dump(exclude_unset=True)

for k, v in overrides_dict.items():
base_dict[k] = v

return IngestionConfig(**base_dict)


class DocumentsRouter(BaseRouterV3):
def __init__(
self,
Expand Down Expand Up @@ -106,6 +120,29 @@ def _register_workflows(self):
},
)

def _prepare_ingestion_config(
self,
ingestion_mode: IngestionMode,
ingestion_config: Optional[IngestionConfig],
) -> IngestionConfig:
# If not custom, start from defaults
if ingestion_mode != IngestionMode.custom:
effective_config = IngestionConfig.get_default(
ingestion_mode.value, app=self.providers.auth.config.app
)
if ingestion_config:
effective_config = merge_ingestion_config(
effective_config, ingestion_config
)
else:
# custom mode
effective_config = ingestion_config or IngestionConfig(
app=self.providers.auth.config.app
)

effective_config.validate_config()
return effective_config

def _setup_routes(self):
@self.router.post(
"/documents",
Expand Down Expand Up @@ -199,7 +236,18 @@ async def create_document(
None,
description="Metadata to associate with the document, such as title, description, or custom fields.",
),
ingestion_config: Optional[Json[dict]] = Form(
ingestion_mode: IngestionMode = Form(
default=IngestionMode.custom,
description=(
"Ingestion modes:\n"
"- `hi-res`: Thorough ingestion with full summaries and enrichment.\n"
"- `fast`: Quick ingestion with minimal enrichment and no summaries.\n"
"- `custom`: Full control via `ingestion_config`.\n\n"
"If `filters` or `limit` (in `ingestion_config`) are provided alongside `hi-res` or `fast`, "
"they will override the default settings for that mode."
),
),
ingestion_config: Optional[Json[IngestionConfig]] = Form(
None,
description="An optional dictionary to override the default chunking configuration for the ingestion process. If not provided, the system will use the default server-side chunking configuration.",
),
Expand All @@ -210,14 +258,23 @@ async def create_document(
auth_user=Depends(self.providers.auth.auth_wrapper),
) -> WrappedIngestionResponse:
"""
Creates a new Document object from an input file or text content. The document will be processed
to create chunks for vector indexing and search.
Creates a new Document object from an input file, text content, or chunks. The chosen `ingestion_mode` determines
how the ingestion process is configured:
**Ingestion Modes:**
- `hi-res`: Comprehensive parsing and enrichment, including summaries and possibly more thorough parsing.
- `fast`: Speed-focused ingestion that skips certain enrichment steps like summaries.
- `custom`: Provide a full `ingestion_config` to customize the entire ingestion process.
Either a file or text content must be provided, but not both. Documents are shared through `Collections` which allow for tightly specified cross-user interactions.
The ingestion process runs asynchronously and its progress can be tracked using the returned
task_id.
"""
effective_ingestion_config = self._prepare_ingestion_config(
ingestion_mode=ingestion_mode,
ingestion_config=ingestion_config,
)
if not file and not raw_text and not chunks:
raise R2RException(
status_code=422,
Expand Down Expand Up @@ -275,6 +332,7 @@ async def create_document(
],
"metadata": metadata, # Base metadata for the document
"user": auth_user.model_dump_json(),
"ingestion_config": effective_ingestion_config.model_dump(),
}

# TODO - Modify create_chunks so that we can add chunks to existing document
Expand Down Expand Up @@ -347,7 +405,7 @@ async def create_document(
"document_id": str(document_id),
"collection_ids": collection_ids,
"metadata": metadata,
"ingestion_config": ingestion_config,
"ingestion_config": effective_ingestion_config.model_dump(),
"user": auth_user.model_dump_json(),
"size_in_bytes": content_length,
"is_update": False,
Expand Down
Loading

0 comments on commit 63a7917

Please sign in to comment.