Skip to content

Commit

Permalink
Feature/fix zerox ingestion (#1659)
Browse files Browse the repository at this point in the history
* fix zerox ing

* up

* rm cruft

* fix bugs

* fix llm formatting for rag reply
  • Loading branch information
emrgnt-cmplxty authored Dec 5, 2024
1 parent e407e87 commit 01659ea
Show file tree
Hide file tree
Showing 13 changed files with 257 additions and 92 deletions.
4 changes: 2 additions & 2 deletions py/core/base/providers/orchestration.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class OrchestrationConfig(ProviderConfig):
provider: str
max_runs: int = 2_048
kg_creation_concurrency_limit: int = 32
ingestion_concurrency_limit: int = 64
kg_enrichment_concurrency_limit: int = 8
ingestion_concurrency_limit: int = 16
kg_concurrency_limit: int = 4

def validate_config(self) -> None:
if self.provider not in self.supported_providers:
Expand Down
18 changes: 11 additions & 7 deletions py/core/configs/full.toml
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
[completion]
provider = "litellm"
concurrent_request_limit = 128

[ingestion]
provider = "unstructured_local"
strategy = "auto"
chunking_strategy = "by_title"
new_after_n_chars = 512
max_characters = 1_024
combine_under_n_chars = 128
overlap = 256
new_after_n_chars = 2_048
max_characters = 4_096
combine_under_n_chars = 1_024
overlap = 1_024

[ingestion.extra_parsers]
pdf = "zerox"

[orchestration]
provider = "hatchet"
kg_creation_concurrency_lipmit = 32
ingestion_concurrency_limit = 128
kg_enrichment_concurrency_limit = 8
kg_creation_concurrency_limit = 32
ingestion_concurrency_limit = 16
kg_concurrency_limit = 8
18 changes: 11 additions & 7 deletions py/core/configs/full_azure.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# A config which overrides all instances of `openai` with `azure` in the `r2r.toml` config
[completion]
provider = "litellm"
concurrent_request_limit = 128

[completion.generation_config]
model = "azure/gpt-4o"

Expand All @@ -25,6 +28,7 @@ batch_size = 256
[embedding]
provider = "litellm"
base_model = "azure/text-embedding-3-small"
base_dimension = 512

[file]
provider = "postgres"
Expand All @@ -33,10 +37,10 @@ provider = "postgres"
provider = "unstructured_local"
strategy = "auto"
chunking_strategy = "by_title"
new_after_n_chars = 512
max_characters = 1_024
combine_under_n_chars = 128
overlap = 256
new_after_n_chars = 2_048
max_characters = 4_096
combine_under_n_chars = 1_024
overlap = 1_024
document_summary_model = "azure/gpt-4o-mini"
vision_img_model = "azure/gpt-4o"
vision_pdf_model = "azure/gpt-4o"
Expand All @@ -49,6 +53,6 @@ vision_pdf_model = "azure/gpt-4o"

[orchestration]
provider = "hatchet"
kg_creation_concurrency_lipmit = 32
ingestion_concurrency_limit = 128
kg_enrichment_concurrency_limit = 8
kg_creation_concurrency_limit = 32
ingestion_concurrency_limit = 4
kg_concurrency_limit = 8
3 changes: 2 additions & 1 deletion py/core/configs/r2r_azure.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ batch_size = 256

[embedding]
provider = "litellm"
base_model = "openai/text-embedding-3-small" # continue with `openai` for embeddings, due to server rate limit on azure
base_model = "azure/text-embedding-3-small" # continue with `openai` for embeddings, due to server rate limit on azure
base_dimension = 512

[file]
provider = "postgres"
Expand Down
10 changes: 5 additions & 5 deletions py/core/main/api/v3/retrieval_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def _setup_routes(self):
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
use_semantic_search: true,
chunk_settings: {
limit: 20, // separate limit for chunk vs. graph
limit: 20, # separate limit for chunk vs. graph
enabled: true
},
graph_settings: {
Expand All @@ -130,7 +130,7 @@ def _setup_routes(self):
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
use_semantic_search: true,
chunk_settings: {
limit: 20, // separate limit for chunk vs. graph
limit: 20, # separate limit for chunk vs. graph
enabled: true
},
graph_settings: {
Expand Down Expand Up @@ -165,7 +165,7 @@ def _setup_routes(self):
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
use_semantic_search: true,
chunk_settings: {
limit: 20, // separate limit for chunk vs. graph
limit: 20, # separate limit for chunk vs. graph
enabled: true
},
graph_settings: {
Expand Down Expand Up @@ -261,7 +261,7 @@ async def search_app(
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
use_semantic_search: true,
chunk_settings: {
limit: 20, // separate limit for chunk vs. graph
limit: 20, # separate limit for chunk vs. graph
enabled: true
},
graph_settings: {
Expand Down Expand Up @@ -435,7 +435,7 @@ async def stream_generator():
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
use_semantic_search: true,
chunk_settings: {
limit: 20, // separate limit for chunk vs. graph
limit: 20, # separate limit for chunk vs. graph
enabled: true
},
graph_settings: {
Expand Down
2 changes: 1 addition & 1 deletion py/core/main/orchestration/hatchet/ingestion_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def concurrency(self, context: Context) -> str:
except Exception as e:
return str(uuid.uuid4())

@orchestration_provider.step(timeout="60m")
@orchestration_provider.step(retries=0, timeout="60m")
async def parse(self, context: Context) -> dict:
try:
logger.info("Initiating ingestion workflow, step: parse")
Expand Down
Loading

0 comments on commit 01659ea

Please sign in to comment.