Skip to content

Commit 01659ea

Browse files
Feature/fix zerox ingestion (#1659)
* fix zerox ing * up * rm cruft * fix bugs * fix llm formatting for rag reply
1 parent e407e87 commit 01659ea

File tree

13 files changed

+257
-92
lines changed

13 files changed

+257
-92
lines changed

py/core/base/providers/orchestration.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ class OrchestrationConfig(ProviderConfig):
1414
provider: str
1515
max_runs: int = 2_048
1616
kg_creation_concurrency_limit: int = 32
17-
ingestion_concurrency_limit: int = 64
18-
kg_enrichment_concurrency_limit: int = 8
17+
ingestion_concurrency_limit: int = 16
18+
kg_concurrency_limit: int = 4
1919

2020
def validate_config(self) -> None:
2121
if self.provider not in self.supported_providers:

py/core/configs/full.toml

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
1+
[completion]
2+
provider = "litellm"
3+
concurrent_request_limit = 128
4+
15
[ingestion]
26
provider = "unstructured_local"
37
strategy = "auto"
48
chunking_strategy = "by_title"
5-
new_after_n_chars = 512
6-
max_characters = 1_024
7-
combine_under_n_chars = 128
8-
overlap = 256
9+
new_after_n_chars = 2_048
10+
max_characters = 4_096
11+
combine_under_n_chars = 1_024
12+
overlap = 1_024
913

1014
[ingestion.extra_parsers]
1115
pdf = "zerox"
1216

1317
[orchestration]
1418
provider = "hatchet"
15-
kg_creation_concurrency_lipmit = 32
16-
ingestion_concurrency_limit = 128
17-
kg_enrichment_concurrency_limit = 8
19+
kg_creation_concurrency_limit = 32
20+
ingestion_concurrency_limit = 16
21+
kg_concurrency_limit = 8

py/core/configs/full_azure.toml

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# A config which overrides all instances of `openai` with `azure` in the `r2r.toml` config
22
[completion]
3+
provider = "litellm"
4+
concurrent_request_limit = 128
5+
36
[completion.generation_config]
47
model = "azure/gpt-4o"
58

@@ -25,6 +28,7 @@ batch_size = 256
2528
[embedding]
2629
provider = "litellm"
2730
base_model = "azure/text-embedding-3-small"
31+
base_dimension = 512
2832

2933
[file]
3034
provider = "postgres"
@@ -33,10 +37,10 @@ provider = "postgres"
3337
provider = "unstructured_local"
3438
strategy = "auto"
3539
chunking_strategy = "by_title"
36-
new_after_n_chars = 512
37-
max_characters = 1_024
38-
combine_under_n_chars = 128
39-
overlap = 256
40+
new_after_n_chars = 2_048
41+
max_characters = 4_096
42+
combine_under_n_chars = 1_024
43+
overlap = 1_024
4044
document_summary_model = "azure/gpt-4o-mini"
4145
vision_img_model = "azure/gpt-4o"
4246
vision_pdf_model = "azure/gpt-4o"
@@ -49,6 +53,6 @@ vision_pdf_model = "azure/gpt-4o"
4953

5054
[orchestration]
5155
provider = "hatchet"
52-
kg_creation_concurrency_lipmit = 32
53-
ingestion_concurrency_limit = 128
54-
kg_enrichment_concurrency_limit = 8
56+
kg_creation_concurrency_limit = 32
57+
ingestion_concurrency_limit = 4
58+
kg_concurrency_limit = 8

py/core/configs/r2r_azure.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ batch_size = 256
2424

2525
[embedding]
2626
provider = "litellm"
27-
base_model = "openai/text-embedding-3-small" # continue with `openai` for embeddings, due to server rate limit on azure
27+
base_model = "azure/text-embedding-3-small" # continue with `openai` for embeddings, due to server rate limit on azure
28+
base_dimension = 512
2829

2930
[file]
3031
provider = "postgres"

py/core/main/api/v3/retrieval_router.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def _setup_routes(self):
104104
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
105105
use_semantic_search: true,
106106
chunk_settings: {
107-
limit: 20, // separate limit for chunk vs. graph
107+
limit: 20, # separate limit for chunk vs. graph
108108
enabled: true
109109
},
110110
graph_settings: {
@@ -130,7 +130,7 @@ def _setup_routes(self):
130130
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
131131
use_semantic_search: true,
132132
chunk_settings: {
133-
limit: 20, // separate limit for chunk vs. graph
133+
limit: 20, # separate limit for chunk vs. graph
134134
enabled: true
135135
},
136136
graph_settings: {
@@ -165,7 +165,7 @@ def _setup_routes(self):
165165
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
166166
use_semantic_search: true,
167167
chunk_settings: {
168-
limit: 20, // separate limit for chunk vs. graph
168+
limit: 20, # separate limit for chunk vs. graph
169169
enabled: true
170170
},
171171
graph_settings: {
@@ -261,7 +261,7 @@ async def search_app(
261261
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
262262
use_semantic_search: true,
263263
chunk_settings: {
264-
limit: 20, // separate limit for chunk vs. graph
264+
limit: 20, # separate limit for chunk vs. graph
265265
enabled: true
266266
},
267267
graph_settings: {
@@ -435,7 +435,7 @@ async def stream_generator():
435435
filters: {"document_id": {"$eq": "3e157b3a-8469-51db-90d9-52e7d896b49b"}},
436436
use_semantic_search: true,
437437
chunk_settings: {
438-
limit: 20, // separate limit for chunk vs. graph
438+
limit: 20, # separate limit for chunk vs. graph
439439
enabled: true
440440
},
441441
graph_settings: {

py/core/main/orchestration/hatchet/ingestion_workflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def concurrency(self, context: Context) -> str:
5656
except Exception as e:
5757
return str(uuid.uuid4())
5858

59-
@orchestration_provider.step(timeout="60m")
59+
@orchestration_provider.step(retries=0, timeout="60m")
6060
async def parse(self, context: Context) -> dict:
6161
try:
6262
logger.info("Initiating ingestion workflow, step: parse")

0 commit comments

Comments
 (0)