fix integration suite

emrgnt-cmplxty · emrgnt-cmplxty · commit 453623d64237 · 2024-10-04T15:23:52.000-07:00
diff --git a/.github/workflows/integration-test-workflow-debian.yml b/.github/workflows/integration-test-workflow-debian.yml
@@ -78,8 +78,9 @@ jobs:
     - name: Run integration tests - Ingestion
       working-directory: ./py
       run: |
-        poetry run python tests/integration/runner.py test_ingest_sample_files_cli
-        poetry run python tests/integration/runner.py test_document_ingestion_cli
+        poetry run python tests/integration/runner.py test_ingest_sample_file_cli
+        poetry run python tests/integration/runner.py test_document_overview_sample_file_cli
+        poetry run python tests/integration/runner.py test_vector_search_sample_file_filter_cli
 
     - name: Stop R2R server
       if: always()
diff --git a/py/core/base/api/models/__init__.py b/py/core/base/api/models/__init__.py
@@ -53,10 +53,10 @@
     RAGAgentResponse,
     RAGResponse,
     SearchResponse,
+    WrappedCompletionResponse,
     WrappedRAGAgentResponse,
     WrappedRAGResponse,
     WrappedSearchResponse,
-    WrappedCompletionResponse,
 )
 
 __all__ = [
diff --git a/py/core/base/providers/kg.py b/py/core/base/providers/kg.py
@@ -89,7 +89,7 @@ async def get_entities(
         limit: int,
         entity_ids: list[str] | None = None,
         with_description: bool = False,
-    ) -> list[Entity]:
+    ) -> dict:
         """Abstract method to get entities."""
         pass
 
@@ -100,7 +100,7 @@ async def get_triples(
         offset: int,
         limit: int,
         triple_ids: list[str] | None = None,
-    ) -> list[Triple]:
+    ) -> dict:
         """Abstract method to get triples."""
         pass
 
diff --git a/py/core/main/api/ingestion_router.py b/py/core/main/api/ingestion_router.py
@@ -2,7 +2,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 from uuid import UUID
 
 import yaml
@@ -118,7 +118,7 @@ async def ingest_files_app(
 
             file_datas = await self._process_files(files)
 
-            messages = []
+            messages: list[dict[str, Union[str, None]]] = []
             for it, file_data in enumerate(file_datas):
                 content_length = len(file_data["content"])
                 file_content = BytesIO(base64.b64decode(file_data["content"]))
@@ -149,7 +149,7 @@ async def ingest_files_app(
                     file_content,
                     file_data["content_type"],
                 )
-                raw_message = await self.orchestration_provider.run_workflow(
+                raw_message: dict[str, Union[str, None]] = await self.orchestration_provider.run_workflow( # type: ignore
                     "ingest-files",
                     {"request": workflow_input},
                     options={
@@ -159,9 +159,10 @@ async def ingest_files_app(
                     },
                 )
                 raw_message["document_id"] = str(document_id)
+                if "task_id" not in raw_message:
+                    raw_message["task_id"] = None
                 messages.append(raw_message)
-
-            return messages
+            return messages  # type: ignore
 
         update_files_extras = self.openapi_extras.get("update_files", {})
         update_files_descriptions = update_files_extras.get(
@@ -188,7 +189,7 @@ async def update_files_app(
                 description=ingest_files_descriptions.get("ingestion_config"),
             ),
             auth_user=Depends(self.service.providers.auth.auth_wrapper),
-        ) -> WrappedUpdateResponse:  # type: ignore
+        ) -> WrappedUpdateResponse:
             """
             Update existing files in the system.
 
@@ -255,7 +256,7 @@ async def update_files_app(
             )
             raw_message["message"] = "Update task queued successfully."
             raw_message["document_ids"] = workflow_input["document_ids"]
-            return raw_message
+            return raw_message  # type: ignore
 
         ingest_chunks_extras = self.openapi_extras.get("ingest_chunks", {})
         ingest_chunks_descriptions = ingest_chunks_extras.get(
@@ -278,7 +279,7 @@ async def ingest_chunks_app(
                 None, description=ingest_files_descriptions.get("metadata")
             ),
             auth_user=Depends(self.service.providers.auth.auth_wrapper),
-        ) -> WrappedIngestionResponse:  # type: ignore
+        ) -> WrappedIngestionResponse:
             """
             Ingest text chunks into the system.
 
@@ -308,7 +309,7 @@ async def ingest_chunks_app(
                 },
             )
             raw_message["document_id"] = str(document_id)
-            return raw_message
+            return raw_message  # type: ignore
 
     @staticmethod
     def _validate_ingestion_config(ingestion_config):
diff --git a/py/core/providers/kg/postgres.py b/py/core/providers/kg/postgres.py
@@ -16,11 +16,7 @@
     KGProvider,
     Triple,
 )
-from shared.abstractions import (
-    KGCreationSettings,
-    KGEnrichmentSettings,
-    KGRunType,
-)
+from shared.abstractions import KGCreationSettings, KGEnrichmentSettings
 from shared.api.models.kg.responses import (
     KGCreationEstimationResponse,
     KGEnrichmentEstimationResponse,
@@ -923,7 +919,7 @@ async def get_entities(
         with_description: bool = False,
     ) -> dict:
         conditions = []
-        params = [collection_id]
+        params: list = [collection_id]
 
         if entity_ids:
             conditions.append(f"id = ANY(${len(params) + 1})")
diff --git a/py/shared/api/models/ingestion/responses.py b/py/shared/api/models/ingestion/responses.py
@@ -1,4 +1,4 @@
-from typing import TypeVar
+from typing import Optional, TypeVar
 from uuid import UUID
 
 from pydantic import BaseModel, Field
@@ -13,7 +13,7 @@ class IngestionResponse(BaseModel):
         ...,
         description="A message describing the result of the ingestion request.",
     )
-    task_id: UUID = Field(
+    task_id: Optional[UUID] = Field(
         ...,
         description="The task ID of the ingestion request.",
     )
diff --git a/py/tests/integration/runner.py b/py/tests/integration/runner.py
@@ -16,46 +16,87 @@ def run_command(command):
     return result.stdout
 
 
-def test_ingest_sample_files_cli():
-    print("Testing: Ingest sample files")
-    run_command("poetry run r2r ingest-sample-files")
+def test_ingest_sample_file_cli():
+    print("Testing: Ingest sample file CLI")
+    run_command("poetry run r2r ingest-sample-file")
     print("Ingestion successful")
 
 
-def test_document_ingestion_cli():
-    print("Testing: Document ingestion")
+def test_document_overview_sample_file_cli():
+    print("Testing: Document overview contains 'aristotle.txt'")
     output = run_command("poetry run r2r documents-overview")
-    documents = json.loads(output)
+    output = output.replace("'", '"')
+    output_lines = output.strip().split('\n')[1:]
+    documents = [json.loads(ele) for ele in output_lines]
 
-    expected_document = {
-        "id": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
+    aristotle_document = {
         "title": "aristotle.txt",
         "type": "txt",
         "ingestion_status": "success",
-        "kg_extraction_status": "success",
+        "kg_extraction_status": "pending",
         "version": "v0",
         "metadata": {"title": "aristotle.txt", "version": "v0"},
     }
 
+    # Check if any document in the overview matches the Aristotle document
     if not any(
-        all(doc.get(k) == v for k, v in expected_document.items())
+        all(doc.get(k) == v for k, v in aristotle_document.items())
         for doc in documents
     ):
-        print("Document ingestion test failed")
-        print(f"Expected document not found in output: {output}")
+        print("Document overview test failed")
+        print("Aristotle document not found in the overview")
         sys.exit(1)
-    print("Document ingestion test passed")
+    print("Document overview test passed")
 
-
-def test_vector_search_cli():
+def test_vector_search_sample_file_filter_cli():
     print("Testing: Vector search")
     output = run_command(
-        "poetry run r2r search --query='What was Uber's profit in 2020?'"
+        """poetry run r2r search --query="Who was aristotle?" """
     )
-    results = json.loads(output)
-    if not results.get("results"):
+    # Split the output into lines and remove the first and last lines
+    output_lines = output.strip().split('\n')[1:-1]
+    # Replace single quotes with double quotes in each line
+    cleaned_output_lines = [line.replace("'", '"') for line in output_lines]
+    results = []
+    for line in cleaned_output_lines:
+        try:
+            result = json.loads(line)
+            results.append(result)
+        # Skip lines that are not valid JSON b/c of the single quote replacement
+        except json.JSONDecodeError:
+            continue
+
+    if not results:
         print("Vector search test failed: No results returned")
         sys.exit(1)
+
+    expected_lead_search_result = {
+        "extraction_id": "ff8accdb-791e-5b6d-a83a-5adc32c4222c",
+        "document_id": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1", 
+        "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
+        "score": 0.7820796370506287,
+        "text": """Aristotle[A] (Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was an Ancient Greek philosopher and polymath. His writings cover a broad range of subjects spanning the natural sciences, philosophy, linguistics, economics, politics, psychology, and the arts. As the founder of the Peripatetic school of philosophy in the Lyceum in Athens, he began the wider Aristotelian tradition that followed, which set the groundwork for the development of modern science."""
+    }
+    lead_result = results[0]
+
+    if lead_result['text'] != expected_lead_search_result['text']:
+        print('Vector search test failed: Incorrect search result text')
+        print('Expected lead search text:', expected_lead_search_result['text'])
+        print('Actual lead search text:', lead_result['text'])
+        sys.exit(1)
+
+    if lead_result['extraction_id'] != expected_lead_search_result['extraction_id']:
+        print("Vector search test failed: Incorrect extraction_id")
+        print('Expected extraction_id:', expected_lead_search_result['extraction_id'])
+        print('Actual extraction_id:', lead_result['extraction_id'])
+        sys.exit(1)
+
+    if lead_result['document_id'] != expected_lead_search_result['document_id']:
+        print("Vector search test failed: Incorrect document_id")
+        print('Expected document_id:', expected_lead_search_result['document_id'])
+        print('Actual document_id:', lead_result['document_id'])
+        sys.exit(1)
+
     print("Vector search test passed")
 
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from typing import TypeVar`
	`1`	`+from typing import Optional, TypeVar`
`2`	`2`	`from uuid import UUID`
`3`	`3`
`4`	`4`	`from pydantic import BaseModel, Field`
`@@ -13,7 +13,7 @@ class IngestionResponse(BaseModel):`
`13`	`13`	`...,`
`14`	`14`	`description="A message describing the result of the ingestion request.",`
`15`	`15`	`)`
`16`		`- task_id: UUID = Field(`
	`16`	`+ task_id: Optional[UUID] = Field(`
`17`	`17`	`...,`
`18`	`18`	`description="The task ID of the ingestion request.",`
`19`	`19`	`)`