From 94341a66dec940eee06669313a6a0eb4cede949a Mon Sep 17 00:00:00 2001 From: emrgnt-cmplxty <68796651+emrgnt-cmplxty@users.noreply.github.com> Date: Fri, 4 Oct 2024 22:46:09 -0700 Subject: [PATCH] Feature/revive integration tests (#1343) (#1346) * Feature/revive integration tests (#1343) * add postgres to integration * add postgres to integration * up * rename * hardcode * add back postgres * add back postgres * add pgvector * add pgvector * add pgvector * add pgvector * add pgvector * add pgvector * add pgvector * tweak config docs * fix integration suite * fix integration suite * fix integration suite * up * up * up * up * up * up * up * up * up * update integration test * final user tests * final user tests * Fix validation error on collection creation responses, remove unnecessary error on deletion (#1344) * Don't throw an error when deleting a collection with no documents, fix return on create collection, more js tests * Revert "Don't throw an error when deleting a collection with no documents, fix return on create collection, more js tests" This reverts commit f9f6eadd7307a76b4826fcc7fed9288d412294db. * Don't throw an error when deleting a collection with no documents, fix return on create collection, more js tests * more * Improve kg throughput (#1342) * try * up * up * space * add it in ingestion * rm ingestion * init * add semaphore * test * rm duplicates * kg_creation_settings * rm semaphores * increase conns * change it back * clean * up * up * up * up --------- Co-authored-by: --global=Shreyas Pimpalgaonkar <--global=shreyas.gp.7@gmail.com> Co-authored-by: emrgnt-cmplxty <68796651+emrgnt-cmplxty@users.noreply.github.com> --------- Co-authored-by: Nolan Tremelling <34580718+NolanTrem@users.noreply.github.com> Co-authored-by: Shreyas Pimpalgaonkar Co-authored-by: --global=Shreyas Pimpalgaonkar <--global=shreyas.gp.7@gmail.com> --- .../r2rClientIntegrationSuperUser.test.ts | 67 +++++++++++++++++-- py/core/base/providers/embedding.py | 9 +-- py/core/base/providers/llm.py | 4 +- py/core/configs/r2r_aws_bedrock.toml | 2 +- py/core/main/api/management_router.py | 3 +- py/core/pipes/kg/entity_description.py | 2 +- py/core/providers/database/relational.py | 11 ++- py/core/providers/database/vector.py | 2 +- py/core/providers/kg/postgres.py | 4 +- py/core/providers/orchestration/hatchet.py | 1 + py/poetry.lock | 8 ++- py/pyproject.toml | 2 +- py/r2r.toml | 2 +- .../test_document_management.json | 2 +- .../test_group_management.json | 2 +- .../observed_outputs/test_observability.json | 2 +- .../observed_outputs/test_retrieval.json | 2 +- .../test_user_management.json | 2 +- 18 files changed, 96 insertions(+), 31 deletions(-) diff --git a/js/sdk/__tests__/r2rClientIntegrationSuperUser.test.ts b/js/sdk/__tests__/r2rClientIntegrationSuperUser.test.ts index a0d6ec431..79025810c 100644 --- a/js/sdk/__tests__/r2rClientIntegrationSuperUser.test.ts +++ b/js/sdk/__tests__/r2rClientIntegrationSuperUser.test.ts @@ -3,11 +3,13 @@ const fs = require("fs"); import { describe, test, beforeAll, expect } from "@jest/globals"; const baseUrl = "http://localhost:7272"; +let newCollectionId: string; /** * raskolnikov.txt should have an id of `f9f61fc8-079c-52d0-910a-c657958e385b` * karamozov.txt should have an id of `73749580-1ade-50c6-8fbe-a5e9e87783c8` * myshkin.txt should have an id of `2e05b285-2746-5778-9e4a-e293db92f3be` + * The default collection should have an id of `122fdf6a-e116-546b-a8f6-e4cb2e2c0a09` */ /** @@ -18,7 +20,7 @@ const baseUrl = "http://localhost:7272"; * X verifyEmail * - login * - logout - * X user + * - user * X updateUser * - refreshAccessToken * X changePassword @@ -42,14 +44,14 @@ const baseUrl = "http://localhost:7272"; * - documentChunks * X inspectKnowledgeGraph * X collectionsOverview - * X createCollection - * X getCollection - * X updateCollection - * X deleteCollection - * X listCollections + * - createCollection + * - getCollection + * - updateCollection + * - deleteCollection + * - listCollections * X addUserToCollection * X removeUserFromCollection - * X getUsersInCollection + * - getUsersInCollection * X getCollectionsForUser * X assignDocumentToCollection * X removeDocumentFromCollection @@ -82,6 +84,10 @@ describe("r2rClient Integration Tests", () => { ).resolves.not.toThrow(); }); + test("User", async () => { + await expect(client.user()).resolves.not.toThrow(); + }); + test("Server stats", async () => { await expect(client.serverStats()).resolves.not.toThrow(); }); @@ -222,6 +228,53 @@ describe("r2rClient Integration Tests", () => { ).resolves.not.toThrow(); }); + test("Collections overview", async () => { + await expect(client.collectionsOverview()).resolves.not.toThrow(); + }); + + test("Create collection", async () => { + const response = await client.createCollection("test_collection", "test_description"); + newCollectionId = response.results.collection_id; + + expect(newCollectionId).toBeDefined(); + }); + + test("Get default collection", async () => { + await expect(client.getCollection("122fdf6a-e116-546b-a8f6-e4cb2e2c0a09")).resolves.not.toThrow(); + }); + + test("Get newly created collection", async () => { + await expect(client.getCollection(newCollectionId)).resolves.not.toThrow(); + }); + + test("Update collection", async () => { + await expect( + client.updateCollection( + newCollectionId, + "updated_test_collection", + "updated_test_description" + ), + ).resolves.not.toThrow(); + }); + + test("List collections", async () => { + await expect(client.listCollections()).resolves.not.toThrow(); + }); + + test("Delete collection", async () => { + await expect( + client.deleteCollection(newCollectionId), + ).resolves.not.toThrow(); + }); + + test("Get users in collection", async () => { + await expect(client.getUsersInCollection("122fdf6a-e116-546b-a8f6-e4cb2e2c0a09")).resolves.not.toThrow(); + }); + + test("Get users in collection with pagination", async () => { + await expect(client.getUsersInCollection("122fdf6a-e116-546b-a8f6-e4cb2e2c0a09", 10, 10)).resolves.not.toThrow(); + }); + test("Clean up remaining documents", async () => { // Deletes karamozov.txt await expect( diff --git a/py/core/base/providers/embedding.py b/py/core/base/providers/embedding.py index c08bd4720..1d7b5557a 100644 --- a/py/core/base/providers/embedding.py +++ b/py/core/base/providers/embedding.py @@ -25,10 +25,10 @@ class EmbeddingConfig(ProviderConfig): batch_size: int = 1 prefixes: Optional[dict[str, str]] = None add_title_as_prefix: bool = True - concurrent_request_limit: int = 16 - max_retries: int = 2 - initial_backoff: float = 1.0 - max_backoff: float = 60.0 + concurrent_request_limit: int = 256 + max_retries: int = 8 + initial_backoff: float = 1 + max_backoff: float = 64.0 def validate_config(self) -> None: if self.provider not in self.supported_providers: @@ -63,6 +63,7 @@ async def _execute_with_backoff_async(self, task: dict[str, Any]): try: async with self.semaphore: return await self._execute_task(task) + # TODO: Capture different error types and handle them accordingly except Exception as e: logger.warning( f"Request failed (attempt {retries + 1}): {str(e)}" diff --git a/py/core/base/providers/llm.py b/py/core/base/providers/llm.py index 65445bc53..9a35fe77f 100644 --- a/py/core/base/providers/llm.py +++ b/py/core/base/providers/llm.py @@ -20,9 +20,9 @@ class CompletionConfig(ProviderConfig): provider: Optional[str] = None generation_config: GenerationConfig = GenerationConfig() concurrent_request_limit: int = 256 - max_retries: int = 2 + max_retries: int = 8 initial_backoff: float = 1.0 - max_backoff: float = 60.0 + max_backoff: float = 64.0 def validate_config(self) -> None: if not self.provider: diff --git a/py/core/configs/r2r_aws_bedrock.toml b/py/core/configs/r2r_aws_bedrock.toml index d83fadbc2..6cb08693f 100644 --- a/py/core/configs/r2r_aws_bedrock.toml +++ b/py/core/configs/r2r_aws_bedrock.toml @@ -18,7 +18,7 @@ overlap = 20 [completion] provider = "litellm" -concurrent_request_limit = 16 +concurrent_request_limit = 256 [completion.generation_config] model = "bedrock/anthropic.claude-v2" diff --git a/py/core/main/api/management_router.py b/py/core/main/api/management_router.py index fe8e5482a..f68246cca 100644 --- a/py/core/main/api/management_router.py +++ b/py/core/main/api/management_router.py @@ -436,9 +436,10 @@ async def create_collection_app( collection_id = await self.service.create_collection( name, description ) - return await self.service.add_user_to_collection( # type: ignore + await self.service.add_user_to_collection( # type: ignore auth_user.id, collection_id.collection_id ) + return collection_id @self.router.get("/get_collection/{collection_id}") @self.base_endpoint diff --git a/py/core/pipes/kg/entity_description.py b/py/core/pipes/kg/entity_description.py index adc2b48e3..12de049b3 100644 --- a/py/core/pipes/kg/entity_description.py +++ b/py/core/pipes/kg/entity_description.py @@ -134,7 +134,7 @@ async def process_entity( ), } ], - generation_config=self.kg_provider.config.kg_enrichment_settings.generation_config, + generation_config=self.kg_provider.config.kg_creation_settings.generation_config, ) ) .choices[0] diff --git a/py/core/providers/database/relational.py b/py/core/providers/database/relational.py index 3d097b48e..aa26e995c 100644 --- a/py/core/providers/database/relational.py +++ b/py/core/providers/database/relational.py @@ -1,6 +1,6 @@ import logging from contextlib import asynccontextmanager - +import asyncio import asyncpg from core.base import RelationalDBProvider @@ -35,6 +35,9 @@ def __init__( self.project_name = project_name self.pool = None self.postgres_configuration_settings = postgres_configuration_settings + self.semaphore = asyncio.Semaphore( + int(self.postgres_configuration_settings.max_connections * 0.9) + ) async def initialize(self): try: @@ -42,6 +45,7 @@ async def initialize(self): self.connection_string, max_size=self.postgres_configuration_settings.max_connections, ) + logger.info( "Successfully connected to Postgres database and created connection pool." ) @@ -57,8 +61,9 @@ def _get_table_name(self, base_name: str) -> str: @asynccontextmanager async def get_connection(self): - async with self.pool.acquire() as conn: - yield conn + async with self.semaphore: + async with self.pool.acquire() as conn: + yield conn async def execute_query(self, query, params=None): async with self.get_connection() as conn: diff --git a/py/core/providers/database/vector.py b/py/core/providers/database/vector.py index b54cddab5..d07f97160 100644 --- a/py/core/providers/database/vector.py +++ b/py/core/providers/database/vector.py @@ -459,7 +459,7 @@ def delete_collection(self, collection_id: str) -> None: sess.commit() if affected_rows == 0: - raise ValueError( + logger.warning( f"Collection {collection_id} not found in any documents." ) except NoResultFound: diff --git a/py/core/providers/kg/postgres.py b/py/core/providers/kg/postgres.py index 9cefe4452..9d5607c91 100644 --- a/py/core/providers/kg/postgres.py +++ b/py/core/providers/kg/postgres.py @@ -40,6 +40,7 @@ def __init__( self.db_provider = db_provider.relational self.embedding_provider = embedding_provider + try: import networkx as nx @@ -160,9 +161,6 @@ async def create_tables(self, project_name: str): await self.execute_query(query) - # TODO: Create another table for entity_embedding_collection - # entity embeddings at a collection level - # communities table, result of the Leiden algorithm query = f""" CREATE TABLE IF NOT EXISTS {self._get_table_name("community")} ( diff --git a/py/core/providers/orchestration/hatchet.py b/py/core/providers/orchestration/hatchet.py index d641d4705..01f6670fd 100644 --- a/py/core/providers/orchestration/hatchet.py +++ b/py/core/providers/orchestration/hatchet.py @@ -35,6 +35,7 @@ def get_worker(self, name: str, max_threads: Optional[int] = None) -> Any: self.worker = self.orchestrator.worker(name, max_threads) return self.worker + def concurrency(self, *args, **kwargs) -> Callable: return self.orchestrator.concurrency(*args, **kwargs) diff --git a/py/poetry.lock b/py/poetry.lock index 4f52a1f2f..07906bb8e 100644 --- a/py/poetry.lock +++ b/py/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -4426,6 +4426,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -4866,6 +4871,7 @@ files = [ {file = "tiktoken-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d8c2d0e5ba6453a290b86cd65fc51fedf247e1ba170191715b049dac1f628005"}, {file = "tiktoken-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d622d8011e6d6f239297efa42a2657043aaed06c4f68833550cac9e9bc723ef1"}, {file = "tiktoken-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2efaf6199717b4485031b4d6edb94075e4d79177a172f38dd934d911b588d54a"}, + {file = "tiktoken-0.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5637e425ce1fc49cf716d88df3092048359a4b3bbb7da762840426e937ada06d"}, {file = "tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fb0e352d1dbe15aba082883058b3cce9e48d33101bdaac1eccf66424feb5b47"}, {file = "tiktoken-0.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:56edfefe896c8f10aba372ab5706b9e3558e78db39dd497c940b47bf228bc419"}, {file = "tiktoken-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:326624128590def898775b722ccc327e90b073714227175ea8febbc920ac0a99"}, diff --git a/py/pyproject.toml b/py/pyproject.toml index e0f8ff592..9b38722d3 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "r2r" readme = "README.md" -version = "3.2.5" +version = "3.2.6" description = "SciPhi R2R" authors = ["Owen Colegrove "] diff --git a/py/r2r.toml b/py/r2r.toml index cadb07bff..7fd4d3de6 100644 --- a/py/r2r.toml +++ b/py/r2r.toml @@ -18,7 +18,7 @@ default_admin_password = "Sk7!PhnVrUC2zQgAvqeDBN" [completion] provider = "litellm" -concurrent_request_limit = 16 +concurrent_request_limit = 256 [completion.generation_config] model = "openai/gpt-4o" diff --git a/py/tests/regression/observed_outputs/test_document_management.json b/py/tests/regression/observed_outputs/test_document_management.json index f541aac95..eb99b338b 100644 --- a/py/tests/regression/observed_outputs/test_document_management.json +++ b/py/tests/regression/observed_outputs/test_document_management.json @@ -1111,4 +1111,4 @@ "rerun_document_chunks_test": { "results": "{\"detail\":{\"message\":\"No chunks found for the given document ID.\",\"error_type\":\"R2RException\"}}" } -} \ No newline at end of file +} diff --git a/py/tests/regression/observed_outputs/test_group_management.json b/py/tests/regression/observed_outputs/test_group_management.json index d8ca64f67..195daa6fc 100644 --- a/py/tests/regression/observed_outputs/test_group_management.json +++ b/py/tests/regression/observed_outputs/test_group_management.json @@ -17,4 +17,4 @@ "cleanup": { "error": "'R2RClient' object has no attribute 'delete_group'" } -} \ No newline at end of file +} diff --git a/py/tests/regression/observed_outputs/test_observability.json b/py/tests/regression/observed_outputs/test_observability.json index 7ee1be089..692ca2a0e 100644 --- a/py/tests/regression/observed_outputs/test_observability.json +++ b/py/tests/regression/observed_outputs/test_observability.json @@ -177,4 +177,4 @@ } } } -} \ No newline at end of file +} diff --git a/py/tests/regression/observed_outputs/test_retrieval.json b/py/tests/regression/observed_outputs/test_retrieval.json index 9e213395f..5cf5ed428 100644 --- a/py/tests/regression/observed_outputs/test_retrieval.json +++ b/py/tests/regression/observed_outputs/test_retrieval.json @@ -764,4 +764,4 @@ } } } -} \ No newline at end of file +} diff --git a/py/tests/regression/observed_outputs/test_user_management.json b/py/tests/regression/observed_outputs/test_user_management.json index 38180fe2d..bdccd1240 100644 --- a/py/tests/regression/observed_outputs/test_user_management.json +++ b/py/tests/regression/observed_outputs/test_user_management.json @@ -128,4 +128,4 @@ "message": "User account f7495bfb-58e1-539e-8b8d-6cb61f821c2b deleted successfully." } } -} \ No newline at end of file +}