From 1c35ccf277fda4fde7fce5664c95dcf06a0185fc Mon Sep 17 00:00:00 2001 From: Grant Linville Date: Mon, 4 Nov 2024 22:25:09 -0500 Subject: [PATCH 1/3] chore: update for dataset rewrite Signed-off-by: Grant Linville --- gptscript/datasets.py | 22 +++------ gptscript/gptscript.py | 98 ++++++++++------------------------------- gptscript/opts.py | 6 +-- tests/test_gptscript.py | 51 +++++++++------------ 4 files changed, 53 insertions(+), 124 deletions(-) diff --git a/gptscript/datasets.py b/gptscript/datasets.py index 9bd97fd..360e765 100644 --- a/gptscript/datasets.py +++ b/gptscript/datasets.py @@ -1,5 +1,4 @@ import base64 -from typing import Dict from pydantic import BaseModel, field_serializer, field_validator, BeforeValidator @@ -10,28 +9,17 @@ class DatasetElementMeta(BaseModel): class DatasetElement(BaseModel): name: str - description: str - contents: bytes + description: str = "" + contents: str = "" + binaryContents: bytes = b"" - @field_serializer("contents") + @field_serializer("binaryContents") def serialize_contents(self, value: bytes) -> str: return base64.b64encode(value).decode("utf-8") - @field_validator("contents", mode="before") + @field_validator("binaryContents", mode="before") def deserialize_contents(cls, value) -> bytes: if isinstance(value, str): return base64.b64decode(value) return value - -class DatasetMeta(BaseModel): - id: str - name: str - description: str - - -class Dataset(BaseModel): - id: str - name: str - description: str - elements: Dict[str, DatasetElementMeta] diff --git a/gptscript/gptscript.py b/gptscript/gptscript.py index 745e2ca..2853a53 100644 --- a/gptscript/gptscript.py +++ b/gptscript/gptscript.py @@ -8,7 +8,7 @@ from gptscript.confirm import AuthResponse from gptscript.credentials import Credential, to_credential -from gptscript.datasets import DatasetMeta, Dataset, DatasetElementMeta, DatasetElement +from gptscript.datasets import DatasetElementMeta, DatasetElement from gptscript.fileinfo import FileInfo from gptscript.frame import RunFrame, CallFrame, PromptFrame, Program from gptscript.opts import GlobalOptions @@ -213,109 +213,54 @@ async def delete_credential(self, context: str = "default", name: str = "") -> s {"context": [context], "name": name} ) - async def list_datasets(self, workspace_id: str) -> List[DatasetMeta]: - if workspace_id == "": - workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"] - + # list_datasets returns an array of dataset IDs + async def list_datasets(self) -> List[str]: res = await self._run_basic_command( "datasets", - {"input": "{}", "workspaceID": workspace_id, "datasetToolRepo": self.opts.DatasetToolRepo, - "env": self.opts.Env} - ) - return [DatasetMeta.model_validate(d) for d in json.loads(res)] - - async def create_dataset(self, workspace_id: str, name: str, description: str = "") -> Dataset: - if workspace_id == "": - workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"] - - if name == "": - raise ValueError("name cannot be empty") - - res = await self._run_basic_command( - "datasets/create", { - "input": json.dumps({"datasetName": name, "datasetDescription": description}), - "workspaceID": workspace_id, - "datasetToolRepo": self.opts.DatasetToolRepo, - "env": self.opts.Env, - } - ) - return Dataset.model_validate_json(res) - - async def add_dataset_element(self, workspace_id: str, datasetID: str, elementName: str, elementContent: bytes, - elementDescription: str = "") -> DatasetElementMeta: - if workspace_id == "": - workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"] - - if datasetID == "": - raise ValueError("datasetID cannot be empty") - elif elementName == "": - raise ValueError("elementName cannot be empty") - elif not elementContent: - raise ValueError("elementContent cannot be empty") - - res = await self._run_basic_command( - "datasets/add-element", - { - "input": json.dumps({ - "datasetID": datasetID, - "elementName": elementName, - "elementContent": base64.b64encode(elementContent).decode("utf-8"), - "elementDescription": elementDescription, - }), - "workspaceID": workspace_id, - "datasetToolRepo": self.opts.DatasetToolRepo, + "input": json.dumps({"workspaceID": os.getenv("GPTSCRIPT_WORKSPACE_ID")}), + "datasetTool": self.opts.DatasetTool, "env": self.opts.Env } ) - return DatasetElementMeta.model_validate_json(res) + return json.loads(res) - async def add_dataset_elements(self, workspace_id: str, datasetID: str, elements: List[DatasetElement]) -> str: - if workspace_id == "": - workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"] - - if datasetID == "": - raise ValueError("datasetID cannot be empty") - elif not elements: + async def add_dataset_elements(self, elements: List[DatasetElement], datasetID: str = "") -> str: + if not elements: raise ValueError("elements cannot be empty") res = await self._run_basic_command( "datasets/add-elements", { "input": json.dumps({ + "workspaceID": os.getenv("GPTSCRIPT_WORKSPACE_ID"), "datasetID": datasetID, "elements": [element.model_dump() for element in elements], }), - "workspaceID": workspace_id, - "datasetToolRepo": self.opts.DatasetToolRepo, + "datasetTool": self.opts.DatasetTool, "env": self.opts.Env } ) return res - - async def list_dataset_elements(self, workspace_id: str, datasetID: str) -> List[DatasetElementMeta]: - if workspace_id == "": - workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"] - + async def list_dataset_elements(self, datasetID: str) -> List[DatasetElementMeta]: if datasetID == "": raise ValueError("datasetID cannot be empty") res = await self._run_basic_command( "datasets/list-elements", { - "input": json.dumps({"datasetID": datasetID}), - "workspaceID": workspace_id, - "datasetToolRepo": self.opts.DatasetToolRepo, + "input": json.dumps({ + "workspaceID": os.getenv("GPTSCRIPT_WORKSPACE_ID"), + "datasetID": datasetID, + }), + "datasetTool": self.opts.DatasetTool, "env": self.opts.Env } ) return [DatasetElementMeta.model_validate(d) for d in json.loads(res)] - async def get_dataset_element(self, workspace_id: str, datasetID: str, elementName: str) -> DatasetElement: - if workspace_id == "": - workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"] - + async def get_dataset_element(self, datasetID: str, elementName: str) -> DatasetElement: if datasetID == "": raise ValueError("datasetID cannot be empty") elif elementName == "": @@ -324,9 +269,12 @@ async def get_dataset_element(self, workspace_id: str, datasetID: str, elementNa res = await self._run_basic_command( "datasets/get-element", { - "input": json.dumps({"datasetID": datasetID, "element": elementName}), - "workspaceID": workspace_id, - "datasetToolRepo": self.opts.DatasetToolRepo, + "input": json.dumps({ + "workspaceID": os.getenv("GPTSCRIPT_WORKSPACE_ID"), + "datasetID": datasetID, + "name": elementName, + }), + "datasetTool": self.opts.DatasetTool, "env": self.opts.Env, } ) diff --git a/gptscript/opts.py b/gptscript/opts.py index 48f36c1..a5bac39 100644 --- a/gptscript/opts.py +++ b/gptscript/opts.py @@ -12,7 +12,7 @@ def __init__( defaultModelProvider: str = "", defaultModel: str = "", cacheDir: str = "", - datasetToolRepo: str = "", + datasetTool: str = "", workspaceTool: str = "", env: list[str] = None, ): @@ -23,7 +23,7 @@ def __init__( self.DefaultModel = defaultModel self.DefaultModelProvider = defaultModelProvider self.CacheDir = cacheDir - self.DatasetToolRepo = datasetToolRepo + self.DatasetTool = datasetTool self.WorkspaceTool = workspaceTool if env is None: env = [f"{k}={v}" for k, v in os.environ.items()] @@ -42,7 +42,7 @@ def merge(self, other: Self) -> Self: cp.DefaultModel = other.DefaultModel if other.DefaultModel != "" else self.DefaultModel cp.DefaultModelProvider = other.DefaultModelProvider if other.DefaultModelProvider != "" else self.DefaultModelProvider cp.CacheDir = other.CacheDir if other.CacheDir != "" else self.CacheDir - cp.DatasetToolRepo = other.DatasetToolRepo if other.DatasetToolRepo != "" else self.DatasetToolRepo + cp.DatasetTool = other.DatasetTool if other.DatasetTool != "" else self.DatasetTool cp.WorkspaceTool = other.WorkspaceTool if other.WorkspaceTool != "" else self.WorkspaceTool cp.Env = (other.Env or []) cp.Env.extend(self.Env or []) diff --git a/tests/test_gptscript.py b/tests/test_gptscript.py index 9da5792..a5b16cd 100644 --- a/tests/test_gptscript.py +++ b/tests/test_gptscript.py @@ -761,57 +761,50 @@ async def test_credentials(gptscript): @pytest.mark.asyncio async def test_datasets(gptscript): workspace_id = await gptscript.create_workspace("directory") - dataset_name = str(os.urandom(8).hex()) + os.environ["GPTSCRIPT_WORKSPACE_ID"] = workspace_id # Create dataset - dataset = await gptscript.create_dataset(workspace_id, dataset_name, "this is a test dataset") - assert dataset.id != "", "Expected dataset id to be set" - assert dataset.name == dataset_name, "Expected dataset name to match" - assert dataset.description == "this is a test dataset", "Expected dataset description to match" - assert len(dataset.elements) == 0, "Expected dataset elements to be empty" - - # Add an element - element_meta = await gptscript.add_dataset_element(workspace_id, dataset.id, "element1", b"element1 contents", - "element1 description") - assert element_meta.name == "element1", "Expected element name to match" - assert element_meta.description == "element1 description", "Expected element description to match" + dataset_id = await gptscript.add_dataset_elements([ + DatasetElement(name="element1", contents="element1 contents", description="element1 description"), + DatasetElement(name="element2", binaryContents=b"element2 contents", description="element2 description"), + ]) # Add two more elements - await gptscript.add_dataset_elements(workspace_id, dataset.id, [ - DatasetElement(name="element2", contents=b"element2 contents", description="element2 description"), - DatasetElement(name="element3", contents=b"element3 contents", description="element3 description"), - ]) + await gptscript.add_dataset_elements([ + DatasetElement(name="element3", contents="element3 contents", description="element3 description"), + DatasetElement(name="element4", contents="element3 contents", description="element4 description"), + ], dataset_id) # Get the elements - e1 = await gptscript.get_dataset_element(workspace_id, dataset.id, "element1") + e1 = await gptscript.get_dataset_element(dataset_id, "element1") assert e1.name == "element1", "Expected element name to match" - assert e1.contents == b"element1 contents", "Expected element contents to match" + assert e1.contents == "element1 contents", "Expected element contents to match" assert e1.description == "element1 description", "Expected element description to match" - e2 = await gptscript.get_dataset_element(workspace_id, dataset.id, "element2") + e2 = await gptscript.get_dataset_element(dataset_id, "element2") assert e2.name == "element2", "Expected element name to match" - assert e2.contents == b"element2 contents", "Expected element contents to match" + assert e2.binaryContents == b"element2 contents", "Expected element contents to match" assert e2.description == "element2 description", "Expected element description to match" - e3 = await gptscript.get_dataset_element(workspace_id, dataset.id, "element3") + e3 = await gptscript.get_dataset_element(dataset_id, "element3") assert e3.name == "element3", "Expected element name to match" - assert e3.contents == b"element3 contents", "Expected element contents to match" + assert e3.contents == "element3 contents", "Expected element contents to match" assert e3.description == "element3 description", "Expected element description to match" # List elements in the dataset - elements = await gptscript.list_dataset_elements(workspace_id, dataset.id) - assert len(elements) == 3, "Expected one element in the dataset" + elements = await gptscript.list_dataset_elements(dataset_id) + assert len(elements) == 4, "Expected four elements in the dataset" assert elements[0].name == "element1", "Expected element name to match" assert elements[0].description == "element1 description", "Expected element description to match" assert elements[1].name == "element2", "Expected element name to match" assert elements[1].description == "element2 description", "Expected element description to match" assert elements[2].name == "element3", "Expected element name to match" assert elements[2].description == "element3 description", "Expected element description to match" + assert elements[3].name == "element4", "Expected element name to match" + assert elements[3].description == "element4 description", "Expected element description to match" # List datasets - datasets = await gptscript.list_datasets(workspace_id) - assert len(datasets) > 0, "Expected at least one dataset" - assert datasets[0].id == dataset.id, "Expected dataset id to match" - assert datasets[0].name == dataset_name, "Expected dataset name to match" - assert datasets[0].description == "this is a test dataset", "Expected dataset description to match" + dataset_ids = await gptscript.list_datasets() + assert len(dataset_ids) > 0, "Expected at least one dataset" + assert dataset_ids[0] == dataset_id, "Expected dataset id to match" await gptscript.delete_workspace(workspace_id) From 2289075bd5e2fc9ab245a2f8757dfe331de57879 Mon Sep 17 00:00:00 2001 From: Grant Linville Date: Tue, 5 Nov 2024 16:36:30 -0500 Subject: [PATCH 2/3] workspaceID updates Signed-off-by: Grant Linville --- gptscript/datasets.py | 6 ++++++ gptscript/gptscript.py | 25 ++++++++++++++----------- tests/test_gptscript.py | 30 ++++++++++++++++++------------ 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/gptscript/datasets.py b/gptscript/datasets.py index 360e765..63b8fd1 100644 --- a/gptscript/datasets.py +++ b/gptscript/datasets.py @@ -2,6 +2,12 @@ from pydantic import BaseModel, field_serializer, field_validator, BeforeValidator +class DatasetMeta(BaseModel): + id: str + name: str + description: str + + class DatasetElementMeta(BaseModel): name: str description: str diff --git a/gptscript/gptscript.py b/gptscript/gptscript.py index 2853a53..392f70b 100644 --- a/gptscript/gptscript.py +++ b/gptscript/gptscript.py @@ -8,7 +8,7 @@ from gptscript.confirm import AuthResponse from gptscript.credentials import Credential, to_credential -from gptscript.datasets import DatasetElementMeta, DatasetElement +from gptscript.datasets import DatasetElementMeta, DatasetElement, DatasetMeta from gptscript.fileinfo import FileInfo from gptscript.frame import RunFrame, CallFrame, PromptFrame, Program from gptscript.opts import GlobalOptions @@ -214,18 +214,24 @@ async def delete_credential(self, context: str = "default", name: str = "") -> s ) # list_datasets returns an array of dataset IDs - async def list_datasets(self) -> List[str]: + async def list_datasets(self) -> List[DatasetMeta]: res = await self._run_basic_command( "datasets", { - "input": json.dumps({"workspaceID": os.getenv("GPTSCRIPT_WORKSPACE_ID")}), + "input": "{}", "datasetTool": self.opts.DatasetTool, "env": self.opts.Env } ) - return json.loads(res) + return [DatasetMeta.model_validate(d) for d in json.loads(res)] - async def add_dataset_elements(self, elements: List[DatasetElement], datasetID: str = "") -> str: + async def add_dataset_elements( + self, + elements: List[DatasetElement], + datasetID: str = "", + name: str = "", + description: str = "" + ) -> str: if not elements: raise ValueError("elements cannot be empty") @@ -233,8 +239,9 @@ async def add_dataset_elements(self, elements: List[DatasetElement], datasetID: "datasets/add-elements", { "input": json.dumps({ - "workspaceID": os.getenv("GPTSCRIPT_WORKSPACE_ID"), "datasetID": datasetID, + "name": name, + "description": description, "elements": [element.model_dump() for element in elements], }), "datasetTool": self.opts.DatasetTool, @@ -250,10 +257,7 @@ async def list_dataset_elements(self, datasetID: str) -> List[DatasetElementMeta res = await self._run_basic_command( "datasets/list-elements", { - "input": json.dumps({ - "workspaceID": os.getenv("GPTSCRIPT_WORKSPACE_ID"), - "datasetID": datasetID, - }), + "input": json.dumps({"datasetID": datasetID}), "datasetTool": self.opts.DatasetTool, "env": self.opts.Env } @@ -270,7 +274,6 @@ async def get_dataset_element(self, datasetID: str, elementName: str) -> Dataset "datasets/get-element", { "input": json.dumps({ - "workspaceID": os.getenv("GPTSCRIPT_WORKSPACE_ID"), "datasetID": datasetID, "name": elementName, }), diff --git a/tests/test_gptscript.py b/tests/test_gptscript.py index a5b16cd..6c2aff9 100644 --- a/tests/test_gptscript.py +++ b/tests/test_gptscript.py @@ -761,36 +761,40 @@ async def test_credentials(gptscript): @pytest.mark.asyncio async def test_datasets(gptscript): workspace_id = await gptscript.create_workspace("directory") - os.environ["GPTSCRIPT_WORKSPACE_ID"] = workspace_id + + new_client = GPTScript(GlobalOptions( + apiKey=os.getenv("OPENAI_API_KEY"), + env=[f"GPTSCRIPT_WORKSPACE_ID={workspace_id}"], + )) # Create dataset - dataset_id = await gptscript.add_dataset_elements([ + dataset_id = await new_client.add_dataset_elements([ DatasetElement(name="element1", contents="element1 contents", description="element1 description"), DatasetElement(name="element2", binaryContents=b"element2 contents", description="element2 description"), - ]) + ], name="test-dataset", description="test dataset description") # Add two more elements - await gptscript.add_dataset_elements([ + await new_client.add_dataset_elements([ DatasetElement(name="element3", contents="element3 contents", description="element3 description"), DatasetElement(name="element4", contents="element3 contents", description="element4 description"), - ], dataset_id) + ], datasetID=dataset_id) # Get the elements - e1 = await gptscript.get_dataset_element(dataset_id, "element1") + e1 = await new_client.get_dataset_element(dataset_id, "element1") assert e1.name == "element1", "Expected element name to match" assert e1.contents == "element1 contents", "Expected element contents to match" assert e1.description == "element1 description", "Expected element description to match" - e2 = await gptscript.get_dataset_element(dataset_id, "element2") + e2 = await new_client.get_dataset_element(dataset_id, "element2") assert e2.name == "element2", "Expected element name to match" assert e2.binaryContents == b"element2 contents", "Expected element contents to match" assert e2.description == "element2 description", "Expected element description to match" - e3 = await gptscript.get_dataset_element(dataset_id, "element3") + e3 = await new_client.get_dataset_element(dataset_id, "element3") assert e3.name == "element3", "Expected element name to match" assert e3.contents == "element3 contents", "Expected element contents to match" assert e3.description == "element3 description", "Expected element description to match" # List elements in the dataset - elements = await gptscript.list_dataset_elements(dataset_id) + elements = await new_client.list_dataset_elements(dataset_id) assert len(elements) == 4, "Expected four elements in the dataset" assert elements[0].name == "element1", "Expected element name to match" assert elements[0].description == "element1 description", "Expected element description to match" @@ -802,9 +806,11 @@ async def test_datasets(gptscript): assert elements[3].description == "element4 description", "Expected element description to match" # List datasets - dataset_ids = await gptscript.list_datasets() - assert len(dataset_ids) > 0, "Expected at least one dataset" - assert dataset_ids[0] == dataset_id, "Expected dataset id to match" + datasets = await new_client.list_datasets() + assert len(datasets) > 0, "Expected at least one dataset" + assert datasets[0].id == dataset_id, "Expected dataset id to match" + assert datasets[0].name == "test-dataset", "Expected dataset name to match" + assert datasets[0].description == "test dataset description", "Expected dataset description to match" await gptscript.delete_workspace(workspace_id) From accc63dc45dd2ced0f5b169e3ed0d5388ee391c1 Mon Sep 17 00:00:00 2001 From: Grant Linville Date: Wed, 6 Nov 2024 15:15:12 -0500 Subject: [PATCH 3/3] fix dataset test Signed-off-by: Grant Linville --- tests/test_gptscript.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_gptscript.py b/tests/test_gptscript.py index 6c2aff9..f83607f 100644 --- a/tests/test_gptscript.py +++ b/tests/test_gptscript.py @@ -760,11 +760,11 @@ async def test_credentials(gptscript): @pytest.mark.asyncio async def test_datasets(gptscript): - workspace_id = await gptscript.create_workspace("directory") + os.environ["GPTSCRIPT_WORKSPACE_ID"] = await gptscript.create_workspace("directory") new_client = GPTScript(GlobalOptions( apiKey=os.getenv("OPENAI_API_KEY"), - env=[f"GPTSCRIPT_WORKSPACE_ID={workspace_id}"], + env=[f"{k}={v}" for k, v in os.environ.items()], )) # Create dataset @@ -812,7 +812,7 @@ async def test_datasets(gptscript): assert datasets[0].name == "test-dataset", "Expected dataset name to match" assert datasets[0].description == "test dataset description", "Expected dataset description to match" - await gptscript.delete_workspace(workspace_id) + await gptscript.delete_workspace(os.environ["GPTSCRIPT_WORKSPACE_ID"]) @pytest.mark.asyncio