Merge pull request #9 from pymc-labs/check-for-fileref

andrewheusser · web-flow · commit ee9a264cb4a2 · 2025-02-20T10:29:42.000-05:00
enforce the schema
diff --git a/app.py b/app.py
@@ -1,8 +1,10 @@
 import json
+from jsonschema import validate, ValidationError
 from flask import Flask, request, jsonify
 from celery import Celery, Task
 from kombu import serialization
 
+
 import pandas as pd
 import arviz as az
 from pymc_marketing.mmm import (
@@ -102,6 +104,16 @@ def __call__(self, *args: object, **kwargs: object) -> object:
 # Ensure proper permissions (readable/writable by all users)
 os.chmod(DATA_DIR, 0o777)
 
+# Extract the request schema from the OpenAPI spec
+def get_mmm_request_schema():
+    try:
+        with open('gpt-agent/api_spec.json', 'r') as f:
+            api_spec = json.load(f)
+            return api_spec['paths']['/run_mmm_async']['post']['requestBody']['content']['application/json']['schema']
+    except Exception as e:
+        logging.error("Failed to load API spec: %s", str(e))
+        raise e
+
 
 @celery.task(bind=True)
 def run_mmm_task(self, data):
@@ -233,6 +245,20 @@ def run_mmm_async():
         data = request.get_json()
         logging.debug("run_mmm_async request data: %s", data)
 
+        try:
+            schema = get_mmm_request_schema()
+            validate(instance=data, schema=schema)
+        except ValidationError as e:
+            logging.error("Schema validation failed: %s", str(e))
+            return jsonify({
+                "error": "Invalid request format",
+                "details": {
+                    "message": str(e),
+                    "path": " -> ".join(str(p) for p in e.path),
+                    "schema_path": " -> ".join(str(p) for p in e.schema_path)
+                }
+            }), 400
+
         task = run_mmm_task.apply_async(args=[data])
         logging.info("Task submitted with ID: %s", task.id)
 
diff --git a/environment.yml b/environment.yml
@@ -14,3 +14,4 @@ dependencies:
   - procps-ng
   - yq=2.12.0
   - dill=0.3.9
+  - jsonschema=4.23.0
diff --git a/gpt-agent/api_spec.json b/gpt-agent/api_spec.json
@@ -3,7 +3,7 @@
   "info": {
     "title": "PyMC-Marketing MMM API",
     "description": "Asynchronous API for running Marketing Mix Modeling.",
-    "version": "v0.0.3"
+    "version": "v0.0.4"
   },
   "servers": [
     {
@@ -22,17 +22,43 @@
             "application/json": {
               "schema": {
                 "type": "object",
+                "required": ["openaiFileIdRefs", "date_column", "channel_columns", "y_column"],
                 "properties": {
                   "openaiFileIdRefs": {
                     "type": "array",
                     "items": {
-                      "type": "string"
+                      "type": "object",
+                      "required": [
+                        "name",
+                        "id",
+                        "mime_type",
+                        "download_link"
+                      ],
+                      "properties": {
+                        "name": {
+                          "type": "string",
+                          "description": "Name of the file"
+                        },
+                        "id": {
+                          "type": "string",
+                          "description": "OpenAI file ID"
+                        },
+                        "mime_type": {
+                          "type": "string",
+                          "description": "MIME type of the file"
+                        },
+                        "download_link": {
+                          "type": "string",
+                          "format": "uri",
+                          "description": "URL to download the file"
+                        }
+                      }
                     },
-                    "description": "List of OpenAI file IDs to be used as references."
+                    "minItems": 1,
+                    "description": "List of OpenAI file references"
                   },
                   "date_column": {
                     "type": "string",
-                    "default": "date",
                     "description": "Name of the date column in data."
                   },
                   "channel_columns": {
@@ -42,6 +68,10 @@
                     },
                     "description": "List of channel column names."
                   },
+                  "y_column": {
+                    "type": "string",
+                    "description": "Name of the y column in data."
+                  },
                   "adstock_max_lag": {
                     "type": "integer",
                     "default": 8,
diff --git a/gpt-agent/gpt_prompt.md b/gpt-agent/gpt_prompt.md
@@ -11,7 +11,7 @@ It leverages the `dev-nextgen-mmm.pymc-labs.com` API to run MMM models and retri
 
 As BayesMMM, your main role is to:
 
-1. Assist users in preparing and validating their data for MMM and ensure that is correctly formatted for the API operations. 
+1. Assist users in validating their data for MMM and ensure that is correctly formatted for the API operations. 
 2. Run the model asynchronously using `runMMMAsync`.
 3. Provide actionable insights and visualizations, such as saturation curves and relative channel contributions.
 4. Leverage the PyMC-Marketing codebase for analysis and visualization examples, replicating them to deliver meaningful insights.
@@ -20,29 +20,26 @@ Throughout your interactions provide concise responses using bullet points and f
 
 ## Running an MMM Analysis
 
-### 1. Data Preparation
+### 1. Data Validation
 
 Before starting, ensure the data includes:
 
 - Date: Column with dates in `%Y-%m-%d` format.
 - Sales: Column with the target variable (renamed to `sales` if necessary).
 - Marketing Spend: Columns representing marketing channel spends (e.g., TV, online).
 
-Handle missing values appropriately and convert the date column to the required format:
-
-```python
-# Code example to convert date column to %Y-%m-%d format
-data['date_column_name'] = pd.to_datetime(data['date_column_name']).dt.strftime('%Y-%m-%d')
-```
-
 **Very Important:**
-- Always confirm with the user that the data is correctly formatted before proceeding to initiate the model run. 
+Validate the data, but do not attempt to fix it. Provide the user with code that they can run to fix the data. Instruct them to reupload the file to the GPT when the data is correctly formatted.
 
 ### 2. Initiating the Model Run
 
 When asked to run the Bayesian MMM model you must use the `runMMMAsync` API operation with the correctly formatted data. **Do not import MMM libraries directly or attempt to run the model locally in your code interpreter**. The payload to the API should include the reference to the data file and the following parameters:
 
-- **df**: The data as a CSV string.
+- **openaiFileIdRefs**: An array of objects with the following fields:
+  - **name**: Name of the file.
+  - **id**: OpenAI file ID.
+  - **mime_type**: MIME type of the file.
+  - **download_link**: URL to download the file.
 - **date_column**: Name of the date column.
 - **channel_columns**: List of channel spend columns.
 - **y_column**: Name of the y column.
@@ -96,15 +93,6 @@ The most important parameters are:
 * intercept: Intercept parameter
 * (optional) gamma_control: Control parameters that multiply the control variables
 
-You can retrieve the return on ad spend from the `return_on_ad_spend` field in the payload returned by `getReturnOnAdSpend`. This is a JSON object with the following fields:
-
-- `channel_columns`: List of channel columns.
-- `roas_mean`: Mean of the return on ad spend.
-- `roas_hdi_lower`: Lower bound of the 94% confidence interval of the return on ad spend.
-- `roas_hdi_upper`: Upper bound of the 94% confidence interval of the return on ad spend.
-
-Plot the return on ad spend using the `roas_mean` and the `roas_hdi_lower` and `roas_hdi_upper` to plot the confidence interval.
-
 ### 6. Analysis Workflow
 
 While waiting for results, you can suggest to the user to perform exploratory data analysis. Here some ideas:
@@ -120,6 +108,6 @@ After retrieving results here are some ideas:
 
 - Spend with Saturation: Overlay total spend as a dashed line on the saturation plot.
 
-** Important Reminder **
+** Very Important Reminders **
 
 - Throughout your interactions provide **concise responses** using bullet points and formulas when appropriate.
diff --git a/test_mmm_async.py b/test_mmm_async.py
@@ -11,22 +11,25 @@
 
 API_KEY = os.environ.get('API_KEY', None)
 
-def create_payload():
+def create_payload(include_file_refs=True):
+    openaiFileIdRefs = []
+    if include_file_refs:
+        openaiFileIdRefs = [
+            {
+                "name": "mmm_example.csv",
+                "id": "file-1234567890",
+                "mime_type": "text/csv",
+                "download_link": "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/refs/heads/main/data/mmm_example.csv"
+            }
+        ]
     payload = {
         "domain": "dev-nextgen-mmm.pymc-labs.com",
         "method": "post",
         "path": "/run_mmm_async",
         "operation": "runMMMAsync",
         "operation_hash": "0c869884cb92378e2dfe2ae377cac236cbc2b9d0",
         "is_consequential": True,
-        "openaiFileIdRefs": [
-            {
-                "name": "mmm_example.csv",
-                "id": "file-1234567890",
-                "mime_type": "text/csv",
-                "download_link": "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/refs/heads/main/data/mmm_example.csv"
-            }
-        ],
+        "openaiFileIdRefs": openaiFileIdRefs,
         "date_column": "date_week",
         "channel_columns": [
             "x1",
@@ -38,6 +41,16 @@ def create_payload():
     }
     return payload
 
+def test_missing_file_refs(base_url):
+    payload = create_payload(include_file_refs=False)
+    run_url = f"{base_url}/run_mmm_async"
+    headers = {
+        'Content-Type': 'application/json',
+        'X-API-Key': API_KEY
+    }
+    response = requests.post(run_url, data=json.dumps(payload), headers=headers)
+    assert response.status_code == 400
+    assert response.json()["error"] == "Invalid request format"
 
 def test_async_mmm_run(base_url):
     # Payload that includes data
@@ -104,4 +117,5 @@ def test_async_mmm_run(base_url):
         print("Invalid argument. Use 'local' or 'deployed-production' or 'deployed-development'.")
         sys.exit(1)
 
+    test_missing_file_refs(base_url)
     test_async_mmm_run(base_url)