GoogleCloudPlatform
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎agent_starter_pack/agents/adk/.template/templateconfig.yaml‎
Lines changed: 1 addition & 1 deletion b/‎agent_starter_pack/agents/adk/.template/templateconfig.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎agent_starter_pack/agents/adk/tests/eval/eval_config.json‎
Lines changed: 17 additions & 0 deletions b/‎agent_starter_pack/agents/adk/tests/eval/eval_config.json‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎agent_starter_pack/agents/adk/tests/eval/evalsets/README.md‎
Lines changed: 80 additions & 0 deletions b/‎agent_starter_pack/agents/adk/tests/eval/evalsets/README.md‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎agent_starter_pack/agents/adk/tests/eval/evalsets/basic.evalset.json‎
Lines changed: 37 additions & 0 deletions b/‎agent_starter_pack/agents/adk/tests/eval/evalsets/basic.evalset.json‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎agent_starter_pack/agents/adk_a2a/.template/templateconfig.yaml‎
Lines changed: 1 addition & 1 deletion b/‎agent_starter_pack/agents/adk_a2a/.template/templateconfig.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎agent_starter_pack/agents/adk_a2a/tests/eval/eval_config.json‎
Lines changed: 17 additions & 0 deletions b/‎agent_starter_pack/agents/adk_a2a/tests/eval/eval_config.json‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎agent_starter_pack/agents/adk_a2a/tests/eval/evalsets/README.md‎
Lines changed: 80 additions & 0 deletions b/‎agent_starter_pack/agents/adk_a2a/tests/eval/evalsets/README.md‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎agent_starter_pack/agents/adk_a2a/tests/eval/evalsets/basic.evalset.json‎
Lines changed: 37 additions & 0 deletions b/‎agent_starter_pack/agents/adk_a2a/tests/eval/evalsets/basic.evalset.json‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎agent_starter_pack/agents/adk_go/.template/templateconfig.yaml‎
Lines changed: 1 addition & 1 deletion b/‎agent_starter_pack/agents/adk_go/.template/templateconfig.yaml‎
Lines changed: 1 addition & 1 deletion
@@ -197,6 +197,7 @@ Thumbs.db
 .saved_chats
 .aider*
 target
+target*/
 set_projects.sh
 delete_genai_repos.sh
 cleanup_e2e_projects.sh
 
@@ -17,7 +17,7 @@ example_question: "What's the weather in San Francisco?"
 settings:
   requires_data_ingestion: false
   requires_session: true
-  deployment_targets: ["agent_engine", "cloud_run"]
+  deployment_targets: ["agent_engine", "cloud_run", "none"]
   extra_dependencies: ["google-adk>=1.15.0,<2.0.0"]
   tags: ["adk"]
   frontend_type: "None"
 
@@ -0,0 +1,17 @@
+{
+  "criteria": {
+    "rubric_based_final_response_quality_v1": {
+      "threshold": 0.8,
+      "rubrics": [
+        {
+          "rubricId": "relevance",
+          "rubricContent": { "textProperty": "The response directly addresses the user's query." }
+        },
+        {
+          "rubricId": "helpfulness",
+          "rubricContent": { "textProperty": "The response is helpful and provides useful information." }
+        }
+      ]
+    }
+  }
+}
@@ -0,0 +1,80 @@
+# Evaluation Sets
+
+This directory contains evaluation sets for testing agent behavior using `adk eval`.
+
+## Running Evaluations
+
+```bash
+# Run default evalset
+make eval
+
+# Run specific evalset
+make eval EVALSET=tests/eval/evalsets/custom.evalset.json
+
+# Run all evalsets
+make eval-all
+```
+
+## Evalset Format
+
+Each `.evalset.json` follows the ADK evaluation format:
+
+```json
+{
+  "eval_set_id": "unique_id",
+  "name": "Human-readable name",
+  "description": "What this evalset tests",
+  "eval_cases": [
+    {
+      "eval_id": "case_id",
+      "conversation": [
+        {
+          "user_content": {
+            "parts": [{"text": "User message"}]
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {"name": "tool_name", "args": {"param": "value"}}
+            ]
+          }
+        }
+      ],
+      "session_input": {
+        "app_name": "app_name",
+        "user_id": "test_user",
+        "state": {}
+      }
+    }
+  ]
+}
+```
+
+## Key Fields
+
+- `eval_cases`: Array of test scenarios
+- `conversation`: Sequence of user messages
+- `intermediate_data.tool_uses`: Expected tool calls (for trajectory matching)
+- `session_input`: Initial session state
+
+## Evaluation Metrics
+
+ADK eval measures:
+
+- **tool_trajectory_avg_score**: Are the correct tools called in the right order?
+- **response_match_score**: How similar is the response to expected output?
+
+## Creating Custom Evalsets
+
+1. Copy `basic.evalset.json` as a template
+2. Add cases based on your `DESIGN_SPEC.md` scenarios
+3. Include expected tool calls for capability tests
+4. Run `make eval EVALSET=your_evalset.json`
+
+## Tips
+
+- Start with 3-5 representative cases
+- Include both happy path and edge cases
+- Test each core capability from DESIGN_SPEC.md
+- Add cases when you find bugs in production
+
+See [ADK documentation](https://google.github.io/adk-docs/) for advanced evaluation options.
@@ -0,0 +1,37 @@
+{
+  "eval_set_id": "basic_eval",
+  "name": "Basic Agent Evaluation",
+  "description": "Sample evaluation set for testing core agent functionality. Customize these cases based on your DESIGN_SPEC.md.",
+  "eval_cases": [
+    {
+      "eval_id": "greeting",
+      "conversation": [
+        {
+          "user_content": {
+            "parts": [{"text": "Hello, what can you help me with?"}]
+          }
+        }
+      ],
+      "session_input": {
+        "app_name": "app",
+        "user_id": "eval_user",
+        "state": {}
+      }
+    },
+    {
+      "eval_id": "weather_query",
+      "conversation": [
+        {
+          "user_content": {
+            "parts": [{"text": "What's the weather like in San Francisco?"}]
+          }
+        }
+      ],
+      "session_input": {
+        "app_name": "app",
+        "user_id": "eval_user",
+        "state": {}
+      }
+    }
+  ]
+}
@@ -16,7 +16,7 @@ description: "ReAct agent with A2A protocol [experimental]"
 example_question: "What's the weather in San Francisco?"
 settings:
   requires_data_ingestion: false
-  deployment_targets: ["agent_engine", "cloud_run"]
+  deployment_targets: ["agent_engine", "cloud_run", "none"]
   extra_dependencies: ["google-adk>=1.16.0,<2.0.0", "a2a-sdk~=0.3.22", "nest-asyncio>=1.6.0,<2.0.0"]
   tags: ["adk", "a2a"]
   frontend_type: "None"
@@ -0,0 +1,17 @@
+{
+  "criteria": {
+    "rubric_based_final_response_quality_v1": {
+      "threshold": 0.8,
+      "rubrics": [
+        {
+          "rubricId": "relevance",
+          "rubricContent": { "textProperty": "The response directly addresses the user's query." }
+        },
+        {
+          "rubricId": "helpfulness",
+          "rubricContent": { "textProperty": "The response is helpful and provides useful information." }
+        }
+      ]
+    }
+  }
+}
@@ -0,0 +1,80 @@
+# Evaluation Sets
+
+This directory contains evaluation sets for testing agent behavior using `adk eval`.
+
+## Running Evaluations
+
+```bash
+# Run default evalset
+make eval
+
+# Run specific evalset
+make eval EVALSET=tests/eval/evalsets/custom.evalset.json
+
+# Run all evalsets
+make eval-all
+```
+
+## Evalset Format
+
+Each `.evalset.json` follows the ADK evaluation format:
+
+```json
+{
+  "eval_set_id": "unique_id",
+  "name": "Human-readable name",
+  "description": "What this evalset tests",
+  "eval_cases": [
+    {
+      "eval_id": "case_id",
+      "conversation": [
+        {
+          "user_content": {
+            "parts": [{"text": "User message"}]
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {"name": "tool_name", "args": {"param": "value"}}
+            ]
+          }
+        }
+      ],
+      "session_input": {
+        "app_name": "app_name",
+        "user_id": "test_user",
+        "state": {}
+      }
+    }
+  ]
+}
+```
+
+## Key Fields
+
+- `eval_cases`: Array of test scenarios
+- `conversation`: Sequence of user messages
+- `intermediate_data.tool_uses`: Expected tool calls (for trajectory matching)
+- `session_input`: Initial session state
+
+## Evaluation Metrics
+
+ADK eval measures:
+
+- **tool_trajectory_avg_score**: Are the correct tools called in the right order?
+- **response_match_score**: How similar is the response to expected output?
+
+## Creating Custom Evalsets
+
+1. Copy `basic.evalset.json` as a template
+2. Add cases based on your `DESIGN_SPEC.md` scenarios
+3. Include expected tool calls for capability tests
+4. Run `make eval EVALSET=your_evalset.json`
+
+## Tips
+
+- Start with 3-5 representative cases
+- Include both happy path and edge cases
+- Test each core capability from DESIGN_SPEC.md
+- Add cases when you find bugs in production
+
+See [ADK documentation](https://google.github.io/adk-docs/) for advanced evaluation options.
@@ -0,0 +1,37 @@
+{
+  "eval_set_id": "basic_eval",
+  "name": "Basic Agent Evaluation",
+  "description": "Sample evaluation set for testing core agent functionality. Customize these cases based on your DESIGN_SPEC.md.",
+  "eval_cases": [
+    {
+      "eval_id": "greeting",
+      "conversation": [
+        {
+          "user_content": {
+            "parts": [{"text": "Hello, what can you help me with?"}]
+          }
+        }
+      ],
+      "session_input": {
+        "app_name": "app",
+        "user_id": "eval_user",
+        "state": {}
+      }
+    },
+    {
+      "eval_id": "capability_query",
+      "conversation": [
+        {
+          "user_content": {
+            "parts": [{"text": "What tools do you have available?"}]
+          }
+        }
+      ],
+      "session_input": {
+        "app_name": "app",
+        "user_id": "eval_user",
+        "state": {}
+      }
+    }
+  ]
+}
@@ -18,7 +18,7 @@ settings:
   language: "go"
   requires_data_ingestion: false
   requires_session: false
-  deployment_targets: ["cloud_run"]
+  deployment_targets: ["cloud_run", "none"]
   extra_dependencies: []
   tags: ["adk", "go", "a2a"]
   frontend_type: "None"