ToolEvals lets you write and run unit tests for your CXAS tools — without needing a full agent session. Tests are described in simple YAML files, which makes them easy to version-control alongside your tool code and review in pull requests.
Each test case specifies a tool name, the input args, optional variables (session state), and expectations that assert things about the response using the Operator enum:
| Operator | Meaning |
equals | exact match |
contains | substring or element check |
greater_than / less_than | numeric comparison |
length_equals / length_greater_than / length_less_than | collection size |
is_null / is_not_null | presence check |
run_tool_tests() returns a pandas DataFrame with columns for test name, tool, status, latency, and errors — easy to save as a CSV or display in a notebook.
Quick Example
from cxas_scrapi import ToolEvals
app_name = "projects/my-project/locations/us/apps/my-app-id"
te = ToolEvals(app_name=app_name)
# Load tests from a YAML file
test_cases = te.load_tool_test_cases_from_file("tool_tests/lookup_account.yaml")
# Run them and get a results DataFrame
results_df = te.run_tool_tests(test_cases)
print(results_df[["test_name", "tool", "status", "latency (ms)"]])
# Generate a summary report
report_df = ToolEvals.generate_report(results_df)
print(report_df)
A minimal YAML test file looks like this:
tests:
- name: lookup_known_customer
tool: lookup_account
args:
customer_id: "C-1234"
expectations:
response:
- path: "$.account_status"
operator: equals
value: "active"
- path: "$.balance"
operator: is_not_null
Reference
ToolEvals(app_name, creds=None, user_agent_extension=None)
Utility class for testing CXAS Tools.
Initializes the ToolEvals class.
Parameters:
| Name | Type | Description | Default |
app_name | str | CXAS App name (projects/{project}/locations/{location}/apps/{app}). | required |
creds | Any | Optional Google Cloud credentials. | None |
Source code in src/cxas_scrapi/evals/tool_evals.py
| def __init__(
self,
app_name: str,
creds: Any = None,
user_agent_extension: str = None,
):
"""Initializes the ToolEvals class.
Args:
app_name: CXAS App name
(projects/{project}/locations/{location}/apps/{app}).
creds: Optional Google Cloud credentials.
"""
self.app_name = app_name
parts = self.app_name.split("/")
self.project_id = parts[1] if len(parts) > 1 else ""
self.location = parts[3] if len(parts) > 3 else "us"
self.creds = creds
self.user_agent_extension = user_agent_extension
self.tools_client = Tools(
app_name=self.app_name,
creds=self.creds,
user_agent_extension=user_agent_extension,
)
self.var_client = Variables(
app_name=self.app_name,
creds=self.creds,
user_agent_extension=user_agent_extension,
)
try:
self.tool_map = self.tools_client.get_tools_map(reverse=True)
except (AttributeError, KeyError, RuntimeError, ValueError) as e:
logger.warning(
"Failed to fetch tool map for %s: %s", self.app_name, e
)
self.tool_map = {}
|
load_tool_test_cases_from_file(test_file_path)
Loads tool tests from a YAML file.
Source code in src/cxas_scrapi/evals/tool_evals.py
| def load_tool_test_cases_from_file(
self, test_file_path: str
) -> List["ToolTestCase"]:
"""Loads tool tests from a YAML file."""
with open(test_file_path, "r", encoding="utf-8") as f:
return self.load_tool_test_cases_from_yaml(f.read())
|
load_tool_tests_from_dir(directory_path='tool_tests')
Recursively loads all YAML tool tests from a directory.
Source code in src/cxas_scrapi/evals/tool_evals.py
| def load_tool_tests_from_dir(
self, directory_path: str = "tool_tests"
) -> List["ToolTestCase"]:
"""Recursively loads all YAML tool tests from a directory."""
all_tests = []
if not os.path.exists(directory_path):
print(f"Directory {directory_path} does not exist.")
return all_tests
for root, _, files in os.walk(directory_path):
for file in files:
if file.endswith(".yaml") or file.endswith(".yml"):
file_path = os.path.join(root, file)
try:
tests = self.load_tool_test_cases_from_file(file_path)
all_tests.extend(tests)
except Exception as e:
logger.error(f"Error loading {file_path}: {e}")
return all_tests
|
load_tool_test_cases_from_yaml(yaml_data)
Loads tool tests from a YAML string.
Source code in src/cxas_scrapi/evals/tool_evals.py
| def load_tool_test_cases_from_yaml(
self, yaml_data: str
) -> List["ToolTestCase"]:
"""Loads tool tests from a YAML string."""
raw_data = yaml.safe_load(yaml_data)
if not raw_data or "tests" not in raw_data:
return []
return self.load_tool_test_cases_from_data(raw_data["tests"])
|
load_tool_test_cases_from_data(test_data)
Loads tool tests from a list of dictionaries.
Source code in src/cxas_scrapi/evals/tool_evals.py
| def load_tool_test_cases_from_data(
self, test_data: List[Dict[str, Any]]
) -> List["ToolTestCase"]:
"""Loads tool tests from a list of dictionaries."""
# Pre-process data to handle VariableDeclaration objects
cleaned_data = []
for case in test_data:
case_copy = case.copy()
if "variables" in case_copy and isinstance(
case_copy["variables"], dict
):
cleaned_vars = {}
for k, v in case_copy["variables"].items():
cleaned_vars[k] = Variables.variable_to_dict(v)
case_copy["variables"] = cleaned_vars
cleaned_data.append(case_copy)
adapter = TypeAdapter(List[ToolTestCase])
return adapter.validate_python(cleaned_data)
|
validate_tool_test(test_case, tool_response)
Validates the tool response and variables against expectations.
Returns:
| Type | Description |
List[str] | List of error messages. Empty list if all expectations pass. |
Source code in src/cxas_scrapi/evals/tool_evals.py
| def validate_tool_test(
self,
test_case: "ToolTestCase",
tool_response: Any,
) -> List[str]:
"""Validates the tool response and variables against expectations.
Returns:
List of error messages. Empty list if all expectations pass.
"""
updated_variables = {}
if isinstance(tool_response, dict) and "variables" in tool_response:
updated_variables = tool_response["variables"]
errors = []
# Validate response
for exp in test_case.response_expectations:
resp_data = (
tool_response.get("response")
if isinstance(tool_response, dict)
else tool_response
)
actual_value = self._get_value_at_path(resp_data, exp.path)
if not self._check_expectation(actual_value, exp):
errors.append(
f"Response expectation failed: path='{exp.path}',"
f" actual='{actual_value}', expected='{exp.value}',"
f" operator='{exp.operator}'"
)
# Validate variables
for exp in test_case.variable_expectations:
actual_value = self._get_value_at_path(updated_variables, exp.path)
if not self._check_expectation(actual_value, exp):
errors.append(
f"Variable expectation failed: path='{exp.path}',"
f" actual='{actual_value}', expected='{exp.value}',"
f" operator='{exp.operator}'"
)
return errors
|
run_tool_tests(test_cases, debug=False)
Runs a list of tool tests.
Returns:
| Type | Description |
DataFrame | A pandas DataFrame of results with status and errors. |
Source code in src/cxas_scrapi/evals/tool_evals.py
| def run_tool_tests(
self, test_cases: List["ToolTestCase"], debug: bool = False
) -> pd.DataFrame:
"""Runs a list of tool tests.
Returns:
A pandas DataFrame of results with status and errors.
"""
# Fetch and unwrap app variables once
raw_app_vars = self.var_client.list_variables()
app_vars_cache = {}
for var in raw_app_vars:
try:
var_dict = MessageToDict(var._pb)
except AttributeError:
var_dict = MessageToDict(var)
schema = var_dict.get("schema", {})
actual_data = schema.get("default") or var_dict.get("value") or {}
app_vars_cache[var.name] = actual_data
# Fetch app metadata and user info once per run
app_client = Apps(
project_id=self.project_id,
location=self.location,
creds=self.creds,
user_agent_extension=self.user_agent_extension
)
app = app_client.get_app(self.app_name)
app_display_name = app.display_name if app else "Unknown App"
tester_email = getattr(self.creds, "service_account_email", "Unknown")
results = []
for test_case in test_cases:
print(f"Running test: {test_case.name} ({test_case.tool})")
tool_id = self.tool_map.get(test_case.tool)
if not tool_id:
error = f"Tool '{test_case.tool}' not found in app."
print(f"FAILED: {error}")
results.append(
{
"test": test_case.name,
"tool": test_case.tool,
"status": "FAILED",
"latency (ms)": 0.0,
"app_display_name": app_display_name,
"tester": tester_email,
"errors": [error],
}
)
continue
if "toolsets/" in tool_id and test_case.context:
error = "Context can only be specified for python tools."
print(f"FAILED: {error}")
results.append(
{
"test": test_case.name,
"tool": test_case.tool,
"status": "FAILED",
"latency (ms)": 0.0,
"app_display_name": app_display_name,
"tester": tester_email,
"errors": [error],
}
)
continue
# Filter and merge variables for this specific test case
final_variables = {}
for var_name, custom_val in test_case.variables.items():
if custom_val is None:
# User requested an existing app variable by name
if var_name in app_vars_cache:
final_variables[var_name] = app_vars_cache[var_name]
else:
print(
f"[WARNING] App variable '{var_name}' requested "
f"but not found in app."
)
else:
# User provided their own custom mock data
final_variables[var_name] = custom_val
latency_ms = 0.0
tool_response = None
try:
if debug:
print(f"[DEBUG] Executing tool: {test_case.tool}")
print(f"[DEBUG] Tool ID: {tool_id}")
print(f"[DEBUG] Args: {test_case.args}")
print(f"[DEBUG] Variables: {final_variables}")
start_time = time.perf_counter()
tool_response = self.tools_client.execute_tool(
tool_display_name=test_case.tool,
args=test_case.args,
variables=final_variables,
context=test_case.context,
)
end_time = time.perf_counter()
latency_ms = (end_time - start_time) * 1000
if debug:
print(f"[DEBUG] Tool Response: {tool_response}")
errors = self.validate_tool_test(test_case, tool_response)
status = "PASSED"
if errors:
status = "FAILED"
print(f"{status}: {test_case.tool} --> {test_case.name}")
if errors:
print(errors)
results.append(
{
"test": test_case.name,
"tool": test_case.tool,
"status": status,
"latency (ms)": latency_ms,
"app_display_name": app_display_name,
"tester": tester_email,
"errors": errors,
"response": tool_response,
}
)
except Exception as e:
# Catch *all* exceptions so the entire test loop doesn't fail
print(f"ERROR: Exception occurred during test execution: {e}")
results.append(
{
"test": test_case.name,
"tool": test_case.tool,
"status": "ERROR",
"latency (ms)": latency_ms,
"app_display_name": app_display_name,
"tester": tester_email,
"errors": [str(e)],
"response": tool_response,
}
)
print("-" * 30)
return self.tool_tests_to_dataframe(results)
|
generate_tool_tests(target_dir='tool_tests', include_tools=None, exclude_tools=None, overwrite=False, mine_tool_data=False, mine_conversations_limit=50)
Generates configurable YAML test templates for tools defined in the app.
Parses the application's OpenAPI tool schemas or Python underlying functions to try and intelligently scaffold the request arguments and expected responses.
Parameters:
| Name | Type | Description | Default |
target_dir | str | The directory path where the generated YAML files will be saved. Defaults to 'tool_tests'. | 'tool_tests' |
include_tools | Optional[List[str]] | An optional list of tool display names to restrict the generation. If None, all tools in the app are evaluated. | None |
exclude_tools | Optional[List[str]] | An optional list of tool display names (or prefixes) to exclude from generation. Matches if a tool's display name starts with any string in this list. | None |
overwrite | bool | If True, existing YAML test templates in the target directory will be overwritten. If False, existing files are skipped. | False |
mine_tool_data | bool | If True, queries recent conversations to populate generated tests with real tool payload arguments. | False |
mine_conversations_limit | int | The maximum number of conversations to scan when mining real tool arguments. | 50 |
Source code in src/cxas_scrapi/evals/tool_evals.py
| def generate_tool_tests(
self,
target_dir: str = "tool_tests",
include_tools: Optional[List[str]] = None,
exclude_tools: Optional[List[str]] = None,
overwrite: bool = False,
mine_tool_data: bool = False,
mine_conversations_limit: int = 50,
) -> None:
"""Generates configurable YAML test templates for tools defined in
the app.
Parses the application's OpenAPI tool schemas or Python underlying
functions to try and intelligently scaffold the request arguments and
expected responses.
Args:
target_dir: The directory path where the generated YAML files will
be saved. Defaults to 'tool_tests'.
include_tools: An optional list of tool display names to restrict
the generation. If None, all tools in the app are evaluated.
exclude_tools: An optional list of tool display names (or prefixes)
to exclude from generation. Matches if a tool's display name
starts with any string in this list.
overwrite: If True, existing YAML test templates in the target
directory will be overwritten. If False, existing files are
skipped.
mine_tool_data: If True, queries recent conversations to populate
generated tests with real tool payload arguments.
mine_conversations_limit: The maximum number of conversations to
scan when mining real tool arguments.
"""
os.makedirs(target_dir, exist_ok=True)
mined_data = {}
if mine_tool_data:
logger.info("Mining tool data from recent conversations...")
mined_data = self._mine_tool_data(mine_conversations_limit)
for display_name, tool_id in self.tool_map.items():
if include_tools and display_name not in include_tools:
continue
if exclude_tools and any(
display_name.startswith(ex) for ex in exclude_tools
):
continue
template_args = {}
expected_returns = []
# Try to build template args based on schema
try:
actual_tool_id = tool_id
if "toolsets/" in tool_id and "/tools/" in tool_id:
# For tools inside a toolset, we need the toolset object
# to get the schema
actual_tool_id, _ = tool_id.split("/tools/")
tool_obj = self.tools_client.get_tool(actual_tool_id)
tool_dict = (
type(tool_obj).to_dict(tool_obj)
if not isinstance(tool_obj, dict)
else tool_obj
)
# Handle Python Tools
if "toolsets/" not in tool_id:
if "python_function" in tool_dict:
template_args, expected_returns = (
self._parse_python_function(tool_dict)
)
elif not any(
key in tool_dict
for key in (
"data_store_spec",
"data_store_tool",
"google_search_tool",
)
):
logger.info(
f"Skipping test generation for '{display_name}' "
f"as it lacks a supported server-side execution "
f"implementation."
)
continue
# Handle OpenAPI Toolsets
else:
template_args, _ = self._parse_openapi_toolset(
tool_dict, display_name
)
except Exception as e:
logger.warning(
f"Could not fetch tool schema for {display_name}: {e}"
)
self._write_tool_test_template(
display_name,
template_args,
expected_returns,
target_dir,
overwrite,
mined_data.get(display_name),
)
|
tool_tests_to_dataframe(results)
Converts tool test results to a pandas DataFrame for reporting.
Source code in src/cxas_scrapi/evals/tool_evals.py
| def tool_tests_to_dataframe(
self, results: List[Dict[str, Any]]
) -> pd.DataFrame:
"""Converts tool test results to a pandas DataFrame for reporting."""
rows = []
for res in results:
errors = res.get("errors", [])
error_str = "; ".join(errors) if errors else ""
rows.append(
{
"test_name": res.get("test"),
"tool": res.get("tool"),
"status": res.get("status"),
"latency (ms)": res.get("latency (ms)", 0.0),
"app_display_name": res.get(
"app_display_name", "Unknown App"
),
"tester": res.get("tester", "Unknown"),
"errors": error_str,
}
)
return pd.DataFrame(rows)
|
generate_report(results_df)
Generates a summary report DataFrame capturing key metrics from tool evaluation results.
Source code in src/cxas_scrapi/evals/tool_evals.py
| @staticmethod
def generate_report(results_df: pd.DataFrame) -> pd.DataFrame:
"""
Generates a summary report DataFrame capturing key metrics from tool
evaluation results.
"""
stats = ToolEvals._calculate_stats(results_df)
report_data = {
col: getattr(stats, col) for col in SUMMARY_SCHEMA_COLUMNS
}
return pd.DataFrame([report_data])
|
Bases: str, Enum
Operators for testing expectations.
Bases: BaseModel
Data model for a tool test case.
Bases: BaseModel
Data model for a single test expectation.