ToolEvals¶

ToolEvals lets you write and run unit tests for your CXAS tools — without needing a full agent session. Tests are described in simple YAML files, which makes them easy to version-control alongside your tool code and review in pull requests.

Each test case specifies a tool name, the input args, optional variables (session state), and expectations that assert things about the response using the Operator enum:

Operator	Meaning
`equals`	exact match
`contains`	substring or element check
`greater_than` / `less_than`	numeric comparison
`length_equals` / `length_greater_than` / `length_less_than`	collection size
`is_null` / `is_not_null`	presence check

run_tool_tests() returns a pandas DataFrame with columns for test name, tool, status, latency, and errors — easy to save as a CSV or display in a notebook.

Quick Example¶

from cxas_scrapi import ToolEvals

app_name = "projects/my-project/locations/us/apps/my-app-id"
te = ToolEvals(app_name=app_name)

# Load tests from a YAML file
test_cases = te.load_tool_test_cases_from_file("tool_tests/lookup_account.yaml")

# Run them and get a results DataFrame
results_df = te.run_tool_tests(test_cases)
print(results_df[["test_name", "tool", "status", "latency (ms)"]])

# Generate a summary report
report_df = ToolEvals.generate_report(results_df)
print(report_df)

A minimal YAML test file looks like this:

tests:
  - name: lookup_known_customer
    tool: lookup_account
    args:
      customer_id: "C-1234"
    expectations:
      response:
        - path: "$.account_status"
          operator: equals
          value: "active"
        - path: "$.balance"
          operator: is_not_null

Reference¶

ToolEvals ¶

ToolEvals(app_name, creds=None, user_agent_extension=None)

Utility class for testing CXAS Tools.

Initializes the ToolEvals class.

Parameters:

Name	Type	Description	Default
`app_name`	`str`	CXAS App name (projects/{project}/locations/{location}/apps/{app}).	required
`creds`	`Any`	Optional Google Cloud credentials.	`None`

Source code in src/cxas_scrapi/evals/tool_evals.py

def __init__(
    self,
    app_name: str,
    creds: Any = None,
    user_agent_extension: str | None = None,
):
    """Initializes the ToolEvals class.

    Args:
        app_name: CXAS App name
            (projects/{project}/locations/{location}/apps/{app}).
        creds: Optional Google Cloud credentials.
    """
    self.app_name = app_name

    parts = self.app_name.split("/")
    self.project_id = parts[1] if len(parts) > 1 else ""
    self.location = parts[3] if len(parts) > 3 else "us"

    self.creds = creds
    self.user_agent_extension = user_agent_extension
    self.tools_client = Tools(
        app_name=self.app_name,
        creds=self.creds,
        user_agent_extension=user_agent_extension,
    )
    self.var_client = Variables(
        app_name=self.app_name,
        creds=self.creds,
        user_agent_extension=user_agent_extension,
    )
    try:
        self.tool_map = self.tools_client.get_tools_map(reverse=True)
    except (AttributeError, KeyError, RuntimeError, ValueError) as e:
        logger.warning(
            "Failed to fetch tool map for %s: %s", self.app_name, e
        )
        self.tool_map = {}

load_tool_test_cases_from_file ¶

load_tool_test_cases_from_file(test_file_path)

Loads tool tests from a YAML file.

Source code in src/cxas_scrapi/evals/tool_evals.py

def load_tool_test_cases_from_file(
    self, test_file_path: str
) -> list["ToolTestCase"]:
    """Loads tool tests from a YAML file."""
    with open(test_file_path, encoding="utf-8") as f:
        return self.load_tool_test_cases_from_yaml(f.read())

load_tool_tests_from_dir ¶

load_tool_tests_from_dir(directory_path='tool_tests')

Recursively loads all YAML tool tests from a directory.

Source code in src/cxas_scrapi/evals/tool_evals.py

def load_tool_tests_from_dir(
    self, directory_path: str = "tool_tests"
) -> list["ToolTestCase"]:
    """Recursively loads all YAML tool tests from a directory."""
    all_tests = []
    if not os.path.exists(directory_path):
        print(f"Directory {directory_path} does not exist.")
        return all_tests

    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".yaml") or file.endswith(".yml"):
                file_path = os.path.join(root, file)
                try:
                    tests = self.load_tool_test_cases_from_file(file_path)
                    all_tests.extend(tests)
                except Exception as e:
                    logger.error(f"Error loading {file_path}: {e}")

    return all_tests

load_tool_test_cases_from_yaml ¶

load_tool_test_cases_from_yaml(yaml_data)

Loads tool tests from a YAML string.

Source code in src/cxas_scrapi/evals/tool_evals.py

def load_tool_test_cases_from_yaml(
    self, yaml_data: str
) -> list["ToolTestCase"]:
    """Loads tool tests from a YAML string."""
    raw_data = yaml.safe_load(yaml_data)
    if not raw_data or "tests" not in raw_data:
        return []

    return self.load_tool_test_cases_from_data(raw_data["tests"])

load_tool_test_cases_from_data ¶

load_tool_test_cases_from_data(test_data)

Loads tool tests from a list of dictionaries.

Source code in src/cxas_scrapi/evals/tool_evals.py

def load_tool_test_cases_from_data(
    self, test_data: list[dict[str, Any]]
) -> list["ToolTestCase"]:
    """Loads tool tests from a list of dictionaries."""
    # Pre-process data to handle VariableDeclaration objects
    cleaned_data = []
    for case in test_data:
        case_copy = case.copy()
        if "variables" in case_copy and isinstance(
            case_copy["variables"], dict
        ):
            cleaned_vars = {}
            for k, v in case_copy["variables"].items():
                cleaned_vars[k] = Variables.variable_to_dict(v)
            case_copy["variables"] = cleaned_vars
        cleaned_data.append(case_copy)

    adapter = TypeAdapter(list[ToolTestCase])
    return adapter.validate_python(cleaned_data)

validate_tool_test ¶

validate_tool_test(test_case, tool_response)

Validates the tool response and variables against expectations.

Returns:

Type	Description
`list[str]`	List of error messages. Empty list if all expectations pass.

Source code in src/cxas_scrapi/evals/tool_evals.py

def validate_tool_test(
    self,
    test_case: "ToolTestCase",
    tool_response: Any,
) -> list[str]:
    """Validates the tool response and variables against expectations.

    Returns:
        List of error messages. Empty list if all expectations pass.
    """
    updated_variables = {}
    if isinstance(tool_response, dict) and "variables" in tool_response:
        updated_variables = tool_response["variables"]

    errors = []
    # Validate response
    for exp in test_case.response_expectations:
        resp_data = (
            tool_response.get("response")
            if isinstance(tool_response, dict)
            else tool_response
        )
        actual_value = self._get_value_at_path(resp_data, exp.path)
        if not self._check_expectation(actual_value, exp):
            errors.append(
                f"Response expectation failed: path='{exp.path}',"
                f" actual='{actual_value}', expected='{exp.value}',"
                f" operator='{exp.operator}'"
            )

    # Validate variables
    for exp in test_case.variable_expectations:
        actual_value = self._get_value_at_path(updated_variables, exp.path)
        if not self._check_expectation(actual_value, exp):
            errors.append(
                f"Variable expectation failed: path='{exp.path}',"
                f" actual='{actual_value}', expected='{exp.value}',"
                f" operator='{exp.operator}'"
            )

    return errors

run_tool_tests ¶

run_tool_tests(test_cases, debug=False)

Runs a list of tool tests.

Returns:

Type	Description
`DataFrame`	A pandas DataFrame of results with status and errors.

Source code in src/cxas_scrapi/evals/tool_evals.py

def run_tool_tests(
    self, test_cases: list["ToolTestCase"], debug: bool = False
) -> pd.DataFrame:
    """Runs a list of tool tests.

    Returns:
        A pandas DataFrame of results with status and errors.
    """
    # Fetch and unwrap app variables once
    raw_app_vars = self.var_client.list_variables()
    app_vars_cache = {}
    for var in raw_app_vars:
        try:
            var_dict = MessageToDict(var._pb)
        except AttributeError:
            var_dict = MessageToDict(var)

        schema = var_dict.get("schema", {})
        actual_data = schema.get("default") or var_dict.get("value") or {}
        app_vars_cache[var.name] = actual_data

    # Fetch app metadata and user info once per run
    app_client = Apps(
        project_id=self.project_id,
        location=self.location,
        creds=self.creds,
        user_agent_extension=self.user_agent_extension,
    )
    app = app_client.get_app(self.app_name)
    app_display_name = app.display_name if app else "Unknown App"
    tester_email = getattr(self.creds, "service_account_email", "Unknown")

    results = []
    for test_case in test_cases:
        print(f"Running test: {test_case.name} ({test_case.tool})")

        tool_id = self.tool_map.get(test_case.tool)
        if not tool_id:
            error = f"Tool '{test_case.tool}' not found in app."
            print(f"FAILED: {error}")
            results.append(
                {
                    "test": test_case.name,
                    "tool": test_case.tool,
                    "status": "FAILED",
                    "latency (ms)": 0.0,
                    "app_display_name": app_display_name,
                    "tester": tester_email,
                    "errors": [error],
                }
            )
            continue

        if "toolsets/" in tool_id and test_case.context:
            error = "Context can only be specified for python tools."
            print(f"FAILED: {error}")
            results.append(
                {
                    "test": test_case.name,
                    "tool": test_case.tool,
                    "status": "FAILED",
                    "latency (ms)": 0.0,
                    "app_display_name": app_display_name,
                    "tester": tester_email,
                    "errors": [error],
                }
            )
            continue

        # Filter and merge variables for this specific test case
        final_variables = {}
        for var_name, custom_val in test_case.variables.items():
            if custom_val is None:
                # User requested an existing app variable by name
                if var_name in app_vars_cache:
                    final_variables[var_name] = app_vars_cache[var_name]
                else:
                    print(
                        f"[WARNING] App variable '{var_name}' requested "
                        f"but not found in app."
                    )
            else:
                # User provided their own custom mock data
                final_variables[var_name] = custom_val

        latency_ms = 0.0
        tool_response = None
        try:
            if debug:
                print(f"[DEBUG] Executing tool: {test_case.tool}")
                print(f"[DEBUG] Tool ID: {tool_id}")
                print(f"[DEBUG] Args: {test_case.args}")
                print(f"[DEBUG] Variables: {final_variables}")

            start_time = time.perf_counter()
            tool_response = self.tools_client.execute_tool(
                tool_display_name=test_case.tool,
                args=test_case.args,
                variables=final_variables,
                context=test_case.context,
            )
            end_time = time.perf_counter()
            latency_ms = (end_time - start_time) * 1000

            if debug:
                print(f"[DEBUG] Tool Response: {tool_response}")

            errors = self.validate_tool_test(test_case, tool_response)
            status = "PASSED"
            if errors:
                status = "FAILED"

            print(f"{status}: {test_case.tool} --> {test_case.name}")
            if errors:
                print(errors)

            results.append(
                {
                    "test": test_case.name,
                    "tool": test_case.tool,
                    "status": status,
                    "latency (ms)": latency_ms,
                    "app_display_name": app_display_name,
                    "tester": tester_email,
                    "errors": errors,
                    "response": tool_response,
                }
            )

        except Exception as e:
            # Catch *all* exceptions so the entire test loop doesn't fail
            print(f"ERROR: Exception occurred during test execution: {e}")
            results.append(
                {
                    "test": test_case.name,
                    "tool": test_case.tool,
                    "status": "ERROR",
                    "latency (ms)": latency_ms,
                    "app_display_name": app_display_name,
                    "tester": tester_email,
                    "errors": [str(e)],
                    "response": tool_response,
                }
            )

        print("-" * 30)

    return self.tool_tests_to_dataframe(results)

generate_tool_tests ¶

generate_tool_tests(target_dir='tool_tests', include_tools=None, exclude_tools=None, overwrite=False, mine_tool_data=False, mine_conversations_limit=50)

Generates configurable YAML test templates for tools defined in the app.

Parses the application's OpenAPI tool schemas or Python underlying functions to try and intelligently scaffold the request arguments and expected responses.

Parameters:

Name	Type	Description	Default
`target_dir`	`str`	The directory path where the generated YAML files will be saved. Defaults to 'tool_tests'.	`'tool_tests'`
`include_tools`	`list[str] \| None`	An optional list of tool display names to restrict the generation. If None, all tools in the app are evaluated.	`None`
`exclude_tools`	`list[str] \| None`	An optional list of tool display names (or prefixes) to exclude from generation. Matches if a tool's display name starts with any string in this list.	`None`
`overwrite`	`bool`	If True, existing YAML test templates in the target directory will be overwritten. If False, existing files are skipped.	`False`
`mine_tool_data`	`bool`	If True, queries recent conversations to populate generated tests with real tool payload arguments.	`False`
`mine_conversations_limit`	`int`	The maximum number of conversations to scan when mining real tool arguments.	`50`

Source code in src/cxas_scrapi/evals/tool_evals.py

def generate_tool_tests(
    self,
    target_dir: str = "tool_tests",
    include_tools: list[str] | None = None,
    exclude_tools: list[str] | None = None,
    overwrite: bool = False,
    mine_tool_data: bool = False,
    mine_conversations_limit: int = 50,
) -> None:
    """Generates configurable YAML test templates for tools defined in
    the app.

    Parses the application's OpenAPI tool schemas or Python underlying
    functions to try and intelligently scaffold the request arguments and
    expected responses.

    Args:
        target_dir: The directory path where the generated YAML files will
            be saved. Defaults to 'tool_tests'.
        include_tools: An optional list of tool display names to restrict
            the generation. If None, all tools in the app are evaluated.
        exclude_tools: An optional list of tool display names (or prefixes)
            to exclude from generation. Matches if a tool's display name
            starts with any string in this list.
        overwrite: If True, existing YAML test templates in the target
            directory will be overwritten. If False, existing files are
            skipped.
        mine_tool_data: If True, queries recent conversations to populate
            generated tests with real tool payload arguments.
        mine_conversations_limit: The maximum number of conversations to
            scan when mining real tool arguments.
    """
    os.makedirs(target_dir, exist_ok=True)

    mined_data = {}
    if mine_tool_data:
        logger.info("Mining tool data from recent conversations...")
        mined_data = self._mine_tool_data(mine_conversations_limit)

    for display_name, tool_id in self.tool_map.items():
        if include_tools and display_name not in include_tools:
            continue

        if exclude_tools and any(
            display_name.startswith(ex) for ex in exclude_tools
        ):
            continue

        template_args = {}
        expected_returns = []
        # Try to build template args based on schema
        try:
            actual_tool_id = tool_id
            if "toolsets/" in tool_id and "/tools/" in tool_id:
                # For tools inside a toolset, we need the toolset object
                # to get the schema
                actual_tool_id, _ = tool_id.split("/tools/")

            tool_obj = self.tools_client.get_tool(actual_tool_id)
            tool_dict = (
                type(tool_obj).to_dict(tool_obj)
                if not isinstance(tool_obj, dict)
                else tool_obj
            )

            # Handle Python Tools
            if "toolsets/" not in tool_id:
                if "python_function" in tool_dict:
                    template_args, expected_returns = (
                        self._parse_python_function(tool_dict)
                    )
                elif not any(
                    key in tool_dict
                    for key in (
                        "data_store_spec",
                        "data_store_tool",
                        "google_search_tool",
                    )
                ):
                    logger.info(
                        f"Skipping test generation for '{display_name}' "
                        f"as it lacks a supported server-side execution "
                        f"implementation."
                    )
                    continue

            # Handle OpenAPI Toolsets
            else:
                template_args, _ = self._parse_openapi_toolset(
                    tool_dict, display_name
                )

        except Exception as e:
            logger.warning(
                f"Could not fetch tool schema for {display_name}: {e}"
            )

        self._write_tool_test_template(
            display_name,
            template_args,
            expected_returns,
            target_dir,
            overwrite,
            mined_data.get(display_name),
        )

tool_tests_to_dataframe ¶

tool_tests_to_dataframe(results)

Converts tool test results to a pandas DataFrame for reporting.

Source code in src/cxas_scrapi/evals/tool_evals.py

def tool_tests_to_dataframe(
    self, results: list[dict[str, Any]]
) -> pd.DataFrame:
    """Converts tool test results to a pandas DataFrame for reporting."""
    rows = []
    for res in results:
        errors = res.get("errors", [])
        error_str = "; ".join(errors) if errors else ""
        rows.append(
            {
                "test_name": res.get("test"),
                "tool": res.get("tool"),
                "status": res.get("status"),
                "latency (ms)": res.get("latency (ms)", 0.0),
                "app_display_name": res.get(
                    "app_display_name", "Unknown App"
                ),
                "tester": res.get("tester", "Unknown"),
                "errors": error_str,
            }
        )
    return pd.DataFrame(rows)

generate_report `staticmethod` ¶

generate_report(results_df)

Generates a summary report DataFrame capturing key metrics from tool evaluation results.

Source code in src/cxas_scrapi/evals/tool_evals.py

@staticmethod
def generate_report(results_df: pd.DataFrame) -> pd.DataFrame:
    """
    Generates a summary report DataFrame capturing key metrics from tool
    evaluation results.
    """
    stats = ToolEvals._calculate_stats(results_df)

    report_data = {
        col: getattr(stats, col) for col in SUMMARY_SCHEMA_COLUMNS
    }
    return pd.DataFrame([report_data])

Operator ¶

Bases: str, Enum

Operators for testing expectations.

ToolTestCase ¶

Bases: BaseModel

Data model for a tool test case.

Expectation ¶

Bases: BaseModel

Data model for a single test expectation.

ToolEvals¶

Quick Example¶

Reference¶

ToolEvals ¶

load_tool_test_cases_from_file ¶

load_tool_tests_from_dir ¶

load_tool_test_cases_from_yaml ¶

load_tool_test_cases_from_data ¶

validate_tool_test ¶

run_tool_tests ¶

generate_tool_tests ¶

tool_tests_to_dataframe ¶

generate_report staticmethod ¶

Operator ¶

ToolTestCase ¶

Expectation ¶

generate_report `staticmethod` ¶