Skip to content

ToolEvals

ToolEvals lets you write and run unit tests for your CXAS tools — without needing a full agent session. Tests are described in simple YAML files, which makes them easy to version-control alongside your tool code and review in pull requests.

Each test case specifies a tool name, the input args, optional variables (session state), and expectations that assert things about the response using the Operator enum:

Operator Meaning
equals exact match
contains substring or element check
greater_than / less_than numeric comparison
length_equals / length_greater_than / length_less_than collection size
is_null / is_not_null presence check

run_tool_tests() returns a pandas DataFrame with columns for test name, tool, status, latency, and errors — easy to save as a CSV or display in a notebook.

Quick Example

from cxas_scrapi import ToolEvals

app_name = "projects/my-project/locations/us/apps/my-app-id"
te = ToolEvals(app_name=app_name)

# Load tests from a YAML file
test_cases = te.load_tool_test_cases_from_file("tool_tests/lookup_account.yaml")

# Run them and get a results DataFrame
results_df = te.run_tool_tests(test_cases)
print(results_df[["test_name", "tool", "status", "latency (ms)"]])

# Generate a summary report
report_df = ToolEvals.generate_report(results_df)
print(report_df)

A minimal YAML test file looks like this:

tests:
  - name: lookup_known_customer
    tool: lookup_account
    args:
      customer_id: "C-1234"
    expectations:
      response:
        - path: "$.account_status"
          operator: equals
          value: "active"
        - path: "$.balance"
          operator: is_not_null

Reference

ToolEvals

ToolEvals(app_name, creds=None, user_agent_extension=None)

Utility class for testing CXAS Tools.

Initializes the ToolEvals class.

Parameters:

Name Type Description Default
app_name str

CXAS App name (projects/{project}/locations/{location}/apps/{app}).

required
creds Any

Optional Google Cloud credentials.

None
Source code in src/cxas_scrapi/evals/tool_evals.py
def __init__(
    self,
    app_name: str,
    creds: Any = None,
    user_agent_extension: str = None,
):
    """Initializes the ToolEvals class.

    Args:
        app_name: CXAS App name
            (projects/{project}/locations/{location}/apps/{app}).
        creds: Optional Google Cloud credentials.
    """
    self.app_name = app_name

    parts = self.app_name.split("/")
    self.project_id = parts[1] if len(parts) > 1 else ""
    self.location = parts[3] if len(parts) > 3 else "us"

    self.creds = creds
    self.user_agent_extension = user_agent_extension
    self.tools_client = Tools(
        app_name=self.app_name,
        creds=self.creds,
        user_agent_extension=user_agent_extension,
    )
    self.var_client = Variables(
        app_name=self.app_name,
        creds=self.creds,
        user_agent_extension=user_agent_extension,
    )
    try:
        self.tool_map = self.tools_client.get_tools_map(reverse=True)
    except (AttributeError, KeyError, RuntimeError, ValueError) as e:
        logger.warning(
            "Failed to fetch tool map for %s: %s", self.app_name, e
        )
        self.tool_map = {}

load_tool_test_cases_from_file

load_tool_test_cases_from_file(test_file_path)

Loads tool tests from a YAML file.

Source code in src/cxas_scrapi/evals/tool_evals.py
def load_tool_test_cases_from_file(
    self, test_file_path: str
) -> List["ToolTestCase"]:
    """Loads tool tests from a YAML file."""
    with open(test_file_path, "r", encoding="utf-8") as f:
        return self.load_tool_test_cases_from_yaml(f.read())

load_tool_tests_from_dir

load_tool_tests_from_dir(directory_path='tool_tests')

Recursively loads all YAML tool tests from a directory.

Source code in src/cxas_scrapi/evals/tool_evals.py
def load_tool_tests_from_dir(
    self, directory_path: str = "tool_tests"
) -> List["ToolTestCase"]:
    """Recursively loads all YAML tool tests from a directory."""
    all_tests = []
    if not os.path.exists(directory_path):
        print(f"Directory {directory_path} does not exist.")
        return all_tests

    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".yaml") or file.endswith(".yml"):
                file_path = os.path.join(root, file)
                try:
                    tests = self.load_tool_test_cases_from_file(file_path)
                    all_tests.extend(tests)
                except Exception as e:
                    logger.error(f"Error loading {file_path}: {e}")

    return all_tests

load_tool_test_cases_from_yaml

load_tool_test_cases_from_yaml(yaml_data)

Loads tool tests from a YAML string.

Source code in src/cxas_scrapi/evals/tool_evals.py
def load_tool_test_cases_from_yaml(
    self, yaml_data: str
) -> List["ToolTestCase"]:
    """Loads tool tests from a YAML string."""
    raw_data = yaml.safe_load(yaml_data)
    if not raw_data or "tests" not in raw_data:
        return []

    return self.load_tool_test_cases_from_data(raw_data["tests"])

load_tool_test_cases_from_data

load_tool_test_cases_from_data(test_data)

Loads tool tests from a list of dictionaries.

Source code in src/cxas_scrapi/evals/tool_evals.py
def load_tool_test_cases_from_data(
    self, test_data: List[Dict[str, Any]]
) -> List["ToolTestCase"]:
    """Loads tool tests from a list of dictionaries."""
    # Pre-process data to handle VariableDeclaration objects
    cleaned_data = []
    for case in test_data:
        case_copy = case.copy()
        if "variables" in case_copy and isinstance(
            case_copy["variables"], dict
        ):
            cleaned_vars = {}
            for k, v in case_copy["variables"].items():
                cleaned_vars[k] = Variables.variable_to_dict(v)
            case_copy["variables"] = cleaned_vars
        cleaned_data.append(case_copy)

    adapter = TypeAdapter(List[ToolTestCase])
    return adapter.validate_python(cleaned_data)

validate_tool_test

validate_tool_test(test_case, tool_response)

Validates the tool response and variables against expectations.

Returns:

Type Description
List[str]

List of error messages. Empty list if all expectations pass.

Source code in src/cxas_scrapi/evals/tool_evals.py
def validate_tool_test(
    self,
    test_case: "ToolTestCase",
    tool_response: Any,
) -> List[str]:
    """Validates the tool response and variables against expectations.

    Returns:
        List of error messages. Empty list if all expectations pass.
    """
    updated_variables = {}
    if isinstance(tool_response, dict) and "variables" in tool_response:
        updated_variables = tool_response["variables"]

    errors = []
    # Validate response
    for exp in test_case.response_expectations:
        resp_data = (
            tool_response.get("response")
            if isinstance(tool_response, dict)
            else tool_response
        )
        actual_value = self._get_value_at_path(resp_data, exp.path)
        if not self._check_expectation(actual_value, exp):
            errors.append(
                f"Response expectation failed: path='{exp.path}',"
                f" actual='{actual_value}', expected='{exp.value}',"
                f" operator='{exp.operator}'"
            )

    # Validate variables
    for exp in test_case.variable_expectations:
        actual_value = self._get_value_at_path(updated_variables, exp.path)
        if not self._check_expectation(actual_value, exp):
            errors.append(
                f"Variable expectation failed: path='{exp.path}',"
                f" actual='{actual_value}', expected='{exp.value}',"
                f" operator='{exp.operator}'"
            )

    return errors

run_tool_tests

run_tool_tests(test_cases, debug=False)

Runs a list of tool tests.

Returns:

Type Description
DataFrame

A pandas DataFrame of results with status and errors.

Source code in src/cxas_scrapi/evals/tool_evals.py
def run_tool_tests(
    self, test_cases: List["ToolTestCase"], debug: bool = False
) -> pd.DataFrame:
    """Runs a list of tool tests.

    Returns:
        A pandas DataFrame of results with status and errors.
    """
    # Fetch and unwrap app variables once
    raw_app_vars = self.var_client.list_variables()
    app_vars_cache = {}
    for var in raw_app_vars:
        try:
            var_dict = MessageToDict(var._pb)
        except AttributeError:
            var_dict = MessageToDict(var)

        schema = var_dict.get("schema", {})
        actual_data = schema.get("default") or var_dict.get("value") or {}
        app_vars_cache[var.name] = actual_data

    # Fetch app metadata and user info once per run
    app_client = Apps(
        project_id=self.project_id,
        location=self.location,
        creds=self.creds,
        user_agent_extension=self.user_agent_extension
    )
    app = app_client.get_app(self.app_name)
    app_display_name = app.display_name if app else "Unknown App"
    tester_email = getattr(self.creds, "service_account_email", "Unknown")

    results = []
    for test_case in test_cases:
        print(f"Running test: {test_case.name} ({test_case.tool})")

        tool_id = self.tool_map.get(test_case.tool)
        if not tool_id:
            error = f"Tool '{test_case.tool}' not found in app."
            print(f"FAILED: {error}")
            results.append(
                {
                    "test": test_case.name,
                    "tool": test_case.tool,
                    "status": "FAILED",
                    "latency (ms)": 0.0,
                    "app_display_name": app_display_name,
                    "tester": tester_email,
                    "errors": [error],
                }
            )
            continue

        if "toolsets/" in tool_id and test_case.context:
            error = "Context can only be specified for python tools."
            print(f"FAILED: {error}")
            results.append(
                {
                    "test": test_case.name,
                    "tool": test_case.tool,
                    "status": "FAILED",
                    "latency (ms)": 0.0,
                    "app_display_name": app_display_name,
                    "tester": tester_email,
                    "errors": [error],
                }
            )
            continue

        # Filter and merge variables for this specific test case
        final_variables = {}
        for var_name, custom_val in test_case.variables.items():
            if custom_val is None:
                # User requested an existing app variable by name
                if var_name in app_vars_cache:
                    final_variables[var_name] = app_vars_cache[var_name]
                else:
                    print(
                        f"[WARNING] App variable '{var_name}' requested "
                        f"but not found in app."
                    )
            else:
                # User provided their own custom mock data
                final_variables[var_name] = custom_val

        latency_ms = 0.0
        tool_response = None
        try:
            if debug:
                print(f"[DEBUG] Executing tool: {test_case.tool}")
                print(f"[DEBUG] Tool ID: {tool_id}")
                print(f"[DEBUG] Args: {test_case.args}")
                print(f"[DEBUG] Variables: {final_variables}")

            start_time = time.perf_counter()
            tool_response = self.tools_client.execute_tool(
                tool_display_name=test_case.tool,
                args=test_case.args,
                variables=final_variables,
                context=test_case.context,
            )
            end_time = time.perf_counter()
            latency_ms = (end_time - start_time) * 1000

            if debug:
                print(f"[DEBUG] Tool Response: {tool_response}")

            errors = self.validate_tool_test(test_case, tool_response)
            status = "PASSED"
            if errors:
                status = "FAILED"

            print(f"{status}: {test_case.tool} --> {test_case.name}")
            if errors:
                print(errors)

            results.append(
                {
                    "test": test_case.name,
                    "tool": test_case.tool,
                    "status": status,
                    "latency (ms)": latency_ms,
                    "app_display_name": app_display_name,
                    "tester": tester_email,
                    "errors": errors,
                    "response": tool_response,
                }
            )

        except Exception as e:
            # Catch *all* exceptions so the entire test loop doesn't fail
            print(f"ERROR: Exception occurred during test execution: {e}")
            results.append(
                {
                    "test": test_case.name,
                    "tool": test_case.tool,
                    "status": "ERROR",
                    "latency (ms)": latency_ms,
                    "app_display_name": app_display_name,
                    "tester": tester_email,
                    "errors": [str(e)],
                    "response": tool_response,
                }
            )

        print("-" * 30)

    return self.tool_tests_to_dataframe(results)

generate_tool_tests

generate_tool_tests(target_dir='tool_tests', include_tools=None, exclude_tools=None, overwrite=False, mine_tool_data=False, mine_conversations_limit=50)

Generates configurable YAML test templates for tools defined in the app.

Parses the application's OpenAPI tool schemas or Python underlying functions to try and intelligently scaffold the request arguments and expected responses.

Parameters:

Name Type Description Default
target_dir str

The directory path where the generated YAML files will be saved. Defaults to 'tool_tests'.

'tool_tests'
include_tools Optional[List[str]]

An optional list of tool display names to restrict the generation. If None, all tools in the app are evaluated.

None
exclude_tools Optional[List[str]]

An optional list of tool display names (or prefixes) to exclude from generation. Matches if a tool's display name starts with any string in this list.

None
overwrite bool

If True, existing YAML test templates in the target directory will be overwritten. If False, existing files are skipped.

False
mine_tool_data bool

If True, queries recent conversations to populate generated tests with real tool payload arguments.

False
mine_conversations_limit int

The maximum number of conversations to scan when mining real tool arguments.

50
Source code in src/cxas_scrapi/evals/tool_evals.py
def generate_tool_tests(
    self,
    target_dir: str = "tool_tests",
    include_tools: Optional[List[str]] = None,
    exclude_tools: Optional[List[str]] = None,
    overwrite: bool = False,
    mine_tool_data: bool = False,
    mine_conversations_limit: int = 50,
) -> None:
    """Generates configurable YAML test templates for tools defined in
    the app.

    Parses the application's OpenAPI tool schemas or Python underlying
    functions to try and intelligently scaffold the request arguments and
    expected responses.

    Args:
        target_dir: The directory path where the generated YAML files will
            be saved. Defaults to 'tool_tests'.
        include_tools: An optional list of tool display names to restrict
            the generation. If None, all tools in the app are evaluated.
        exclude_tools: An optional list of tool display names (or prefixes)
            to exclude from generation. Matches if a tool's display name
            starts with any string in this list.
        overwrite: If True, existing YAML test templates in the target
            directory will be overwritten. If False, existing files are
            skipped.
        mine_tool_data: If True, queries recent conversations to populate
            generated tests with real tool payload arguments.
        mine_conversations_limit: The maximum number of conversations to
            scan when mining real tool arguments.
    """
    os.makedirs(target_dir, exist_ok=True)

    mined_data = {}
    if mine_tool_data:
        logger.info("Mining tool data from recent conversations...")
        mined_data = self._mine_tool_data(mine_conversations_limit)

    for display_name, tool_id in self.tool_map.items():
        if include_tools and display_name not in include_tools:
            continue

        if exclude_tools and any(
            display_name.startswith(ex) for ex in exclude_tools
        ):
            continue

        template_args = {}
        expected_returns = []
        # Try to build template args based on schema
        try:
            actual_tool_id = tool_id
            if "toolsets/" in tool_id and "/tools/" in tool_id:
                # For tools inside a toolset, we need the toolset object
                # to get the schema
                actual_tool_id, _ = tool_id.split("/tools/")

            tool_obj = self.tools_client.get_tool(actual_tool_id)
            tool_dict = (
                type(tool_obj).to_dict(tool_obj)
                if not isinstance(tool_obj, dict)
                else tool_obj
            )

            # Handle Python Tools
            if "toolsets/" not in tool_id:
                if "python_function" in tool_dict:
                    template_args, expected_returns = (
                        self._parse_python_function(tool_dict)
                    )
                elif not any(
                    key in tool_dict
                    for key in (
                        "data_store_spec",
                        "data_store_tool",
                        "google_search_tool",
                    )
                ):
                    logger.info(
                        f"Skipping test generation for '{display_name}' "
                        f"as it lacks a supported server-side execution "
                        f"implementation."
                    )
                    continue

            # Handle OpenAPI Toolsets
            else:
                template_args, _ = self._parse_openapi_toolset(
                    tool_dict, display_name
                )

        except Exception as e:
            logger.warning(
                f"Could not fetch tool schema for {display_name}: {e}"
            )

        self._write_tool_test_template(
            display_name,
            template_args,
            expected_returns,
            target_dir,
            overwrite,
            mined_data.get(display_name),
        )

tool_tests_to_dataframe

tool_tests_to_dataframe(results)

Converts tool test results to a pandas DataFrame for reporting.

Source code in src/cxas_scrapi/evals/tool_evals.py
def tool_tests_to_dataframe(
    self, results: List[Dict[str, Any]]
) -> pd.DataFrame:
    """Converts tool test results to a pandas DataFrame for reporting."""
    rows = []
    for res in results:
        errors = res.get("errors", [])
        error_str = "; ".join(errors) if errors else ""
        rows.append(
            {
                "test_name": res.get("test"),
                "tool": res.get("tool"),
                "status": res.get("status"),
                "latency (ms)": res.get("latency (ms)", 0.0),
                "app_display_name": res.get(
                    "app_display_name", "Unknown App"
                ),
                "tester": res.get("tester", "Unknown"),
                "errors": error_str,
            }
        )
    return pd.DataFrame(rows)

generate_report staticmethod

generate_report(results_df)

Generates a summary report DataFrame capturing key metrics from tool evaluation results.

Source code in src/cxas_scrapi/evals/tool_evals.py
@staticmethod
def generate_report(results_df: pd.DataFrame) -> pd.DataFrame:
    """
    Generates a summary report DataFrame capturing key metrics from tool
    evaluation results.
    """
    stats = ToolEvals._calculate_stats(results_df)

    report_data = {
        col: getattr(stats, col) for col in SUMMARY_SCHEMA_COLUMNS
    }
    return pd.DataFrame([report_data])

Operator

Bases: str, Enum

Operators for testing expectations.

ToolTestCase

Bases: BaseModel

Data model for a tool test case.

Expectation

Bases: BaseModel

Data model for a single test expectation.