Skip to content

SimulationEvals

SimulationEvals runs AI-driven end-to-end conversation simulations against your CXAS agent. Instead of scripting exact utterances, you describe goals and success criteria — and a Gemini model figures out what to say at each turn to try to achieve them. This is a great way to test how your agent handles realistic, messy, unpredictable conversations.

Here are the key concepts:

  • Step (Pydantic model) — a single goal within a simulation, with a goal, success_criteria, optional response_guide, and a max_turns limit. Steps can also include a static_utterance for when you want a fixed first message, and inject_variables for seeding session state.
  • StepStatus enum — tracks whether each step is NOT_STARTED, IN_PROGRESS, or COMPLETED.
  • simulate_conversation() — drives the full multi-turn loop, returning an LLMUserConversation object that contains the transcript, step progress, and expectation results.
  • generate_report() — produces a SimulationReport with two DataFrames: goal progress and expectation results. It renders as styled HTML in a Jupyter notebook.

Quick Example

from cxas_scrapi import SimulationEvals
from cxas_scrapi.utils.rate_limiter import RateLimiter

app_name = "projects/my-project/locations/us/apps/my-app-id"

# Optional: configure a rate limiter to pace simulation turns and prevent quota exhaustion
limiter = RateLimiter(requests_per_minute=30.0)
sim = SimulationEvals(app_name=app_name, rate_limiter=limiter)

test_case = {
    "steps": [
        {
            "goal": "User wants to check their account balance",
            "success_criteria": "Agent provides a numeric balance and account status",
            "max_turns": 5,
        },
        {
            "goal": "User asks to dispute a charge",
            "success_criteria": "Agent acknowledges the dispute and provides a reference number",
            "max_turns": 8,
        },
    ],
    "expectations": [
        "The agent should never ask for the full credit card number",
        "The agent should offer to escalate if it cannot resolve the dispute",
    ],
}

# Run the simulation
conversation = sim.simulate_conversation(
    test_case=test_case,
    console_logging=True,
)

# View the report
report = conversation.generate_report()
print(report)  # Colorized in terminal, styled HTML in Jupyter

Reference

SimulationEvals

SimulationEvals(app_name, rate_limiter=None, **kwargs)

Bases: Apps

Wrapper class to simulate entire multi-turn conversations with a CXAS Agent.

Source code in src/cxas_scrapi/evals/simulation_evals.py
def __init__(
    self,
    app_name: str,
    rate_limiter: RateLimiter | None = None,
    **kwargs,
):
    self.app_name = app_name
    project_id = app_name.split("/")[1]
    location = app_name.split("/")[3]
    super().__init__(project_id=project_id, location=location, **kwargs)
    self.sessions_client = Sessions(
        app_name, rate_limiter=rate_limiter, **kwargs
    )
    self.tools_map = Tools(app_name=app_name, **kwargs).get_tools_map()

    # Vertex AI requires a specific region (e.g. global), whereas CXAS
    # Apps use 'us' or 'eu'
    vertex_location = "global"

    self.genai_client = GeminiGenerate(
        project_id=self.project_id,
        location=vertex_location,
        credentials=self.creds,
    )

simulate_conversation

simulate_conversation(test_case, sim_user_model=_DEFAULT_GEMINI_MODEL, eval_model=_DEFAULT_GEMINI_MODEL, session_id=None, console_logging=True, modality='text', background_noise_file=None, burst_noise_files=None, use_tool_fakes=False)

Runs the simulated conversation loop.

Parameters:

Name Type Description Default
test_case dict[str, Any]

The test case dictionary defining evaluation steps.

required
sim_user_model str | None

The Gemini model used for the simulated user.

_DEFAULT_GEMINI_MODEL
eval_model str | None

The Gemini model used for evaluating expectations.

_DEFAULT_GEMINI_MODEL
console_logging bool

Whether to print interaction transcript to the console.

True
Source code in src/cxas_scrapi/evals/simulation_evals.py
def simulate_conversation(
    self,
    test_case: dict[str, Any],
    sim_user_model: str | None = _DEFAULT_GEMINI_MODEL,
    eval_model: str | None = _DEFAULT_GEMINI_MODEL,
    session_id: str | None = None,
    console_logging: bool = True,
    modality: str = "text",
    background_noise_file: str | None = None,
    burst_noise_files: list[str] | None = None,
    use_tool_fakes: bool = False,
) -> LLMUserConversation:
    """Runs the simulated conversation loop.

    Args:
        test_case: The test case dictionary defining evaluation steps.
        sim_user_model: The Gemini model used for the simulated user.
        eval_model: The Gemini model used for evaluating expectations.
        console_logging: Whether to print interaction transcript to
            the console.
    """
    sim_user_model = sim_user_model or _DEFAULT_GEMINI_MODEL
    eval_model = eval_model or _DEFAULT_GEMINI_MODEL
    if session_id is None:
        session_id = str(uuid.uuid4())
    eval_conv = LLMUserConversation(
        genai_client=self.genai_client,
        genai_model=sim_user_model,
        test_case=test_case,
    )

    if console_logging:
        print(
            f"Starting simulated conversation with session ID: {session_id}"
        )

    # Initialize the first turn manually
    user_utterance, variables = eval_conv.next_user_utterance()
    accumulated_variables = {}
    if variables:
        accumulated_variables.update(variables)

    detailed_trace = []
    detailed_trace.append(f"User: {user_utterance}")

    while user_utterance:
        response = self._send_request_with_retry(
            session_id,
            user_utterance,
            accumulated_variables,
            modality,
            console_logging,
            background_noise_file,
            burst_noise_files,
            use_tool_fakes=use_tool_fakes,
        )
        if not response:
            break

        if console_logging:
            self.sessions_client.parse_result(response)

        agent_text, trace_chunks, session_ended = (
            self._parse_agent_response(response)
        )
        detailed_trace.append("\n".join(trace_chunks))

        if session_ended:
            if agent_text:
                eval_conv._add_agent_response(agent_text)
            # Ensure the final agent response is evaluated
            # so that steps_progress is updated on session end.
            eval_conv._next_user_utterance()
            if console_logging:
                print(
                    "\nSession has been closed by the Agent via "
                    "end_session tool."
                )
            break

        # Get the next simulated user utterance based on the agent's
        # response
        user_utterance, variables = eval_conv.next_user_utterance(
            agent_text
        )
        if variables:
            accumulated_variables.update(variables)
        if user_utterance:
            detailed_trace.append(f"User: {user_utterance}")

    if console_logging:
        self._print_completion_status(eval_conv)

    self._evaluate_expectations(
        eval_conv, detailed_trace, eval_model, console_logging
    )
    eval_conv.detailed_trace = detailed_trace
    return eval_conv

run_simulations

run_simulations(test_cases, runs=1, parallel=1, sim_user_model=_DEFAULT_GEMINI_MODEL, eval_model=_DEFAULT_GEMINI_MODEL, modality='text', verbose=False, background_noise_file=None, burst_noise_files=None, use_tool_fakes=False)

Runs multiple simulations, optionally in parallel.

Parameters:

Name Type Description Default
test_cases list[dict[str, Any]]

List of test case dictionaries.

required
runs int

Number of runs per test case.

1
parallel int

Number of parallel workers (capped at 25).

1
sim_user_model str | None

Gemini model to use for simulated user.

_DEFAULT_GEMINI_MODEL
eval_model str | None

Gemini model to use for evaluating expectations.

_DEFAULT_GEMINI_MODEL
modality str

'text' or 'audio'.

'text'
verbose bool

Whether to log to console (only active if parallel=1).

False
use_tool_fakes bool

Use fake tools for the session if available.

False
Source code in src/cxas_scrapi/evals/simulation_evals.py
def run_simulations(
    self,
    test_cases: list[dict[str, Any]],
    runs: int = 1,
    parallel: int = 1,
    sim_user_model: str | None = _DEFAULT_GEMINI_MODEL,
    eval_model: str | None = _DEFAULT_GEMINI_MODEL,
    modality: str = "text",
    verbose: bool = False,
    background_noise_file: str | None = None,
    burst_noise_files: list[str] | None = None,
    use_tool_fakes: bool = False,
) -> list[dict[str, Any]]:
    """Runs multiple simulations, optionally in parallel.

    Args:
        test_cases: List of test case dictionaries.
        runs: Number of runs per test case.
        parallel: Number of parallel workers (capped at 25).
        sim_user_model: Gemini model to use for simulated user.
        eval_model: Gemini model to use for evaluating expectations.
        modality: 'text' or 'audio'.
        verbose: Whether to log to console (only active if parallel=1).
        use_tool_fakes: Use fake tools for the session if available.
    """
    sim_user_model = sim_user_model or _DEFAULT_GEMINI_MODEL
    eval_model = eval_model or _DEFAULT_GEMINI_MODEL
    jobs = self._prepare_simulation_jobs(test_cases, runs)
    return self._aggregate_simulation_results(
        jobs,
        runs,
        parallel,
        sim_user_model,
        eval_model,
        modality,
        verbose,
        background_noise_file,
        burst_noise_files,
        use_tool_fakes=use_tool_fakes,
    )

export_results_to_golden

export_results_to_golden(results, output_path=None)

Exports simulation results to a Golden Evaluation YAML file.

Fetches the full conversation trace for each simulation from the platform to ensure accuracy.

Parameters:

Name Type Description Default
results list[dict[str, Any]]

The list of results returned by run_simulations.

required
output_path str | None

Optional local path to save the generated YAML.

None

Returns:

Type Description
str

The generated YAML string.

Source code in src/cxas_scrapi/evals/simulation_evals.py
def export_results_to_golden(
    self,
    results: list[dict[str, Any]],
    output_path: str | None = None,
) -> str:
    """Exports simulation results to a Golden Evaluation YAML file.

    Fetches the full conversation trace for each simulation from the
    platform to ensure accuracy.

    Args:
        results: The list of results returned by run_simulations.
        output_path: Optional local path to save the generated YAML.

    Returns:
        The generated YAML string.
    """
    conversations_list = []

    for res in results:
        turns = self._get_turns(res)
        if not turns:
            continue

        expectations = [
            e["expectation"] for e in res.get("expectation_details", [])
        ]
        params = res.get("session_parameters", {})

        conversations_list.append(
            GoldenConversation(
                conversation=res.get("name", "Simulated_Conversation"),
                turns=turns,
                expectations=expectations,
                session_parameters=params,
            )
        )

    dataset = GoldenConversations(conversations=conversations_list)
    yaml_content = yaml.dump(
        dataset.model_dump(exclude_none=True),
        sort_keys=False,
        allow_unicode=True,
    )

    if output_path:
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(yaml_content)

    return yaml_content

Step

Bases: BaseModel

StepStatus

Bases: str, Enum

SimulationReport

SimulationReport(goals_df, expectations_df=None)

A report containing both Goals and Expectations DataFrames.

Source code in src/cxas_scrapi/evals/simulation_evals.py
def __init__(
    self,
    goals_df: pd.DataFrame,
    expectations_df: pd.DataFrame | None = None,
):
    self.goals_df = goals_df
    self.expectations_df = expectations_df