Skip to content

SimulationEvals

SimulationEvals runs AI-driven end-to-end conversation simulations against your CXAS agent. Instead of scripting exact utterances, you describe goals and success criteria — and a Gemini model figures out what to say at each turn to try to achieve them. This is a great way to test how your agent handles realistic, messy, unpredictable conversations.

Here are the key concepts:

  • Step (Pydantic model) — a single goal within a simulation, with a goal, success_criteria, optional response_guide, and a max_turns limit. Steps can also include a static_utterance for when you want a fixed first message, and inject_variables for seeding session state.
  • StepStatus enum — tracks whether each step is NOT_STARTED, IN_PROGRESS, or COMPLETED.
  • simulate_conversation() — drives the full multi-turn loop, returning an LLMUserConversation object that contains the transcript, step progress, and expectation results.
  • generate_report() — produces a SimulationReport with two DataFrames: goal progress and expectation results. It renders as styled HTML in a Jupyter notebook.

Quick Example

from cxas_scrapi import SimulationEvals

app_name = "projects/my-project/locations/us/apps/my-app-id"
sim = SimulationEvals(app_name=app_name)

test_case = {
    "steps": [
        {
            "goal": "User wants to check their account balance",
            "success_criteria": "Agent provides a numeric balance and account status",
            "max_turns": 5,
        },
        {
            "goal": "User asks to dispute a charge",
            "success_criteria": "Agent acknowledges the dispute and provides a reference number",
            "max_turns": 8,
        },
    ],
    "expectations": [
        "The agent should never ask for the full credit card number",
        "The agent should offer to escalate if it cannot resolve the dispute",
    ],
}

# Run the simulation
conversation = sim.simulate_conversation(
    test_case=test_case,
    console_logging=True,
)

# View the report
report = conversation.generate_report()
print(report)  # Colorized in terminal, styled HTML in Jupyter

Reference

SimulationEvals

SimulationEvals(app_name, **kwargs)

Bases: Apps

Wrapper class to simulate entire multi-turn conversations with a CXAS Agent.

Source code in src/cxas_scrapi/evals/simulation_evals.py
def __init__(self, app_name: str, **kwargs):
    self.app_name = app_name
    project_id = app_name.split("/")[1]
    location = app_name.split("/")[3]
    super().__init__(project_id=project_id, location=location, **kwargs)
    self.sessions_client = Sessions(app_name, **kwargs)
    self.tools_map = Tools(app_name=app_name, **kwargs).get_tools_map()

    # Vertex AI requires a specific region (e.g. global), whereas CXAS
    # Apps use 'us' or 'eu'
    vertex_location = "global"

    self.genai_client = GeminiGenerate(
        project_id=self.project_id,
        location=vertex_location,
        credentials=self.creds,
    )

simulate_conversation

simulate_conversation(test_case, model=_DEFAULT_GEMINI_MODEL, session_id=None, console_logging=True, modality='text')

Runs the simulated conversation loop.

Parameters:

Name Type Description Default
test_case Dict[str, Any]

The test case dictionary defining evaluation steps.

required
model str

The Gemini model used for evaluating turns.

_DEFAULT_GEMINI_MODEL
console_logging bool

Whether to print interaction transcript to the console.

True
Source code in src/cxas_scrapi/evals/simulation_evals.py
def simulate_conversation(
    self,
    test_case: Dict[str, Any],
    model: str = _DEFAULT_GEMINI_MODEL,
    session_id: Optional[str] = None,
    console_logging: bool = True,
    modality: str = "text",
) -> LLMUserConversation:
    """Runs the simulated conversation loop.

    Args:
        test_case: The test case dictionary defining evaluation steps.
        model: The Gemini model used for evaluating turns.
        console_logging: Whether to print interaction transcript to
            the console.
    """
    if session_id is None:
        session_id = str(uuid.uuid4())
    eval_conv = LLMUserConversation(
        genai_client=self.genai_client,
        genai_model=model,
        test_case=test_case,
    )

    if console_logging:
        print(
            f"Starting simulated conversation with session ID: {session_id}"
        )

    # Initialize the first turn manually
    user_utterance, variables = eval_conv.next_user_utterance()

    detailed_trace = []
    detailed_trace.append(f"User: {user_utterance}")

    while user_utterance:
        response = self._send_request_with_retry(
            session_id, user_utterance, variables, modality, console_logging
        )
        if not response:
            break

        if console_logging:
            self.sessions_client.parse_result(response)

        agent_text, trace_chunks, session_ended = (
            self._parse_agent_response(response)
        )
        detailed_trace.append("\n".join(trace_chunks))

        if session_ended:
            if console_logging:
                print(
                    "\nSession has been closed by the Agent via "
                    "end_session tool."
                )
            break

        # Get the next simulated user utterance based on the agent's
        # response
        user_utterance, variables = eval_conv.next_user_utterance(
            agent_text
        )
        if user_utterance:
            detailed_trace.append(f"User: {user_utterance}")

    if console_logging:
        self._print_completion_status(eval_conv)

    self._evaluate_expectations(
        eval_conv, detailed_trace, model, console_logging
    )
    eval_conv.detailed_trace = detailed_trace
    return eval_conv

run_simulations

run_simulations(test_cases, runs=1, parallel=1, model=_DEFAULT_GEMINI_MODEL, modality='text', verbose=False)

Runs multiple simulations, optionally in parallel.

Parameters:

Name Type Description Default
test_cases List[Dict[str, Any]]

List of test case dictionaries.

required
runs int

Number of runs per test case.

1
parallel int

Number of parallel workers (capped at 25).

1
model str

Gemini model to use.

_DEFAULT_GEMINI_MODEL
modality str

'text' or 'audio'.

'text'
verbose bool

Whether to log to console (only active if parallel=1).

False
Source code in src/cxas_scrapi/evals/simulation_evals.py
def run_simulations(
    self,
    test_cases: List[Dict[str, Any]],
    runs: int = 1,
    parallel: int = 1,
    model: str = _DEFAULT_GEMINI_MODEL,
    modality: str = "text",
    verbose: bool = False,
) -> List[Dict[str, Any]]:
    """Runs multiple simulations, optionally in parallel.

    Args:
        test_cases: List of test case dictionaries.
        runs: Number of runs per test case.
        parallel: Number of parallel workers (capped at 25).
        model: Gemini model to use.
        modality: 'text' or 'audio'.
        verbose: Whether to log to console (only active if parallel=1).
    """
    jobs = self._prepare_simulation_jobs(test_cases, runs)
    return self._aggregate_simulation_results(
        jobs, runs, parallel, model, modality, verbose
    )

export_results_to_golden

export_results_to_golden(results, output_path=None)

Exports simulation results to a Golden Evaluation YAML file.

Fetches the full conversation trace for each simulation from the platform to ensure accuracy.

Parameters:

Name Type Description Default
results List[Dict[str, Any]]

The list of results returned by run_simulations.

required
output_path Optional[str]

Optional local path to save the generated YAML.

None

Returns:

Type Description
str

The generated YAML string.

Source code in src/cxas_scrapi/evals/simulation_evals.py
def export_results_to_golden(
    self,
    results: List[Dict[str, Any]],
    output_path: Optional[str] = None,
) -> str:
    """Exports simulation results to a Golden Evaluation YAML file.

    Fetches the full conversation trace for each simulation from the
    platform to ensure accuracy.

    Args:
        results: The list of results returned by run_simulations.
        output_path: Optional local path to save the generated YAML.

    Returns:
        The generated YAML string.
    """
    conversations_list = []

    for res in results:
        turns = self._get_turns(res)
        if not turns:
            continue

        expectations = [
            e["expectation"] for e in res.get("expectation_details", [])
        ]
        params = res.get("session_parameters", {})

        conversations_list.append(
            GoldenConversation(
                conversation=res.get("name", "Simulated_Conversation"),
                turns=turns,
                expectations=expectations,
                session_parameters=params,
            )
        )

    dataset = GoldenConversations(conversations=conversations_list)
    yaml_content = yaml.dump(
        dataset.model_dump(exclude_none=True),
        sort_keys=False,
        allow_unicode=True,
    )

    if output_path:
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(yaml_content)

    return yaml_content

Step

Bases: BaseModel

StepStatus

Bases: str, Enum

SimulationReport

SimulationReport(goals_df, expectations_df=None)

A report containing both Goals and Expectations DataFrames.

Source code in src/cxas_scrapi/evals/simulation_evals.py
def __init__(
    self,
    goals_df: pd.DataFrame,
    expectations_df: Optional[pd.DataFrame] = None,
):
    self.goals_df = goals_df
    self.expectations_df = expectations_df