Multiturn chat eval

In [ ]:

Copied!





# Copyright 2025 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2025 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Multi-turn Chat Eval Recipe¶

This Eval Recipe demonstrates how to compare quality of chat responses from two versions of Gemini using Vertex AI Evaluation Service.

Open in Colab

Open in Colab Enterprise

Open in Vertex AI Workbench

View on GitHub

Use case: multi-turn conversation (Chat)
Metric: this eval uses a Pairwise Autorater (LLM Judge) to compare the quality of model responses.
Evaluation Dataset is a subset of Multi-turn Prompts Dataset. Each record in the dataset.jsonl file links to a JSON file with the history of conversation between the User and the Model. This dataset does not include any ground truth labels.

Step 1 of 4: Configure all necessary parameters

In [ ]:

Copied!





%%writefile .env
PROJECT_ID=your-project-id            # Google Cloud Project ID
LOCATION=us-central1                  # Region for all required Google Cloud services
EXPERIMENT_NAME=eval-multiturn-chat   # Creates Vertex AI Experiment to track the eval runs
MODEL_BASELINE=gemini-1.5-flash-002   # Name of your current model
MODEL_CANDIDATE=gemini-2.0-flash-001  # This model will be compared to the baseline model
DATASET_URI="gs://gemini_assets/multiturn_chat/dataset.jsonl"  # Evaluation dataset in Google Cloud Storage
PROMPT_TEMPLATE_URI=gs://gemini_assets/multiturn_chat/prompt_template.txt  # Text file in Google Cloud Storage
%%writefile .env
PROJECT_ID=your-project-id            # Google Cloud Project ID
LOCATION=us-central1                  # Region for all required Google Cloud services
EXPERIMENT_NAME=eval-multiturn-chat   # Creates Vertex AI Experiment to track the eval runs
MODEL_BASELINE=gemini-1.5-flash-002   # Name of your current model
MODEL_CANDIDATE=gemini-2.0-flash-001  # This model will be compared to the baseline model
DATASET_URI="gs://gemini_assets/multiturn_chat/dataset.jsonl"  # Evaluation dataset in Google Cloud Storage
PROMPT_TEMPLATE_URI=gs://gemini_assets/multiturn_chat/prompt_template.txt  # Text file in Google Cloud Storage

Step 2 of 4: Install all required Python libraries if not already installed.

In [ ]:

Copied!





try: # Skip installation and kernel restart if this cell has been executed.
  import dotenv
except:
  %pip install --upgrade --user --quiet python-dotenv google-genai google-cloud-aiplatform[evaluation]
  import IPython
  # The error "session crashed" is expected. Please ignore it and proceed to the next cell.
  IPython.Application.instance().kernel.do_shutdown(True)
try: # Skip installation and kernel restart if this cell has been executed.
  import dotenv
except:
  %pip install --upgrade --user --quiet python-dotenv google-genai google-cloud-aiplatform[evaluation]
  import IPython
  # The error "session crashed" is expected. Please ignore it and proceed to the next cell.
  IPython.Application.instance().kernel.do_shutdown(True)

Step 3 of 4: Authenticate to Google Cloud (requires permission to open a popup window)

In [ ]:

Copied!





import os
import sys
import vertexai
from dotenv import load_dotenv
from google.cloud import storage

load_dotenv(override=True)
if os.getenv("PROJECT_ID") == "your-project-id":
    raise ValueError("Please configure your Google Cloud Project ID in the first cell.")
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()
vertexai.init(project=os.getenv('PROJECT_ID'), location=os.getenv('LOCATION'))
import os
import sys
import vertexai
from dotenv import load_dotenv
from google.cloud import storage

load_dotenv(override=True)
if os.getenv("PROJECT_ID") == "your-project-id":
    raise ValueError("Please configure your Google Cloud Project ID in the first cell.")
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()
vertexai.init(project=os.getenv('PROJECT_ID'), location=os.getenv('LOCATION'))

Step 4 of 4: Run the eval on both models and compare the Accuracy scores

In [ ]:

Copied!





import json
import pandas as pd
from datetime import datetime
from google import genai
from google.genai.types import Content, Part
from IPython.display import clear_output
from vertexai.evaluation import EvalTask, EvalResult, MetricPromptTemplateExamples
from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory

def load_file(gcs_uri: str) -> str:
    blob = storage.Blob.from_string(gcs_uri, storage.Client())
    return blob.download_as_string().decode('utf-8')

def load_dataset(dataset_uri: str):
    jsonl = load_file(dataset_uri)
    samples = [json.loads(line) for line in jsonl.splitlines() if line.strip()]
    df = pd.DataFrame(samples)
    df['history'] = df['chat_uri'].apply(lambda document_uri: load_file(document_uri))
    return df[['history']]

def generate_chat_responses(project_id: str, location:str, model: str, dataset: pd.DataFrame, response_column_name: str) -> None:
    '''Generate the final model response for each conversation in the dataset using the specified model.'''
    client = genai.Client(vertexai=True, project=project_id, location=location)
    responses = []
    user_prompts = []
    for i, record in dataset.iterrows():
        print(f'Generating chat completion #{i+1} with {model}')
        messages = json.loads(record.get('history'))
        last_user_message = messages.pop()
        history = [
            Content(
                role=message['role'],
                parts=[Part(text=message['content'])],
            )
            for message in messages
        ]
        chat = client.chats.create(model=model, history=history)
        response = chat.send_message(message=[Part(text=last_user_message['content'])])
        user_prompts.append(last_user_message)
        responses.append( response.candidates[0].content.parts[0].text )
    dataset['prompt'] = user_prompts  # The last user message is required by the Autorater
    dataset[response_column_name] = responses
    print(f'{len(responses)} responses from model {model} are stored in dataset column "{response_column_name}"')

def run_eval(project_id: str, location:str, experiment_name: str, baseline_model: str, candidate_model: str, dataset_uri: str):
    vertexai.init(project=project_id, location=location)
    timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
    dataset=load_dataset(dataset_uri)
    generate_chat_responses(project_id, location, baseline_model, dataset, 'baseline_model_response')
    generate_chat_responses(project_id, location, candidate_model, dataset, 'response')
    task = EvalTask(
        dataset=dataset,
        metrics=[MetricPromptTemplateExamples.Pairwise.MULTI_TURN_CHAT_QUALITY],
        experiment=experiment_name
    )
    eval_results = task.evaluate(
        experiment_run_name=f"{timestamp}-{baseline_model.replace('.', '-')}"
    )
    clear_output()
    print(f"Baseline model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/baseline_model_win_rate']:.2f}")
    print(f"Candidate model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/candidate_model_win_rate']:.2f}")

run_eval(
    project_id=os.getenv('PROJECT_ID'),
    location=os.getenv('LOCATION'),
    experiment_name=os.getenv('EXPERIMENT_NAME'),
    baseline_model=os.getenv('MODEL_BASELINE'),
    candidate_model=os.getenv('MODEL_CANDIDATE'),
    dataset_uri=os.getenv('DATASET_URI')
)
import json
import pandas as pd
from datetime import datetime
from google import genai
from google.genai.types import Content, Part
from IPython.display import clear_output
from vertexai.evaluation import EvalTask, EvalResult, MetricPromptTemplateExamples
from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory

def load_file(gcs_uri: str) -> str:
    blob = storage.Blob.from_string(gcs_uri, storage.Client())
    return blob.download_as_string().decode('utf-8')

def load_dataset(dataset_uri: str):
    jsonl = load_file(dataset_uri)
    samples = [json.loads(line) for line in jsonl.splitlines() if line.strip()]
    df = pd.DataFrame(samples)
    df['history'] = df['chat_uri'].apply(lambda document_uri: load_file(document_uri))
    return df[['history']]

def generate_chat_responses(project_id: str, location:str, model: str, dataset: pd.DataFrame, response_column_name: str) -> None:
    '''Generate the final model response for each conversation in the dataset using the specified model.'''
    client = genai.Client(vertexai=True, project=project_id, location=location)
    responses = []
    user_prompts = []
    for i, record in dataset.iterrows():
        print(f'Generating chat completion #{i+1} with {model}')
        messages = json.loads(record.get('history'))
        last_user_message = messages.pop()
        history = [
            Content(
                role=message['role'],
                parts=[Part(text=message['content'])],
            )
            for message in messages
        ]
        chat = client.chats.create(model=model, history=history)
        response = chat.send_message(message=[Part(text=last_user_message['content'])])
        user_prompts.append(last_user_message)
        responses.append( response.candidates[0].content.parts[0].text )
    dataset['prompt'] = user_prompts  # The last user message is required by the Autorater
    dataset[response_column_name] = responses
    print(f'{len(responses)} responses from model {model} are stored in dataset column "{response_column_name}"')

def run_eval(project_id: str, location:str, experiment_name: str, baseline_model: str, candidate_model: str, dataset_uri: str):
    vertexai.init(project=project_id, location=location)
    timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
    dataset=load_dataset(dataset_uri)
    generate_chat_responses(project_id, location, baseline_model, dataset, 'baseline_model_response')
    generate_chat_responses(project_id, location, candidate_model, dataset, 'response')
    task = EvalTask(
        dataset=dataset,
        metrics=[MetricPromptTemplateExamples.Pairwise.MULTI_TURN_CHAT_QUALITY],
        experiment=experiment_name
    )
    eval_results = task.evaluate(
        experiment_run_name=f"{timestamp}-{baseline_model.replace('.', '-')}"
    )
    clear_output()
    print(f"Baseline model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/baseline_model_win_rate']:.2f}")
    print(f"Candidate model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/candidate_model_win_rate']:.2f}")

run_eval(
    project_id=os.getenv('PROJECT_ID'),
    location=os.getenv('LOCATION'),
    experiment_name=os.getenv('EXPERIMENT_NAME'),
    baseline_model=os.getenv('MODEL_BASELINE'),
    candidate_model=os.getenv('MODEL_CANDIDATE'),
    dataset_uri=os.getenv('DATASET_URI')
)

You can access all prompts and model responses in candidate_results.metrics_table

Please use our documentation to learn about all available metrics and customization options.