Summarization eval
# Copyright 2025 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Summarization Eval Recipe¶
This Eval Recipe demonstrates how to compare performance of two models on a summarization task using Vertex AI Evaluation Service.
Use case: summarize a news article.
Metric: this eval uses an Autorater (LLM Judge) to rate Summarization Quality.
Evaluation Dataset is based on XSum. It includes 5 news articles stored as plain text files, and a JSONL file with ground truth labels:
dataset.jsonl
. Each record in this file includes 2 attributes:document
: relative path to the plain text file containing the news articlereference
: ground truth label (short summary of the article)
Prompt Template is a zero-shot prompt located in
prompt_template.txt
with variabledocument
that gets populated from the corresponding dataset attribute.
Step 1 of 4: Configure eval settings
%%writefile .env
PROJECT_ID=your-project-id # Google Cloud Project ID
LOCATION=us-central1 # Region for all required Google Cloud services
EXPERIMENT_NAME=eval-document-qna # Creates Vertex AI Experiment to track the eval runs
MODEL_BASELINE=gemini-1.0-pro-002 # Name of your current model
MODEL_CANDIDATE=gemini-2.0-flash-001 # This model will be compared to the baseline model
DATASET_URI="gs://gemini_assets/summarization/dataset.jsonl" # Evaluation dataset in Google Cloud Storage
PROMPT_TEMPLATE_URI=gs://gemini_assets/summarization/prompt_template.txt # Text file in Google Cloud Storage
Step 2 of 4: Install Python libraries
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation] plotly python-dotenv
# The error "session crashed" is expected. Please ignore it and proceed to the next cell.
import IPython
IPython.Application.instance().kernel.do_shutdown(True)
Step 3 of 4: Authenticate to Google Cloud (requires permission to open a popup window)
import os
import sys
import vertexai
from dotenv import load_dotenv
from google.cloud import storage
load_dotenv(override=True)
if os.getenv("PROJECT_ID") == "your-project-id":
raise ValueError("Please configure your Google Cloud Project ID in the first cell.")
if "google.colab" in sys.modules:
from google.colab import auth
auth.authenticate_user()
vertexai.init(project=os.getenv('PROJECT_ID'), location=os.getenv('LOCATION'))
Step 4 of 4: Run the eval on both models and compare the Accuracy scores
import json
import pandas as pd
from datetime import datetime
from IPython.display import clear_output
from vertexai.evaluation import EvalTask, EvalResult, MetricPromptTemplateExamples
from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory
def load_file(gcs_uri: str) -> str:
blob = storage.Blob.from_string(gcs_uri, storage.Client())
return blob.download_as_string().decode('utf-8')
def load_dataset(dataset_uri: str):
jsonl = load_file(dataset_uri)
samples = [json.loads(line) for line in jsonl.splitlines() if line.strip()]
df = pd.DataFrame(samples)
df['document_text'] = df['document_uri'].apply(lambda document_uri: load_file(document_uri))
return df[['document_text', 'reference']]
def run_eval(model: str) -> EvalResult:
timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
return EvalTask(
dataset=load_dataset(os.getenv("DATASET_URI")),
metrics=[MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY],
experiment=os.getenv('EXPERIMENT_NAME')
).evaluate(
model=GenerativeModel(model),
prompt_template=load_file(os.getenv("PROMPT_TEMPLATE_URI")),
experiment_run_name=f"{timestamp}-{model.replace('.', '-')}"
)
baseline_results = run_eval(os.getenv("MODEL_BASELINE"))
candidate_results = run_eval(os.getenv("MODEL_CANDIDATE"))
clear_output()
print(f"Baseline model score: {baseline_results.summary_metrics['summarization_quality/mean']:.2f}")
print(f"Candidate model score: {candidate_results.summary_metrics['summarization_quality/mean']:.2f}")
You can access all prompts and model responses in candidate_results.metrics_table
Dataset (XSum) citation: @InProceedings{xsum-emnlp, author = {Shashi Narayan and Shay B. Cohen and Mirella Lapata}, title = {Don't Give Me the Details, Just the Summary! {T}opic-Aware Convolutional Neural Networks for Extreme Summarization}, booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, year = {2018} }
Please use our documentation to learn about all available metrics and customization options.