# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Vertex AI: Gemini Evaluations Playbook
Experiment, Evaluate, and Analyze
Evals Playbook: Experiment, Evaluate & Analyze¶
This notebook shows you how to define experiments, run evaluations to assess model performance, and analyze evaluation results including side-by-side comparison of results across different experiments and runs. The notebook performs following steps:
- Define the evaluation task
- Prepare evaluation dataset
- Define an experiment by:
- Configuring the model
- Setting prompt and system instruction
- Establishing evaluation criteria (metrics)
- Run evaluations using Vertex AI Rapid Eval SDK
- Log detailed results and summarizing through aggregated metrics.
- Side-by-side comparison of evaluation runs for a comprehensive analysis.
🚧 0. Pre-requisites¶
Make sure that you have prepared the environment following steps in 0_gemini_evals_playbook_setup.ipynb. If the 0_gemini_evals_playbook_setup notebook has been run successfully, the following are set up:
- GCP project and APIs to run the eval pipeline
- All the required IAM permissions
- Environment to run the notebooks
- Bigquery datasets and tables to track evaluation results
%load_ext autoreload
%autoreload 2
Read configurations¶
The configuration saved previously in 0_gemini_evals_playbook_setup.ipynb will be used for initializing variables.
import os
import sys
module_path = os.path.abspath(os.path.join(".."))
sys.path.append(module_path)
print(f"module_path: {module_path}")
# Import all the parameters
from utils.config import (LOCATION, PROJECT_ID, STAGING_BUCKET,
STAGING_BUCKET_URI)
from utils.evals_playbook import Evals, generate_uuid
Import libraries¶
import datetime
import itertools
import re
import pandas as pd
import vertexai
from datasets import Dataset, load_dataset
from vertexai.evaluation import (EvalTask, PointwiseMetric,
PointwiseMetricPromptTemplate, constants)
from vertexai.generative_models import (GenerativeModel, HarmBlockThreshold,
HarmCategory, SafetySetting)
Initialize Vertex AI SDK¶
vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=STAGING_BUCKET_URI)
print("Vertex AI SDK initialized.")
print(f"Vertex AI SDK version = {vertexai.__version__}")
# pandas display full column values
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
# Initialize evals object
evals = Evals()
🛠️ 1. Define and configure evaluation task and experiment¶
# create and log task
task_id = "task_summarization"
task = evals.Task(
task_id=task_id,
task_desc="summarize pubmed articles",
tags=["pubmed"],
create_datetime=datetime.datetime.now(),
update_datetime=datetime.datetime.now(),
)
evals.log_task(task)
- List all tasks available in the database (lists tasks sorted by task creation time in descending order)
evals.get_all_tasks()
Define Experiment¶
An experiment in Evals Playbook is defined by configuring
- Dataset
- Model and model configuration
- Prompt
Each experiment has an experiment_id
and associated with a task_id
. This sectio defines the required components.
experiment_id = "Prompt with simple language summary and custom metrics"
# remove any special characters from experiment id
_experiment_id = re.sub("[^0-9a-zA-Z]", "-", experiment_id.lower())
experiment_desc = "Update system instruction to generate a simple summary with bullets"
tags = ["pubmed"]
metadata = {}
Configure Model¶
Define the Gemini model you want to evaluate your task on including name, configuration settings such as temperature and safety settings.
- Add system instructions to give the model additional context to understand the task, provide more customized responses, and adhere to specific guidelines over the full user interaction with the model.
system_instruction = """Instruction: You are a medical researcher writing a plain language Summary of your Article for a layperson.
Translate any medical terms to simple english explanations.
Use first-person 'We'. Use short bullet points addressing following
- Purpose: What was the purpose of the study?
- Research: What did the researchers do?
- Findings: What did they find?
- Implications: What does this mean for me?"
"""
- Define generation config and safety settings
generation_config = {
"temperature": 0.1,
}
safety_settings = [
SafetySetting(
category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
threshold=HarmBlockThreshold.BLOCK_NONE,
),
SafetySetting(
category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,
threshold=HarmBlockThreshold.BLOCK_NONE,
),
SafetySetting(
category=HarmCategory.HARM_CATEGORY_HARASSMENT,
threshold=HarmBlockThreshold.BLOCK_NONE,
),
SafetySetting(
category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
threshold=HarmBlockThreshold.BLOCK_NONE,
),
]
model = GenerativeModel(
model_name="gemini-1.5-pro-002",
generation_config=generation_config,
safety_settings=safety_settings,
system_instruction=system_instruction,
# TODO: Add tools and tool_config
)
Prepare Prompt¶
- Prepare a prompt template for the experiment
prompt_id = "short bulleted list with format"
prompt_description = "instruction with short bullets addressing specific questions"
# Prompt Template
prompt_template = "Article: {context} \nSummary:"
evals.save_prompt_template(task_id, _experiment_id, prompt_id, prompt_template)
- Configure prompt id, description for tracking
prompt = evals.Prompt(
prompt_id=prompt_id,
prompt_description=prompt_description,
prompt_type="single-turn", # single-turn, chat,
is_multimodal=False,
system_instruction=system_instruction,
prompt_template=prompt_template,
create_datetime=datetime.datetime.now(),
update_datetime=datetime.datetime.now(),
tags=tags,
)
evals.log_prompt(prompt)
Prepare evaluation dataset¶
This notebook uses a sample of PubMed articles that are hosted on HuggingFace.
- Download sample dataset (10 rows) of PubMed articles for the task.
# get sample dataset from PubMed articles
ds_stream = load_dataset(
"ccdv/pubmed-summarization", "document", split="test", streaming=True
)
num_rows = 10
dataset = Dataset.from_list(list(itertools.islice(ds_stream, num_rows)))
- Pre-process and prepare dataset to use with the evaluator.
Prepare the dataset as Pandas dataframe in the format expected by the Vertex AI Rapid Eval SDK.
Dataset column names:
reference
: The column name of ground truth in the dataset.context
: The column name containing article passed as the context.instruction
: System instruction configured to pass to the model
# convert HuggingFace dataset to Pandas dataframe
eval_dataset = dataset.to_pandas()
# rename columns as per Vertex AI Rapid Eval SDK defaults
eval_dataset.columns = ["context", "reference"]
# add instruction for calculating metrics (not all metrics need instruction)
eval_dataset["instruction"] = system_instruction
# add prompt column
eval_dataset["prompt"] = eval_dataset["context"].apply(
lambda x: prompt_template.format(context=x)
)
# add prompt id for tracking
eval_dataset["dataset_row_id"] = [f"dataset_row_{i}" for i in eval_dataset.index]
- Verify a few samples in the prepared evaluation dataset
print(f"Number of rows: {eval_dataset.shape}")
eval_dataset.head(1)
- Optionally, save the dataset in Cloud Storage (or BigQuery) to reuse.
file_name = "pubmed_summary.csv"
gcs_file_path = f"gs://{STAGING_BUCKET}/{task_id}/data/{file_name}"
# Save dataset to Cloud Storage
eval_dataset.to_csv(gcs_file_path, index=False)
print(f"Dataset saved at {gcs_file_path} successfully!")
Configure Metrics¶
In this section, you configure the evaluation criteria for your task. You can choose from the built-in metrics (or metric bundles) from Vertex AI Rapid Eval SDK or define a custom metric.
- Define prebuilt/built-in metrics with Vertex GenAI Evaluation or bring your own metrics.
# Creating custom metrics for Pointwise Evaluation;
# You can define the metric following either a template of criteria and rating rubric
# or using a free form prompt. One example for each is demonstrated below
# Example 1: format adherence metric, to evaluate if the LLM strictly followed the required formatting
criteria = {
"First-person We": "The text is written in first person 'we'",
"Format": "The output is formatted in bullets",
"Completeness": "All four sections, purpose, research, findings and implications are addressed in the output",
}
pointwise_rating_rubric = {
"5": "Perfectly formatted: Text is in first person 'we', formatted in bullets and all four sections purpose, research, findings and implications are addressed in the output",
"4": "Mostly formatted: Content is formatted in bullets and all four sections purpose, research, findings and implications are addressed in the output, but failed to write in first person 'we' ",
"3": "Somewhat formatted: Content is formatted in bullets and but failed to address one of the four sections purpose, research, findings and implications",
"2": "Poorly formatted : Content is may or may not be formatted in bullets and failed to address two out of the four sections purpose, research, findings and implications",
"1": "Very poorly formatted: Content is not formatted in bullets and failed to address two or more out of the four sections purpose, research, findings and implications",
}
# The metric prompt template contains default prompts pre-defined for unspecified components.
format_adherence_metric_prompt_template = PointwiseMetricPromptTemplate(
criteria=criteria,
rating_rubric=pointwise_rating_rubric,
input_variables=["prompt", "reference"],
)
# Display the assembled prompt template that will be sent to Gen AI Eval Service
# along with the input data for model-based evaluation.
# print(format_adherence_metric_prompt_template.prompt_data)
# Register the custom "format_adherence" model-based metric.
format_adherence = PointwiseMetric(
metric="format_adherence",
metric_prompt_template=format_adherence_metric_prompt_template,
)
# Example 2: text quality and relevance to layperson
free_form_pointwise_metric_prompt = """
# Instruction
You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
We will provide you with the user prompt and an AI-generated response.
You should first read the user prompt carefully for analyzing the task, and then evaluate the
quality of the responses based on and Criteria provided in the Evaluation section below.
You will assign the response a score from 5, 4, 3, 2, 1, following the Rating Rubric and Evaluation Steps.
Give step-by-step explanations for your scoring, and only choose scores from 5, 4, 3, 2, 1.
# Evaluation
## Metric Definition
You will be assessing Text Quality and relevance to layperson, which measures how effectively the text conveys
clear, accurate, and engaging information that is easily understandable by a layperson and directly addresses
the user's prompt, considering factors like fluency, coherence, relevance, conciseness and free of
complex medical language
## Criteria
Coherence: The response presents ideas in a logical and organized manner, with clear transitions and a consistent focus, making it easy to follow and understand.
Fluency: The text flows smoothly and naturally, adhering to grammatical rules and using appropriate vocabulary.
Relevance to layperson: The response is easily understandable by a layperson as opposed to a medical professional
Groundedness: The response contains information included only in the context. The response does not reference any outside information.
Verbosity: The response is appropriately concise, providing sufficient detail without using complex language to thoroughly address the prompt without being overly wordy or excessively brief.
## Rating Rubric
5: (Very good). Exceptionally clear, coherent, fluent, and concise. Free of complex Medical language
4: (Good). Well-written, coherent, and fluent. Easy to understand by a layperson. Minor room for improvement.
3: (Ok). Adequate writing with decent coherence and fluency. May contain some medical jargon and minor ungrounded information. Could be more concise.
2: (Bad). Poorly written, lacking coherence and fluency. Geared towards to medical professional as opposed to layperson. May include ungrounded information.
1: (Very bad). Very poorly written, incoherent, and non-fluent. Geared towards to medical professional as opposed to layperson. Contains substantial ungrounded information. Severely lacking in conciseness.
## Evaluation Steps
STEP 1: Assess the response in aspects of all criteria provided. Provide assessment according to each criterion.
STEP 2: Score based on the rating rubric. Give a brief rationale to explain your evaluation considering each individual criterion.
# User Inputs and AI-generated Response
## User Inputs
### Prompt
{prompt}
## AI-generated Response
{reference}
"""
# Register the custom "text_quality_relevance_to_layperson" model-based metric.
text_quality_relevance_to_layperson = PointwiseMetric(
metric="text_quality_relevance_to_layperson",
metric_prompt_template=free_form_pointwise_metric_prompt,
)
For a full list of built in metrics:
# List of built in metrics
metrics = [
constants.Metric.ROUGE_1,
constants.Metric.ROUGE_L_SUM,
constants.Metric.BLEU,
constants.Metric.FLUENCY,
constants.Metric.COHERENCE,
constants.Metric.SAFETY,
constants.Metric.GROUNDEDNESS,
constants.Metric.SUMMARIZATION_QUALITY,
]
# build a metric config object for tracking
# Add built in metrics
metric_config = [
{"metric_name": metric, "type": "prebuilt", "metric_scorer": "Vertex AI"}
for metric in metrics
]
# Add custom metrics
metric_config.extend(
[
{
"metric_name": text_quality_relevance_to_layperson.metric_name,
"type": "custom",
"metric_scorer": "Vertex AI",
},
{
"metric_name": format_adherence.metric_name,
"type": "custom",
"metric_scorer": "Vertex AI",
},
]
)
metrics.extend([text_quality_relevance_to_layperson, format_adherence])
print(metric_config)
Add Experiment¶
Now that you have defined model, prompt, dataset and eval criteria (metrics), let's add them to an experiment and start logging.
experiment = evals.log_experiment(
task_id=task_id,
experiment_id=experiment_id,
experiment_desc=experiment_desc,
prompt=prompt,
model=model,
metric_config=metric_config,
tags=tags,
)
- You can view the experiment details
evals.get_experiment(experiment_id=experiment_id)
- You can view the prompt and system instruction if set.
evals.get_prompt(prompt_id=prompt_id)
- List all experiments available
evals.get_all_experiments()
🚀 2. Run experiment(s) for an evaluation task¶
The experiment is now ready to run an evaluation task using the model, prompt, dataset and metrics configured.
- Define Vertex AI Rapid Eval Task. Evaluation tasks must contain an evaluation dataset, and a list of metrics to evaluate.
_experiment_id = re.sub("[^0-9a-zA-Z]", "-", experiment_id.lower())
eval_task = EvalTask(dataset=eval_dataset, metrics=metrics, experiment=_experiment_id)
- Run the evaluation task with a run name, model and prompt template. This step may take a few minutes depending on the size of evaluation dataset.
experiment_run_name = generate_uuid(_experiment_id)
eval_result = eval_task.evaluate(
model=model,
prompt_template=prompt_template,
experiment_run_name=experiment_run_name,
)
- After the evaluation task is completed, Vertex AI Rapid Eval SDK returns the result of the run including summary metrics and a detailed metrics table with per-instance (that is per example) metrics.
summary_metrics = eval_result.summary_metrics
report_df = eval_result.metrics_table
report_df.head(1)
summary_metrics
- Log the run metrics (both summary and detail) to analyze or compare them in subsequent iterations.
run_path = f"{task_id}/prompts/{_experiment_id}/{experiment_run_name}"
evals.log_eval_run(
experiment_run_id=experiment_run_name,
experiment=experiment,
eval_result=eval_result,
run_path=run_path,
tags=tags,
metadata=metadata,
)
- View all evaluation runs for an experiment
evals.get_eval_runs(experiment_id=experiment_id)
- View all evaluation runs in the system across experiments
evals.get_all_eval_runs()
📊 3. Analyze results¶
This section shows a few ways to analyze and compare results. Since the results are stored in BigQuery tables, there are multiple ways to analyze them
- Use BigQuery SQL queries
- Use Pandas dataframe and BigQuery
- Build Looker dashboards
- Use tools such as LLM Comparator from Google's PAIR team
- and more ...
Get experiments, runs and run details¶
- Define
Evals
object to access helper functions
evals = Evals()
- Get all experiments
evals.get_all_experiments()
- Get a specific experiment using
experiment_id
experiment_id = "Prompt with simple language summary"
evals.get_experiment(experiment_id=experiment_id)
evals.get_eval_runs(experiment_id=experiment_id)
Detailed metrics¶
You can get a detail eval result for a given experiment run at example level. This helps you to analyze and identify any loss patterns. To find run_id for previous runs, see gemini_evals_plapbook(schema) >> eval_runs(table) >> run_id (column) on bigquery
# Replace
experiment_run_id = "[your-run_id]"
evals.get_eval_run_detail(experiment_run_id=experiment_run_id)
Compare eval runs across experiments¶
Compare eval runs at summary level¶
You can compare summary metrics for multiple runs side-by-side even across different experiments. For example, you can compare eval runs
- For the same prompt at different temperature settings
- Same model setting but different prompt templates or system instruction
Pass a list of experiment run ids and compare them side-by-side
run_ids = [
"[your-run_id1]",
"[your-run_id2]",
]
# list of run ids - strings
evals.compare_eval_runs(run_ids)
LLM Comparator for analyzing side-by-side LLM evaluation results¶
To visualize model responses from different runs, we use LLM Comparator Python Library from Google PAIR team to compare model responses from two runs side-by-side. The tool coordinates the three phases of comparative evaluation: judging, bulletizing, and clustering and the results can be uploaded on LLM Comparator app to view and analyze further.
- Fetch run details for two experiment run ids you would like to compare.
evals.get_all_eval_runs()
or evals.get_eval_runs(experiment_id=experiment_id)
to get run ids.
# Prepare run details to compare
# @markdown ### Enter experiment run id 1
run_1 = "[your-run_id1]" # @param {type:"string"}
run_1_details = evals.get_eval_run_detail(experiment_run_id=run_1)
run_1_details = run_1_details[
["run_id", "dataset_row_id", "input_prompt_gcs_uri", "output_text"]
]
# @markdown ### Enter experiment run id 2
run_2 = "[your-run_id2]" # @param {type:"string"}
run_2_details = evals.get_eval_run_detail(experiment_run_id=run_2)
run_2_details = run_2_details[
["run_id", "dataset_row_id", "input_prompt_gcs_uri", "output_text"]
]
run1_run2 = pd.merge(
run_1_details,
run_2_details,
how="outer",
on=["dataset_row_id"],
suffixes=("_1", "_2"),
)
run1_run2 = run1_run2.rename(
columns={
"input_prompt_gcs_uri_1": "prompt",
"output_text_1": "response_a",
"output_text_2": "response_b",
}
)
- Prepare pairwise comparison file to visualize using LLM Comparator
from llm_comparator import (comparison, llm_judge_runner, model_helper,
rationale_bullet_generator,
rationale_cluster_generator)
inputs = run1_run2.to_dict(orient="records")
custom_fields_schema = [
{"name": "prompt_id", "type": "string"},
]
# Initialize the models-calling classes.
generator = model_helper.VertexGenerationModelHelper(model_name="gemini-1.5-pro")
embedder = model_helper.VertexEmbeddingModelHelper()
# Initialize the instances that run work on the models.
judge = llm_judge_runner.LLMJudgeRunner(generator)
bulletizer = rationale_bullet_generator.RationaleBulletGenerator(generator)
clusterer = rationale_cluster_generator.RationaleClusterGenerator(generator, embedder)
# Configure and run the comparative evaluation.
comparison_result = comparison.run(
inputs, judge, bulletizer, clusterer, judge_opts={"num_repeats": 2}
)
# Write the results to a JSON file that can be loaded in
# https://pair-code.github.io/llm-comparator
file_path = "assets/run1_run2_compare.json"
comparison.write(comparison_result, file_path)
- You can now upload this file on LLM Comparator tool/app at https://pair-code.github.io/llm-comparator/ and analyze the results. Refer to documentation on how to use the tool.
Based on the analysis, you can identify loss patterns and seed idea for next experiment. For example, changing prompt template, system instruction or model configuration. Add a new experiment and run evaluations until you meet the success criteria for the evaluation task.
🧹 Cleaning up¶
Uncomment the following cells to clean up resources created as part of the Evals Playbook.
# # Delete BigQuery Dataset using bq utility
# ! bq rm -r -f -d {BQ_DATASET_ID}
# # Delete GCS bucket
# ! gcloud storage rm --recursive {STAGING_BUCKET_URI}