# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Vertex AI: Gemini Evaluations Playbook
Optimize with grid search of experiments
Evals Playbook: Optimize with grid search of experiments¶
This notebook shows you systematically exploring different experiment configurations by testing various prompt templates or model settings (like temperature), or combinations of these using a grid-search style approach. The notebook performs following steps:
- Define the evaluation task
- Prepare evaluation dataset
- Define an experiment by:
- Configuring the model
- Setting prompt and system instruction
- Establishing evaluation criteria (metrics)
- Run evaluations using Vertex AI Rapid Eval SDK
- Log detailed results and summarizing through aggregated metrics.
- Side-by-side comparison of evaluation runs for a comprehensive analysis.
🚧 0. Pre-requisites¶
Make sure that you have completed the initial setup process using 0_gemini_evals_playbook_setup.ipynb. If the 0_gemini_evals_playbook_setup notebook has been run successfully, the following are set up:
GCP project and APIs to run the eval pipeline
All the required IAM permissions
Environment to run the notebooks
Bigquery datasets and tables to track evaluation results
%load_ext autoreload
%autoreload 2
Read configurations¶
The configuration saved previously in 0_gemini_evals_playbook_setup.ipynb will be used for initializing variables.
import os
import sys
module_path = os.path.abspath(os.path.join(".."))
sys.path.append(module_path)
print(f"module_path: {module_path}")
# Import all the parameters
from utils.config import (LOCATION, PROJECT_ID, STAGING_BUCKET,
STAGING_BUCKET_URI)
from utils.evals_playbook import Evals, generate_uuid
Import libraries¶
import datetime
import itertools
import re
import pandas as pd
import vertexai
from datasets import Dataset, load_dataset
from vertexai.evaluation import EvalTask, constants
from vertexai.generative_models import (GenerativeModel, HarmBlockThreshold,
HarmCategory, SafetySetting)
Initialize Vertex AI SDK¶
vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=STAGING_BUCKET_URI)
print("Vertex AI SDK initialized.")
print(f"Vertex AI SDK version = {vertexai.__version__}")
# pandas display full column values
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
# Define eval object
evals = Evals()
🛠️ 1. Configure parameter grid to run experiments¶
Define exploration space as grid¶
Define a dictionary with parameters names (str) as keys such as prompt template or temperature. For each key, specify a list of settings to try as values, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. This is similar to defining grid search in ML.
param_grid = {
"prompt": [ # Format: (prompt_id, prompt_description, prompt_template)
(
"prompt_template_1",
"Single Sentence",
"Summarize this PubMed article: {context}",
),
("prompt_template_2", "Structured", "Article: {context}. Summary:"),
],
"temperature": [0.0, 0.1, 0.2],
}
Configure Model¶
Define the Gemini model you want to evaluate your task on including name, configuration settings such as temperature and safety settings.
system_instruction = """Instruction: You are a medical researcher writing a plain language Summary of your Article for a layperson.
Translate any medical terms to simple english explanations.
Use first-person 'We'. Use short bullet points addressing following
- Purpose: What was the purpose of the study?
- Research: What did the researchers do?
- Findings: What did they find?
- Implications: What does this mean for me?"
"""
#
safety_settings = [
SafetySetting(
category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
threshold=HarmBlockThreshold.BLOCK_NONE,
),
SafetySetting(
category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,
threshold=HarmBlockThreshold.BLOCK_NONE,
),
SafetySetting(
category=HarmCategory.HARM_CATEGORY_HARASSMENT,
threshold=HarmBlockThreshold.BLOCK_NONE,
),
SafetySetting(
category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
threshold=HarmBlockThreshold.BLOCK_NONE,
),
]
#
model_name = "gemini-1.5-pro-002"
Configure Metrics¶
In this section, you configure the evaluation criteria for your task. You can choose from the built-in metrics (or metric bundles) from Vertex AI Rapid Eval SDK or define a custom metric.
#
metrics = [
constants.Metric.ROUGE_1,
constants.Metric.ROUGE_L_SUM,
constants.Metric.BLEU,
constants.Metric.FLUENCY,
constants.Metric.COHERENCE,
constants.Metric.SAFETY,
constants.Metric.GROUNDEDNESS,
constants.Metric.SUMMARIZATION_QUALITY,
]
# build a metric config object for tracking
metric_config = [
{"metric_name": metric, "type": "prebuilt", "metric_scorer": "Vertex AI"}
for metric in metrics
]
Prepare evaluation dataset¶
# Prompt Template
prompt_template = "Article: {context} \nSummary:"
from google.cloud import storage
# # OPTION 1:
# # Load prepared dataset from GCS
# # Path to your CSV file in GCS
# file_name = "pubmed_summary.csv"
# file_path = f"gs://{STAGING_BUCKET}/{file_name}"
# # Read the CSV file into pandas DataFrame
# eval_dataset = pd.read_csv(file_path)
# OPTION 2:
# Load and prepare public dataset from HuggingFace
ds_stream = load_dataset(
"ccdv/pubmed-summarization", "document", split="test", streaming=True
)
num_rows = 10
dataset = Dataset.from_list(list(itertools.islice(ds_stream, num_rows)))
# convert HuggingFace dataset to Pandas dataframe
eval_dataset = dataset.to_pandas()
# rename columns as per Vertex AI Rapid Eval SDK defaults
eval_dataset.columns = ["context", "reference"]
# add instruction for calculating metrics (not all metrics need instruction)
eval_dataset["instruction"] = system_instruction
# add prompt column
eval_dataset["prompt"] = eval_dataset["context"].apply(
lambda x: prompt_template.format(context=x)
)
# add prompt id for tracking
eval_dataset["dataset_row_id"] = [f"dataset_row_{i}" for i in eval_dataset.index]
Define Evaluation task¶
# create and log task
task_id = "task_summarization"
task = evals.Task(
task_id=task_id,
task_desc="summarize pubmed articles",
create_datetime=datetime.datetime.now(),
update_datetime=datetime.datetime.now(),
tags=["pubmed"],
)
evals.log_task(task)
#
evals.get_all_tasks()
⏳ 2. Run experiments on the grid¶
# Note thar this cell can take time to finish!
from sklearn.model_selection import ParameterGrid
grid = ParameterGrid(param_grid)
experiment_run_ids = []
# print(list(grid))
for indx, params in enumerate(grid):
prompt_id, prompt_description, prompt_template = params["prompt"]
temperature = params["temperature"]
# Print above parameters, one in each line
# print(f'prompt_id: {prompt_id}\nprompt_description: {prompt_description}\nprompt_template: {prompt_template}\ntemperature: {temperature}\n')
# Track status
print("Running ........")
print(f"{indx+1}. {params}")
# Set up the experiment
experiment_id = f"prompt-{prompt_id}-{temperature}"
experiment_desc = f"Simple language summary with prompt {prompt_id} and temperature {temperature} "
tags = ["pubmed"]
metadata = {}
# print(experiment_id, experiment_desc)
generation_config = {"temperature": temperature}
model = GenerativeModel(
model_name=model_name,
generation_config=generation_config,
safety_settings=safety_settings,
system_instruction=system_instruction,
# TODO: Add tools and tool_config
)
# Configure and log prompt
prompt = evals.Prompt(
prompt_id=prompt_id,
prompt_description=prompt_description,
prompt_type="single-turn", # single-turn, chat,
is_multimodal=False,
system_instruction=system_instruction,
prompt_template=prompt_template,
create_datetime=datetime.datetime.now(),
update_datetime=datetime.datetime.now(),
tags=tags,
)
evals.log_prompt(prompt)
# Configure and log experiment
experiment = evals.log_experiment(
task_id=task_id,
experiment_id=experiment_id,
experiment_desc=experiment_desc,
prompt=prompt,
model=model,
metric_config=metric_config,
tags=tags,
)
# Run Experiment
_experiment_id = re.sub("[^0-9a-zA-Z]", "-", experiment_id.lower())
eval_task = EvalTask(
dataset=eval_dataset, metrics=metrics, experiment=_experiment_id
)
experiment_run_name = generate_uuid(_experiment_id)
experiment_run_ids.append(experiment_run_name)
eval_result = eval_task.evaluate(
model=model,
prompt_template=prompt_template,
experiment_run_name=experiment_run_name,
)
run_path = f"{task_id}/prompts/{_experiment_id}/{experiment_run_name}"
evals.log_eval_run(
experiment_run_id=experiment_run_name,
experiment=experiment,
eval_result=eval_result,
run_path=run_path,
tags=tags,
metadata=metadata,
)
- Fetch run details
evals.get_all_eval_runs()
or evals.get_eval_runs(experiment_id=experiment_id)
to get run ids.
evals.get_all_eval_runs()
# To find run_id for previous runs, see
# gemini_evals_plapbook(schema) >> eval_runs(table) >> run_id (column) on bigquery
evals.get_eval_run_detail(
experiment_run_id="[your_run_id]"
)
🔍 3. Grid search¶
Search the grid for optimal configuration with respect to metrics of choice
# Set the task_id to perform the search
task_id = "task_summarization"
# Metrics to be used for grid search
opt_metrics = [
"ROUGE_1",
"BLEU",
] # Options: "ROUGE_1", "ROUGE_L_SUM", "BLEU", "FLUENCY", "COHERENCE", "SAFETY", "GROUNDEDNESS", "SUMMARIZATION_QUALITY", "SUMMARIZATION_VERBOSITY", "SUMMARIZATION_HELPFULNESS"
# Paramaters to be retrieved from grid search
opt_params = [
"prompt_template",
"temperature",
] # Options: "experiment_desc", "prompt_template", "temperature", "system_instruction", "model_name"
# Use run_ids collected during grid search: experiment_run_ids
# Comparision of runs in experiment grid
evals.compare_eval_runs(experiment_run_ids)
# Outcome of gridsearch
evals.grid_search(
task_id=task_id,
experiment_run_ids=experiment_run_ids,
opt_metrics=opt_metrics,
opt_params=opt_params,
)
🧹 Cleaning up¶
Uncomment the following cells to clean up resources created as part of the Evals Playbook.
# # Delete BigQuery Dataset using bq utility
# ! bq rm -r -f -d {BQ_DATASET_ID}
# # Delete GCS bucket
# ! gcloud storage rm --recursive {STAGING_BUCKET_URI}