Instruction following eval
In [ ]:
Copied!
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Instruction Following Eval Recipe¶
This Eval Recipe demonstrates how to compare performance of two models on a Instruction Following dataset using Vertex AI Evaluation Service.
Use case: Instruction Following
Metric: This eval uses a Pairwise Instruction Following template to evaluate the responses and pick a model as the winner.
Evaluation Dataset is based on Instruction Following Evaluation Dataset. It includes 10 randomly sampled prompts in a JSONL file
dataset.jsonl
with the following structure:prompt
: The task with specific instructions provided
Prompt Template is a zero-shot prompt located in
prompt_template.txt
with one prompt variable (prompt
) that is automatically populated from our dataset.
Configure Eval Settings¶
In [ ]:
Copied!
%%writefile .env
PROJECT_ID=your-project-id # Google Cloud Project ID
LOCATION=us-central1 # Region for all required Google Cloud services
EXPERIMENT_NAME=instructionfollowing-eval-recipe-demo # Creates Vertex AI Experiment to track the eval runs
MODEL_BASELINE=gemini-1.5-flash # Name of your current model
MODEL_CANDIDATE=gemini-2.0-flash # This model will be compared to the baseline model
DATASET_URI="gs://gemini_assets/instruction_following/dataset.jsonl" # Evaluation dataset in Google Cloud Storage
PROMPT_TEMPLATE_URI="gs://gemini_assets/instruction_following/prompt_template.txt" # Text file in Google Cloud Storage
METRIC_NAME = "pairwise_instruction_following"
%%writefile .env
PROJECT_ID=your-project-id # Google Cloud Project ID
LOCATION=us-central1 # Region for all required Google Cloud services
EXPERIMENT_NAME=instructionfollowing-eval-recipe-demo # Creates Vertex AI Experiment to track the eval runs
MODEL_BASELINE=gemini-1.5-flash # Name of your current model
MODEL_CANDIDATE=gemini-2.0-flash # This model will be compared to the baseline model
DATASET_URI="gs://gemini_assets/instruction_following/dataset.jsonl" # Evaluation dataset in Google Cloud Storage
PROMPT_TEMPLATE_URI="gs://gemini_assets/instruction_following/prompt_template.txt" # Text file in Google Cloud Storage
METRIC_NAME = "pairwise_instruction_following"
Install Python Libraries¶
In [ ]:
Copied!
%pip install --upgrade --quiet google-cloud-aiplatform[evaluation] python-dotenv
# The error "session crashed" is expected. Please ignore it and proceed to the next cell.
import IPython
IPython.Application.instance().kernel.do_shutdown(True)
%pip install --upgrade --quiet google-cloud-aiplatform[evaluation] python-dotenv
# The error "session crashed" is expected. Please ignore it and proceed to the next cell.
import IPython
IPython.Application.instance().kernel.do_shutdown(True)
In [ ]:
Copied!
import os
import json
import pandas as pd
import sys
import vertexai
from dotenv import load_dotenv
from google.cloud import storage
from datetime import datetime
from IPython.display import clear_output
from vertexai.evaluation import EvalTask, EvalResult, PairwiseMetric, MetricPromptTemplateExamples
from vertexai.generative_models import GenerativeModel
import os
import json
import pandas as pd
import sys
import vertexai
from dotenv import load_dotenv
from google.cloud import storage
from datetime import datetime
from IPython.display import clear_output
from vertexai.evaluation import EvalTask, EvalResult, PairwiseMetric, MetricPromptTemplateExamples
from vertexai.generative_models import GenerativeModel
Authenticate to Google Cloud (requires permission to open a popup window)¶
In [ ]:
Copied!
load_dotenv(override=True)
if os.getenv("PROJECT_ID") == "your-project-id":
raise ValueError("Please configure your Google Cloud Project ID in the first cell.")
if "google.colab" in sys.modules:
from google.colab import auth
auth.authenticate_user()
vertexai.init(project=os.getenv('PROJECT_ID'), location=os.getenv('LOCATION'))
load_dotenv(override=True)
if os.getenv("PROJECT_ID") == "your-project-id":
raise ValueError("Please configure your Google Cloud Project ID in the first cell.")
if "google.colab" in sys.modules:
from google.colab import auth
auth.authenticate_user()
vertexai.init(project=os.getenv('PROJECT_ID'), location=os.getenv('LOCATION'))
Run the eval on both models on the Pairwise Autorater¶
In [ ]:
Copied!
def load_file(gcs_uri: str) -> str:
blob = storage.Blob.from_string(gcs_uri, storage.Client())
return blob.download_as_string().decode('utf-8')
def load_dataset(dataset_uri: str):
jsonl = load_file(dataset_uri)
samples = [json.loads(line) for line in jsonl.splitlines() if line.strip()]
df = pd.DataFrame(samples)
return df
def load_prompt_template() -> str:
blob = storage.Blob.from_string(os.getenv("PROMPT_TEMPLATE_URI"), storage.Client())
return blob.download_as_string().decode('utf-8')
def run_eval(model: str) -> EvalResult:
timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
return EvalTask(
dataset=os.getenv("DATASET_URI"),
metrics= [
PairwiseMetric(
metric=os.getenv('METRIC_NAME'),
metric_prompt_template=MetricPromptTemplateExamples.Pairwise.INSTRUCTION_FOLLOWING.metric_prompt_template,
# Baseline model for pairwise comparison
baseline_model=GenerativeModel(os.getenv("MODEL_BASELINE")),
),
],
experiment=os.getenv('EXPERIMENT_NAME')
).evaluate(
model=GenerativeModel(os.getenv("MODEL_CANDIDATE")),
prompt_template=load_prompt_template(),
experiment_run_name=f"{timestamp}-{model.replace('.', '-')}"
)
#baseline = run_eval(os.getenv("MODEL_BASELINE"))
metrics = run_eval(os.getenv("MODEL_CANDIDATE"))
clear_output()
print("Baseline model win rate:", round(metrics.summary_metrics[f'{os.getenv("METRIC_NAME")}/baseline_model_win_rate'],3))
print("Candidate model win rate:", round(metrics.summary_metrics[f'{os.getenv("METRIC_NAME")}/candidate_model_win_rate'],3))
def load_file(gcs_uri: str) -> str:
blob = storage.Blob.from_string(gcs_uri, storage.Client())
return blob.download_as_string().decode('utf-8')
def load_dataset(dataset_uri: str):
jsonl = load_file(dataset_uri)
samples = [json.loads(line) for line in jsonl.splitlines() if line.strip()]
df = pd.DataFrame(samples)
return df
def load_prompt_template() -> str:
blob = storage.Blob.from_string(os.getenv("PROMPT_TEMPLATE_URI"), storage.Client())
return blob.download_as_string().decode('utf-8')
def run_eval(model: str) -> EvalResult:
timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
return EvalTask(
dataset=os.getenv("DATASET_URI"),
metrics= [
PairwiseMetric(
metric=os.getenv('METRIC_NAME'),
metric_prompt_template=MetricPromptTemplateExamples.Pairwise.INSTRUCTION_FOLLOWING.metric_prompt_template,
# Baseline model for pairwise comparison
baseline_model=GenerativeModel(os.getenv("MODEL_BASELINE")),
),
],
experiment=os.getenv('EXPERIMENT_NAME')
).evaluate(
model=GenerativeModel(os.getenv("MODEL_CANDIDATE")),
prompt_template=load_prompt_template(),
experiment_run_name=f"{timestamp}-{model.replace('.', '-')}"
)
#baseline = run_eval(os.getenv("MODEL_BASELINE"))
metrics = run_eval(os.getenv("MODEL_CANDIDATE"))
clear_output()
print("Baseline model win rate:", round(metrics.summary_metrics[f'{os.getenv("METRIC_NAME")}/baseline_model_win_rate'],3))
print("Candidate model win rate:", round(metrics.summary_metrics[f'{os.getenv("METRIC_NAME")}/candidate_model_win_rate'],3))