# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Author(s) | Renato Leite (renatoleite@), Egon Soares (egon@) |
Last updated | 10/23/2023 |
Workflow for Evaluating LLM Performance in a Text Classification Task using Text-Bison and Vertex AI SDK¶
In this notebook, we will explore various aspects related to running the Vertex LLM evaluation pipeline. Our journey will encompass the following key stages:
Data Preparation: Before we begin the evaluation process, we will ensure that our data is prepared and ready for input into the pipeline.
Evaluation with Model text-bison@001: We will execute the evaluation phase using the foundational model, known as text-bison@001. This step is crucial for assessing the model's performance and establishing a baseline.
Metric Retrieval: After completing the evaluation, we will extract valuable metrics generated as artifacts by the pipeline.
Metric Visualization: In this notebook, we will present and visualize the collected metrics.
Tensorboard Upload and Visualization: We will upload the metrics to Tensorboard. This platform will allow us to explore the metrics dynamically and interactively, enhancing our understanding.
Vertex Experiments: In addition to Tensorboard, we will also explore another method for uploading and visualizing our metrics: the Vertex Experiments environment.
Reference Architecture¶
Install required python packages¶
# Install Vertex AI LLM SDK (Private Preview)
! pip install -U google-cloud-aiplatform
! pip install "shapely<2.0.0"
# Install HuggingFace Datasets
! pip install datasets
! pip install tensorflow
# OPTIONAL (if you are using Colab, restart the Kernel at this point, uncommend and execute the following code)
# from google.colab import auth as google_auth
# google_auth.authenticate_user()
Import python packages and define project variables¶
import json
import matplotlib
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import vertexai
import uuid
from google.cloud import aiplatform
from datasets import load_dataset
from google.cloud import storage
from sklearn import metrics
from tabulate import tabulate
from vertexai.preview.language_models import (
TextGenerationModel,
EvaluationTextClassificationSpec,
EvaluationTextGenerationSpec,
EvaluationQuestionAnsweringSpec,
EvaluationTextSummarizationSpec,
)
Replace the values of the variables below according to your project specification.
# Project variables
PROJECT_ID = "<YOUR PROJECT ID>"
LOCATION = "us-central1"
STAGING_BUCKET = "gs://<YOUR BUCKET NAME>"
DATA_STAGING_GCS_LOCATION = "gs://<YOUR BUCKET NAME>"
storage_client = storage.Client()
vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=STAGING_BUCKET)
Create a Vertex AI TensorBoard instance¶
Create an instance of Vertex AI Tensorboard that will be used to upload the evaluation metrics.
If you want to reuse an existing instance, skip the following cell and set the tensorboard_id
variable to your instance ID.
Note that the instance must be in the same region where the evaluation data was written.
display_name = 'llm-eval-tensorboard'
tensorboard = aiplatform.Tensorboard.create(
display_name=display_name,
project=PROJECT_ID,
location=LOCATION
)
print(tensorboard.display_name)
print(tensorboard.resource_name)
# Example: projects/244831775715/locations/us-central1/tensorboards/1667462160080437248
# Replace with the your Tensorboard resource name
tensorboard_id = '<YOUR TENSORBOARD RESOURCE NAME>'
Prepare the dataset for evaluation¶
In this lab, you are going to evaluate the text-bison foundation model for a single label text classification task. You are going to use the dair-ai/emotion
dataset from HuggingFace.
Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise.
# Load the dataset from HuggingFace
dataset = load_dataset('dair-ai/emotion', split='test[:5%]')
print('Dataset structure:\n', dataset)
print('Sample:\n', dataset[0])
The evaluation dataset used for model evaluation includes prompt and ground truth pairs that align with the task that you want to evaluate. Your dataset must include a minimum of one prompt and ground truth pair, but we recommend at least 10 pairs for meaningful metrics. Generally speaking, the more examples you give, the more meaningful the results.
The dataset can be in 2 different formats:
- Pandas Dataframe
- JSONL file on Google Cloud Storage
Next we will demonstrate both methods.
class_labels = {
0: 'sadness',
1: 'joy',
2: 'love',
3: 'anger',
4: 'fear',
5: 'surprise'
}
instructions = f'''Classify the text into one of the classes bellow:
[{', '.join(class_labels.values())}]
Text:
'''
def add_instructions(example, instructions):
example["prompt"] = f'{instructions}{example["text"]}'
example["ground_truth"] = class_labels[example["label"]]
return example
eval_dataset = dataset.map(lambda x: add_instructions(x, instructions)).remove_columns(['text', 'label'])
print(eval_dataset)
print(eval_dataset[0])
# Export the dataset split to GCS
jsonl_filename = 'emotions-eval.jsonl'
gcs_uri = f'{DATA_STAGING_GCS_LOCATION}/{jsonl_filename}'
eval_dataset.to_json(jsonl_filename)
# Copy file to GCS
!gsutil cp {jsonl_filename} {gcs_uri}
# List GCS bucket to verify the file was copied successfully
!gsutil ls {DATA_STAGING_GCS_LOCATION}/*.jsonl
Run Vertex AI LLM Model Evaluation job¶
As mentioned before, you can start an evaluation job passing a Pandas Dataframe or a path to a JSONL file on GCS. You will explore both possibilities.
Option 1 - Run evaluation with JSONL on GCS¶
model = TextGenerationModel.from_pretrained("text-bison@001")
task_spec_classification = EvaluationTextClassificationSpec(
ground_truth_data=[gcs_uri],
class_names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'],
target_column_name='ground_truth'
)
metrics = model.evaluate(task_spec=task_spec_classification)
metrics
Option 2 - Run evaluation on a Pandas Dataframe¶
# Use a pandas dataframe to submit your job
task_spec_classification = EvaluationTextClassificationSpec(
ground_truth_data=pd.DataFrame(eval_dataset),
class_names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'],
target_column_name='ground_truth'
)
metrics = model.evaluate(task_spec=task_spec_classification)
metrics
Metrics Visualization¶
# List all pipeline jobs with "evaluation-llm-classification-pipeline" that succeeded
for name in aiplatform.PipelineJob.list(project=PROJECT_ID, filter="pipeline_name:*evaluation-llm-classification-pipeline*"):
if name.state == 4: # SUCCEEDED
print(name.resource_name)
target_field_name='ground_truth'
evaluation_class_labels=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise', 'UNKNOWN']
experiment_name = 'notebook1-experiment-llm-custom'
# Example: 'projects/244831775715/locations/us-central1/pipelineJobs/evaluation-llm-classification-pipeline-20230831205858'
# Copy one of the resource names from the listing above
pipeline_resource_name = '<YOUR PROJECT RESOURCE FULL NAME>'
aiplatform.init(
project=PROJECT_ID,
location=LOCATION,
staging_bucket=STAGING_BUCKET,
experiment=experiment_name,
experiment_tensorboard=tensorboard_id)
pipeline_job = aiplatform.PipelineJob.get(resource_name=pipeline_resource_name)
Option 1 - Local visualization¶
# Define the function to read metrics content from GCS
def get_metrics_blob(job):
expected_task_name = "model-evaluation-classification"
task_detail = None
for detail in job.task_details:
if detail.task_name == expected_task_name:
task_detail = detail
if not task_detail:
print(f"Not able to find the task {expected_task_name}.")
metrics_uri = None
for k, v in task_detail.outputs.items():
if k != "evaluation_metrics":
continue
for artifact in v.artifacts:
if artifact.display_name == "evaluation_metrics":
metrics_uri = artifact.uri[5:]
if not metrics_uri:
print("Not able to find the metric.")
splits = metrics_uri.split("/")
bucket_name = splits[0]
blob_name = '/'.join(splits[1:])
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
with blob.open("r") as f:
return json.loads(f.read())
overall_metrics = get_metrics_blob(pipeline_job)
# Define the function to print classification metrics
def get_classification_metrics(overall_metrics):
classification_metrics = overall_metrics['slicedMetrics']
metric_names = ["Metric Slice", "auPrc", "auRoc", "logLoss"]
f1_metrics = ["f1Score"]
aggregated_f1_metrics = ["f1ScoreMicro", "f1ScoreMacro"]
table = [metric_names + f1_metrics + aggregated_f1_metrics]
for metrics in classification_metrics:
classification_metric = metrics['metrics']['classification']
slice_name = "class - " + metrics['singleOutputSlicingSpec']['value'] if 'value' in metrics['singleOutputSlicingSpec'] else "Overall"
slice_metric_values = [slice_name]
slice_metric_values.extend([classification_metric.get(metric_name, 0) for metric_name in metric_names[1:]])
slice_metric_values.extend([classification_metric['confidenceMetrics'][0].get(metric_name, 0) for metric_name in f1_metrics])
slice_metric_values.extend([classification_metric['confidenceMetrics'][0].get(metric_name, 'n/a') for metric_name in aggregated_f1_metrics])
table.append(slice_metric_values)
return table
classification_metrics = get_classification_metrics(overall_metrics)
print(tabulate(classification_metrics, headers='firstrow', tablefmt='fancy_grid'))
# Define the function to plot confusion matrix
matplotlib.use('Agg')
%matplotlib inline
def get_confusion_matrix(overall_metrics):
confusion_matrix = []
for slice_metric in overall_metrics['slicedMetrics']:
if 'value' in slice_metric['singleOutputSlicingSpec']:
continue
if 'confusionMatrix' not in slice_metric['metrics']['classification']:
print("No Confusion Matrix found")
print(f"Evaluation metrics is: {slice_metric}")
return
for row in slice_metric['metrics']['classification']['confusionMatrix']['rows']:
confusion_matrix.append(row['dataItemCounts'])
# Plot the matrix
return confusion_matrix
confusion_matrix = get_confusion_matrix(overall_metrics)
confusion_matrix_plot = numpy.array(confusion_matrix)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_plot, display_labels = evaluation_class_labels)
fig, ax = plt.subplots(figsize=(8,8))
cm_display.plot(ax=ax)
plt.show()
# Define the function to print confidence metrics
def get_confidence_metrics(overall_metrics, expected_confidence_threshold):
all_metrics = overall_metrics['slicedMetrics']
confidence_metric_names = ["Metric Slice", "recall", "precision", "falsePositiveRate", "f1Score", "truePositiveCount", "falsePositiveCount"]
table = [confidence_metric_names]
for metrics in all_metrics:
classification_metric = metrics['metrics']['classification']
slice_name = "class - " + metrics['singleOutputSlicingSpec']['value'] if 'value' in metrics['singleOutputSlicingSpec'] else "Overall"
slice_metric_values = [slice_name]
confidence_metrics = None
found_threshold_distance = 1
for metrics in classification_metric['confidenceMetrics']:
confidence_threshold = metrics['confidenceThreshold'] if 'confidenceThreshold' in metrics else 0
if abs(expected_confidence_threshold-confidence_threshold) <= found_threshold_distance:
confidence_metrics = metrics
found_threshold_distance = abs(expected_confidence_threshold-confidence_threshold)
slice_metric_values.extend([confidence_metrics.get(metric_name, 0) for metric_name in confidence_metric_names[1:]])
table.append(slice_metric_values)
return table
confidence_metrics = get_confidence_metrics(overall_metrics=overall_metrics, expected_confidence_threshold=0.9)
print(tabulate(confidence_metrics, headers='firstrow', tablefmt='fancy_grid'))
Option 2 - Start ExperimentRun and log metrics¶
run_name = "run-{}".format(uuid.uuid4())
with aiplatform.start_run(run=run_name) as my_run:
metrics = {}
metrics['auPrc'] = classification_metrics[1][4]
metrics['auRoc'] = classification_metrics[1][5]
metrics['logLoss'] = classification_metrics[1][6]
metrics['f1Score'] = classification_metrics[1][4]
metrics['f1ScoreMicro'] = classification_metrics[1][5]
metrics['f1ScoreMacro'] = classification_metrics[1][6]
my_run.log_metrics(metrics)
aiplatform.log(pipeline_job=pipeline_job)
aiplatform.log_classification_metrics(
labels=evaluation_class_labels,
matrix=confusion_matrix,
display_name='confusion_matrix'
)
Option 3 - Log metrics to Tensorboard¶
from datetime import datetime
import tensorflow as tf
logdir = "tf_logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
file_writer = tf.summary.create_file_writer(logdir + "/metrics")
with file_writer.as_default(step=0):
tf.summary.scalar(name='auPrc', data=classification_metrics[1][4])
tf.summary.scalar(name='auRoc', data=classification_metrics[1][5])
tf.summary.scalar(name='logLoss', data=classification_metrics[1][6])
tf.summary.scalar(name='f1Score', data=classification_metrics[1][4])
tf.summary.scalar(name='f1ScoreMicro', data=classification_metrics[1][5])
tf.summary.scalar(name='f1ScoreMacro', data=classification_metrics[1][6])
aiplatform.upload_tb_log(
tensorboard_id=tensorboard_id,
tensorboard_experiment_name=experiment_name,
logdir=logdir
)