# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Author(s) | Renato Leite (renatoleite@), Egon Soares (egon@) |
Last updated | 09/01/2023 |
LLM Evaluation workflow for a Classification task using a tuned model and Vertex AI SDK¶
In this notebook, we will explore various aspects related to running the Vertex LLM evaluation pipeline. Our journey will encompass the following key stages:
Data Preparation: Before we dive into the evaluation process, we will ensure that our data is properly prepared and ready to be input into the pipeline.
Model Tuning: We will optimize model performance through tuning. Additionally, we will track the progress of the tuning job using a managed Tensorboard instance.
Evaluation with Tuned Model: Following model tuning, we will execute the evaluation phase using the tuned model.
Metric Analysis: After completing the evaluation, we will visualize all the metrics within the Vertex AI Model Registry. This step is crucial for assessing the effectiveness of our tuned model.
Reference Architecture¶
Install required python packages¶
# Install Vertex AI LLM SDK (Private Preview)
! pip install -U google-cloud-aiplatform
! pip install "shapely<2.0.0"
# Install HuggingFace Datasets
! pip install datasets
# OPTIONAL (if you are using Colab, restart the Kernel at this point, uncommend and execute the following code)
# from google.colab import auth as google_auth
# google_auth.authenticate_user()
Import python packages and define project variables¶
import pandas as pd
import vertexai
from google.cloud import aiplatform
from datasets import load_dataset, DatasetDict
from google.cloud import storage
from tabulate import tabulate
from vertexai.preview.language_models import (
TextGenerationModel,
EvaluationTextClassificationSpec,
TuningEvaluationSpec
)
Replace the values of the variables below according to your project specification.
# Project variables
PROJECT_ID = "<YOUR PROJECT ID>"
ENDPOINT_LOCATION = "us-central1"
STAGING_BUCKET = "gs://<YOUR BUCKET NAME>" # In the same location as ENDPOINT_LOCATION
TUNING_JOB_LOCATION = "us-central1"
DATA_STAGING_GCS_LOCATION = "gs://<YOUR BUCKET NAME>" # In the same location as TUNING_JOB_LOCATION
storage_client = storage.Client()
vertexai.init(project=PROJECT_ID, location=ENDPOINT_LOCATION, staging_bucket=STAGING_BUCKET)
aiplatform.init(project=PROJECT_ID, location=ENDPOINT_LOCATION, staging_bucket=STAGING_BUCKET)
Create a Vertex AI TensorBoard instance¶
The Adapter Tuning pipeline can log the training metrics for tracking and retrospective analysis.
Create an instance of Vertex AI Tensorboard that will be used by tuning pipeline runs.
If you want to reuse an existing instance, skip the following cell and set the tensorboard_id
variable to your instance ID. Note that the instance must be in the same region where the tuning jobs will run.
display_name = 'llm-eval-tensorboard-notebook-2'
tensorboard = aiplatform.Tensorboard.create(
display_name=display_name,
project=PROJECT_ID,
location=TUNING_JOB_LOCATION,
)
print(tensorboard.display_name)
print(tensorboard.resource_name)
# Example: 'projects/244831775715/locations/us-central1/tensorboards/1704616857006243840'
# Replace with your Tensorboard resouce name
tensorboard_id = '<YOUR TENSORBOARD RESOURCE NAME>'
Prepare training dataset¶
In this lab, you are going to tune the text-bison foundation model for a single label text classification task. You are going to use the dair-ai/emotion
dataset from HuggingFace.
dataset = load_dataset('dair-ai/emotion')
print(dataset)
print(dataset['test'][0:2])
splits = {k:v for (k,v) in zip(['train', 'validation', 'test'],
load_dataset('dair-ai/emotion', split=['train[0:7200]', 'validation[0:256]', 'test[0:256]']))}
dataset = DatasetDict(splits)
dataset
Convert to the format required by the tuning pipeline¶
Your model tuning dataset must be in JSON Lines (JSONL) format where each line contains a single tuning example. Each example is composed of an input_text
field that contains the prompt to the model and an output_text
field that contains an example response that the tuned model is expected to produce. The maximum token length for input_text is 8,192 and the maximum token length for output_text is 1,024. If either fields exceed the maximum token length, the excess tokens are truncated.
The examples included in your dataset should match your expected production traffic. If your dataset contains specific formatting, keywords, instructions, or information, the production data should be formatted in the same way and contain the same instructions.
For example, if the examples in your dataset include a "question:"
and a "context:"
, production traffic should also be formatted to include a "question:"
and a "context:"
in the same order as it appears in the dataset examples. If you exclude the context, the model will not recognize the pattern, even if the exact question was in an example in the dataset.
For tasks such as classification, it is possible to create a dataset of examples that don't contain instructions. However, excluding instructions from the examples in the dataset leads to worse performance after tuning than including instructions, especially for smaller datasets.
For our dataset, we are going to add the following instructions
Classify the following as one of the following categories:
- sadness,
- joy,
Text:
class_labels = {
0: 'sadness',
1: 'joy',
2: 'love',
3: 'anger',
4: 'fear',
5: 'surprise'
}
class_labels.values()
instructions = f'''Classify the following text into one of the following classes:
[{', '.join(class_labels.values())}]
Text:
'''
def add_instructions(example, instructions):
example["input_text"] = f'{instructions}{example["text"]}'
example["output_text"] = class_labels[example["label"]]
return example
tuning_dataset = dataset.map(lambda x: add_instructions(x, instructions)).remove_columns(['text', 'label'])
print(tuning_dataset)
print(tuning_dataset['train'][:1])
Export the dataset splits to GCS¶
gcs_uris = {}
filename_prefix = 'emotion'
for split_name, split_data in tuning_dataset.items():
jsonl_filename = f'{filename_prefix}-{split_name}.jsonl'
gcs_uri = f'{DATA_STAGING_GCS_LOCATION}/{jsonl_filename}'
gcs_uris[split_name] = gcs_uri
split_data.to_json(jsonl_filename)
!gsutil cp {jsonl_filename} {gcs_uri}
Run a tuning pipeline¶
The key parameters used to configure a run of the tuning pipeline are as follows:
model_display_name
- a display name of the deployed adapterlocation
- a region where the adapter endpoint will be deployeddataset_uri
- a GCS location of the training splitevaluation_data_uri
- a GCS location of the validation splittrain_steps
- a number of steps to train forevaluation_interval
- training metrics are generated everyevaluation_interval
stepstensorboard_resource_id
- an ID of a Tensorboard instance to use for trackinglarge_model_reference
- the name of the base foundation model to tune
There are other parameters that can be configured, including parameters controlling a learning rate. In this lab we use the default values.
model = TextGenerationModel.from_pretrained("text-bison@001")
train_steps = 50
model_display_name = f"emotion-classification-demo-{train_steps}-steps"
tuning_eval_spec = TuningEvaluationSpec(
evaluation_data = gcs_uris['validation'],
evaluation_interval = 20,
tensorboard = tensorboard_id
)
TUNING_JOB_LOCATION
model.tune_model(
training_data=gcs_uris['train'],
train_steps=train_steps,
tuning_job_location=TUNING_JOB_LOCATION,
tuned_model_location=ENDPOINT_LOCATION,
model_display_name=model_display_name,
tuning_evaluation_spec=tuning_eval_spec
)
Evaluating the tuned model¶
test_split_filename = 'emotion-test.jsonl'
test_split = load_dataset('json',
data_files={'test': test_split_filename})
evaluation_dataset = test_split.rename_column('input_text', 'prompt').rename_column('output_text', 'ground_truth')
print(evaluation_dataset)
print(evaluation_dataset['test'][0])
model = TextGenerationModel.from_pretrained('text-bison@001')
tuned_model_names = model.list_tuned_model_names()
print(tuned_model_names)
# Replace with one of the tuned model resource name
# Example: tuned_model_name = 'projects/244831775715/locations/us-central1/models/1807691674063732736'
tuned_model_name = '<REPLACE WITH TUNED MODEL RESOURCE NAME>'
tuned_model = TextGenerationModel.get_tuned_model(tuned_model_name)
task_spec_classification = EvaluationTextClassificationSpec(
ground_truth_data=pd.DataFrame(evaluation_dataset['test']),
class_names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'],
target_column_name='ground_truth'
)
metrics = tuned_model.evaluate(task_spec=task_spec_classification)
metrics