# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Author(s) | Renato Leite (renatoleite@), Egon Soares (egon@) |
Last updated | 09/01/2023 |
LLM Evaluation Workflow for a Classification Task using Text-Bison and Vertex AI Pipelines¶
In this notebook, we will explore various aspects related to running the Vertex LLM evaluation pipeline. Our journey will encompass the following key stages:
Data Preparation: Before we dive into the evaluation process, we'll ensure that our data is prepped and ready to be fed into the pipeline.
Evaluation with Model text-bison@001: We will execute the evaluation phase using the foundational model, specifically text-bison@001. To initiate the evaluation job, we will utilize the open-source pipeline definition.
Metric Retrieval and Visualization: Once we've run the evaluation, we'll extract all the valuable metrics generated as artifacts by the pipeline. These metrics will be uploaded to an ExperimentsRun and will be able to visualize inside the pipeline.
Reference Architecture¶
Install required python packages¶
# Install Vertex AI LLM SDK (Private Preview)
! pip install -U google-cloud-aiplatform
! pip install -U google-cloud-pipeline-components
! pip install "shapely<2.0.0"
# Install HuggingFace Datasets
! pip install datasets
# OPTIONAL (if you are using Colab, restart the Kernel at this point, uncommend and execute the following code)
# from google.colab import auth as google_auth
# google_auth.authenticate_user()
Import python packages and define project variables¶
import vertexai
import uuid
from datasets import load_dataset
from google.cloud import aiplatform
from google.cloud import storage
from google_cloud_pipeline_components.preview.model_evaluation import evaluation_llm_classification_pipeline
from kfp import compiler
from kfp import dsl
from vertexai.preview.language_models import TextGenerationModel
Replace the values of the variables below according to your project specification.
# Project variables
PROJECT_ID = "<YOUR PROJECT ID>"
ENDPOINT_LOCATION = "us-central1"
STAGING_BUCKET = "gs://<YOUR BUCKET NAME>" # Same location as your ENDPOINT_LOCATION
storage_client = storage.Client()
vertexai.init(project=PROJECT_ID, location=ENDPOINT_LOCATION, staging_bucket=STAGING_BUCKET)
aiplatform.init(project=PROJECT_ID, location=ENDPOINT_LOCATION, staging_bucket=STAGING_BUCKET)
Prepare the dataset for evaluation¶
In this lab, you are going to evaluate the text-bison foundation model for a single label text classification task. You are going to use the dair-ai/emotion
dataset from HuggingFace.
# Load the dataset from HuggingFace
dataset = load_dataset('dair-ai/emotion', split='test[:5%]')
print('Dataset structure:\n', dataset)
print('Sample:\n', dataset[0])
The evaluation dataset used for model evaluation includes prompt and ground truth pairs that align with the task that you want to evaluate. Your dataset must include a minimum of one prompt and ground truth pair, but we recommend at least 10 pairs for meaningful metrics. Generally speaking, the more examples you give, the more meaningful the results.
The dataset can be in 2 different formats:
- Pandas Dataframe
- JSONL file on Google Cloud Storage
Next we will demonstrate both methods.
class_labels = {
0: 'sadness',
1: 'joy',
2: 'love',
3: 'anger',
4: 'fear',
5: 'surprise'
}
instructions = f'''Classify the following text into one of the following classes:
[{', '.join(class_labels.values())}]
Text:
'''
def add_instructions(example, instructions):
example["prompt"] = f'{instructions}{example["text"]}'
example["ground_truth"] = class_labels[example["label"]]
return example
eval_dataset = dataset.map(lambda x: add_instructions(x, instructions)).remove_columns(['text', 'label'])
print(eval_dataset)
print(eval_dataset[0])
# Export the dataset split to GCS
jsonl_filename = 'emotions-eval.jsonl'
gcs_uri = f'{STAGING_BUCKET}/{jsonl_filename}'
eval_dataset.to_json(jsonl_filename)
# Copy file to GCS
!gsutil cp {jsonl_filename} {gcs_uri}
# List GCS bucket to verify the file was copied successfully
!gsutil ls {STAGING_BUCKET}/*.jsonl
Run Vertex AI LLM Model Evaluation job¶
Option 1: Simple evaluation pipeline submission¶
classification_pipeline_path = 'classification_pipeline.json'
compiler.Compiler().compile(
pipeline_func=evaluation_llm_classification_pipeline,
package_path=classification_pipeline_path
)
base_model = TextGenerationModel.from_pretrained('text-bison@001')
model_name = base_model._model_resource_name
job_id = "base-model-evaluation-{}".format(uuid.uuid4())
experiment_name = 'tweet-emotion-classification'
target_field_name='ground_truth'
evaluation_class_labels=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
parameters = {
"project": PROJECT_ID,
"location": ENDPOINT_LOCATION,
"batch_predict_gcs_destination_output_uri": f'{STAGING_BUCKET}/output',
"evaluation_class_labels": evaluation_class_labels,
"batch_predict_gcs_source_uris": [gcs_uri],
"target_field_name": 'ground_truth',
"model_name": model_name
}
job = aiplatform.PipelineJob(
display_name=job_id,
template_path=classification_pipeline_path,
job_id=job_id,
pipeline_root=STAGING_BUCKET,
parameter_values=parameters,
enable_caching=False,
)
job.submit(experiment=experiment_name)
Option 2: Evaluation pipeline with custom visualization¶
from google_cloud_pipeline_components.types import artifact_types
from kfp import dsl
from kfp.dsl import Input, Output, Markdown
@dsl.component(
packages_to_install=[
'google_cloud_pipeline_components',
'google-cloud-storage',
'pandas']
)
def record_metrics_component(
evaluation_class_labels: list,
evaluation_metrics: Input[artifact_types.ClassificationMetrics],
confusion_artifact: Output[dsl.ClassificationMetrics],
classification_artifact: Output[Markdown],
raw_metrics: Output[dsl.Metrics]
):
import json
from google.cloud import storage
import pandas as pd
storage_client = storage.Client()
# Read metrics content from GCS
def get_metrics_blob(metrics_uri):
splits = metrics_uri.split("/")
bucket_name = splits[2]
blob_name = '/'.join(splits[3:])
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
with blob.open("r") as f:
return json.loads(f.read())
def get_confusion_matrix(overall_metrics):
confusion_matrix = []
for slice_metric in overall_metrics['slicedMetrics']:
if 'value' in slice_metric['singleOutputSlicingSpec']:
continue
for row in slice_metric['metrics']['classification']['confusionMatrix']['rows']:
confusion_matrix.append(row['dataItemCounts'])
return confusion_matrix
# Define the function to print classification metrics
def get_classification_metrics(overall_metrics):
all_metrics = overall_metrics['slicedMetrics']
metric_names = ["Metric Slice", "auPrc", "auRoc", "logLoss"]
f1_metrics = ["f1Score"]
aggregated_f1_metrics = ["f1ScoreMicro", "f1ScoreMacro"]
table = [metric_names + f1_metrics + aggregated_f1_metrics]
for metrics in all_metrics:
classification_metric = metrics['metrics']['classification']
slice_name = "class - " + metrics['singleOutputSlicingSpec']['value'] if 'value' in metrics['singleOutputSlicingSpec'] else "Overall"
slice_metric_values = [slice_name]
slice_metric_values.extend(
[classification_metric.get(metric_name, 0)
for metric_name in metric_names[1:]])
slice_metric_values.extend(
[classification_metric['confidenceMetrics'][0].get(metric_name, 0)
for metric_name in f1_metrics])
slice_metric_values.extend(
[classification_metric['confidenceMetrics'][0].get(metric_name, 'n/a')
for metric_name in aggregated_f1_metrics])
table.append(slice_metric_values)
return table
# Log Confusion Matrix artifact
overall_metrics = get_metrics_blob(metrics_uri=evaluation_metrics.uri)
confusion_matrix = get_confusion_matrix(overall_metrics)
evaluation_class_labels.append('UNKNOWN')
confusion_artifact.log_confusion_matrix(
categories=evaluation_class_labels,
matrix=confusion_matrix
)
# Log Classification metrics
metrics_table = get_classification_metrics(overall_metrics)
markdown_content = pd.DataFrame(metrics_table).to_markdown()
with open(classification_artifact.path, 'w') as fp:
fp.write(markdown_content)
# Log Raw metrics
raw_metrics.log_metric(
metric='f1Score',
value=metrics_table[1][4]
)
# Log Raw metrics
raw_metrics.log_metric(
metric='f1ScoreMicro',
value=metrics_table[1][5]
)
# Log Raw metrics
raw_metrics.log_metric(
metric='f1ScoreMacro',
value=metrics_table[1][6]
)
@dsl.pipeline
def custom_evaluation_pipeline(
project: str,
location: str,
batch_predict_gcs_destination_output_uri: str,
evaluation_class_labels: list,
batch_predict_gcs_source_uris: list,
model_name: str,
target_field_name: str
):
eval_pipeline = evaluation_llm_classification_pipeline(
project=project,
location=location,
batch_predict_gcs_destination_output_uri=batch_predict_gcs_destination_output_uri,
evaluation_class_labels=evaluation_class_labels,
batch_predict_gcs_source_uris=batch_predict_gcs_source_uris,
target_field_name=target_field_name,
model_name=model_name
)
record_metrics_component(
evaluation_class_labels=evaluation_class_labels,
evaluation_metrics=eval_pipeline.outputs['evaluation_metrics'])
base_model = TextGenerationModel.from_pretrained('text-bison@001')
model_name = base_model._model_resource_name
job_id = "notebooks3-custom-model-evaluation-{}".format(uuid.uuid4())
experiment_name = 'tweet-emotion-classification'
target_field_name='ground_truth'
evaluation_class_labels=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
custom_classification_pipeline_path = 'custom_evaluation_pipeline.json'
compiler.Compiler().compile(
pipeline_func=custom_evaluation_pipeline,
package_path=custom_classification_pipeline_path
)
parameters = {
"project": PROJECT_ID,
"location": ENDPOINT_LOCATION,
"batch_predict_gcs_destination_output_uri": f'{STAGING_BUCKET}/output',
"evaluation_class_labels": evaluation_class_labels,
"batch_predict_gcs_source_uris": [gcs_uri],
"target_field_name": 'ground_truth',
"model_name": model_name,
}
job = aiplatform.PipelineJob(
display_name=job_id,
template_path=custom_classification_pipeline_path,
pipeline_root=STAGING_BUCKET,
parameter_values=parameters,
enable_caching=True,
)
job.submit(experiment=experiment_name)