Image prompt alignment eval

In [ ]:

Copied!





# Copyright 2025 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2025 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Image-Prompt Alignment¶

This Eval Recipe demonstrates how to use a prompt alignment autorater to compare image generation quality of two models (Imagen2 and Imagen3) using Vertex AI Evaluation Service.

Open in Colab

Open in Colab Enterprise

Open in Vertex AI Workbench

View on GitHub

Use case: Image Generation
Dataset: This eval recipe uses two JSONL dataset files that are based on the same set of prompts and map the prompts to the images generated by Imagen 2 and Imagen 3
Metric: we use an autorater inspired by Gecko that generates questions about all visually groundable aspects of the image, answers these questions, assigns the prompt alignment score based on the answers, and generates an explanation for all identified gaps.

Step 1 of 4: Configure eval settings

In [ ]:

Copied!





%%writefile .env
PROJECT_ID=your-project-id            # Google Cloud Project ID
LOCATION=us-central1                  # Region for all required Google Cloud services
EXPERIMENT_NAME=eval-image-prompt-alignment    # Creates Vertex AI Experiment to track the eval runs
MODEL_JUDGE=gemini-2.0-flash-001  # This model will run the autorater prompt
DATASET_URI_IMAGEN2="gs://gemini_assets/image_prompt_alignment/dataset_imagen2.jsonl"  # Evaluation dataset for Imagen 2
DATASET_URI_IMAGEN3="gs://gemini_assets/image_prompt_alignment/dataset_imagen3.jsonl"  # Evaluation dataset for Imagen 3
%%writefile .env
PROJECT_ID=your-project-id            # Google Cloud Project ID
LOCATION=us-central1                  # Region for all required Google Cloud services
EXPERIMENT_NAME=eval-image-prompt-alignment    # Creates Vertex AI Experiment to track the eval runs
MODEL_JUDGE=gemini-2.0-flash-001  # This model will run the autorater prompt
DATASET_URI_IMAGEN2="gs://gemini_assets/image_prompt_alignment/dataset_imagen2.jsonl"  # Evaluation dataset for Imagen 2
DATASET_URI_IMAGEN3="gs://gemini_assets/image_prompt_alignment/dataset_imagen3.jsonl"  # Evaluation dataset for Imagen 3

Step 2 of 4: Install Python libraries

In [ ]:

Copied!





%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation] python-dotenv
# The error "session crashed" is expected. Please ignore it and proceed to the next cell.
import IPython
IPython.Application.instance().kernel.do_shutdown(True)
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation] python-dotenv
# The error "session crashed" is expected. Please ignore it and proceed to the next cell.
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

Step 3 of 4: Authenticate to Google Cloud (requires permission to open a popup window)

In [ ]:

Copied!





import os
import sys
import pandas as pd
import vertexai
from datetime import datetime
from dotenv import load_dotenv
from google import genai
from google.cloud import storage
from google.genai.types import Content, Part
from vertexai.evaluation import EvalTask, CustomMetric

load_dotenv(override=True)
if os.getenv("PROJECT_ID") == "your-project-id":
    raise ValueError("Please configure your Google Cloud Project ID in the first cell.")
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

vertexai.init(project=os.getenv('PROJECT_ID'), location=os.getenv('LOCATION'))
_gemini_client = genai.Client(vertexai=True, project=os.getenv('PROJECT_ID'), location=os.getenv('LOCATION'))
import os
import sys
import pandas as pd
import vertexai
from datetime import datetime
from dotenv import load_dotenv
from google import genai
from google.cloud import storage
from google.genai.types import Content, Part
from vertexai.evaluation import EvalTask, CustomMetric

load_dotenv(override=True)
if os.getenv("PROJECT_ID") == "your-project-id":
    raise ValueError("Please configure your Google Cloud Project ID in the first cell.")
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

vertexai.init(project=os.getenv('PROJECT_ID'), location=os.getenv('LOCATION'))
_gemini_client = genai.Client(vertexai=True, project=os.getenv('PROJECT_ID'), location=os.getenv('LOCATION'))

Step 4 of 4: Evaluate images from Baseline and Candidate models and print the alignment scores

In [ ]:

Copied!





import json
import os
import pandas as pd
import vertexai
from datetime import datetime
from IPython.display import clear_output
from vertexai.evaluation import EvalTask, EvalResult, MetricPromptTemplateExamples

_AUTORATER_PROMPT_TEMPLATE = '''
You are an expert image analyst with a keen eye for detail and a deep understanding of linguistics and human perception.

# Definitions
- **Visually Groundable Requirement:** A specific claim or requirement within the image description that can be verified or refuted by examining the visual content of the image. This includes descriptions of objects (existence and attributes like color, size, shape, or text on the object), spatial relationships between objects, actions depicted, or overall scene characteristics like lighting conditions.
- **Gap:** A visually groundable requirement that is either contradicted by the image or cannot be directly confirmed based on the image.

# Instructions
Review the image and a description of that image located in the IMAGE_DESCRIPTION tag below.
Your goal is to rate the accuracy of the image description on the scale of 0 to 10.
You must use the following 6-step process and provide brief written notes for each step:
- Step 1. Identify all Visually Groundable Requirements contained in IMAGE_DESCRIPTION and save them to a numbered list.
- Step 2. Write a numbered list of true/false questions that should be asked about each of the identified requirements in order to verify whether each requirement is satisfied by the image or not.
- Step 3. For each of the questions created in Step 2 write a brief analysis of the most relevant information in the provided image and then write the final answer:
    - True only if the image contains a clear positive answer to this question.
    - False if the image clearly justifies a negative answer to this question OR does not have enough information to answer this question.
- Step 4. Calculate the number of questions that received the answer "True" in step 3.
- Step 5. Calculate the final accuracy score as the percentage of positively answered questions out of the total questions answered in Step 3, rounded to the nearest integer.
- Step 6. Write the final answer as a Markdown codeblock containing a single JSON object with two attributes:
    - "score" with the integer value of the final accuracy score calculated in Step 5.
    - "gaps" with a JSON array of strings that describe each gap (question that got a negative answer in Step 3). The description should be a one sentence statement that combines key information from the question and the analysis of relevant information from Step 3.

<IMAGE_DESCRIPTION>
{image_description}
</IMAGE_DESCRIPTION>
'''

def load_text_file(gcs_uri: str) -> str:
    blob = storage.Blob.from_string(gcs_uri, storage.Client())
    return blob.download_as_string().decode('utf-8')

def load_image(gcs_uri: str) -> bytes:
    blob = storage.Blob.from_string(gcs_uri, storage.Client())
    return blob.download_as_bytes()

def load_dataset(dataset_uri: str):
    '''Convert the dataset to a Pandas DataFrame and load all images into the "image" column.'''
    lines = load_text_file(dataset_uri).splitlines()
    data = [json.loads(line) for line in lines if line.strip()]
    df = pd.DataFrame(data)
    df['image'] = df['image_uri'].apply(lambda image_uri: load_image(image_uri))
    return df[['image_uri', 'prompt', 'image']]

def image_prompt_alignment_autorater(record: dict) -> dict:
    '''Custom metric function for scoring prompt alignment between the image and prompt from the given dataset record.'''
    response = _gemini_client.models.generate_content(
        model=os.getenv('MODEL_JUDGE'),
        contents=[
            Content(role='user', parts=[Part(text=_AUTORATER_PROMPT_TEMPLATE.format(image_description=record['prompt']))]),
            Content(role='user', parts=[Part.from_bytes(data=record['image'], mime_type='image/jpeg')])
        ]
    )
    json_output = json.loads(response.text.split('```json\n')[1].split('\n```')[0])
    return {
        "image_prompt_alignment": json_output['score'],
        "explanation": '\n'.join(json_output['gaps'])
    }

def print_scores_and_explanations(title: str, eval_result: EvalResult) -> None:
    print(f'\n{"-"*80}\nRESULTS FOR {title}:')
    for i, row in eval_result.metrics_table.iterrows():
        gaps = row["image_prompt_alignment/explanation"]
        gaps = f', GAPS: {gaps}' if gaps else ''
        print(f'{row["image_uri"]}: SCORE={row["image_prompt_alignment/score"]}%{gaps}')

def run_eval(model: str, dataset_uri: str, experiment_name:str):
    '''Rate the alignment between image generation prompts and the generated images and identify gaps using a custom autorater.'''
    timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
    dataset=load_dataset(dataset_uri)
    task = EvalTask(
        dataset=dataset,
        metrics=[CustomMetric(name="image_prompt_alignment", metric_function=image_prompt_alignment_autorater)],
        experiment=experiment_name
    )
    return task.evaluate(experiment_run_name=f"{timestamp}-{model.lower().replace('.', '-')}")

def compare_models(project_id: str, location: str, experiment_name: str, model_a: str, dataset_uri_a: str, model_b: str, dataset_uri_b: str) -> None:
    global _gemini_client
    _gemini_client = genai.Client(vertexai=True, project=project_id, location=location)
    vertexai.init(project=project_id, location=location)
    results_a = run_eval(model_a, dataset_uri_a, experiment_name)
    results_b = run_eval(model_b, dataset_uri_b, experiment_name)
    clear_output()
    print_scores_and_explanations(model_a, results_a)
    print_scores_and_explanations(model_b, results_b)
    print(f"\n{model_a} average alignment score = {results_a.summary_metrics['image_prompt_alignment/mean']:.1f}%")
    print(f"{model_b} average alignment score = {results_b.summary_metrics['image_prompt_alignment/mean']:.1f}%")

compare_models(
    project_id=os.getenv('PROJECT_ID'),
    location=os.getenv('LOCATION'),
    experiment_name=os.getenv('EXPERIMENT_NAME'),
    model_a="IMAGEN2",
    dataset_uri_a=os.getenv('DATASET_URI_IMAGEN2'),
    model_b="IMAGEN3",
    dataset_uri_b=os.getenv('DATASET_URI_IMAGEN3')
)
import json
import os
import pandas as pd
import vertexai
from datetime import datetime
from IPython.display import clear_output
from vertexai.evaluation import EvalTask, EvalResult, MetricPromptTemplateExamples

_AUTORATER_PROMPT_TEMPLATE = '''
You are an expert image analyst with a keen eye for detail and a deep understanding of linguistics and human perception.

# Definitions
- **Visually Groundable Requirement:** A specific claim or requirement within the image description that can be verified or refuted by examining the visual content of the image. This includes descriptions of objects (existence and attributes like color, size, shape, or text on the object), spatial relationships between objects, actions depicted, or overall scene characteristics like lighting conditions.
- **Gap:** A visually groundable requirement that is either contradicted by the image or cannot be directly confirmed based on the image.

# Instructions
Review the image and a description of that image located in the IMAGE_DESCRIPTION tag below.
Your goal is to rate the accuracy of the image description on the scale of 0 to 10.
You must use the following 6-step process and provide brief written notes for each step:
- Step 1. Identify all Visually Groundable Requirements contained in IMAGE_DESCRIPTION and save them to a numbered list.
- Step 2. Write a numbered list of true/false questions that should be asked about each of the identified requirements in order to verify whether each requirement is satisfied by the image or not.
- Step 3. For each of the questions created in Step 2 write a brief analysis of the most relevant information in the provided image and then write the final answer:
    - True only if the image contains a clear positive answer to this question.
    - False if the image clearly justifies a negative answer to this question OR does not have enough information to answer this question.
- Step 4. Calculate the number of questions that received the answer "True" in step 3.
- Step 5. Calculate the final accuracy score as the percentage of positively answered questions out of the total questions answered in Step 3, rounded to the nearest integer.
- Step 6. Write the final answer as a Markdown codeblock containing a single JSON object with two attributes:
    - "score" with the integer value of the final accuracy score calculated in Step 5.
    - "gaps" with a JSON array of strings that describe each gap (question that got a negative answer in Step 3). The description should be a one sentence statement that combines key information from the question and the analysis of relevant information from Step 3.


{image_description}

'''

def load_text_file(gcs_uri: str) -> str:
    blob = storage.Blob.from_string(gcs_uri, storage.Client())
    return blob.download_as_string().decode('utf-8')

def load_image(gcs_uri: str) -> bytes:
    blob = storage.Blob.from_string(gcs_uri, storage.Client())
    return blob.download_as_bytes()

def load_dataset(dataset_uri: str):
    '''Convert the dataset to a Pandas DataFrame and load all images into the "image" column.'''
    lines = load_text_file(dataset_uri).splitlines()
    data = [json.loads(line) for line in lines if line.strip()]
    df = pd.DataFrame(data)
    df['image'] = df['image_uri'].apply(lambda image_uri: load_image(image_uri))
    return df[['image_uri', 'prompt', 'image']]

def image_prompt_alignment_autorater(record: dict) -> dict:
    '''Custom metric function for scoring prompt alignment between the image and prompt from the given dataset record.'''
    response = _gemini_client.models.generate_content(
        model=os.getenv('MODEL_JUDGE'),
        contents=[
            Content(role='user', parts=[Part(text=_AUTORATER_PROMPT_TEMPLATE.format(image_description=record['prompt']))]),
            Content(role='user', parts=[Part.from_bytes(data=record['image'], mime_type='image/jpeg')])
        ]
    )
    json_output = json.loads(response.text.split('```json\n')[1].split('\n```')[0])
    return {
        "image_prompt_alignment": json_output['score'],
        "explanation": '\n'.join(json_output['gaps'])
    }

def print_scores_and_explanations(title: str, eval_result: EvalResult) -> None:
    print(f'\n{"-"*80}\nRESULTS FOR {title}:')
    for i, row in eval_result.metrics_table.iterrows():
        gaps = row["image_prompt_alignment/explanation"]
        gaps = f', GAPS: {gaps}' if gaps else ''
        print(f'{row["image_uri"]}: SCORE={row["image_prompt_alignment/score"]}%{gaps}')

def run_eval(model: str, dataset_uri: str, experiment_name:str):
    '''Rate the alignment between image generation prompts and the generated images and identify gaps using a custom autorater.'''
    timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
    dataset=load_dataset(dataset_uri)
    task = EvalTask(
        dataset=dataset,
        metrics=[CustomMetric(name="image_prompt_alignment", metric_function=image_prompt_alignment_autorater)],
        experiment=experiment_name
    )
    return task.evaluate(experiment_run_name=f"{timestamp}-{model.lower().replace('.', '-')}")

def compare_models(project_id: str, location: str, experiment_name: str, model_a: str, dataset_uri_a: str, model_b: str, dataset_uri_b: str) -> None:
    global _gemini_client
    _gemini_client = genai.Client(vertexai=True, project=project_id, location=location)
    vertexai.init(project=project_id, location=location)
    results_a = run_eval(model_a, dataset_uri_a, experiment_name)
    results_b = run_eval(model_b, dataset_uri_b, experiment_name)
    clear_output()
    print_scores_and_explanations(model_a, results_a)
    print_scores_and_explanations(model_b, results_b)
    print(f"\n{model_a} average alignment score = {results_a.summary_metrics['image_prompt_alignment/mean']:.1f}%")
    print(f"{model_b} average alignment score = {results_b.summary_metrics['image_prompt_alignment/mean']:.1f}%")

compare_models(
    project_id=os.getenv('PROJECT_ID'),
    location=os.getenv('LOCATION'),
    experiment_name=os.getenv('EXPERIMENT_NAME'),
    model_a="IMAGEN2",
    dataset_uri_a=os.getenv('DATASET_URI_IMAGEN2'),
    model_b="IMAGEN3",
    dataset_uri_b=os.getenv('DATASET_URI_IMAGEN3')
)

Learn more about Vertex AI GenAI Evaluation Service.