Copyright 2024 Google LLC.¶
#@title Licensed under the Apache License 2.0
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2D spatial reasoning with Gemini 2.0¶
This notebook demonstrates object detection and spatial reasoning with Gemini 2.0 in Vertex AI.
You'll learn how to use Gemini to perform object detection like this:
You will find live examples including object detection with
- overlaying information
- searching within an image
- translating and understanding things in multiple languages
- using Gemini reasoning abilities
Please note
There's no "magical prompt". Feel free to experiment with different ones. You can use the samples included in this notebook or upload your own images and write your own prompts.
This notebook is based on a notebook published by the Gemini AI Studio team.
Setup¶
Please enter the PROJECT_ID of your Google Cloud Project
PROJECT_ID = '[your-project-id]' # @param {type: 'string'}
LOCATION = 'us-central1' # @param {type: 'string'}
Install or upgrade Vertex AI SDK and restart the Colab runtime (if necessary).
def install_or_upgrade_vertex_ai():
package_name = "google-cloud-aiplatform"
required_version = "1.73.0"
try:
import google.cloud.aiplatform as aiplatform
installed_version = aiplatform.__version__
if installed_version < required_version:
print(f"Upgrading {package_name} from version {installed_version} to {required_version}...")
!pip install google-cloud-aiplatform --upgrade --quiet --user
print(f"Successfully upgraded {package_name}.")
restart_runtime()
else:
print(f"{package_name} is already installed with version {installed_version}.")
except ImportError:
print(f"{package_name} is not installed. Installing version {required_version}...")
!pip install google-cloud-aiplatform --upgrade --quiet --user
print(f"Successfully installed {package_name}.")
restart_runtime()
def restart_runtime():
import IPython
print('The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says "Your session crashed for an unknown reason." This is expected.')
print('Once the runtime is restarted you can continue running individual cells or run the entire notebook with the "Run all" command.')
IPython.Application.instance().kernel.do_shutdown(True)
install_or_upgrade_vertex_ai()
google-cloud-aiplatform is already installed with version 1.73.0.
import base64
import io
import os
import requests
import sys
from io import BytesIO
from PIL import Image
if 'google.colab' in sys.modules:
from google.colab import auth
auth.authenticate_user()
Initialize Vertex AI SDK client¶
Please rerun this cell if you ever get a Session Timeout error:
import vertexai
from vertexai.generative_models import (GenerativeModel, HarmBlockThreshold, HarmCategory, Part)
model_name = 'gemini-2.0-flash-exp' # This specific model is required
vertexai.init(project=PROJECT_ID, location=LOCATION)
Configure the model¶
The system instructions are mainly used to make the prompts shorter by not having to reapeat each time the format. They are also telling the model how to deal with similar objects which is a nice way to let it be creative.
bounding_box_system_instructions = """
Return bounding boxes as a JSON array with labels. Never return masks or code fencing. Limit to 25 objects.
If an object is present multiple times, name them according to their unique characteristics (colors, size, position, unique characteristics, etc.)
"""
safety_settings= {
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
}
# Temperature above 0 is recommended to prevent repeated outputs
generation_config = {
'temperature': 0.5,
'max_output_tokens': 8192,
'candidate_count': 1,
}
model = GenerativeModel(
model_name=model_name,
system_instruction=bounding_box_system_instructions,
generation_config=generation_config,
safety_settings=safety_settings,
)
Utils¶
These scripts will be needed to plot the bounding boxes. Of course they are just examples and you are free to use any other libraries.
# @title Bounding Box Visualization
# Get Noto JP font to display japanese characters
!apt-get install fonts-noto-cjk # For Noto Sans CJK JP
import json
import random
import io
from PIL import Image, ImageDraw, ImageFont
from PIL import ImageColor
additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]
def plot_bounding_boxes(im, bounding_boxes):
"""
Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.
Args:
img_path: The path to the image file.
bounding_boxes: A list of bounding boxes containing the name of the object
and their positions in normalized [y1 x1 y2 x2] format.
"""
img = im.resize((1024,1024))
width, height = img.size
draw = ImageDraw.Draw(img)
# Define a list of bounding box border colors
colors = [
'red',
'green',
'blue',
'yellow',
'orange',
'pink',
'purple',
'brown',
'gray',
'beige',
'turquoise',
'cyan',
'magenta',
'lime',
'navy',
'maroon',
'teal',
'olive',
'coral',
'lavender',
'violet',
'gold',
'silver',
] + additional_colors
# We parse out the markdown fencing
bounding_boxes = parse_json(bounding_boxes)
font = ImageFont.truetype("NotoSansCJK-Regular.ttc", size=14)
# Iterate over the bounding boxes
for i, bounding_box in enumerate(json.loads(bounding_boxes)):
# Select a color from the list
color = colors[i % len(colors)]
# Convert normalized coordinates to absolute coordinates
abs_y1 = int(bounding_box["box_2d"][0]/1000 * height)
abs_x1 = int(bounding_box["box_2d"][1]/1000 * width)
abs_y2 = int(bounding_box["box_2d"][2]/1000 * height)
abs_x2 = int(bounding_box["box_2d"][3]/1000 * width)
if abs_x1 > abs_x2:
abs_x1, abs_x2 = abs_x2, abs_x1
if abs_y1 > abs_y2:
abs_y1, abs_y2 = abs_y2, abs_y1
# Draw the bounding box
draw.rectangle(
((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4
)
# Draw the text
if "label" in bounding_box:
draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font)
# Display the image
img.show()
return img
# @title Image encoder
def encode_image(local_file_path: str) -> Part:
encoded_image = base64.b64encode(open(local_file_path, 'rb').read()).decode('utf-8')
return Part.from_data(data=base64.b64decode(encoded_image), mime_type='image/jpeg')
# @title Model output parsing
def parse_json(json_output):
# We parse out the markdown fencing
lines = json_output.splitlines()
for i, line in enumerate(lines):
if line == "```json":
json_output = "\n".join(lines[i+1:]) # Remove everything before "```json"
json_output = json_output.split("```")[0] # Remove everything after the closing "```"
break # Exit the loop once "```json" is found
return json_output
Download sample images¶
# Load sample images
!wget https://storage.googleapis.com/generativeai-downloads/images/socks.jpg -O Socks.jpg -q
!wget https://storage.googleapis.com/generativeai-downloads/images/vegetables.jpg -O Vegetables.jpg -q
!wget https://storage.googleapis.com/generativeai-downloads/images/Japanese_Bento.png -O Japanese_bento.png -q
!wget https://storage.googleapis.com/generativeai-downloads/images/Cupcakes.jpg -O Cupcakes.jpg -q
!wget https://storage.googleapis.com/generativeai-downloads/images/origamis.jpg -O Origamis.jpg -q
!wget https://storage.googleapis.com/generativeai-downloads/images/fruits.jpg -O Fruits.jpg -q
!wget https://storage.googleapis.com/generativeai-downloads/images/cat.jpg -O Cat.jpg -q
!wget https://storage.googleapis.com/generativeai-downloads/images/pumpkins.jpg -O Pumpkins.jpg -q
!wget https://storage.googleapis.com/generativeai-downloads/images/breakfast.jpg -O Breakfast.jpg -q
!wget https://storage.googleapis.com/generativeai-downloads/images/bookshelf.jpg -O Bookshelf.jpg -q
!wget https://storage.googleapis.com/generativeai-downloads/images/spill.jpg -O Spill.jpg -q
Object detection with Bounding Boxes¶
Let's start by loading an image:
image = "Cupcakes.jpg" # @param ["Socks.jpg","Vegetables.jpg","Japanese_bento.png","Cupcakes.jpg","Origamis.jpg","Fruits.jpg","Cat.jpg","Pumpkins.jpg","Breakfast.jpg","Bookshelf.jpg", "Spill.jpg"] {"allow-input":true}
Image.open(image).resize((400,400))
Now we can test a simple prompt to find all the cupcakes in the image.
To prevent the model from repeating itself, it is recommended to use a temperature above 0, in this case 0.5. Limiting the number of items (25 in the system instructions) is also a way to prevent the model from looping and to speed up the decoding of the bounding boxes.
prompt = "Detect the 2d bounding boxes of the cupcakes (with “label” as topping description”)" # @param {type:"string"}
# Run inference to find the bounding boxes
response = model.generate_content(
contents=[prompt, encode_image(image)]
)
print(response.text)
As you can see, even without any instructions about the format, Gemini is trained to always use this format with a label and the coordinates of the bounding box in a "box_2d" array.
Please note that Y coordinates preceed X coordinates which is a bit unusual.
plot_bounding_boxes(Image.open(image), response.text)
Search within an image¶
A more nuanced example of finding requested objects and displaying bounding boxes with additional information.
image = "Socks.jpg" # @param ["Socks.jpg","Vegetables.jpg","Japanese_bento.png","Cupcakes.jpg","Origamis.jpg","Fruits.jpg","Cat.jpg","Pumpkins.jpg","Breakfast.jpg","Bookshelf.jpg", "Spill.jpg"] {"allow-input":true}
prompt = "Find the sock that matches the one at the top and return the bounding box for that sock" # @param ["Detect all rainbow socks", "Show me the positions of the socks with the face","Find the sock that matches the one at the top and return the bounding box for that sock"] {"allow-input":true}
# Run inference to find bounding boxes
response = model.generate_content(
contents=[prompt, encode_image(image)]
)
# Check output
print(response.text)
# Generate image with bounding boxes
plot_bounding_boxes(Image.open(image), response.text)
Try it with different images and prompts. Different samples are proposed but you can also write your own.
Multilingual capabilities¶
As Gemini is able to understand multiple languages, you can combine spatial reasoning with multilingual capabilities.
You can give it an image like this and prompt it to label each item with Japanese characters and English translation. The model reads the text and recognize the pictures from the image itself and translates them.
image = "Japanese_bento.png" # @param ["Socks.jpg","Vegetables.jpg","Japanese_bento.png","Cupcakes.jpg","Origamis.jpg","Fruits.jpg","Cat.jpg","Pumpkins.jpg","Breakfast.jpg","Bookshelf.jpg", "Spill.jpg"] {"allow-input":true}
prompt = "Explain what those dishes are with a 5 words description" # @param ["Detect food items, with Japanese characters + english translation in \"label\".", "Show me the vegan dishes","Explain what those dishes are with a 5 words description","Find the dishes with allergens and label them accordingly"] {"allow-input":true}
# Run inference to find bounding boxes
response = model.generate_content(
contents=[prompt, encode_image(image)]
)
# Generate image with bounding boxes
plot_bounding_boxes(Image.open(image), response.text)
Summary¶
Reasoning capabilities¶
The model can also reason based on the image. You can ask it about the positions of items, their utility, or, like in this example, to find the shadow of a speficic item.
image = "Origamis.jpg" # @param ["Socks.jpg","Vegetables.jpg","Japanese_bento.png","Cupcakes.jpg","Origamis.jpg","Fruits.jpg","Cat.jpg","Pumpkins.jpg","Breakfast.jpg","Bookshelf.jpg", "Spill.jpg"] {"allow-input":true}
prompt = "Draw a square around the fox' shadow" # @param ["Find the two origami animals.", "Where are the origamis' shadows?","Draw a square around the fox' shadow"] {"allow-input":true}
# Run inference to find bounding boxes
response = model.generate_content(
contents=[prompt, encode_image(image)]
)
# Generate image with bounding boxes
plot_bounding_boxes(Image.open(image), response.text)
If you check the previous examples, the Japanese food one in particular,multiple other prompt samples are provided to experiment with Gemini reasoning capabilities.
Summary¶
This notebook demonstrates a few ways to leverage Gemini 2.0's spacial reasoning for various tasks, including object detection, spatial reasoning, and multilingual capabilities.