# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Author(s) | Lei Pan |
Last updated | 01/22/2024 |
Generate Fine-tuning Dataset¶
Codey models are text-to-code models from Google AI, trained on a massive code related dataset. You can generate code related responses for different scenarios such as writing functions, unit tests, debugging, explaining code etc. Here is the overview of all the Codey APIs.
For some scenarios, fine-tuned codey models work better such as generating code using custom libraries it has never been trained before. In those use cases, you will need to create training dataset to be able to do fine-tuning. Here is the overview of codey fine-tuning.
In this notebook, we will show you how to generate fine-tuning dataset to tune codey models.
- Step 1: Set up basic input and output text
- Step 2: Simulate more examples based on the input texts
- Step 3: Automatically store json data to a JSONL File
- Step 4: Automatically upload JSONL to the GCS bucket
Caveat: this is done as an example only. In the real world practice, you want to generate more examples for different aspects of using APIs and twist around it to find optimal training datasets.
Install pre-requisites¶
If running in Colab install the pre-requisites into the runtime. Otherwise it is assumed that the notebook is running in Vertex Workbench. In that case it is recommended to install the pre-requistes from a terminal using the --user
option.
import sys
if 'google.colab' in sys.modules:
! pip install google-cloud-aiplatform
! pip install jsonlines
from google.colab import auth as google_auth
google_auth.authenticate_user()
import json
import os
from typing import Dict, List, Optional, Tuple
import jsonlines
import vertexai
from vertexai.language_models import TextGenerationModel
from google.cloud import storage
Initialize Vertex AI¶
Please set VERTEX_API_PROJECT and VERTEX_API_LOCATION below with your project id and location for Vertex AI. This should be the project in which you enabled Vertex AI.
vertexai.init(project="your project", location="your location")
Set Up Text Generation Function and GCS Bucket Upload Function¶
- We need text generation to simulate input training data
- We need GCS upload function to automatically upload generated training dataset to your GCS bucket
def paraphrase_input_text(input_text: str) -> str:
parameters = {
"temperature": 0.2,
"max_output_tokens": 256,
"top_p": 0.8,
"top_k": 40 ,
}
model = TextGenerationModel.from_pretrained("text-bison")
response = model.predict(
f"Paraphrase this sentence: {input_text}",
**parameters,
)
print(f"Response from Model: {response.text}")
return response.text
def upload_blob(bucket_name, source_file_name, destination_blob_name):
"""Uploads a file to the bucket."""
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
generation_match_precondition = None
blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)
print(
f"File {source_file_name} uploaded to {destination_blob_name}."
)
Step 1: Set Up Basic Input and Output Text¶
- Training datasets for fine-tuning codey models should be in a jsonline file and this is the format [ {input_text:"xxx",output_text:"xxx"} {input_text:"xxx",output_text:"xxx"} ..... ]
- Input_text means the prompts that you want the model to understand, output_text means the results/coding blocks you want the model to produce
- In this example, since we want codey model to know how to use vertex AI search API do to 1) basic search request and 2) more advanced function in the response - converting protobuf to dictionary, we will use the corresponding input and output text for model to learn how to use the API
Refer to this link to check out the full function of search_sample: https://github.com/GoogleCloudPlatform/python-docs-samples/blob/HEAD/discoveryengine/search_sample.py
We only use a few lines from that example as training dataset.
basic_input_text = "Create a function to perform search requests to the Vertex AI Search and Conversation API and return the search results."
basic_output_text = """def search_sample(
project_id: str,
location: str,
search_engine_id: str,
serving_config_id: str,
search_query: str,
) -> List[discoveryengine.SearchResponse.SearchResult]:
client = discoveryengine.SearchServiceClient()
serving_config = client.serving_config_path(
project=project_id,
location=location,
data_store=search_engine_id,
serving_config=serving_config_id,
)
request = discoveryengine.SearchRequest(
serving_config=serving_config,
query=search_query,
)
response = client.search(request)
return response"""
advance_input_text = "Create a function to send search requests to Vertex AI Search API, convert the protobuf search response to a dictionary, and return the dictionary result."
advance_output_text = """
def search_sample(
project_id: str,
location: str,
search_engine_id: str,
serving_config_id: str,
search_query: str,
) -> List[discoveryengine.SearchResponse.SearchResult]:
client = discoveryengine.SearchServiceClient()
serving_config = client.serving_config_path(
project=project_id,
location=location,
data_store=search_engine_id,
serving_config=serving_config_id,
)
request = discoveryengine.SearchRequest(
serving_config=serving_config,
query=search_query,
)
response = client.search(request)
results = [MessageToDict(result.document._pb) for result in response.results]
return results
"""
Step 2: Simulate More Examples Based on the Input Texts¶
- We got 2 examples above. That's not enough to fine-tune codey models. We're going to use text-bison model to simulate 8 more input_text variations to map to the same output/results we mentioned above.
- With 10 examples, models should be able to learn for each category of prompt, what output we are looking for.
Call text model to simulate more input text as examples
json_data =[{"input_text": basic_input_text,"output_text": basic_output_text},
{"input_text": advance_input_text,"output_text": advance_output_text}
]
def simulate_input_text_add_jsondata(temp_input,temp_output,json_data):
for i in range(4):
new_input_text = paraphrase_input_text(temp_input)
line_json = {"input_text": new_input_text,"output_text": temp_output}
json_data.append(line_json)
temp_input = new_input_text
return json_data
json_data = simulate_input_text_add_jsondata(basic_input_text,basic_output_text,json_data)
json_data = simulate_input_text_add_jsondata(advance_input_text,advance_output_text,json_data)
Step 3: Automatically Store Json Data to a JSONL File¶
with open('output.jsonl', 'w') as outfile:
for entry in json_data:
json.dump(entry, outfile)
outfile.write('\n')
jsonl_file = open("output.jsonl", "r")
print(jsonl_file.read())
{"input_text": "Create a function to perform search requests to the Vertex AI Search and Conversation API and return the search results.", "output_text": "def search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n\n return response"} {"input_text": "Create a function to send search requests to Vertex AI Search API, convert the protobuf search response to a dictionary, and return the dictionary result.", "output_text": "\ndef search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n results = [MessageToDict(result.document._pb) for result in response.results]\n\n return results\n"} {"input_text": " Develop a function that can send search queries to the Vertex AI Search and Conversation API and provide the search results.", "output_text": "def search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n\n return response"} {"input_text": " Create a function that can send search queries to the Vertex AI Search and Conversation API and return the search results.", "output_text": "def search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n\n return response"} {"input_text": " Write a function that can send search queries to the Vertex AI Search and Conversation API and then return the search results.", "output_text": "def search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n\n return response"} {"input_text": " Create a function that can send search queries to the Vertex AI Search and Conversation API and then return the search results.", "output_text": "def search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n\n return response"} {"input_text": " Write a function that sends search requests to the Vertex AI Search API, converts the protobuf search response into a dictionary, and returns the dictionary result.", "output_text": "\ndef search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n results = [MessageToDict(result.document._pb) for result in response.results]\n\n return results\n"} {"input_text": " Create a function that sends search requests to the Vertex AI Search API, converts the protobuf search response into a Python dictionary, and returns the dictionary result.", "output_text": "\ndef search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n results = [MessageToDict(result.document._pb) for result in response.results]\n\n return results\n"} {"input_text": " Write a function that:\n- Sends search requests to the Vertex AI Search API.\n- Converts the protobuf search response into a Python dictionary.\n- Returns the dictionary result.", "output_text": "\ndef search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n results = [MessageToDict(result.document._pb) for result in response.results]\n\n return results\n"} {"input_text": " Create a function that:\n- Makes search requests to the Vertex AI Search API.\n- Changes the protobuf search response into a Python dictionary.\n- Gives back the dictionary result.", "output_text": "\ndef search_sample(\n project_id: str,\n location: str,\n search_engine_id: str,\n serving_config_id: str,\n search_query: str,\n) -> List[discoveryengine.SearchResponse.SearchResult]:\n client = discoveryengine.SearchServiceClient()\n serving_config = client.serving_config_path(\n project=project_id,\n location=location,\n data_store=search_engine_id,\n serving_config=serving_config_id,\n )\n\n request = discoveryengine.SearchRequest(\n serving_config=serving_config,\n query=search_query,\n )\n response = client.search(request)\n results = [MessageToDict(result.document._pb) for result in response.results]\n\n return results\n"}
Step 4: Automatically Upload JSONL to the GCS Bucket¶
Replace "your bucket name" with your GCS bucket for fine-tuning code model
upload_blob("your bucket name", "output.jsonl", "output.jsonl")
File output.jsonl uploaded to output.jsonl.