Hi,
Below is the script I used to call the chirp model to get the transcription. Each time I am calling the transcription API, I am getting different results. (I mean I get 2 to 3 different responses.)
NOTE: I am trying with the words spoken in the audio files.
import argparse
import json
import logging
import os
import threading
import time
from base64 import b64encode
from datetime import datetime
from functools import partial
from typing import Dict, Union
import pandas as pd
import requests
from google.auth.exceptions import DefaultCredentialsError
from google.auth.transport.requests import Request
from google.oauth2 import service_account
from tqdm import tqdm
log_dir = "../logs"
if not os.path.exists(log_dir):
os.makedirs(log_dir, exist_ok=True)
# Set up logging
log_file = f"{log_dir}/log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(filename=log_file, level=logging.INFO)
class SpeechAPI:
"""
Represents a client for the Google Cloud Speech-to-Text API.
"""
def get_credentials(self):
credentials = service_account.Credentials.from_service_account_file(
self._secret_json_filepath,
scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
return credentials
def update_refresh_token(self):
self.credentials.refresh(Request())
self.token = self.credentials.token
self.headers["Authorization"] = f"Bearer {self.token}"
def __init__(self, secret_json_filepath: str) -> None:
self._secret_json_filepath = secret_json_filepath
# loading secret_json_filepath
with open(secret_json_filepath, "r") as f:
self._secret_json = json.load(f)
# count to track refresh token
self.count = 0
# load the credentials from json key file
try:
self.credentials = self.get_credentials()
# Refresh the credentials to obtain a new access token
self.credentials.refresh(Request())
# Access token
self.token = self.credentials.token
except DefaultCredentialsError as e:
logging.error(f"Error: {e}")
exit()
# headers to call API
self.headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.token}",
}
self.project_id = self._secret_json["project_id"]
self.region = "us-central1"
# self.region = "asia-southeast1"
self.region_v2 = "eu" # for telephony and en-US, en-IN
self.url = {
"v2": f"https://{self.region_v2}-speech.googleapis.com/v2/projects/{self.project_id}/locations/{self.region_v2}/recognizers/_:recognize",
"v1": f"https://speech.googleapis.com/v1/speech:recognize",
"v1p1beta1": f"https://speech.googleapis.com/v1p1beta1/speech:recognize",
"chirp": f"https://{self.region}-speech.googleapis.com/v2/projects/{self.project_id}/locations/{self.region}/recognizers/_:recognize",
"chirp_telephony": f"https://{self.region}-speech.googleapis.com/v2/projects/{self.project_id}/locations/{self.region}/recognizers/_:recognize",
}
# the below is the default config
self.config_dict = {
"v2": {
"config": {
"model": "short", # possibles: "long", "short", "telephony", "telephony_short"
# telephony doesn't support boosting/adaptation
# model --> this paramter to tune, possibles: []
"languageCodes": [
"en-US",
"en-IN",
"en-GB",
],
"features": {
"maxAlternatives": 5,
"enableWordTimeOffsets": True,
"enableWordConfidence": True,
"multi_channel_mode": "SEPARATE_RECOGNITION_PER_CHANNEL", # to get separate recognition per channel
},
# "adaptation": {
# "phraseSets": [
# {
# "inlinePhraseSet": {
# "phrases": [
# {
# "value": "$OOV_CLASS_DIGIT_SEQUENCE",
# "boost": 20,
# },
# ],
# },
# },
# ],
# },
"transcriptNormalization": {},
"autoDecodingConfig": {},
},
"content": "",
},
"v1": {
"config": {
"languageCode": "en-US",
"alternativeLanguageCodes": [
"en-IN",
"en-GB",
],
"speechContexts": [
{"phrases": "$OOV_CLASS_DIGIT_SEQUENCE", "boost": 100}
],
"enableWordConfidence": True,
"enableWordTimeOffsets": True,
"useEnhanced": True,
"maxAlternatives": 5,
# there is no use of metadata as it is deprecated.
"metadata": {
"microphoneDistance": "NEARFIELD",
"originalMediaType": "VIDEO",
"interactionType": "VOICE_COMMAND",
"recordingDeviceType": "SMARTPHONE",
},
"audioChannelCount": 2,
"enableSeparateRecognitionPerChannel": True,
},
"audio": {"content": ""},
},
"chirp": {
"config": {
"model": "chirp",
"languageCodes": [
"en-US",
],
# "autoDecodingConfig": {},
"explicitDecodingConfig":{
"encoding": "FLAC",
"sample_rate_hertz": 48000,
"audio_channel_count": 2,
},
"features": {
"maxAlternatives": 5,
"enableWordTimeOffsets": True,
# "enableWordConfidence": True, # not supported by "chirp" model
"multi_channel_mode": "SEPARATE_RECOGNITION_PER_CHANNEL",
},
# adaptation is not supported "chirp" model
},
"content": "",
},
}
def update_config_dict(self, config_dict: Dict) -> None:
"""
Args:
config_dict (Dict): Configuration dictionary
NOTE: don't pass the content, just pass the ({"config": {}}) dictionary
"""
self.config_dict["config"] = config_dict["config"]
def get_transcription(
self, audio_filepath: str, version="v2", language: str = None
) -> Dict[str, Union[str, Dict[str, str]]]:
"""
Args:
audio_filepath (str): Path to the audio file
version (str, optional): The version of the API to use. Defaults to "v2".
all possible versions: v2, v1, v1p1beta1, chirp
Returns:
text (str): Transcribed text
"""
# update the count
self.count += 1
if self.count % 500 == 0:
self.update_refresh_token()
with open(audio_filepath, "rb") as f:
audio_b64 = b64encode(f.read()).decode()
# get the URL
if version in self.url:
URL = self.url[version]
else:
raise ValueError(f"Invalid version: {version}")
# getting audio path extension
audio_path_extension = audio_filepath.split(".")[-1]
if version == "v2":
config_dict = self.config_dict["v2"].copy()
config_dict["content"] = audio_b64
elif version == "v1":
config_dict = self.config_dict["v1"].copy()
config_dict["audio"]["content"] = audio_b64
elif version == "v1p1beta1":
config_dict = self.config_dict["v1"].copy()
config_dict["audio"]["content"] = audio_b64
aac_config_dict = {"aac": {"encoding": "MP3", "sampleRateHertz": 16000}}
if audio_path_extension in aac_config_dict:
config_dict["config"].update(aac_config_dict[audio_path_extension])
elif version == "chirp":
config_dict = self.config_dict["chirp"].copy()
config_dict["content"] = audio_b64
if language is not None:
config_dict["config"]["languageCodes"] = [language]
elif version == "chirp_telephony":
config_dict = self.config_dict["chirp"].copy()
config_dict["content"] = audio_b64
config_dict["config"]["model"] = "chirp_telephony"
if language is not None:
config_dict["config"]["languageCodes"] = [language]
response = requests.post(
URL,
headers=self.headers,
data=json.dumps(config_dict),
verify=True,
)
if response.status_code == 200:
response = response.json()
if "results" in response:
return {audio_path_extension: response["results"]}
else:
return {audio_path_extension: []}
else:
logging.error(f"Error: {response.text}")
print(f"Error: {response.text}")
return {audio_path_extension: None}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process a single audio file using Google Speech-to-Text API for all models.")
parser.add_argument(
"--audio_filepath", type=str, required=True, help="Path to the audio file"
)
parser.add_argument(
"--output_json_path",
type=str,
default=None,
help="Path to save the output JSON response (optional, saves in current directory if not given)",
)
args = parser.parse_args()
secret_json_filepath = "secret_key.json"
api = SpeechAPI(secret_json_filepath=secret_json_filepath)
audio_filepath = args.audio_filepath
output_json_path = args.output_json_path
# VERSIONS = ["v2", "v1", "v1p1beta1", "chirp", "chirp_telephony"]
VERSIONS = ["chirp"]
language = "en-US" # You can make language configurable via argparse as well.
all_transcriptions = {}
print(f"Processing audio file: {audio_filepath}")
for version in VERSIONS:
print(f"STT version: {version}")
trans = api.get_transcription(
audio_filepath=audio_filepath, version=version, language=language
)
all_transcriptions[version] = trans
print(f"Transcription ({version}): {trans}")
print("-" * 50)
# Prepare output filename
audio_filename_base = os.path.basename(audio_filepath).split(".")[0]
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_filename = f"{audio_filename_base}_{timestamp}_all_models.json"
if output_json_path:
# Ensure directory exists
output_dir = os.path.dirname(os.path.abspath(output_json_path))
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
final_output_path = output_json_path
else:
final_output_path = output_filename
with open(final_output_path, "w") as f:
f.write(json.dumps(all_transcriptions, indent=4)) # indent for pretty json
print(f"All transcriptions saved to: {final_output_path}")
+++++++++++++
Please try it with any audio files of any codec where digits are spoken.
Hi @mdiqbal,
Welcome to the Google Cloud Community!
It looks like you are encountering an inconsistent transcription resulting from your use of the Google Cloud Speech-to-Text CHIRP model when called multiple times with the exact same audio file. You expect the same audio to produce the same transcription every time, but this is not happening.
Here are the potential ways that might help with your use case:
Was this helpful? If so, please accept this answer as “Solution”. If you need additional assistance, reply here within 2 business days and I’ll be happy to help.
User | Count |
---|---|
2 | |
2 | |
1 | |
1 | |
1 |