Google CHIRP model responding different transcript...

mdiqbal · 03-10-2025 05:09 AM

Hi,

Below is the script I used to call the chirp model to get the transcription. Each time I am calling the transcription API, I am getting different results. (I mean I get 2 to 3 different responses.)

NOTE: I am trying with the words spoken in the audio files.

import argparse
import json
import logging
import os
import threading
import time
from base64 import b64encode
from datetime import datetime
from functools import partial
from typing import Dict, Union

import pandas as pd
import requests
from google.auth.exceptions import DefaultCredentialsError
from google.auth.transport.requests import Request
from google.oauth2 import service_account
from tqdm import tqdm

log_dir = "../logs"
if not os.path.exists(log_dir):
    os.makedirs(log_dir, exist_ok=True)

# Set up logging
log_file = f"{log_dir}/log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(filename=log_file, level=logging.INFO)


class SpeechAPI:
    """
    Represents a client for the Google Cloud Speech-to-Text API.
    """

    def get_credentials(self):
        credentials = service_account.Credentials.from_service_account_file(
            self._secret_json_filepath,
            scopes=["https://www.googleapis.com/auth/cloud-platform"],
        )
        return credentials

    def update_refresh_token(self):
        self.credentials.refresh(Request())

        self.token = self.credentials.token

        self.headers["Authorization"] = f"Bearer {self.token}"

    def __init__(self, secret_json_filepath: str) -> None:
        self._secret_json_filepath = secret_json_filepath
        # loading secret_json_filepath
        with open(secret_json_filepath, "r") as f:
            self._secret_json = json.load(f)

        # count to track refresh token
        self.count = 0

        # load the credentials from json key file
        try:
            self.credentials = self.get_credentials()
            # Refresh the credentials to obtain a new access token
            self.credentials.refresh(Request())

            # Access token
            self.token = self.credentials.token
        except DefaultCredentialsError as e:
            logging.error(f"Error: {e}")
            exit()

        # headers to call API
        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.token}",
        }

        self.project_id = self._secret_json["project_id"]

        self.region = "us-central1"
        # self.region = "asia-southeast1"
        self.region_v2 = "eu"  # for telephony and en-US, en-IN

        self.url = {
            "v2": f"https://{self.region_v2}-speech.googleapis.com/v2/projects/{self.project_id}/locations/{self.region_v2}/recognizers/_:recognize",
            "v1": f"https://speech.googleapis.com/v1/speech:recognize",
            "v1p1beta1": f"https://speech.googleapis.com/v1p1beta1/speech:recognize",
            "chirp": f"https://{self.region}-speech.googleapis.com/v2/projects/{self.project_id}/locations/{self.region}/recognizers/_:recognize",
            "chirp_telephony": f"https://{self.region}-speech.googleapis.com/v2/projects/{self.project_id}/locations/{self.region}/recognizers/_:recognize",
        }

        # the below is the default config
        self.config_dict = {
            "v2": {
                "config": {
                    "model": "short",  # possibles: "long", "short", "telephony", "telephony_short"
                    # telephony doesn't support boosting/adaptation
                    # model --> this paramter to tune, possibles: []
                    "languageCodes": [
                        "en-US",
                        "en-IN",
                        "en-GB",
                    ],
                    "features": {
                        "maxAlternatives": 5,
                        "enableWordTimeOffsets": True,
                        "enableWordConfidence": True,
                        "multi_channel_mode": "SEPARATE_RECOGNITION_PER_CHANNEL",  # to get separate recognition per channel
                    },
                    # "adaptation": {
                    #     "phraseSets": [
                    #         {
                    #             "inlinePhraseSet": {
                    #                 "phrases": [
                    #                     {
                    #                         "value": "$OOV_CLASS_DIGIT_SEQUENCE",
                    #                         "boost": 20,
                    #                     },
                    #                 ],
                    #             },
                    #         },
                    #     ],
                    # },
                    "transcriptNormalization": {},
                    "autoDecodingConfig": {},
                },
                "content": "",
            },
            "v1": {
                "config": {
                    "languageCode": "en-US",
                    "alternativeLanguageCodes": [
                        "en-IN",
                        "en-GB",
                    ],
                    "speechContexts": [
                        {"phrases": "$OOV_CLASS_DIGIT_SEQUENCE", "boost": 100}
                    ],
                    "enableWordConfidence": True,
                    "enableWordTimeOffsets": True,
                    "useEnhanced": True,
                    "maxAlternatives": 5,
                    # there is no use of metadata as it is deprecated.
                    "metadata": {
                        "microphoneDistance": "NEARFIELD",
                        "originalMediaType": "VIDEO",
                        "interactionType": "VOICE_COMMAND",
                        "recordingDeviceType": "SMARTPHONE",
                    },
                    "audioChannelCount": 2,
                    "enableSeparateRecognitionPerChannel": True,
                },
                "audio": {"content": ""},
            },
            "chirp": {
                "config": {
                    "model": "chirp",
                    "languageCodes": [
                        "en-US",
                    ],
                    # "autoDecodingConfig": {},
                    "explicitDecodingConfig":{
                        "encoding": "FLAC",
                        "sample_rate_hertz": 48000,
                        "audio_channel_count": 2,
                    },
                    "features": {
                        "maxAlternatives": 5,
                        "enableWordTimeOffsets": True,
                        # "enableWordConfidence": True, # not supported by "chirp" model
                        "multi_channel_mode": "SEPARATE_RECOGNITION_PER_CHANNEL",
                    },
                    # adaptation is not supported "chirp" model
                },
                "content": "",
            },
        }

    def update_config_dict(self, config_dict: Dict) -> None:
        """
        Args:
            config_dict (Dict): Configuration dictionary
            NOTE: don't pass the content, just pass the ({"config": {}}) dictionary
        """
        self.config_dict["config"] = config_dict["config"]

    def get_transcription(
        self, audio_filepath: str, version="v2", language: str = None
    ) -> Dict[str, Union[str, Dict[str, str]]]:
        """
        Args:
            audio_filepath (str): Path to the audio file
            version (str, optional): The version of the API to use. Defaults to "v2".
                all possible versions: v2, v1, v1p1beta1, chirp
        Returns:
            text (str): Transcribed text
        """
        # update the count
        self.count += 1

        if self.count % 500 == 0:
            self.update_refresh_token()

        with open(audio_filepath, "rb") as f:
            audio_b64 = b64encode(f.read()).decode()

        # get the URL
        if version in self.url:
            URL = self.url[version]
        else:
            raise ValueError(f"Invalid version: {version}")

        # getting audio path extension
        audio_path_extension = audio_filepath.split(".")[-1]

        if version == "v2":
            config_dict = self.config_dict["v2"].copy()
            config_dict["content"] = audio_b64

        elif version == "v1":
            config_dict = self.config_dict["v1"].copy()
            config_dict["audio"]["content"] = audio_b64

        elif version == "v1p1beta1":
            config_dict = self.config_dict["v1"].copy()
            config_dict["audio"]["content"] = audio_b64

            aac_config_dict = {"aac": {"encoding": "MP3", "sampleRateHertz": 16000}}
            if audio_path_extension in aac_config_dict:
                config_dict["config"].update(aac_config_dict[audio_path_extension])

        elif version == "chirp":
            config_dict = self.config_dict["chirp"].copy()
            config_dict["content"] = audio_b64
            if language is not None:
                config_dict["config"]["languageCodes"] = [language]

        elif version == "chirp_telephony":
            config_dict = self.config_dict["chirp"].copy()
            config_dict["content"] = audio_b64
            config_dict["config"]["model"] = "chirp_telephony"
            if language is not None:
                config_dict["config"]["languageCodes"] = [language]

        response = requests.post(
            URL,
            headers=self.headers,
            data=json.dumps(config_dict),
            verify=True,
        )

        if response.status_code == 200:
            response = response.json()

            if "results" in response:
                return {audio_path_extension: response["results"]}
            else:
                return {audio_path_extension: []}
        else:
            logging.error(f"Error: {response.text}")
            print(f"Error: {response.text}")
            return {audio_path_extension: None}


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process a single audio file using Google Speech-to-Text API for all models.")
    parser.add_argument(
        "--audio_filepath", type=str, required=True, help="Path to the audio file"
    )
    parser.add_argument(
        "--output_json_path",
        type=str,
        default=None,
        help="Path to save the output JSON response (optional, saves in current directory if not given)",
    )
    args = parser.parse_args()

    secret_json_filepath = "secret_key.json"
    api = SpeechAPI(secret_json_filepath=secret_json_filepath)

    audio_filepath = args.audio_filepath
    output_json_path = args.output_json_path

    # VERSIONS = ["v2", "v1", "v1p1beta1", "chirp", "chirp_telephony"]
    VERSIONS = ["chirp"]
    language = "en-US" # You can make language configurable via argparse as well.
    all_transcriptions = {}

    print(f"Processing audio file: {audio_filepath}")

    for version in VERSIONS:
        print(f"STT version: {version}")
        trans = api.get_transcription(
            audio_filepath=audio_filepath, version=version, language=language
        )
        all_transcriptions[version] = trans
        print(f"Transcription ({version}): {trans}")
        print("-" * 50)

    # Prepare output filename
    audio_filename_base = os.path.basename(audio_filepath).split(".")[0]
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_filename = f"{audio_filename_base}_{timestamp}_all_models.json"

    if output_json_path:
        # Ensure directory exists
        output_dir = os.path.dirname(os.path.abspath(output_json_path))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
        final_output_path = output_json_path
    else:
        final_output_path = output_filename

    with open(final_output_path, "w") as f:
        f.write(json.dumps(all_transcriptions, indent=4)) # indent for pretty json

    print(f"All transcriptions saved to: {final_output_path}")

+++++++++++++
Please try it with any audio files of any codec where digits are spoken.

MarvinLlamas

Hi @mdiqbal,

Welcome to the Google Cloud Community!

It looks like you are encountering an inconsistent transcription resulting from your use of the Google Cloud Speech-to-Text CHIRP model when called multiple times with the exact same audio file. You expect the same audio to produce the same transcription every time, but this is not happening.

Here are the potential ways that might help with your use case:

Improved Logging: Enhance your logging by capturing more detailed information around your API calls: log your exact request payload (the JSON you send to the API), log the HTTP status code and response headers, and log the precise time of each API call. This detailed logging can help you identify any patterns related to the time of day or server load.
Retry Mechanism with Exponential Backoff: Implement a robust retry mechanism. If you get a non-200 status code or an unexpected result, retry your API call with an increasing delay (e.g., 1 second, 2 seconds, 4 seconds, etc.). This can mitigate transient network issues or temporary server overload. Limit your number of retries to prevent infinite loops.
Check for Updates to the Google Cloud SDK: Ensure you are using the latest version of the Google Cloud SDK or the google-cloud-speech Python library. Sometimes, updates contain bug fixes that resolve unexpected behavior.
Region Consideration: Experiment with your different region settings. Although you specified "us-central1," it might be worth trying "us-east1," for example, in case there are regional differences in CHIRP model behavior.
Thorough Token Refresh: Review your token refresh logic to ensure it handles all your potential expiration scenarios. Consider refreshing your token more frequently (for example, every 100 API calls instead of 500). Make sure your token refresh is happening before you construct your request.

Was this helpful? If so, please accept this answer as “Solution”. If you need additional assistance, reply here within 2 business days and I’ll be happy to help.

Google CHIRP model responding different transcription for each call