Gemini API Live is not working consistently, at a certain point in the session, the audio input is sent, but there is no response from the API, there is no error message, just timeout.
There is no specific audio that reproduces the issue.
The scenario is as follows: I have a game in Unity that connects to the server, and the logic works as follows:
Unity
Unity creates a session ID if it doesn't have one
It detects the audio while the user actively speaks, separates it into chunks, and sends a standard string to end the sending to the API
At this point, it waits for the API's response and blocks the sending of audio by Unity
Python
It receives the chunks from Unity, and when it receives the standard silence string, it effectively sends them to the server and waits for the response
When it receives the response, it frees Unity to send more audio.
The error occurs when
Unity sends the audio, and waits for the response. At this point, the server effectively sends the audio, and crashes while waiting for the response, returning no error or audio. However, this occurs randomly. In other words, the API responds correctly until this "crash" moment occurs. What I mean is that there is no specific "trigger" that causes the error, it occurs at random times during the interaction
What has already been done
Audio quality validation via STT cloud
Token counting
Unity code below
using UnityEngine;
using System;
using System.Collections.Concurrent;
using NativeWebSocket;
using TMPro;
[RequireComponent(typeof(AudioSource))]
public class VoiceChatClient_Live : MonoBehaviour
{
private enum ClientState { Idle, Connecting, WaitingForSpeech, StreamingSpeech, WaitingForAI }
[Header("Conexão e IA")]
public string serverUrl = "ws://localhost:3000/ws/voicechat";
public string conversationId = "";
[TextArea(3, 10)] public string aiPersonality = "Você é um assistente prestativo.";
public string ttsVoiceName = "kore";
[Header("VAD & Turn-Taking")]
[Range(0.001f, 0.2f)] public float vadThreshold = 0.01f;
public float userSilenceTimeout = 1.0f;
public float aiSilenceTimeout = 1.2f;
[Header("UI e Debug")]
public TMP_Text statusText;
private WebSocket websocket;
private AudioSource audioSource;
private AudioClip recordedClip;
private volatile ClientState currentState = ClientState.Idle;
private const int INPUT_SAMPLE_RATE = 24000;
private const int OUTPUT_SAMPLE_RATE = 24000;
private const int RECORDING_BUFFER_SECONDS = 1;
private readonly ConcurrentQueue<float> receivedAudioQueue = new ConcurrentQueue<float>();
private readonly ConcurrentQueue<string> statusMessageQueue = new ConcurrentQueue<string>();
private int lastSamplePosition = 0;
private float userSilenceTimer = 0f;
private float aiResponseSilenceTimer = 0f;
private float aiSilenceThreshold = 0.0001f;
private int sampleSize = 512;
private float[] samples;
private volatile bool isAiCurrentlySpeaking = false;
private int consecutiveTimeouts = 0;
void Start()
{
Debug.Log(" Iniciando VoiceChatClient...");
audioSource = GetComponent<AudioSource>();
AudioClip playbackClip = AudioClip.Create("PlaybackClip", OUTPUT_SAMPLE_RATE, 1, OUTPUT_SAMPLE_RATE, true, OnAudioRead);
audioSource.clip = playbackClip;
audioSource.loop = true;
audioSource.Play();
ConnectWebSocket();
samples = new float[sampleSize];
}
async void ConnectWebSocket()
{
if (string.IsNullOrEmpty(conversationId))
conversationId = Guid.NewGuid().ToString();
string fullUrl = $"{serverUrl}/{conversationId}?voiceName={Uri.EscapeDataString(ttsVoiceName)}&personality={Uri.EscapeDataString(aiPersonality)}";
UpdateState(ClientState.Connecting, "Conectando ao servidor...");
websocket = new WebSocket(fullUrl);
websocket.OnOpen += () => {
Debug.Log(" WebSocket conectado");
consecutiveTimeouts = 0;
StartListening();
};
websocket.OnError += (e) => {
Debug.LogError(" Erro WebSocket: " + e);
UpdateState(ClientState.Idle, "Erro na conexão.");
};
websocket.OnClose += (e) => {
Debug.LogWarning(" WebSocket desconectado: " + e);
StopListening();
UpdateState(ClientState.Idle, "Desconectado. Tentando reconectar...");
Invoke(nameof(ConnectWebSocket), 2f);
};
websocket.OnMessage += (bytes) => {
if (bytes == null || bytes.Length == 0)
{
HandleTimeout();
return;
}
aiResponseSilenceTimer = 0f;
consecutiveTimeouts = 0;
Debug.Log($" Áudio da IA recebido ({bytes.Length} bytes)");
float[] received = Pcm16ToFloat(bytes);
foreach (var sample in received)
receivedAudioQueue.Enqueue(sample);
};
await websocket.Connect();
}
void HandleTimeout()
{
consecutiveTimeouts++;
Debug.LogWarning($"Timeout {consecutiveTimeouts}/3 detectado!");
if (consecutiveTimeouts >= 3)
{
Debug.LogError("Muitos timeouts seguidos. Forçando reconexão.");
Reconnect();
}
}
void Reconnect()
{
if (websocket != null)
{
websocket.Close();
websocket = null;
}
StopListening();
UpdateState(ClientState.Idle, "Reconectando após falhas...");
Invoke(nameof(ConnectWebSocket), 1f);
}
void Update()
{
while (statusMessageQueue.TryDequeue(out var msg))
{
if (statusText) statusText.text = msg;
Debug.Log("[Status] " + msg);
}
websocket?.DispatchMessageQueue();
HandleTurnTaking();
ProcessMicrophoneInput();
}
private void HandleTurnTaking()
{
if (audioSource.isPlaying)
{
audioSource.GetOutputData(samples, 0);
float sum = 0f;
foreach (var s in samples) sum += s * s;
float rms = Mathf.Sqrt(sum / samples.Length);
isAiCurrentlySpeaking = rms > aiSilenceThreshold;
}
else isAiCurrentlySpeaking = false;
if (!isAiCurrentlySpeaking)
{
aiResponseSilenceTimer += Time.deltaTime;
if (aiResponseSilenceTimer > aiSilenceTimeout && currentState == ClientState.WaitingForAI)
{
UpdateState(ClientState.WaitingForSpeech, "Sua vez. Fale algo.");
aiResponseSilenceTimer = 0f;
}
}
}
private void ProcessMicrophoneInput()
{
if (isAiCurrentlySpeaking || !(currentState == ClientState.WaitingForSpeech || currentState == ClientState.StreamingSpeech)) return;
if (!Microphone.IsRecording(null)) return;
int currentPos = Microphone.GetPosition(null);
if (currentPos == lastSamplePosition) return;
int sampleCount = (currentPos - lastSamplePosition + recordedClip.samples) % recordedClip.samples;
if (sampleCount == 0) return;
float[] sampleChunk = new float[sampleCount];
recordedClip.GetData(sampleChunk, lastSamplePosition);
float rms = CalculateRMS(sampleChunk);
if (currentState == ClientState.WaitingForSpeech && rms > vadThreshold)
{
Debug.Log($"Iniciando fala (RMS={rms:F4})");
UpdateState(ClientState.StreamingSpeech, "Falando...");
userSilenceTimer = 0f;
SendAudioChunk(sampleChunk);
}
else if (currentState == ClientState.StreamingSpeech)
{
SendAudioChunk(sampleChunk);
if (rms < vadThreshold)
{
userSilenceTimer += (float)sampleCount / INPUT_SAMPLE_RATE;
if (userSilenceTimer > userSilenceTimeout)
{
Debug.Log(" Silêncio detectado. Enviando fim de fala.");
SendEndOfSpeechSignal();
UpdateState(ClientState.WaitingForAI, "Processando fala...");
}
}
else userSilenceTimer = 0f;
}
lastSamplePosition = currentPos;
}
private void SendAudioChunk(float[] sampleChunk)
{
if (websocket != null && websocket.State == WebSocketState.Open && sampleChunk.Length > 0 && !isAiCurrentlySpeaking)
{
byte[] pcmBytes = FloatToPcm16(sampleChunk);
websocket.Send(pcmBytes);
Debug.Log($" Chunk de áudio enviado ({pcmBytes.Length} bytes)");
}
}
private void SendEndOfSpeechSignal()
{
if (websocket != null && websocket.State == WebSocketState.Open)
{
Debug.Log(" Enviando byte de fim de fala (0x00)");
websocket.Send(new byte[] { 0 });
}
}
private void StartListening()
{
if (Microphone.IsRecording(null)) return;
if (websocket?.State != WebSocketState.Open) return;
recordedClip = Microphone.Start(null, true, RECORDING_BUFFER_SECONDS, INPUT_SAMPLE_RATE);
lastSamplePosition = 0;
userSilenceTimer = 0f;
Debug.Log(" Começou a escutar usuário.");
UpdateState(ClientState.WaitingForSpeech, "Conectado. Pode falar.");
}
private void StopListening()
{
if (Microphone.IsRecording(null)) Microphone.End(null);
}
private void UpdateState(ClientState newState, string statusMessage)
{
currentState = newState;
statusMessageQueue.Enqueue(statusMessage);
Debug.Log($" Estado: {newState} > {statusMessage}");
}
private void OnAudioRead(float[] data)
{
for (int i = 0; i < data.Length; i++)
data[i] = receivedAudioQueue.TryDequeue(out var sample) ? sample : 0f;
}
private float CalculateRMS(float[] samples)
{
float sum = 0f;
foreach (var s in samples) sum += s * s;
return Mathf.Sqrt(sum / samples.Length);
}
private byte[] FloatToPcm16(float[] samples)
{
byte[] pcmBytes = new byte[samples.Length * 2];
for (int i = 0; i < samples.Length; i++)
{
short intSample = (short)(samples[i] * 32767);
BitConverter.GetBytes(intSample).CopyTo(pcmBytes, i * 2);
}
return pcmBytes;
}
private float[] Pcm16ToFloat(byte[] pcmData)
{
float[] floatData = new float[pcmData.Length / 2];
for (int i = 0; i < floatData.Length; i++)
{
short sample = BitConverter.ToInt16(pcmData, i * 2);
floatData[i] = sample / 32768f;
}
return floatData;
}
private async void OnApplicationQuit()
{
if (websocket != null && websocket.State == WebSocketState.Open)
await websocket.Close();
}
}