I want to use tts api, and send text and receive audio simultaneously. the stream way is a good idea. but i found it need call CloseSend() every time, when the text stream finished, then call the CloseSend() , i can receive the audio stream. if i didn't call CloseSend(), it will come error.
so , actuallly, it can not send and receive simultaneously? it just send some text in the stream, then call CloseSend(), then i receive the audio stream. so someone can tell me the right way to use tts api in golang? thanks a lot.
here is my log:
Receiving streaming responses...
2025/02/22 16:23:12 call close send
2025/02/22 16:23:12 Sent all requests.
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.00800775s
2025/02/22 16:23:13 Received audio data: 1865
2025/02/22 16:23:13 first audio spent: 1.1212305s
2025/02/22 16:23:13 Received audio data: 2113
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.211656625s
2025/02/22 16:23:13 Received audio data: 2139
Audio data received.
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.256695417s
2025/02/22 16:23:13 Received audio data: 2139
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.370593125s
2025/02/22 16:23:13 Received audio data: 2193
2025/02/22 16:23:13 first audio spent: 1.419397459s
2025/02/22 16:23:13 Received audio data: 2143
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.511148917s
2025/02/22 16:23:13 Received audio data: 2179
Audio data received.
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.601768667s
2025/02/22 16:23:13 Received audio data: 2191
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.61639975s
2025/02/22 16:23:13 Received audio data: 1188
Streaming completed successfully.
2025/02/22 16:23:13 save audio data to file: google-20250222162312-output.pcm
2025/02/22 16:23:13 total time: 1.664321167s
here is my golang code(i drop the private_key):
package main
import (
"context"
"fmt"
"io"
"log"
"os"
"time"
texttospeech "cloud.google.com/go/texttospeech/apiv1"
"cloud.google.com/go/texttospeech/apiv1/texttospeechpb"
"google.golang.org/api/option"
)
func main() {
if err := GoogleTts(); err != nil {
log.Fatalf("GoogleTts err: %v", err)
}
}
func GoogleTts() error {
start := time.Now()
// Instantiates a client.
ctx := context.Background()
credentialsJson := `
{
"type": "service_account",
"project_id": "teamsun-workspace",
"private_key_id": "cc43756f5a0e46c30f34e53f8ed1343a2a92a34a",
"private_key": "xxxxxxxx",
"client_email": "teamsun-tts-test@teamsun-workspace.iam.gserviceaccount.com",
"client_id": "105978219795698433242",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/teamsun-tts-test%40teamsun-workspace.iam.gserviceaccount.com",
"universe_domain": "googleapis.com"
}
`
client, err := texttospeech.NewClient(ctx, option.WithCredentialsJSON([]byte(credentialsJson)))
if err != nil {
log.Fatal(err)
}
defer client.Close()
// Open a streaming connection
stream, err := client.StreamingSynthesize(ctx)
if err != nil {
log.Fatalf("Failed to start streaming: %v", err)
}
// Send streaming synthesis requests in a separate goroutine
go func() {
// Prepare the input requests for the streaming API
requests := []*texttospeechpb.StreamingSynthesizeRequest{
{
StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_StreamingConfig{
StreamingConfig: &texttospeechpb.StreamingSynthesizeConfig{
Voice: &texttospeechpb.VoiceSelectionParams{
LanguageCode: "en-US",
Name: "en-US-Chirp-HD-F",
SsmlGender: texttospeechpb.SsmlVoiceGender_FEMALE,
},
StreamingAudioConfig: &texttospeechpb.StreamingAudioConfig{
AudioEncoding: texttospeechpb.AudioEncoding_OGG_OPUS,
SampleRateHertz: 8000,
},
},
},
},
{
StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_Input{
Input: &texttospeechpb.StreamingSynthesisInput{
InputSource: &texttospeechpb.StreamingSynthesisInput_Text{
Text: "hello, ",
},
},
},
},
{
StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_Input{
Input: &texttospeechpb.StreamingSynthesisInput{
InputSource: &texttospeechpb.StreamingSynthesisInput_Text{
Text: "good to see you, ",
},
},
},
},
{
StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_Input{
Input: &texttospeechpb.StreamingSynthesisInput{
InputSource: &texttospeechpb.StreamingSynthesisInput_Text{
Text: "and welcome to the world of AI.",
},
},
},
},
}
// Send the requests into the stream
for _, req := range requests {
if err := stream.Send(req); err != nil {
log.Fatalf("Failed to send request: %v", err)
}
}
// End the sending stream
log.Printf("call close send")
if err := stream.CloseSend(); err != nil {
log.Fatalf("Failed to close send stream: %v", err)
}
log.Printf("Sent all requests.")
}()
// Receive and handle streaming responses
fmt.Println("Receiving streaming responses...")
filename := "google-" + time.Now().Format("20060102150405") + "-output.pcm"
for {
resp, err := stream.Recv()
if err == io.EOF {
break
}
if err != nil {
log.Fatalf("Failed to receive stream: %v", err)
}
log.Printf("first audio spent: %v", time.Since(start))
// log.Printf("Received response: %v", resp)
if len(resp.AudioContent) > 0 {
fmt.Println("Audio data received.")
log.Printf("Received audio data: %v", len(resp.AudioContent))
if err := appendToFile(filename, resp.AudioContent); err != nil {
log.Fatalf("Failed to write audio data to file: %v", err)
}
}
}
fmt.Println("Streaming completed successfully.")
log.Printf("save audio data to file: %v", filename)
log.Printf("total time: %v", time.Since(start))
return nil
}
func appendToFile(filename string, data []byte) error {
f, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return err
}
defer f.Close()
_, err = f.Write(data)
return err
}