How to use the stream of tts in golang? i want to ... - Page 2

zhangchengcheng · 02-22-2025 12:26 AM

I want to use tts api, and send text and receive audio simultaneously. the stream way is a good idea. but i found it need call CloseSend() every time, when the text stream finished, then call the CloseSend() , i can receive the audio stream. if i didn't call CloseSend(), it will come error.

so , actuallly, it can not send and receive simultaneously? it just send some text in the stream, then call CloseSend(), then i receive the audio stream. so someone can tell me the right way to use tts api in golang? thanks a lot.

here is my log:

Receiving streaming responses...
2025/02/22 16:23:12 call close send
2025/02/22 16:23:12 Sent all requests.
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.00800775s
2025/02/22 16:23:13 Received audio data: 1865
2025/02/22 16:23:13 first audio spent: 1.1212305s
2025/02/22 16:23:13 Received audio data: 2113
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.211656625s
2025/02/22 16:23:13 Received audio data: 2139
Audio data received.
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.256695417s
2025/02/22 16:23:13 Received audio data: 2139
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.370593125s
2025/02/22 16:23:13 Received audio data: 2193
2025/02/22 16:23:13 first audio spent: 1.419397459s
2025/02/22 16:23:13 Received audio data: 2143
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.511148917s
2025/02/22 16:23:13 Received audio data: 2179
Audio data received.
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.601768667s
2025/02/22 16:23:13 Received audio data: 2191
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.61639975s
2025/02/22 16:23:13 Received audio data: 1188
Streaming completed successfully.
2025/02/22 16:23:13 save audio data to file: google-20250222162312-output.pcm
2025/02/22 16:23:13 total time: 1.664321167s

here is my golang code(i drop the private_key):

package main

import (
    "context"
    "fmt"
    "io"
    "log"
    "os"
    "time"

    texttospeech "cloud.google.com/go/texttospeech/apiv1"
    "cloud.google.com/go/texttospeech/apiv1/texttospeechpb"
    "google.golang.org/api/option"
)

func main() {
    if err := GoogleTts(); err != nil {
       log.Fatalf("GoogleTts err: %v", err)
    }
}

func GoogleTts() error {
    start := time.Now()
    // Instantiates a client.
    ctx := context.Background()

    credentialsJson := `
{
  "type": "service_account",
  "project_id": "teamsun-workspace",
  "private_key_id": "cc43756f5a0e46c30f34e53f8ed1343a2a92a34a",
  "private_key": "xxxxxxxx",
  "client_email": "teamsun-tts-test@teamsun-workspace.iam.gserviceaccount.com",
  "client_id": "105978219795698433242",
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/teamsun-tts-test%40teamsun-workspace.iam.gserviceaccount.com",
  "universe_domain": "googleapis.com"
}
`

    client, err := texttospeech.NewClient(ctx, option.WithCredentialsJSON([]byte(credentialsJson)))
    if err != nil {
       log.Fatal(err)
    }
    defer client.Close()

    // Open a streaming connection
    stream, err := client.StreamingSynthesize(ctx)
    if err != nil {
       log.Fatalf("Failed to start streaming: %v", err)
    }

    // Send streaming synthesis requests in a separate goroutine
    go func() {
       // Prepare the input requests for the streaming API
       requests := []*texttospeechpb.StreamingSynthesizeRequest{
          {
             StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_StreamingConfig{
                StreamingConfig: &texttospeechpb.StreamingSynthesizeConfig{
                   Voice: &texttospeechpb.VoiceSelectionParams{
                      LanguageCode: "en-US",
                      Name:         "en-US-Chirp-HD-F",
                      SsmlGender:   texttospeechpb.SsmlVoiceGender_FEMALE,
                   },
                   StreamingAudioConfig: &texttospeechpb.StreamingAudioConfig{
                      AudioEncoding:   texttospeechpb.AudioEncoding_OGG_OPUS,
                      SampleRateHertz: 8000,
                   },
                },
             },
          },
          {
             StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_Input{
                Input: &texttospeechpb.StreamingSynthesisInput{
                   InputSource: &texttospeechpb.StreamingSynthesisInput_Text{
                      Text: "hello, ",
                   },
                },
             },
          },
          {
             StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_Input{
                Input: &texttospeechpb.StreamingSynthesisInput{
                   InputSource: &texttospeechpb.StreamingSynthesisInput_Text{
                      Text: "good to see you, ",
                   },
                },
             },
          },
          {
             StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_Input{
                Input: &texttospeechpb.StreamingSynthesisInput{
                   InputSource: &texttospeechpb.StreamingSynthesisInput_Text{
                      Text: "and welcome to the world of AI.",
                   },
                },
             },
          },
       }

       // Send the requests into the stream
       for _, req := range requests {
          if err := stream.Send(req); err != nil {
             log.Fatalf("Failed to send request: %v", err)
          }
       }

       // End the sending stream
       log.Printf("call close send")
       if err := stream.CloseSend(); err != nil {
          log.Fatalf("Failed to close send stream: %v", err)
       }

       log.Printf("Sent all requests.")
    }()

    // Receive and handle streaming responses
    fmt.Println("Receiving streaming responses...")
    filename := "google-" + time.Now().Format("20060102150405") + "-output.pcm"
    for {
       resp, err := stream.Recv()
       if err == io.EOF {
          break
       }
       if err != nil {
          log.Fatalf("Failed to receive stream: %v", err)
       }

       log.Printf("first audio spent: %v", time.Since(start))

       // log.Printf("Received response: %v", resp)

       if len(resp.AudioContent) > 0 {
          fmt.Println("Audio data received.")
          log.Printf("Received audio data: %v", len(resp.AudioContent))
          if err := appendToFile(filename, resp.AudioContent); err != nil {
             log.Fatalf("Failed to write audio data to file: %v", err)
          }
       }
    }

    fmt.Println("Streaming completed successfully.")
    log.Printf("save audio data to file: %v", filename)
    log.Printf("total time: %v", time.Since(start))

    return nil
}

func appendToFile(filename string, data []byte) error {
    f, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
    if err != nil {
       return err
    }
    defer f.Close()

    _, err = f.Write(data)
    return err
}

How to use the stream of tts in golang? i want to send text and receive audio simultaneously.