Get hands-on experience with 20+ free Google Cloud products and $300 in free credit for new customers.

TTS Long form Synthesis not working

Summary of the Issue:

I am trying to integrate long form synthesis into my app and it keeps returning this error. 

Google Long-form API error: {
"error": {
"code": 400,
"message": "Can't write to GCS uri `gs://XXXYYY-audio-output/tts-output-1749443435083.wav`. Make sure to follow the steps at https://cloud.google.com/text-to-speech/docs/create-audio-text-long-audio-synthesis#before-you-begin. Invalid authentication from policy (go/gcs-rpc-sp): Rejected by creds_policy: Permission 'auth.creds.useNormalUserEUC' not granted to cloud-ml-tts-frontend-prod-regional-jobs@prod.google.com, because it satisfies none of the 1 rules granting that permission.; RpcSecurityPolicy http://rpcsp/p/1hEJNqELrPdL_GAiNvmvq3Bhjsd04r_gQIKuyIT27SQ ",
"status": "INVALID_ARGUMENT"
}
}

This is probably because the service agent required for the Text-to-Speech API is failing to be created automatically. ( i dont have an option to create a TTS Service agent directly) 

Troubleshooting Steps Taken:

 

 

  • Confirmed our application's service account has all necessary Storage roles, including Service Usage Consumer and Service Account Token Creator.

  • Attempted to force-create the service agent using gcloud beta services identity create --service=texttospeech.googleapis.com --project=long-form-synthesis. This command consistently fails with the internal error SU_INTERNAL_GENERATE_SERVICE_IDENTITY.

  • Completely disabled and re-enabled the Text-to-Speech API for the project. The problem persists.

  • To rule out any project-specific misconfiguration, we created a brand new, clean project (long-form-synthesis). This new project exhibits the exact same SU_INTERNAL_GENERATE_SERVICE_IDENTITY error and failure to create the service agent.

Below is the code,

import { type NextRequest, NextResponse } from "next/server"
import fs from "fs"
import path from "path"
import * as jose from "jose"

// Configuration for regional endpoint
const REGION = "us-central1" // You can change this to your preferred region

// Function to generate a JWT token for Google API authentication
async function generateGoogleJwt(credentials: any) {
const now = Math.floor(Date.now() / 1000)
const privateKey = credentials.private_key.replace(/\\n/g, "\n")

const payload = {
iss: credentials.client_email,
sub: credentials.client_email,
aud: "https://oauth2.googleapis.com/token",
iat: now,
exp: now + 3600,
scope: "https://www.googleapis.com/auth/cloud-platform",
}

const privateKeyImported = await jose.importPKCS8(privateKey, "RS256")
const token = await new jose.SignJWT(payload).setProtectedHeader({ alg: "RS256" }).sign(privateKeyImported)

return token
}

async function getAccessToken(credentials: any) {
const jwtAssertion = await generateGoogleJwt(credentials)

const tokenResponse = await fetch("https://oauth2.googleapis.com/token", {
method: "POST",
headers: {
"Content-Type": "application/x-www-form-urlencoded",
},
body: new URLSearchParams({
grant_type: "urn:ietf:params:oauth:grant-type:jwt-bearer",
assertion: jwtAssertion,
}),
})

if (!tokenResponse.ok) {
const errorText = await tokenResponse.text()
throw new Error(`Failed to get access token: ${tokenResponse.status} ${errorText}`)
}

const tokenData = await tokenResponse.json()
return tokenData.access_token
}

// ADD THIS NEW FUNCTION - API Kickstart to create service agent
async function triggerStandardSynthesis(token: string, projectId: string) {
console.log("Attempting to 'kickstart' the API with a standard synthesis call...")

const requestBody = {
input: { text: "hello" }, // A very short text
voice: { languageCode: "en-US", name: "en-US-Neural2-A" },
audioConfig: { audioEncoding: "MP3" },
}

const response = await fetch("https://texttospeech.googleapis.com/v1/text:synthesize", {
method: "POST",
headers: {
Authorization: `Bearer ${token}`,
"Content-Type": "application/json",
"x-goog-user-project": projectId, // Pass the project ID in a header
},
body: JSON.stringify(requestBody),
})

if (!response.ok) {
const errorText = await response.text()
console.error("Standard synthesis trigger failed:", errorText)
throw new Error(`Failed to kickstart API. Status: ${response.status}`)
}

const responseData = await response.json()
// We don't need the audio content, we just need the call to succeed.
if (responseData.audioContent) {
console.log(" SUCCESS: Standard API call successful. The service agent should now exist.")
return true
}

return false
}

// Function to start long-form synthesis using correct v1beta1 endpoint
async function startLongFormSynthesis(
text: string,
voice: any,
token: string,
audioEncoding = "LINEAR16",
projectId: string,
) {
// Create a timestamp for unique file naming
const timestamp = Date.now()
const fileExtension = audioEncoding === "MP3" ? "mp3" : audioEncoding === "LINEAR16" ? "wav" : "ogg"

// Use the specified bucket: gs://XXXYYY-audio-output
const outputGcsUri = `gs://XXXYYY-audio-output/tts-output-${timestamp}.${fileExtension}`

console.log(`Using output GCS URI: ${outputGcsUri}`)

const requestBody = {
input: { text },
voice: {
languageCode: "en-US",
name: "en-US-Standard-A",
ssmlGender: "FEMALE",
},
audioConfig: {
audioEncoding: audioEncoding,
},
outputGcsUri: outputGcsUri,
}

// Correct URL format with v1beta1 and full project path
const url = `https://${REGION}-texttospeech.googleapis.com/v1beta1/projects/${projectId}/locations/${REGION}:synthesizeLongAudio`

console.log("Starting long-form synthesis with default voice (en-US-Standard-A)")
console.log(`Using correct v1beta1 endpoint: ${url}`)

const response = await fetch(url, {
method: "POST",
headers: {
Authorization: `Bearer ${token}`,
"Content-Type": "application/json",
},
body: JSON.stringify(requestBody),
})

if (!response.ok) {
const errorText = await response.text()
console.error("Google Long-form API error:", errorText)

// Try to parse as JSON first, if that fails, return the raw text
let errorDetails
try {
const errorJson = JSON.parse(errorText)
errorDetails = JSON.stringify(errorJson, null, 2)
} catch {
errorDetails = errorText
}

throw new Error(`Google Long-form API Error (${response.status}) from ${REGION}: ${errorDetails}`)
}

const responseData = await response.json()
return responseData.name // This is the operation name
}

export async function POST(request: NextRequest) {
console.log("Long-form text-to-speech API route called")

try {
const body = await request.json()
const { text, voice, audioEncoding = "LINEAR16" } = body

if (!text) {
return NextResponse.json({ error: "Text is required" }, { status: 400 })
}

// Check if credentials file exists
const credentialsPath = path.join(process.cwd(), "long-form-credentials.json")
if (!fs.existsSync(credentialsPath)) {
return NextResponse.json({ error: "Long-form credentials file not found" }, { status: 500 })
}

// Read credentials file
const fileContent = fs.readFileSync(credentialsPath, "utf8")
const credentials = JSON.parse(fileContent)

if (!credentials.private_key || !credentials.client_email || !credentials.project_id) {
throw new Error("Credentials file is missing required fields (private_key, client_email, project_id)")
}

// Generate access token for authentication
const token = await getAccessToken(credentials)
console.log("Access token generated successfully for long-form synthesis")

// ====================================================================
// TEMPORARY CODE TO KICKSTART THE API - ADD THIS BLOCK
// Make sure your new project ID is available
const projectId = credentials.project_id
if (!projectId) throw new Error("Project ID is missing from credentials!")

await triggerStandardSynthesis(token, projectId)
// ====================================================================

// Start long-form synthesis with correct project ID
const operationName = await startLongFormSynthesis(text, voice, token, audioEncoding, credentials.project_id)
console.log("Long-form synthesis started with operation:", operationName)

return NextResponse.json({
success: true,
operationName: operationName,
region: REGION,
projectId: credentials.project_id,
message: "Long-form synthesis started successfully",
})
} catch (error) {
console.error("Error in long-form text-to-speech:", error)

return NextResponse.json(
{
error: "Failed to start long-form synthesis",
details: error instanceof Error ? error.message : "Unknown error",
},
{ status: 500 },
)
}
}

 

0 0 29
0 REPLIES 0