Unable to pass jar file properly to pyspark script...

sudheeraja-6232 · 06-07-2024 02:47 AM

Below is the code i have which trigger on file upload into GCS. So the jar files are not able to pass to the script properly which connects to the database, please help. Code is to trigger pyspark script on serverless spark cluster. code highlighted in red is throwing error.

from google.cloud import dataproc_v1

from google.cloud.dataproc_v1 import Batch, PySparkBatch

import os

def submit_job_serverless(data,context):

client = dataproc_v1.BatchControllerClient(client_options={"api_endpoint": "us-central1-dataproc.googleapis.com:443"})

project_id = 'onyx-zodiac-xxxx13'

region = 'us-central1'

spark_script_path = 'gs://bucket/new_code_read_files.py'

jar_file_uri = ['gs://bucket/postgresql-42.7.3.jar']

# Define the PySpark batch job

batch = Batch(

pyspark_batch=PySparkBatch(

{"main_python_file_uri": spark_script_path

,"jar_file_uris":jar_file_uri

}

),

environment_config={

'execution_config': {

'service_account': 'xxxxx-compute@developer.gserviceaccount.com'

}

)

# Submit the batch job

operation = client.create_batch(parent=f"projects/{project_id}/locations/{region}", batch=batch)

#, batch_id=batch_id

# Wait for the batch job to complete

response = operation.result()

operation_id = response.reference.job_id

# Print the batch job details

print(f"Batch job finished with state: {response.state}")

print(f"Submitted job to Serverless Dataproc cluster with operation ID {operation_id}")

return f"Submitted job with operation ID {operation_id}"

Unable to pass jar file properly to pyspark script to connect to AlloyDB