Hello guys!
I am migrating a project from using VisionAI to DocumentAI because the former will go out of business at the end of March. So,
I am using documentAI batch processing request to parse a pdf document into a JSON file with extracted information from the pdf. I have a Google Cloud bucket with several PDF files and another bucket with JSON files where the output goes.
However, when I use the batch processing the output JSON is saved inside a folder (with randomly generated name - some numbers) within another folder with name 0. I want to save the output freely inside the output bucket without any additional folders.
If somebody can help me with that, I will be very thankful.
P.S. Using ProcessRequest does not help because to upload the file into the bucket, I am required to convert the output into a string or bytes which messes up with the JSON formatting and is not usable afterward.
This is my function that does the parsing using batch processing:
def parse_pdf(
config: Config,
source_blob: storage.Blob,
destination_blob: storage.Blob,
parsing_timeout: int
) -> list[str]:
project = config.PROJECT
location = config.REGION
processor_id = "the_id_of_the_processor"
input_mime_type = "application/pdf"
gcs_source_uri = f"gs://{join_blob_paths(source_blob.bucket.name, source_blob.name)}"
gcs_destination_uri = f"gs://{join_blob_paths(destination_blob.bucket.name, destination_blob.name)}"
# initialize the client for Document AI
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
# configure the source bucket location
if not gcs_source_uri.endswith("/") and "." in gcs_source_uri:
# specific GCS URIs to process individual documents
gcs_document = documentai.GcsDocument(
gcs_uri=gcs_source_uri, mime_type=input_mime_type
)
# loading GCS Input URI into a list of document files
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
else:
# GCS URI Prefix to process an entire directory
gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_source_uri)
input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
# configure Cloud Storage URI for the Output Directory
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=gcs_destination_uri
)
# set the location where to write results
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
# set the full resource name of the processor
name = client.processor_path(project, location, processor_id)
# process request
request = documentai.BatchProcessRequest(
name=name,
input_documents=input_config,
document_output_config=output_config,
)
# set the client to asynchronously process the PDF
operation = client.batch_process_documents(request)
# wait for the operation to complete
try:
logger.info(f"Waiting for the operation {operation.operation.name} to finish")
result = operation.result(timeout=parsing_timeout)
logger.info(f"Results saved in {gcs_destination_uri}")
except (RetryError, InternalServerError) as e:
logger.error(e.message)
return result