Re: DocAI - Response in a single json file

UshaPravin · 11-17-2022 11:30 PM

Hello Experts,
I'm doing BatchProcessDocument. I have 18 pages of a PDF file and tried to process this using DocumentProcessorServiceClient API. After the process, Im getting response in json file. This is perfect.
But the json output file is created only for the 5 pages of the source PDF file. Each 5 pages of the content are converted into a separate json file.

My question here is, is it possible to have a single output json file for a PDF source file?

ricconoel

Hi,

Just to confirm, when you said "Each 5 pages of the content are converted into a separate json file." does it mean that 1 json per page? or 1 json per 5 pages? Also can you provide the code and sample file that you are using? Please make sure there are no PIIs (Personal Identifiable Information) in your file when providing it here.

UshaPravin

Hello,

Thanks for you response. Actually I have two points.

A pdf file should be processed and the response for this file in a single json file
I have a file with a table of around 1000 rows. This table data can not be displayed in a single page. I just want a json object for this whole table. But currently the code is working for objects in a single page.

Below is my sample source code. I could not attach my sample file to this discussion.

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import com.google.api.gax.core.FixedCredentialsProvider;

// [START documentai_batch_process_document]

import com.google.api.gax.longrunning.OperationFuture;
import com.google.api.gax.paging.Page;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.documentai.v1.BatchDocumentsInputConfig;
import com.google.cloud.documentai.v1.BatchProcessMetadata;
import com.google.cloud.documentai.v1.BatchProcessRequest;
import com.google.cloud.documentai.v1.BatchProcessResponse;
import com.google.cloud.documentai.v1.Document;
import com.google.cloud.documentai.v1.DocumentOutputConfig;
import com.google.cloud.documentai.v1.DocumentOutputConfig.GcsOutputConfig;
import com.google.cloud.documentai.v1.DocumentProcessorServiceClient;
import com.google.cloud.documentai.v1.DocumentProcessorServiceSettings;
import com.google.cloud.documentai.v1.GcsDocument;
import com.google.cloud.documentai.v1.GcsDocuments;
import com.google.cloud.storage.Blob;
import com.google.cloud.storage.BlobId;
import com.google.cloud.storage.Bucket;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageOptions;
import com.google.common.collect.Lists;
import com.google.protobuf.util.JsonFormat;

public class CustomProcessDocument {
	
	public static void main(String a[]) {
		
		try {
			CustomProcess();
		} catch (IOException | InterruptedException | ExecutionException | TimeoutException e) {
			e.printStackTrace();
		}
	}
	
	public static void CustomProcess() 
			throws IOException, InterruptedException, ExecutionException, TimeoutException {
		
        String projectId = "my-project-id";
        String location = "us"; // Format is "us" or "eu".
        String processerId = "my-processor-id";
        String outputGcsBucketName = "my-storage-bucket-name";
        String outputGcsPrefix = "my-output-path";
        String inputGcsUri = "gs://my-storage-bucket-name/sample-pdf-file.pdf";
        String tokenPath = "credentials-json-file-path";
        CustomProcessDoc(projectId, location, processerId, inputGcsUri, outputGcsBucketName, outputGcsPrefix, tokenPath);
	}

    public static void CustomProcessDoc(String projectId, String location, String processorId, String gcsInputUri, 
    		String gcsOutputBucketName, String gcsOutputUriPrefix, String tokenPath) {
    	
        // Initialize client that will be used to send requests. This client only needs to be created
        // once, and can be reused for multiple requests. After completing all of your requests, call
        // the "close" method on the client to safely clean up any remaining background resources.
        try {
        	
        	GoogleCredentials credentials = GoogleCredentials.fromStream(new FileInputStream(tokenPath)).createScoped(Lists.newArrayList("https://www.googleapis.com/auth/cloud-platform"));
        	DocumentProcessorServiceSettings setting = DocumentProcessorServiceSettings.newBuilder().setCredentialsProvider(FixedCredentialsProvider.create(credentials)).build();
        	DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create(setting);
        	
            // The full resource name of the processor, e.g.:
            // projects/project-id/locations/location/processor/processor-id
            // You must create new processors in the Cloud Console first
            String name = String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);

            GcsDocument gcsDocument = GcsDocument.newBuilder().setGcsUri(gcsInputUri).setMimeType("application/pdf").build();

            GcsDocuments gcsDocuments = GcsDocuments.newBuilder().addDocuments(gcsDocument).build();

            BatchDocumentsInputConfig inputConfig = BatchDocumentsInputConfig.newBuilder().setGcsDocuments(gcsDocuments).build();

            String fullGcsPath = String.format("gs://%s/%s/", gcsOutputBucketName, gcsOutputUriPrefix);
            GcsOutputConfig gcsOutputConfig = GcsOutputConfig.newBuilder().setGcsUri(fullGcsPath).build();

            DocumentOutputConfig documentOutputConfig = DocumentOutputConfig.newBuilder().setGcsOutputConfig(gcsOutputConfig).build();

            // Configure the batch process request.
            BatchProcessRequest request = BatchProcessRequest.newBuilder().setName(name).setInputDocuments(inputConfig).setDocumentOutputConfig(documentOutputConfig).build();

            OperationFuture<BatchProcessResponse, BatchProcessMetadata> future = client.batchProcessDocumentsAsync(request);

            // Batch process document using a long-running operation.
            // You can wait for now, or get results later.
            // Note: first request to the service takes longer than subsequent
            // requests.
            System.out.println("Waiting for operation to complete...");
            future.get(240, TimeUnit.SECONDS);

            System.out.println("Document processing complete.");

//            Storage storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
            Storage storage = StorageOptions.newBuilder().setCredentials(credentials).setProjectId(projectId).build().getService();
            Bucket bucket = storage.get(gcsOutputBucketName);

            // List all of the files in the Storage bucket.
            Page<Blob> blobs = bucket.list(Storage.BlobListOption.prefix(gcsOutputUriPrefix + "/"));
            System.out.println("blobs : "+blobs);
            int idx = 0;
            for (Blob blob : blobs.iterateAll()) {
                if (!blob.isDirectory()) {
                    System.out.printf("Fetched file #%d\n", ++idx);
                    // Read the results

                    // Download and store json data in a temp file.
                    File tempFile = File.createTempFile("file", ".json");
                    Blob fileInfo = storage.get(BlobId.of(gcsOutputBucketName, blob.getName()));
                    fileInfo.downloadTo(tempFile.toPath());

                    // Parse json file into Document.
                    FileReader reader = new FileReader(tempFile);
                    Document.Builder builder = Document.newBuilder();
                    JsonFormat.parser().merge(reader, builder);

                    Document document = builder.build();

                    // Get all of the document text as one big string.
                    String text = document.getText();

                    // Read the text recognition output from the processor
                    System.out.println("The document contains the following paragraphs:");
                    Document.Page page1 = document.getPages(0);
                    List<Document.Page.Paragraph> paragraphList = page1.getParagraphsList();
                    for (Document.Page.Paragraph paragraph : paragraphList) {
                        String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
                        System.out.printf("Paragraph text:%s\n", paragraphText);
                    }

                    // Form parsing provides additional output about
                    // form-formatted PDFs. You must create a form
                    // processor in the Cloud Console to see full field details.
                    System.out.println("The following form key/value pairs were detected:");

                    for (Document.Page.FormField field : page1.getFormFieldsList()) {
                        String fieldName = getText(field.getFieldName().getTextAnchor(), text);
                        String fieldValue = getText(field.getFieldValue().getTextAnchor(), text);

                        System.out.println("Extracted form fields pair:");
                        System.out.printf("\t(%s, %s))", fieldName, fieldValue);
                    }

                    // Clean up temp file.
                    tempFile.deleteOnExit();
                }
            }
        } catch (IOException | InterruptedException | TimeoutException | ExecutionException e) {
        	e.printStackTrace();
        }
    }

    // Extract shards from the text field
    private static String getText(Document.TextAnchor textAnchor, String text) {
    	
        if (textAnchor.getTextSegmentsList().size() > 0) {
        	
            int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
            int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
            return text.substring(startIdx, endIdx);
        }
        return "[NO TEXT]";
    }
}