Get hands-on experience with 20+ free Google Cloud products and $300 in free credit for new customers.

Python: Google Vision doesn't (cannot) read and convert photocopied books, from PDF into TXT

THIS CODE HAVE NO ERROR. The OUPTUT done.

The problem is that the converted files (.txt) are 0 bytes. Seems that google cloud vision cannot read and convert photocopied books, from PDF into TXT

 

import os
os
.environ["GOOGLE_APPLICATION_CREDENTIALS"]="d:/doc/doc/MY-KEY.json"

from google.cloud import vision

from google.cloud.vision_v1 import types

from google.oauth2.service_account import Credentials

# from google.cloud import storage
# client library
# storage_client = storage.Client()

# Set up the Google Cloud Vision client with service account credentials
# credentials = Credentials.from_service_account_file('d:/doc/doc/bebe-1084-992b240528be.json')
# client = vision.ImageAnnotatorClient(credentials=credentials)

#pip install google-cloud-vision

# Set up the Google Cloud Vision client

client = vision.ImageAnnotatorClient()

# Directory containing the PDF files

pdf_directory = "d:/doc/doc"

# Output directory for the TXT files

output_directory = "d:/doc/doc"

# Get a list of PDF files in the directory

pdf_files = [file for file in os.listdir(pdf_directory) if file.endswith(".pdf")]

# Process each PDF file

for pdf_file in pdf_files:

    pdf_path = os.path.join(pdf_directory, pdf_file)

    # Create the output TXT file path

    txt_file = os.path.splitext(pdf_file)[0] + ".txt"

    txt_path = os.path.join(output_directory, txt_file)

    # Read the PDF file as bytes

    with open(pdf_path, 'rb') as file:

        content = file.read()

    # Convert PDF to image using Google Cloud Vision API

    input_image = types.Image(content=content)

    response = client.document_text_detection(image=input_image)

    # Extract text from the response and save it as TXT

    text = response.full_text_annotation.text

    with open(txt_path, 'w', encoding='utf-8') as file:

        file.write(text)

    print(f"Converted {pdf_file} to {txt_file}")

0 1 1,108
1 REPLY 1