THIS CODE HAVE NO ERROR. The OUPTUT done.
The problem is that the converted files (.txt) are 0 bytes. Seems that google cloud vision cannot read and convert photocopied books, from PDF into TXT
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="d:/doc/doc/MY-KEY.json"
from google.cloud import vision
from google.cloud.vision_v1 import types
from google.oauth2.service_account import Credentials
# from google.cloud import storage
# client library
# storage_client = storage.Client()
# Set up the Google Cloud Vision client with service account credentials
# credentials = Credentials.from_service_account_file('d:/doc/doc/bebe-1084-992b240528be.json')
# client = vision.ImageAnnotatorClient(credentials=credentials)
#pip install google-cloud-vision
# Set up the Google Cloud Vision client
client = vision.ImageAnnotatorClient()
# Directory containing the PDF files
pdf_directory = "d:/doc/doc"
# Output directory for the TXT files
output_directory = "d:/doc/doc"
# Get a list of PDF files in the directory
pdf_files = [file for file in os.listdir(pdf_directory) if file.endswith(".pdf")]
# Process each PDF file
for pdf_file in pdf_files:
pdf_path = os.path.join(pdf_directory, pdf_file)
# Create the output TXT file path
txt_file = os.path.splitext(pdf_file)[0] + ".txt"
txt_path = os.path.join(output_directory, txt_file)
# Read the PDF file as bytes
with open(pdf_path, 'rb') as file:
content = file.read()
# Convert PDF to image using Google Cloud Vision API
input_image = types.Image(content=content)
response = client.document_text_detection(image=input_image)
# Extract text from the response and save it as TXT
text = response.full_text_annotation.text
with open(txt_path, 'w', encoding='utf-8') as file:
file.write(text)
print(f"Converted {pdf_file} to {txt_file}")