Vertex AI Datastore - Linked unstructured document... - Page 2

malam

I want to create an automation for updating my vertex search datastore when a GCS source file is updated added or deleted. I did something similar with the discoveryengine API when I had a datastore for just unstructured documents.

Since my data is now a JSONLine file with metadata and a link to the unstructured txt file for each document, I don't know how to particularly add/update/delete one file without doing a full reindex everytime (which I can now do manually).

Here is my update/delete functions for my last project:

async def datastore_import_documents(parent_datastore, gcs_input_uris😞

"""

https://github.com/googleapis/google-cloud-python/blob/5a6b5de18e95ecd5f8d01cfada9817e2433d55fa/pack...

https://cloud.google.com/generative-ai-app-builder/docs/refresh-data

"""

# Create a client

client = discoveryengine_v1.DocumentServiceAsyncClient()

gcs_source_documents = discoveryengine_v1.GcsSource(input_uris=gcs_input_uris,data_schema='content')

# Initialize request argument(s)

request = discoveryengine_v1.ImportDocumentsRequest(

parent=parent_datastore,

gcs_source=gcs_source_documents,

reconciliation_mode=discoveryengine_v1.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL

)

print("Starting import Request")

# Make the request

operation = await client.import_documents(request=request)

print("Finished Import Request")

response = await operation.result()

# Handle the response

print(f"Finished Response: {response}")

async def datastore_delete_document(datastore_document_name😞

"""

https://cloud.google.com/php/docs/reference/cloud-discoveryengine/0.2.0/V1beta.DeleteDocumentRequest

"""

# Create a client

client = discoveryengine_v1.DocumentServiceAsyncClient()

# Initialize request argument(s)

request = discoveryengine_v1.DeleteDocumentRequest(

name=datastore_document_name,

)

# Make the request

await client.delete_document(request=request)

And here is the only place in the documentation where I see a reference to my type of use case - but I dont understand how I can use it, and what I do in the case of a deletion:

https://cloud.google.com/generative-ai-app-builder/docs/refresh-data#discoveryengine_v1_generated_Do...

from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine

# TODO(developer): Uncomment these variables before running the sample.
# project_id = "YOUR_PROJECT_ID"
# location = "YOUR_LOCATION" # Values: "global"
# data_store_id = "YOUR_DATA_STORE_ID"

# Examples:
# - Unstructured documents
#   - `gs://bucket/directory/file.pdf`
#   - `gs://bucket/directory/*.pdf`
# - Unstructured documents with JSONL Metadata
#   - `gs://bucket/directory/file.json`
# - Unstructured documents with CSV Metadata
#   - `gs://bucket/directory/file.csv`
# gcs_uri = "YOUR_GCS_PATH"

#  For more information, refer to:
# https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
client_options = (
    ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
    if location != "global"
    else None
)

# Create a client
client = discoveryengine.DocumentServiceClient(client_options=client_options)

# The full resource name of the search engine branch.
# e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
parent = client.branch_path(
    project=project_id,
    location=location,
    data_store=data_store_id,
    branch="default_branch",
)

request = discoveryengine.ImportDocumentsRequest(
    parent=parent,
    gcs_source=discoveryengine.GcsSource(
        # Multiple URIs are supported
        input_uris=[gcs_uri],
        # Options:
        # - `content` - Unstructured documents (PDF, HTML, DOC, TXT, PPTX)
        # - `custom` - Unstructured documents with custom JSONL metadata
        # - `document` - Structured documents in the discoveryengine.Document format.
        # - `csv` - Unstructured documents with CSV metadata
        data_schema="content",
    ),
    # Options: `FULL`, `INCREMENTAL`
    reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
)

# Make the request
operation = client.import_documents(request=request)

print(f"Waiting for operation to complete: {operation.operation.name}")
response = operation.result()

# After the operation is complete,
# get information from operation metadata
metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

# Handle the response
print(response)
print(metadata)

Vertex AI Datastore - Linked unstructured documents (JSONL with metadata) from GCS