Get hands-on experience with 20+ free Google Cloud products and $300 in free credit for new customers.

Vertex AI Pipeline Template Schema Issue/Question

Hi all,

When compiling our artifacts to yaml file, we are expecting the following schema format in our runtime parameter value : https://github.com/kubeflow/pipelines/blob/master/api/v2alpha1/pipeline_spec.proto#L677 

When I first tried to upload my yaml file with the following schema above and try to run the pipeline template, the runtime value treats everything as a string instead of converting it to it's appropriate data type. 

Here is the yaml file I tried to upload and run:
```

components:
  input_data:
    executorLabel: input_data_executor
    inputDefinitions:
      parameters:
        allow_large_results_flag:
          type: INT
        allow_pre_computation_flag:
          type: INT
        create_disposition:
          type: STRING
        custom_config:
          type: STRING
        labels:
          type: STRING
        non_artifact_input_table:
          type: STRING
        union_bq_shards_flag:
          type: INT
        write_disposition:
          type: STRING
    outputDefinitions:
      artifacts:
        output_table:
          artifactType:
            instanceSchema: 'title: tfx.String

              type: object

              '
deploymentSpec:
  executors:
    input_data_executor:
      container:
        args:
        - --executor_class_path
        - tfx_common.components.bigquery.executor.BigQueryComponentExecutor
        - --json_serialized_invocation_args
        - '{{$}}'
        - --project=projectid
        - --region=us-central1
        - --temp_location=gs://tmp
        - --runner=DataflowRunner
        - --experiments=use_runner_v2
        - --sdk_container_image=image:latest
        command:
        - python
        - -m
        - tfx.orchestration.kubeflow.v2.container.kubeflow_v2_run_executor
        image: image:latest
pipelineInfo:
  name: hello-world
root:
  dag:
    tasks:
      input_data:
        cachingOptions:
          enableCache: true
        componentRef:
          name: input_data
        inputs:
          parameters:
            allow_large_results_flag:
              runtimeValue:
                constantValue:
                  intValue: 1
            allow_pre_computation_flag:
              runtimeValue:
                constantValue:
                  intValue: 0
            create_disposition:
              runtimeValue:
                constantValue:
                  stringValue: CREATE_IF_NEEDED
            custom_config:
              componentInputParameter: custom-config
            labels:
              runtimeValue:
                constantValue:
                  stringValue: 'null'
            non_artifact_input_table:
              componentInputParameter: input-table
            union_bq_shards_flag:
              runtimeValue:
                constantValue:
                  intValue: 0
            write_disposition:
              runtimeValue:
                constantValue:
                  stringValue: WRITE_EMPTY
        taskInfo:
          name: input_data
  inputDefinitions:
    parameters:
      custom-config:
        type: STRING
      input-table:
        type: STRING
schemaVersion: 2.1.0
sdkVersion: tfx-1.12.0
```
 
Result:
hobb_0-1681921204274.png

But what I'm expecting is something like this (this run is submitted through the ai platform python library):

hobb_1-1681921437442.png

As an alternative solution to get the above screenshot (the expected one) - I changed the yaml file as a POC to the following:

```

components:
  input_data:
    executorLabel: input_data_executor
    inputDefinitions:
      parameters:
        allow_large_results_flag:
          type: INT
        allow_pre_computation_flag:
          type: INT
        create_disposition:
          type: STRING
        custom_config:
          type: STRING
        labels:
          type: STRING
        non_artifact_input_table:
          type: STRING
        union_bq_shards_flag:
          type: INT
        write_disposition:
          type: STRING
    outputDefinitions:
      artifacts:
        output_table:
          artifactType:
            instanceSchema: 'title: tfx.String

              type: object

              '
deploymentSpec:
  executors:
    input_data_executor:
      container:
        args:
        - --executor_class_path
        - tfx_common.components.bigquery.executor.BigQueryComponentExecutor
        - --json_serialized_invocation_args
        - '{{$}}'
        - --project=projectid
        - --region=us-central1
        - --temp_location=gs://tmp
        - --runner=DataflowRunner
        - --experiments=use_runner_v2
        - --sdk_container_image=image:latest
        command:
        - python
        - -m
        - tfx.orchestration.kubeflow.v2.container.kubeflow_v2_run_executor
        image: image:latest
pipelineInfo:
  name: hello-world
root:
  dag:
    tasks:
      input_data:
        cachingOptions:
          enableCache: true
        componentRef:
          name: input_data
        inputs:
          parameters:
            allow_large_results_flag:
              runtimeValue:
                constantValue1
            allow_pre_computation_flag:
              runtimeValue:
                constantValue0
            create_disposition:
              runtimeValue:
                constantValue: CREATE_IF_NEEDED
            custom_config:
              componentInputParameter: custom-config
            labels:
              runtimeValue:
                constantValue: 'null'
            non_artifact_input_table:
              componentInputParameter: input-table
            union_bq_shards_flag:
              runtimeValue:
                constantValue: 0
            write_disposition:
              runtimeValue:
                constantValueWRITE_EMPTY
        taskInfo:
          name: input_data
  inputDefinitions:
    parameters:
      custom-config:
        type: STRING
      input-table:
        type: STRING
schemaVersion: 2.1.0
sdkVersion: tfx-1.12.0
```
 
and reran it however, I got this instead:
hobb_2-1681921826348.png

It ran successfully in this use case, but for other components we are expecting an `int` as `int` not `double`. For example we are running the chicago taxi pipeline as a template but we got the following error:

```

The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=636071587074&resource=ml_job%2Fjob_id%2F1886953...

```

hobb_3-1681922019077.pnghobb_4-1681922048679.png

My question here, is there a way in vertex when uploading our yaml file to pipeline template, we can specify the parameter type as well? if so how? if not, what are our alternate approach? (Note here, we are also using TFX to generate these artifacts)

1 REPLY 1

Hi @hobb,

Welcome back to Google Cloud Community.

The issue you are facing is that the runtime parameter value treats everything as a string instead of converting it to its appropriate data type. One possible solution to this problem is to modify your YAML file to use YAML tags to specify the data types of your runtime values.

You may use YAML tags to specify the data type of each runtime value.
For example, to specify an integer value, you can use the !!int tag, and to specify a string value, you can use the !!str tag.

Here is an example of how to modify your YAML file to use YAML tags:

root:
dag:
tasks:
input_data:
inputs:
parameters:
allow_large_results_flag:
runtimeValue:
constantValue: !!int 1
allow_pre_computation_flag:
runtimeValue:
constantValue: !!int 0
create_disposition:
runtimeValue:
constantValue: !!str CREATE_IF_NEEDED
custom_config:
componentInputParameter: custom-config
labels:
runtimeValue:
constantValue: !!str 'null'
non_artifact_input_table:
componentInputParameter: input-table
union_bq_shards_flag:
runtimeValue:
constantValue: !!int 0
write_disposition:
runtimeValue:
constantValue: !!str WRITE_EMPTY

In this modified YAML file, you have to use YAML tags to specify the data types of each runtime value. For example, we have used the !!int tag to specify that the allow_large_results_flag and allow_pre_computation_flag runtime values are integers, and we have used the !!str tag to specify that the create_disposition, labels, and write_disposition runtime values are strings.

By using YAML tags to specify the data types of your runtime values, the Kubeflow Pipelines runtime should
be able to correctly interpret the data types of your runtime values, instead of treating everything as a string.

Here are some documentation that might help you:

https://cloud.google.com/vertex-ai/docs/pipelines/create-pipeline-template?_ga=2.218108633.-13927534...

https://cloud.google.com/vertex-ai/docs/pipelines/build-pipeline?_ga=2.218108633.-1392753435.1676655...