Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modified parameters for BigtableToParquet #1490

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
b31bd4b
Modified parameters for BigQueryToParquet
Supriya-Koppa Apr 30, 2024
997dfd2
Modified parameters for BigQueryToParquet
Supriya-Koppa Apr 30, 2024
ae391ca
Modified parameters for BigtableToParquet
Supriya-Koppa Apr 30, 2024
0c932f8
Modified parameters for SpannerVectorEmbeddingExport and TextIOToBigQ…
Supriya-Koppa May 1, 2024
ec32669
Modified parameters for SpannerVectorEmbeddingExport and TextIOToBigQ…
Supriya-Koppa May 1, 2024
bd943b2
Modified parameters for DataStream to BigQuery template
Supriya-Koppa May 1, 2024
19e36d6
Modified parameters for MQTT to Pub/Sub template
Supriya-Koppa May 3, 2024
a7d68c1
Modified parameters for Pub/Sub to Spunk template
Supriya-Koppa May 6, 2024
cde388b
Modified parameters for Pub/Sub to Datadog template
Supriya-Koppa May 6, 2024
1feef9e
Modified parameters for MongoDB to BigQuery template
Supriya-Koppa May 6, 2024
5de7fde
Modified parameters for Datastore to Cloud Storage template
Supriya-Koppa May 6, 2024
a75b30e
Modified parameters for MySQL CDC to BigQuery template
Supriya-Koppa May 6, 2024
1342c6c
Modified parameters for BigTable to Parquet template (revised)
Supriya-Koppa May 6, 2024
948545e
Modified parameters for BigQuery to Parquet template (revised)
Supriya-Koppa May 6, 2024
704eb35
Merge branch 'main' into koppas-dataflowTemplates
sharan-malyala May 6, 2024
a1d02b5
Apply suggestions from code review
Supriya-Koppa May 7, 2024
acf5569
maven spoteless
Supriya-Koppa May 7, 2024
b48de7a
Removing optional from helptext
gauravjain6633 May 8, 2024
ffd8d07
Update v1/src/main/java/com/google/cloud/teleport/templates/common/Da…
Supriya-Koppa May 8, 2024
a0744c0
Apply suggestions from code review
Supriya-Koppa May 8, 2024
6422bf7
Update v1/src/main/java/com/google/cloud/teleport/templates/common/Da…
Supriya-Koppa May 8, 2024
cf990d3
Modified parameters for Cloud Storage to BigQuery - re-done
Supriya-Koppa May 9, 2024
3276300
Modified parameters for Pub/Sub to Splunk - re-done
Supriya-Koppa May 9, 2024
f56eac6
spotless
Supriya-Koppa May 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public interface Options extends PipelineOptions {
order = 1,
description = "Project ID",
helpText =
"The ID of the Google Cloud project of the Cloud Bigtable instance that you want to read data from")
"The ID of the Google Cloud project that contains the Cloud Bigtable instance that you want to read data from.")
ValueProvider<String> getBigtableProjectId();

@SuppressWarnings("unused")
Expand All @@ -87,7 +87,7 @@ public interface Options extends PipelineOptions {
order = 2,
regexes = {"[a-z][a-z0-9\\-]+[a-z0-9]"},
description = "Instance ID",
helpText = "The ID of the Cloud Bigtable instance that contains the table")
helpText = "The ID of the Cloud Bigtable instance that contains the table.")
ValueProvider<String> getBigtableInstanceId();

@SuppressWarnings("unused")
Expand All @@ -97,7 +97,7 @@ public interface Options extends PipelineOptions {
order = 3,
regexes = {"[_a-zA-Z0-9][-_.a-zA-Z0-9]*"},
description = "Table ID",
helpText = "The ID of the Cloud Bigtable table to export")
helpText = "The ID of the Cloud Bigtable table to export.")
ValueProvider<String> getBigtableTableId();

@SuppressWarnings("unused")
Expand All @@ -107,8 +107,7 @@ public interface Options extends PipelineOptions {
order = 4,
description = "Output file directory in Cloud Storage",
helpText =
"The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters.",
example = "gs://your-bucket/your-path")
"The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse the directory path for date and time formatters. For example: gs://your-bucket/your-path.")
ValueProvider<String> getOutputDirectory();

@SuppressWarnings("unused")
Expand All @@ -117,7 +116,8 @@ public interface Options extends PipelineOptions {
@TemplateParameter.Text(
order = 5,
description = "Parquet file prefix",
helpText = "The prefix of the Parquet file name. For example, \"table1-\"")
helpText =
"The prefix of the Parquet file name. For example, \"table1-\". Defaults to: part.")
@Default.String("part")
ValueProvider<String> getFilenamePrefix();

Expand All @@ -129,10 +129,7 @@ public interface Options extends PipelineOptions {
optional = true,
description = "Maximum output shards",
helpText =
"The maximum number of output shards produced when writing. A higher number of "
+ "shards means higher throughput for writing to Cloud Storage, but potentially higher "
+ "data aggregation cost across shards when processing output Cloud Storage files. "
+ "Default value is decided by Dataflow.")
"The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. The default value is decided by Dataflow.")
@Default.Integer(0)
ValueProvider<Integer> getNumShards();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions {
@TemplateParameter.ProjectId(
order = 10,
description = "Cloud Spanner Project Id",
helpText = "The project ID of the Cloud Spanner instance.")
helpText = "The project ID of the Spanner instance.")
ValueProvider<String> getSpannerProjectId();

void setSpannerProjectId(ValueProvider<String> value);
Expand All @@ -97,8 +97,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions {
order = 20,
regexes = {"[a-z][a-z0-9\\-]*[a-z0-9]"},
description = "Cloud Spanner instance ID",
helpText =
"The instance ID of the Cloud Spanner from which you want to export the vector embeddings.")
helpText = "The ID of the Spanner instance to export the vector embeddings from.")
ValueProvider<String> getSpannerInstanceId();

void setSpannerInstanceId(ValueProvider<String> spannerInstanceId);
Expand All @@ -107,8 +106,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions {
order = 30,
regexes = {"[a-z][a-z0-9_\\-]*[a-z0-9]"},
description = "Cloud Spanner database ID",
helpText =
"The database ID of the Cloud Spanner from which you want to export the vector embeddings.")
helpText = "The ID of the Spanner database to export the vector embeddings from.")
ValueProvider<String> getSpannerDatabaseId();

void setSpannerDatabaseId(ValueProvider<String> spannerDatabaseId);
Expand All @@ -117,7 +115,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions {
order = 40,
regexes = {"^.+$"},
description = "Spanner Table",
helpText = "Spanner Table to read from")
helpText = "The Spanner table to read from.")
ValueProvider<String> getSpannerTable();

void setSpannerTable(ValueProvider<String> table);
Expand All @@ -126,21 +124,16 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions {
order = 50,
description = "Columns to Export from Spanner Table",
helpText =
"Comma separated list of columns which are required for Vertex AI Vector Search Index."
+ " The `id` & `embedding` are required columns for Vertex Vector Search."
+ " If the column names don't precisely align with the Vertex AI Vector Search Index input structure,"
+ " you can establish column mappings using aliases. If you have the columns that don't match the"
+ " format expected by Vertex, you can use the notation `from:to`. For example, if the columns are"
+ " `id` and `my_embedding`, in which `id` matches what Vertex expects but the embedding column is named differently,"
+ " `id, my_embedding:embedding` should be specified.")
"A comma-separated list of required columns for the Vertex AI Vector Search index. The ID and embedding columns are required by Vector Search. If your column names don't match the Vertex AI Vector Search index input structure, create column mappings by using aliases. If the column names don't match the format expected by Vertex AI, use the notation from:to. For example, if you have columns named id and my_embedding, specify id, my_embedding:embedding.")
ValueProvider<String> getSpannerColumnsToExport();

void setSpannerColumnsToExport(ValueProvider<String> value);

@TemplateParameter.GcsWriteFolder(
order = 60,
description = "Output files folder in Cloud Storage",
helpText = "The Cloud Storage folder for writing output files. Must end with a slash.",
helpText =
"The Cloud Storage folder to write output files to. The path must end with a slash.",
example = "gs://your-bucket/folder1/")
ValueProvider<String> getGcsOutputFolder();

Expand All @@ -160,7 +153,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions {
optional = true,
description = "Cloud Spanner Endpoint to call",
helpText =
"The Cloud Spanner endpoint to call in the template. The default is set to https://batch-spanner.googleapis.com.",
"The Spanner endpoint to call in the template. The default value is https://batch-spanner.googleapis.com.",
example = "https://batch-spanner.googleapis.com")
@Default.String("https://batch-spanner.googleapis.com")
ValueProvider<String> getSpannerHost();
Expand All @@ -175,12 +168,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions {
},
description = "Timestamp to read stale data from a version in the past.",
helpText =
"If set, specifies the time when the database version must be taken."
+ " String is in the RFC 3339 format in UTC time. "
+ " Timestamp must be in the past and maximum timestamp staleness applies; see "
+ "<a href=\"https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness\">Maximum Timestamp Staleness</a>."
+ " If not set, strong bound is used to read the latest data; see "
+ "<a href=\"https://cloud.google.com/spanner/docs/timestamp-bounds#strong\">Timestamp Strong Bounds</a>.",
"If set, specifies the time when the database version must be taken. The value is a string in the RFC-3339 date format in Unix epoch time. For example: 1990-12-31T23:59:60Z. The timestamp must be in the past, and maximum timestamp staleness (https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness) applies. If not set, a strong bound (https://cloud.google.com/spanner/docs/timestamp-bounds#strong) is used to read the latest data. Defaults to empty.",
example = "1990-12-31T23:59:60Z")
@Default.String(value = "")
ValueProvider<String> getSpannerVersionTime();
Expand All @@ -192,10 +180,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions {
optional = true,
description = "Use independent compute resource (Spanner DataBoost).",
helpText =
"Use Spanner on-demand compute so the export job will run on independent compute"
+ " resources and have no impact to current Spanner workloads. This will incur"
+ " additional charges in Spanner."
+ " Refer <a href=\" https://cloud.google.com/spanner/docs/databoost/databoost-overview\">Data Boost Overview</a>.")
"When set to true, the template uses Spanner on-demand compute. The export job runs on independent compute resources that don't impact current Spanner workloads. Using this option incurs additional charges in Spanner. For more information, see Spanner Data Boost overview (https://cloud.google.com/spanner/docs/databoost/databoost-overview). Defaults to: false.")
@Default.Boolean(false)
ValueProvider<Boolean> getSpannerDataBoostEnabled();

Expand All @@ -211,8 +196,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions {
optional = true,
description = "Priority for Spanner RPC invocations",
helpText =
"The request priority for Cloud Spanner calls. The value must be one of:"
+ " [HIGH,MEDIUM,LOW]. Defaults to: MEDIUM.")
"The request priority for Spanner calls. The allowed values are HIGH, MEDIUM, and LOW. The default value is MEDIUM.")
ValueProvider<RpcPriority> getSpannerPriority();

void setSpannerPriority(ValueProvider<RpcPriority> value);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ public interface DatadogOptions extends PipelineOptions {
optional = true,
description = "Logs API key.",
helpText =
"Datadog Logs API key. Must be provided if the apiKeySource is set to PLAINTEXT or KMS. "
+ "See: https://docs.datadoghq.com/account_management/api-app-keys")
"The Datadog API key. You must provide this value if the `apiKeySource` is set to `PLAINTEXT` or `KMS`. For more information, see API and Application Keys (https://docs.datadoghq.com/account_management/api-app-keys/) in the Datadog documentation.")
ValueProvider<String> getApiKey();

void setApiKey(ValueProvider<String> apiKey);
Expand All @@ -102,8 +101,7 @@ public interface DatadogOptions extends PipelineOptions {
order = 2,
description = "Datadog Logs API URL.",
helpText =
"Datadog Logs API URL. This should be routable from the VPC in which the pipeline runs. "
+ "See: https://docs.datadoghq.com/api/latest/logs/#send-logs",
"The Datadog Logs API URL. This URL must be routable from the VPC that the pipeline runs in. See Send logs (https://docs.datadoghq.com/api/latest/logs/#send-logs) in the Datadog documentation for more information.",
example = "https://http-intake.logs.datadoghq.com")
ValueProvider<String> getUrl();

Expand All @@ -114,7 +112,7 @@ public interface DatadogOptions extends PipelineOptions {
optional = true,
description = "Batch size for sending multiple events to Datadog Logs API.",
helpText =
"Batch size for sending multiple events to Datadog Logs API. Min is 10. Max is 1000. Defaults to 100.")
"The batch size for sending multiple events to Datadog. The default is `1` (no batching).")
ValueProvider<Integer> getBatchCount();

void setBatchCount(ValueProvider<Integer> batchCount);
Expand All @@ -123,7 +121,7 @@ public interface DatadogOptions extends PipelineOptions {
order = 4,
optional = true,
description = "Maximum number of parallel requests.",
helpText = "Maximum number of parallel requests. Default 1 (no parallelism).")
helpText = "The maximum number of parallel requests. The default is `1` (no parallelism).")
ValueProvider<Integer> getParallelism();

void setParallelism(ValueProvider<Integer> parallelism);
Expand All @@ -133,8 +131,7 @@ public interface DatadogOptions extends PipelineOptions {
optional = true,
description = "Include full Pub/Sub message in the payload.",
helpText =
"Include full Pub/Sub message in the payload (true/false). Defaults to false "
+ "(only data element is included in the payload).")
"Whether to include the full Pub/Sub message in the payload. The default is `false` (only the data element is included in the payload).")
ValueProvider<Boolean> getIncludePubsubMessage();

void setIncludePubsubMessage(ValueProvider<Boolean> includePubsubMessage);
Expand All @@ -147,12 +144,7 @@ public interface DatadogOptions extends PipelineOptions {
},
description = "Google Cloud KMS encryption key for the API key",
helpText =
"The Cloud KMS key to decrypt the Logs API key. This parameter must be "
+ "provided if the apiKeySource is set to KMS. If this parameter is provided, apiKey "
+ "string should be passed in encrypted. Encrypt parameters using the KMS API encrypt "
+ "endpoint. The Key should be in the format "
+ "projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. "
+ "See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt ",
"The Cloud KMS key to use to decrypt the API Key. You must provide this parameter if the `apiKeySource` is set to `KMS`. If the Cloud KMS key is provided, you must pass in an encrypted API Key.",
example =
"projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name")
ValueProvider<String> getApiKeyKMSEncryptionKey();
Expand All @@ -167,7 +159,7 @@ public interface DatadogOptions extends PipelineOptions {
},
description = "Google Cloud Secret Manager ID.",
helpText =
"Secret Manager secret ID for the apiKey. This parameter should be provided if the apiKeySource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}.",
"The Secret Manager secret ID for the API Key. You must provide this parameter if the `apiKeySource` is set to `SECRET_MANAGER`.",
example = "projects/your-project-id/secrets/your-secret/versions/your-secret-version")
ValueProvider<String> getApiKeySecretId();

Expand All @@ -183,11 +175,7 @@ public interface DatadogOptions extends PipelineOptions {
},
description = "Source of the API key passed. One of PLAINTEXT, KMS or SECRET_MANAGER.",
helpText =
"Source of the API key. One of PLAINTEXT, KMS or SECRET_MANAGER. This parameter "
+ "must be provided if secret manager is used. If apiKeySource is set to KMS, "
+ "apiKeyKMSEncryptionKey and encrypted apiKey must be provided. If apiKeySource is set to "
+ "SECRET_MANAGER, apiKeySecretId must be provided. If apiKeySource is set to PLAINTEXT, "
+ "apiKey must be provided.")
"The source of the API key. The following values are supported: `PLAINTEXT`, `KMS`, and `SECRET_MANAGER`. You must provide this parameter if you're using Secret Manager. If `apiKeySource` is set to `KMS`, you must also provide `apiKeyKMSEncryptionKey` and encrypted `API Key`. If `apiKeySource` is set to `SECRET_MANAGER`, you must also provide `apiKeySecretId`. If `apiKeySource` is set to `PLAINTEXT`, you must also provide `apiKey`.")
ValueProvider<String> getApiKeySource();

void setApiKeySource(ValueProvider<String> apiKeySource);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ public interface DatastoreReadOptions extends PipelineOptions {
order = 1,
regexes = {"^.+$"},
description = "GQL Query",
helpText = "Specifies which Datastore entities to read. Ex: ‘SELECT * FROM MyKind’")
helpText =
"A GQL (https://cloud.google.com/datastore/docs/reference/gql_reference) query that specifies which entities to grab. For example, `SELECT * FROM MyKind`.")
@Hidden
@Deprecated
ValueProvider<String> getDatastoreReadGqlQuery();
Expand All @@ -86,7 +87,8 @@ public interface DatastoreReadOptions extends PipelineOptions {
@TemplateParameter.ProjectId(
order = 2,
description = "Project ID",
helpText = "The Google Cloud project ID of the Datastore instance to read from")
helpText =
"The ID of the Google Cloud project that contains the Datastore instance that you want to read data from.")
@Hidden
@Deprecated
ValueProvider<String> getDatastoreReadProjectId();
Expand All @@ -107,7 +109,7 @@ public interface DatastoreReadOptions extends PipelineOptions {
regexes = {"^[0-9A-Za-z._-]{0,100}$"},
description = "Namespace",
helpText =
"Namespace of requested Datastore entities. Leave blank to use default namespace.")
"The namespace of the requested entities. To use the default namespace, leave this parameter blank.")
@Hidden
@Deprecated
ValueProvider<String> getDatastoreReadNamespace();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void setJavascriptTextTransformFunctionName(
optional = true,
description = "JavaScript UDF auto-reload interval (minutes)",
helpText =
"Define the interval that workers may check for JavaScript UDF changes to reload the files.")
"Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0.")
@Default.Integer(0)
ValueProvider<Integer> getJavascriptTextTransformReloadIntervalMinutes();

Expand Down