feat: Added SummaryOptions to ProcessOptions for the Summarizer p…

…rocessor feat: Added `ListDocuments()` method for Document AI Workbench training documents PiperOrigin-RevId: 567684912
googleapis · Sep 22, 2023 · cac736f · cac736f
1 parent 54e225e
commit cac736f
Show file tree

Hide file tree

Showing 7 changed files with 216 additions and 28 deletions.
diff --git a/google/cloud/documentai/v1beta3/document.proto b/google/cloud/documentai/v1beta3/document.proto
@@ -955,7 +955,7 @@ message Document {
 message RevisionRef {
   // Some predefined revision cases.
   enum RevisionCase {
-    // Unspecified case, fallback to read the LATEST_HUMAN_REVIEW.
+    // Unspecified case, fall back to read the `LATEST_HUMAN_REVIEW`.
     REVISION_CASE_UNSPECIFIED = 0;
 
     // The latest revision made by a human.

diff --git a/google/cloud/documentai/v1beta3/document_io.proto b/google/cloud/documentai/v1beta3/document_io.proto
@@ -34,6 +34,13 @@ message RawDocument {
   // An IANA MIME type (RFC6838) indicating the nature and format of the
   // [content][google.cloud.documentai.v1beta3.RawDocument.content].
   string mime_type = 2;
+
+  // The display name of the document, it supports all Unicode characters except
+  // the following:
+  // `*`, `?`, `[`, `]`, `%`, `{`, `}`,`'`, `\"`, `,`
+  // `~`, `=` and `:` are reserved.
+  // If not specified, a default ID is generated.
+  string display_name = 3;
 }
 
 // Specifies a document stored on Cloud Storage.
@@ -117,8 +124,8 @@ message OcrConfig {
 
   // Configurations for premium OCR features.
   message PremiumFeatures {
-    // Turn on selection mark detector in OCR engine. Only available in OCR 2.0+
-    // processors.
+    // Turn on selection mark detector in OCR engine. Only available in OCR 2.0
+    // (and later) processors.
     bool enable_selection_mark_detection = 3;
 
     // Turn on font identification model and return font style information.
@@ -159,7 +166,7 @@ message OcrConfig {
   bool compute_style_info = 8 [deprecated = true];
 
   // Turn off character box detector in OCR engine. Character box detection is
-  // enabled by default in OCR 2.0+ processors.
+  // enabled by default in OCR 2.0 (and later) processors.
   bool disable_character_boxes_detection = 10;
 
   // Configurations for premium OCR features.

diff --git a/google/cloud/documentai/v1beta3/document_processor_service.proto b/google/cloud/documentai/v1beta3/document_processor_service.proto
@@ -348,16 +348,19 @@ message ProcessOptions {
     repeated int32 pages = 1 [(google.api.field_behavior) = OPTIONAL];
   }
 
-  // A subset of pages to process. If not specified, all pages will be
-  // processed. NOTICE: If any of the page range is set, we will extract and
-  // process only the given pages from the document. In the output document,
-  // the page_number is referring to the page number in the original document.
+  // A subset of pages to process. If not specified, all pages are processed.
+  //  If a page range is set, only the given pages are extracted and processed
+  //  from the document. In the output document,
+  //  [Document.Page.page_number][google.cloud.documentai.v1beta3.Document.Page.page_number]
+  //  refers to the page number in the original document. This configuration
+  //  only applies to sync requests. `page_range` can be only one of the
+  //  following:
   oneof page_range {
     // Which pages to process (1-indexed).
     IndividualPageSelector individual_page_selector = 5;
 
-    // Only process certain pages from the start, process all if the document
-    // has less pages.
+    // Only process certain pages from the start. Process all if the document
+    // has fewer pages.
     int32 from_start = 6;
 
     // Only process certain pages from the end, same as above.
@@ -367,6 +370,13 @@ message ProcessOptions {
   // Only applicable to `OCR_PROCESSOR`. Returns error if set on other
   // processor types.
   OcrConfig ocr_config = 1;
+
+  // Optional. Override the schema of the
+  // [ProcessorVersion][google.cloud.documentai.v1beta3.ProcessorVersion]. Will
+  // return an Invalid Argument error if this field is set when the underlying
+  // [ProcessorVersion][google.cloud.documentai.v1beta3.ProcessorVersion]
+  // doesn't support schema override.
+  DocumentSchema schema_override = 8 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Request message for the
@@ -1003,8 +1013,8 @@ message TrainProcessorVersionRequest {
   // Options to control the training of the Custom Document Extraction (CDE)
   // Processor.
   message CustomDocumentExtractionOptions {
-    // Training Method for CDE. TRAINING_METHOD_UNSPECIFIED will fallback to
-    // MODEL_BASED.
+    // Training Method for CDE. `TRAINING_METHOD_UNSPECIFIED` will fall back to
+    // `MODEL_BASED`.
     enum TrainingMethod {
       TRAINING_METHOD_UNSPECIFIED = 0;
 
@@ -1289,9 +1299,9 @@ message ListEvaluationsResponse {
 
 // The request message for the
 // [ImportProcessorVersion][google.cloud.documentai.v1beta3.DocumentProcessorService.ImportProcessorVersion]
-// method. Requirements:
+// method.
 //
-// - The Document AI [Service
+// The Document AI [Service
 // Agent](https://cloud.google.com/iam/docs/service-agents) of the destination
 // project must have [Document AI Editor
 // role](https://cloud.google.com/document-ai/docs/access-control/iam-roles) on
@@ -1300,8 +1310,10 @@ message ListEvaluationsResponse {
 // The destination project is specified as part of the
 // [parent][google.cloud.documentai.v1beta3.ImportProcessorVersionRequest.parent]
 // field. The source project is specified as part of the
-// [source][ImportProcessorVersionRequest.processor_version_source or
-// ImportProcessorVersionRequest.external_processor_version_source] field.
+// [source][google.cloud.documentai.v1beta3.ImportProcessorVersionRequest.processor_version_source]
+// or
+// [external_processor_version_source][google.cloud.documentai.v1beta3.ImportProcessorVersionRequest.external_processor_version_source]
+// field.
 message ImportProcessorVersionRequest {
   // The external source processor version.
   message ExternalProcessorVersionSource {
@@ -1321,7 +1333,7 @@ message ImportProcessorVersionRequest {
       type: "documentai.googleapis.com/ProcessorVersion"
     }];
 
-    // The source processor version to import from, and can be from different
+    // The source processor version to import from. It can be from a different
     // environment and region than the destination processor.
     ExternalProcessorVersionSource external_processor_version_source = 3;
   }

diff --git a/google/cloud/documentai/v1beta3/document_schema.proto b/google/cloud/documentai/v1beta3/document_schema.proto
@@ -24,10 +24,55 @@ option java_package = "com.google.cloud.documentai.v1beta3";
 option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
 option ruby_package = "Google::Cloud::DocumentAI::V1beta3";
 
+// Metadata for document summarization.
+message SummaryOptions {
+  // The Length enum.
+  enum Length {
+    // Default.
+    LENGTH_UNSPECIFIED = 0;
+
+    // A brief summary of one or two sentences.
+    BRIEF = 1;
+
+    // A paragraph-length summary.
+    MODERATE = 2;
+
+    // The longest option available.
+    COMPREHENSIVE = 3;
+  }
+
+  // The Format enum.
+  enum Format {
+    // Default.
+    FORMAT_UNSPECIFIED = 0;
+
+    // Format the output in paragraphs.
+    PARAGRAPH = 1;
+
+    // Format the output in bullets.
+    BULLETS = 2;
+  }
+
+  // How long the summary should be.
+  Length length = 1;
+
+  // The format the summary should be in.
+  Format format = 2;
+}
+
+// Metadata for how this field value is extracted.
+message FieldExtractionMetadata {
+  // Summary options config.
+  SummaryOptions summary_options = 2;
+}
+
 // Metadata about a property.
 message PropertyMetadata {
   // Whether the property should be considered as "inactive".
   bool inactive = 3;
+
+  // Field extraction metadata on the property.
+  FieldExtractionMetadata field_extraction_metadata = 9;
 }
 
 // Metadata about an entity type.

diff --git a/google/cloud/documentai/v1beta3/document_service.proto b/google/cloud/documentai/v1beta3/document_service.proto
@@ -78,6 +78,15 @@ service DocumentService {
     option (google.api.method_signature) = "dataset";
   }
 
+  // Returns a list of documents present in the dataset.
+  rpc ListDocuments(ListDocumentsRequest) returns (ListDocumentsResponse) {
+    option (google.api.http) = {
+      post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:listDocuments"
+      body: "*"
+    };
+    option (google.api.method_signature) = "dataset";
+  }
+
   // Deletes a set of documents.
   rpc BatchDeleteDocuments(BatchDeleteDocumentsRequest)
       returns (google.longrunning.Operation) {
@@ -114,7 +123,6 @@ service DocumentService {
 // referred to as splits: train, test.
 enum DatasetSplitType {
   // Default value if the enum is not set.
-  // go/protodosdonts#do-include-an-unspecified-value-in-an-enum
   DATASET_SPLIT_TYPE_UNSPECIFIED = 0;
 
   // Identifies the train documents.
@@ -127,6 +135,21 @@ enum DatasetSplitType {
   DATASET_SPLIT_UNASSIGNED = 3;
 }
 
+// Describes the labelling status of a document.
+enum DocumentLabelingState {
+  // Default value if the enum is not set.
+  DOCUMENT_LABELING_STATE_UNSPECIFIED = 0;
+
+  // Document has been labelled.
+  DOCUMENT_LABELED = 1;
+
+  // Document has not been labelled.
+  DOCUMENT_UNLABELED = 2;
+
+  // Document has been auto-labelled.
+  DOCUMENT_AUTO_LABELED = 3;
+}
+
 message UpdateDatasetRequest {
   // Required. The `name` field of the `Dataset` is used to identify the
   // resource to be updated.
@@ -137,7 +160,7 @@ message UpdateDatasetRequest {
 }
 
 message UpdateDatasetOperationMetadata {
-  // The basic metadata of the long running operation.
+  // The basic metadata of the long-running operation.
   CommonOperationMetadata common_metadata = 1;
 }
 
@@ -198,9 +221,9 @@ message ImportDocumentsMetadata {
     DocumentId output_document_id = 4;
   }
 
-  // The validation status of each import config. Status is set to errors if
-  // there is no documents to import in the import_config, or OK if the
-  // operation will try to proceed at least one document.
+  // The validation status of each import config. Status is set to an error if
+  // there are no documents to import in the `import_config`, or `OK` if the
+  // operation will try to proceed with at least one document.
   message ImportConfigValidationResult {
     // The source Cloud Storage URI specified in the import config.
     string input_gcs_source = 1;
@@ -209,7 +232,7 @@ message ImportDocumentsMetadata {
     google.rpc.Status status = 2;
   }
 
-  // The basic metadata of the long running operation.
+  // The basic metadata of the long-running operation.
   CommonOperationMetadata common_metadata = 1;
 
   // The list of response details of each document.
@@ -249,6 +272,78 @@ message GetDocumentResponse {
   Document document = 1;
 }
 
+message ListDocumentsRequest {
+  // Required. The resource name of the dataset to be listed.
+  // Format:
+  // projects/{project}/locations/{location}/processors/{processor}/dataset
+  string dataset = 1 [
+    (google.api.field_behavior) = REQUIRED,
+    (google.api.resource_reference) = {
+      type: "documentai.googleapis.com/Dataset"
+    }
+  ];
+
+  // The maximum number of documents to return. The service may return
+  // fewer than this value.
+  // If unspecified, at most 20 documents will be returned.
+  // The maximum value is 100; values above 100 will be coerced to 100.
+  int32 page_size = 2;
+
+  // A page token, received from a previous `ListDocuments` call.
+  // Provide this to retrieve the subsequent page.
+  //
+  // When paginating, all other parameters provided to `ListDocuments`
+  // must match the call that provided the page token.
+  string page_token = 3;
+
+  // Optional. Query to filter the documents based on
+  // https://google.aip.dev/160.
+  // ## Currently support query strings are:
+  //
+  // `SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED`
+  // - `LabelingState=DOCUMENT_LABELED|DOCUMENT_UNLABELED|DOCUMENT_AUTO_LABELED`
+  // - `DisplayName=\"file_name.pdf\"`
+  // - `EntityType=abc/def`
+  // - `TagName=\"auto-labeling-running\"|\"sampled\"`
+  //
+  // Note:
+  // - Only `AND`, `=` and `!=` are supported.
+  //     e.g. `DisplayName=file_name AND EntityType!=abc` IS supported.
+  // - Wildcard `*` is supported only in `DisplayName` filter
+  // - No duplicate filter keys are allowed,
+  //     e.g. `EntityType=a AND EntityType=b` is NOT supported.
+  // - String match is case sensitive (for filter `DisplayName` & `EntityType`).
+  string filter = 4 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. Controls if the ListDocuments request requires a total size
+  // of matched documents. See ListDocumentsResponse.total_size.
+  //
+  // Enabling this flag may adversely impact performance.
+  //
+  // Defaults to false.
+  bool return_total_size = 6 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. Number of results to skip beginning from the `page_token` if
+  // provided. https://google.aip.dev/158#skipping-results. It must be a
+  // non-negative integer. Negative values wil be rejected. Note that this is
+  // not the number of pages to skip. If this value causes the cursor to move
+  // past the end of results, `ListDocumentsResponse.document_metadata` and
+  // `ListDocumentsResponse.next_page_token` will be empty.
+  int32 skip = 8 [(google.api.field_behavior) = OPTIONAL];
+}
+
+message ListDocumentsResponse {
+  // Document metadata corresponding to the listed documents.
+  repeated DocumentMetadata document_metadata = 1;
+
+  // A token, which can be sent as `page_token` to retrieve the next page.
+  // If this field is omitted, there are no subsequent pages.
+  string next_page_token = 2;
+
+  // Total count of documents queried.
+  int32 total_size = 3;
+}
+
 message BatchDeleteDocumentsRequest {
   // Required. The dataset resource name.
   // Format:
@@ -276,7 +371,7 @@ message BatchDeleteDocumentsMetadata {
     google.rpc.Status status = 2;
   }
 
-  // The basic metadata of the long running operation.
+  // The basic metadata of the long-running operation.
   CommonOperationMetadata common_metadata = 1;
 
   // The list of response details of each document.
@@ -323,3 +418,21 @@ message DocumentPageRange {
   // Last page number (one-based index) to be returned.
   int32 end = 2;
 }
+
+// Metadata about a document.
+message DocumentMetadata {
+  // Document identifier.
+  DocumentId document_id = 1;
+
+  // Number of pages in the document.
+  int32 page_count = 2;
+
+  // Type of the dataset split to which the document belongs.
+  DatasetSplitType dataset_type = 3;
+
+  // Labelling state of the document.
+  DocumentLabelingState labeling_state = 5;
+
+  // The display name of the document.
+  string display_name = 6;
+}
diff --git a/google/cloud/documentai/v1beta3/documentai_v1beta3.yaml b/google/cloud/documentai/v1beta3/documentai_v1beta3.yaml
@@ -102,10 +102,6 @@ publishing:
   github_label: 'api: documentai'
   organization: CLOUD
   library_settings:
-  - version: google.cloud.documentai.v1
-    dotnet_settings:
-      ignored_resources:
-      - documentai.googleapis.com/Location
   - version: google.cloud.documentai.v1beta3
     dotnet_settings:
       ignored_resources: