Skip to content

Commit

Permalink
feat: Added SummaryOptions to ProcessOptions for the Summarizer p…
Browse files Browse the repository at this point in the history
…rocessor

feat: Added `ListDocuments()` method for Document AI Workbench training documents

PiperOrigin-RevId: 567684912
  • Loading branch information
Google APIs authored and Copybara-Service committed Sep 22, 2023
1 parent 54e225e commit cac736f
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 28 deletions.
2 changes: 1 addition & 1 deletion google/cloud/documentai/v1beta3/document.proto
Expand Up @@ -955,7 +955,7 @@ message Document {
message RevisionRef {
// Some predefined revision cases.
enum RevisionCase {
// Unspecified case, fallback to read the LATEST_HUMAN_REVIEW.
// Unspecified case, fall back to read the `LATEST_HUMAN_REVIEW`.
REVISION_CASE_UNSPECIFIED = 0;

// The latest revision made by a human.
Expand Down
13 changes: 10 additions & 3 deletions google/cloud/documentai/v1beta3/document_io.proto
Expand Up @@ -34,6 +34,13 @@ message RawDocument {
// An IANA MIME type (RFC6838) indicating the nature and format of the
// [content][google.cloud.documentai.v1beta3.RawDocument.content].
string mime_type = 2;

// The display name of the document, it supports all Unicode characters except
// the following:
// `*`, `?`, `[`, `]`, `%`, `{`, `}`,`'`, `\"`, `,`
// `~`, `=` and `:` are reserved.
// If not specified, a default ID is generated.
string display_name = 3;
}

// Specifies a document stored on Cloud Storage.
Expand Down Expand Up @@ -117,8 +124,8 @@ message OcrConfig {

// Configurations for premium OCR features.
message PremiumFeatures {
// Turn on selection mark detector in OCR engine. Only available in OCR 2.0+
// processors.
// Turn on selection mark detector in OCR engine. Only available in OCR 2.0
// (and later) processors.
bool enable_selection_mark_detection = 3;

// Turn on font identification model and return font style information.
Expand Down Expand Up @@ -159,7 +166,7 @@ message OcrConfig {
bool compute_style_info = 8 [deprecated = true];

// Turn off character box detector in OCR engine. Character box detection is
// enabled by default in OCR 2.0+ processors.
// enabled by default in OCR 2.0 (and later) processors.
bool disable_character_boxes_detection = 10;

// Configurations for premium OCR features.
Expand Down
38 changes: 25 additions & 13 deletions google/cloud/documentai/v1beta3/document_processor_service.proto
Expand Up @@ -348,16 +348,19 @@ message ProcessOptions {
repeated int32 pages = 1 [(google.api.field_behavior) = OPTIONAL];
}

// A subset of pages to process. If not specified, all pages will be
// processed. NOTICE: If any of the page range is set, we will extract and
// process only the given pages from the document. In the output document,
// the page_number is referring to the page number in the original document.
// A subset of pages to process. If not specified, all pages are processed.
// If a page range is set, only the given pages are extracted and processed
// from the document. In the output document,
// [Document.Page.page_number][google.cloud.documentai.v1beta3.Document.Page.page_number]
// refers to the page number in the original document. This configuration
// only applies to sync requests. `page_range` can be only one of the
// following:
oneof page_range {
// Which pages to process (1-indexed).
IndividualPageSelector individual_page_selector = 5;

// Only process certain pages from the start, process all if the document
// has less pages.
// Only process certain pages from the start. Process all if the document
// has fewer pages.
int32 from_start = 6;

// Only process certain pages from the end, same as above.
Expand All @@ -367,6 +370,13 @@ message ProcessOptions {
// Only applicable to `OCR_PROCESSOR`. Returns error if set on other
// processor types.
OcrConfig ocr_config = 1;

// Optional. Override the schema of the
// [ProcessorVersion][google.cloud.documentai.v1beta3.ProcessorVersion]. Will
// return an Invalid Argument error if this field is set when the underlying
// [ProcessorVersion][google.cloud.documentai.v1beta3.ProcessorVersion]
// doesn't support schema override.
DocumentSchema schema_override = 8 [(google.api.field_behavior) = OPTIONAL];
}

// Request message for the
Expand Down Expand Up @@ -1003,8 +1013,8 @@ message TrainProcessorVersionRequest {
// Options to control the training of the Custom Document Extraction (CDE)
// Processor.
message CustomDocumentExtractionOptions {
// Training Method for CDE. TRAINING_METHOD_UNSPECIFIED will fallback to
// MODEL_BASED.
// Training Method for CDE. `TRAINING_METHOD_UNSPECIFIED` will fall back to
// `MODEL_BASED`.
enum TrainingMethod {
TRAINING_METHOD_UNSPECIFIED = 0;

Expand Down Expand Up @@ -1289,9 +1299,9 @@ message ListEvaluationsResponse {

// The request message for the
// [ImportProcessorVersion][google.cloud.documentai.v1beta3.DocumentProcessorService.ImportProcessorVersion]
// method. Requirements:
// method.
//
// - The Document AI [Service
// The Document AI [Service
// Agent](https://cloud.google.com/iam/docs/service-agents) of the destination
// project must have [Document AI Editor
// role](https://cloud.google.com/document-ai/docs/access-control/iam-roles) on
Expand All @@ -1300,8 +1310,10 @@ message ListEvaluationsResponse {
// The destination project is specified as part of the
// [parent][google.cloud.documentai.v1beta3.ImportProcessorVersionRequest.parent]
// field. The source project is specified as part of the
// [source][ImportProcessorVersionRequest.processor_version_source or
// ImportProcessorVersionRequest.external_processor_version_source] field.
// [source][google.cloud.documentai.v1beta3.ImportProcessorVersionRequest.processor_version_source]
// or
// [external_processor_version_source][google.cloud.documentai.v1beta3.ImportProcessorVersionRequest.external_processor_version_source]
// field.
message ImportProcessorVersionRequest {
// The external source processor version.
message ExternalProcessorVersionSource {
Expand All @@ -1321,7 +1333,7 @@ message ImportProcessorVersionRequest {
type: "documentai.googleapis.com/ProcessorVersion"
}];

// The source processor version to import from, and can be from different
// The source processor version to import from. It can be from a different
// environment and region than the destination processor.
ExternalProcessorVersionSource external_processor_version_source = 3;
}
Expand Down
45 changes: 45 additions & 0 deletions google/cloud/documentai/v1beta3/document_schema.proto
Expand Up @@ -24,10 +24,55 @@ option java_package = "com.google.cloud.documentai.v1beta3";
option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
option ruby_package = "Google::Cloud::DocumentAI::V1beta3";

// Metadata for document summarization.
message SummaryOptions {
// The Length enum.
enum Length {
// Default.
LENGTH_UNSPECIFIED = 0;

// A brief summary of one or two sentences.
BRIEF = 1;

// A paragraph-length summary.
MODERATE = 2;

// The longest option available.
COMPREHENSIVE = 3;
}

// The Format enum.
enum Format {
// Default.
FORMAT_UNSPECIFIED = 0;

// Format the output in paragraphs.
PARAGRAPH = 1;

// Format the output in bullets.
BULLETS = 2;
}

// How long the summary should be.
Length length = 1;

// The format the summary should be in.
Format format = 2;
}

// Metadata for how this field value is extracted.
message FieldExtractionMetadata {
// Summary options config.
SummaryOptions summary_options = 2;
}

// Metadata about a property.
message PropertyMetadata {
// Whether the property should be considered as "inactive".
bool inactive = 3;

// Field extraction metadata on the property.
FieldExtractionMetadata field_extraction_metadata = 9;
}

// Metadata about an entity type.
Expand Down
127 changes: 120 additions & 7 deletions google/cloud/documentai/v1beta3/document_service.proto
Expand Up @@ -78,6 +78,15 @@ service DocumentService {
option (google.api.method_signature) = "dataset";
}

// Returns a list of documents present in the dataset.
rpc ListDocuments(ListDocumentsRequest) returns (ListDocumentsResponse) {
option (google.api.http) = {
post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:listDocuments"
body: "*"
};
option (google.api.method_signature) = "dataset";
}

// Deletes a set of documents.
rpc BatchDeleteDocuments(BatchDeleteDocumentsRequest)
returns (google.longrunning.Operation) {
Expand Down Expand Up @@ -114,7 +123,6 @@ service DocumentService {
// referred to as splits: train, test.
enum DatasetSplitType {
// Default value if the enum is not set.
// go/protodosdonts#do-include-an-unspecified-value-in-an-enum
DATASET_SPLIT_TYPE_UNSPECIFIED = 0;

// Identifies the train documents.
Expand All @@ -127,6 +135,21 @@ enum DatasetSplitType {
DATASET_SPLIT_UNASSIGNED = 3;
}

// Describes the labelling status of a document.
enum DocumentLabelingState {
// Default value if the enum is not set.
DOCUMENT_LABELING_STATE_UNSPECIFIED = 0;

// Document has been labelled.
DOCUMENT_LABELED = 1;

// Document has not been labelled.
DOCUMENT_UNLABELED = 2;

// Document has been auto-labelled.
DOCUMENT_AUTO_LABELED = 3;
}

message UpdateDatasetRequest {
// Required. The `name` field of the `Dataset` is used to identify the
// resource to be updated.
Expand All @@ -137,7 +160,7 @@ message UpdateDatasetRequest {
}

message UpdateDatasetOperationMetadata {
// The basic metadata of the long running operation.
// The basic metadata of the long-running operation.
CommonOperationMetadata common_metadata = 1;
}

Expand Down Expand Up @@ -198,9 +221,9 @@ message ImportDocumentsMetadata {
DocumentId output_document_id = 4;
}

// The validation status of each import config. Status is set to errors if
// there is no documents to import in the import_config, or OK if the
// operation will try to proceed at least one document.
// The validation status of each import config. Status is set to an error if
// there are no documents to import in the `import_config`, or `OK` if the
// operation will try to proceed with at least one document.
message ImportConfigValidationResult {
// The source Cloud Storage URI specified in the import config.
string input_gcs_source = 1;
Expand All @@ -209,7 +232,7 @@ message ImportDocumentsMetadata {
google.rpc.Status status = 2;
}

// The basic metadata of the long running operation.
// The basic metadata of the long-running operation.
CommonOperationMetadata common_metadata = 1;

// The list of response details of each document.
Expand Down Expand Up @@ -249,6 +272,78 @@ message GetDocumentResponse {
Document document = 1;
}

message ListDocumentsRequest {
// Required. The resource name of the dataset to be listed.
// Format:
// projects/{project}/locations/{location}/processors/{processor}/dataset
string dataset = 1 [
(google.api.field_behavior) = REQUIRED,
(google.api.resource_reference) = {
type: "documentai.googleapis.com/Dataset"
}
];

// The maximum number of documents to return. The service may return
// fewer than this value.
// If unspecified, at most 20 documents will be returned.
// The maximum value is 100; values above 100 will be coerced to 100.
int32 page_size = 2;

// A page token, received from a previous `ListDocuments` call.
// Provide this to retrieve the subsequent page.
//
// When paginating, all other parameters provided to `ListDocuments`
// must match the call that provided the page token.
string page_token = 3;

// Optional. Query to filter the documents based on
// https://google.aip.dev/160.
// ## Currently support query strings are:
//
// `SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED`
// - `LabelingState=DOCUMENT_LABELED|DOCUMENT_UNLABELED|DOCUMENT_AUTO_LABELED`
// - `DisplayName=\"file_name.pdf\"`
// - `EntityType=abc/def`
// - `TagName=\"auto-labeling-running\"|\"sampled\"`
//
// Note:
// - Only `AND`, `=` and `!=` are supported.
// e.g. `DisplayName=file_name AND EntityType!=abc` IS supported.
// - Wildcard `*` is supported only in `DisplayName` filter
// - No duplicate filter keys are allowed,
// e.g. `EntityType=a AND EntityType=b` is NOT supported.
// - String match is case sensitive (for filter `DisplayName` & `EntityType`).
string filter = 4 [(google.api.field_behavior) = OPTIONAL];

// Optional. Controls if the ListDocuments request requires a total size
// of matched documents. See ListDocumentsResponse.total_size.
//
// Enabling this flag may adversely impact performance.
//
// Defaults to false.
bool return_total_size = 6 [(google.api.field_behavior) = OPTIONAL];

// Optional. Number of results to skip beginning from the `page_token` if
// provided. https://google.aip.dev/158#skipping-results. It must be a
// non-negative integer. Negative values wil be rejected. Note that this is
// not the number of pages to skip. If this value causes the cursor to move
// past the end of results, `ListDocumentsResponse.document_metadata` and
// `ListDocumentsResponse.next_page_token` will be empty.
int32 skip = 8 [(google.api.field_behavior) = OPTIONAL];
}

message ListDocumentsResponse {
// Document metadata corresponding to the listed documents.
repeated DocumentMetadata document_metadata = 1;

// A token, which can be sent as `page_token` to retrieve the next page.
// If this field is omitted, there are no subsequent pages.
string next_page_token = 2;

// Total count of documents queried.
int32 total_size = 3;
}

message BatchDeleteDocumentsRequest {
// Required. The dataset resource name.
// Format:
Expand Down Expand Up @@ -276,7 +371,7 @@ message BatchDeleteDocumentsMetadata {
google.rpc.Status status = 2;
}

// The basic metadata of the long running operation.
// The basic metadata of the long-running operation.
CommonOperationMetadata common_metadata = 1;

// The list of response details of each document.
Expand Down Expand Up @@ -323,3 +418,21 @@ message DocumentPageRange {
// Last page number (one-based index) to be returned.
int32 end = 2;
}

// Metadata about a document.
message DocumentMetadata {
// Document identifier.
DocumentId document_id = 1;

// Number of pages in the document.
int32 page_count = 2;

// Type of the dataset split to which the document belongs.
DatasetSplitType dataset_type = 3;

// Labelling state of the document.
DocumentLabelingState labeling_state = 5;

// The display name of the document.
string display_name = 6;
}
4 changes: 0 additions & 4 deletions google/cloud/documentai/v1beta3/documentai_v1beta3.yaml
Expand Up @@ -102,10 +102,6 @@ publishing:
github_label: 'api: documentai'
organization: CLOUD
library_settings:
- version: google.cloud.documentai.v1
dotnet_settings:
ignored_resources:
- documentai.googleapis.com/Location
- version: google.cloud.documentai.v1beta3
dotnet_settings:
ignored_resources:
Expand Down

0 comments on commit cac736f

Please sign in to comment.