From f95bbeab818f37a9885f6025af04ad102e3e2b25 Mon Sep 17 00:00:00 2001 From: Holt Skinner <13262395+holtskinner@users.noreply.github.com> Date: Fri, 7 Apr 2023 11:53:37 -0500 Subject: [PATCH] feat: Add Import Document from Batch Process Metadata & Operation (#88) * feat: Add utility functions for GCS URIs - Updates to types to fix lint errors - Add `files_to_display` optional parameter to `print_document_tree()` - Other formatting improvements for docs. * test: Add tests for gcs_uri functions. * feat: Add Import Document from batch process metadata * refactor: Moved `_get_storage_client` and `get_bytes` to utilities - Resolves Circular dependency * test: Attempt to fix mock patch * test: Attempt to fix test import errors * Change utility import in converter_helpers.py * Update utilities import * Added Inline samples for `from_document_path()` and `from_documentai_document()` * feat: Add utility functions for GCS URIs - Updates to types to fix lint errors - Add `files_to_display` optional parameter to `print_document_tree()` - Other formatting improvements for docs. * test: Add tests for gcs_uri functions. * feat: Add Import Document from batch process metadata * refactor: Moved `_get_storage_client` and `get_bytes` to utilities - Resolves Circular dependency * test: Attempt to fix mock patch * test: Attempt to fix test import errors * Change utility import in converter_helpers.py * Update utilities import * Added Inline samples for `from_document_path()` and `from_documentai_document()` * test: Add check for Failed BatchProcessMetadata * fix: Update imports based on Gal's feedback * refactor: Rename `utilities.py` to `gcs_utilities.py` * Add alias for gcs_utilities in `__init__.py` * Update mock.patch for gcs_utilities in `test_converter.py` * Removed alias for gcs_utilities. Changed Samples to follow * Added `Document.from_batch_process_operation()` - Gets operation information and passes it to `from_batch_process_metadata()` * Fixed mock.patch for `get_bytes_images_mock` * Remove underscore from `get_bytes` in `get_bytes_images_mock` --- docs/documentai_toolbox/utilities.rst | 2 +- google/cloud/documentai_toolbox/__init__.py | 4 +- .../converters/config/converter_helpers.py | 11 +- .../{utilities.py => gcs_utilities.py} | 122 +++++++++- .../documentai_toolbox/wrappers/document.py | 211 +++++++++++++----- .../cloud/documentai_toolbox/wrappers/page.py | 16 +- samples/snippets/create_batches_sample.py | 4 +- samples/snippets/quickstart_sample.py | 4 +- tests/unit/test_converter.py | 2 +- tests/unit/test_converter_helpers.py | 13 +- tests/unit/test_document.py | 170 ++++++++++---- tests/unit/test_utilities.py | 107 ++++++--- 12 files changed, 500 insertions(+), 166 deletions(-) rename google/cloud/documentai_toolbox/utilities/{utilities.py => gcs_utilities.py} (60%) diff --git a/docs/documentai_toolbox/utilities.rst b/docs/documentai_toolbox/utilities.rst index d6ecbe9f..fe9aff12 100644 --- a/docs/documentai_toolbox/utilities.rst +++ b/docs/documentai_toolbox/utilities.rst @@ -1,7 +1,7 @@ Document AI Toolbox Utilities ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: google.cloud.documentai_toolbox.utilities.utilities +.. automodule:: google.cloud.documentai_toolbox.utilities.gcs_utilities :members: :private-members: :noindex: diff --git a/google/cloud/documentai_toolbox/__init__.py b/google/cloud/documentai_toolbox/__init__.py index 6c93a1c8..a92b2dfe 100644 --- a/google/cloud/documentai_toolbox/__init__.py +++ b/google/cloud/documentai_toolbox/__init__.py @@ -29,7 +29,7 @@ ) from .utilities import ( - utilities, + gcs_utilities, ) -__all__ = (document, page, entity, converter, utilities) +__all__ = (document, page, entity, converter, gcs_utilities) diff --git a/google/cloud/documentai_toolbox/converters/config/converter_helpers.py b/google/cloud/documentai_toolbox/converters/config/converter_helpers.py index b6788d71..9a83ddb8 100644 --- a/google/cloud/documentai_toolbox/converters/config/converter_helpers.py +++ b/google/cloud/documentai_toolbox/converters/config/converter_helpers.py @@ -28,7 +28,9 @@ _load_blocks_from_schema, ) -from google.cloud.documentai_toolbox import document, constants +from google.cloud.documentai_toolbox import constants +from google.cloud.documentai_toolbox.utilities import gcs_utilities + from google.cloud import documentai, storage @@ -86,7 +88,6 @@ def _get_entity_content( entity_id = 0 for block in blocks: - docai_entity = documentai.Document.Entity() if block.confidence: docai_entity.confidence = block.confidence @@ -233,7 +234,7 @@ def _get_bytes( """ - storage_client = document._get_storage_client() + storage_client = gcs_utilities._get_storage_client() bucket = storage_client.bucket(bucket_name=bucket_name) blobs = storage_client.list_blobs(bucket_or_name=bucket_name, prefix=prefix) @@ -287,7 +288,7 @@ def _upload_file( None. """ - storage_client = document._get_storage_client() + storage_client = gcs_utilities._get_storage_client() bucket = storage_client.bucket(bucket_name) blob = bucket.blob(output_prefix) @@ -494,7 +495,7 @@ def _convert_documents_with_config( if file_check: raise ValueError("gcs_prefix cannot contain file types") - storage_client = document._get_storage_client() + storage_client = gcs_utilities._get_storage_client() blob_list = storage_client.list_blobs(input_bucket, prefix=input_prefix) diff --git a/google/cloud/documentai_toolbox/utilities/utilities.py b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py similarity index 60% rename from google/cloud/documentai_toolbox/utilities/utilities.py rename to google/cloud/documentai_toolbox/utilities/gcs_utilities.py index 51ec8e75..306ae415 100644 --- a/google/cloud/documentai_toolbox/utilities/utilities.py +++ b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py @@ -16,18 +16,113 @@ """Document AI utilities.""" import os import re -from typing import Dict, List, Optional +from typing import Dict, List, Tuple +from google.api_core import client_info from google.cloud import documentai +from google.cloud import storage +from google.cloud import documentai_toolbox from google.cloud.documentai_toolbox import constants -from google.cloud.documentai_toolbox.wrappers.document import _get_storage_client + + +def _get_storage_client(): + r"""Returns a Storage client with custom user agent header. + + Returns: + storage.Client. + + """ + user_agent = f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}" + + info = client_info.ClientInfo( + client_library_version=documentai_toolbox.__version__, + user_agent=user_agent, + ) + + return storage.Client(client_info=info) + + +def get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]: + r"""Returns a list of bytes of json files from Cloud Storage. + + Args: + gcs_bucket_name (str): + Required. The name of the gcs bucket. + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`. + gcs_prefix (str): + Required. The prefix of the json files in the target_folder + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`. + Returns: + List[bytes]: + A list of bytes. + + """ + result = [] + + storage_client = _get_storage_client() + blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) + + for blob in blob_list: + if ( + blob.name.endswith(constants.JSON_EXTENSION) + or blob.content_type == constants.JSON_MIMETYPE + ): + result.append(blob.download_as_bytes()) + + return result + + +def split_gcs_uri(gcs_uri: str) -> Tuple[str, str]: + r"""Splits a Cloud Storage uri into the bucket_name and prefix. + + Args: + gcs_uri (str): + Required. The full Cloud Storage URI. + + Format: `gs://{bucket_name}/{gcs_prefix}`. + Returns: + Tuple[str, str]: + The Cloud Storage Bucket and Prefix. + + """ + matches = re.match("gs://(.*?)/(.*)", gcs_uri) + + if not matches: + raise ValueError( + "gcs_uri must follow format 'gs://{bucket_name}/{gcs_prefix}'." + ) + bucket, prefix = matches.groups() + return str(bucket), str(prefix) + + +def create_gcs_uri(gcs_bucket_name: str, gcs_prefix: str) -> str: + r"""Creates a Cloud Storage uri from the bucket_name and prefix. + + Args: + gcs_bucket_name (str): + Required. The name of the gcs bucket. + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`. + gcs_prefix (str): + Required. The prefix of the files in the target_folder. + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`. + Returns: + str + The full Cloud Storage uri. + Format: `gs://{gcs_bucket_name}/{gcs_prefix}` + + """ + return f"gs://{gcs_bucket_name}/{gcs_prefix}" def list_gcs_document_tree( gcs_bucket_name: str, gcs_prefix: str ) -> Dict[str, List[str]]: - r"""Returns a list path to files in Cloud Storage folder and prints the tree to terminal. + r"""Returns a list path to files in Cloud Storage folder. Args: gcs_bucket_name (str): @@ -64,8 +159,10 @@ def list_gcs_document_tree( return path_list -def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None: - r"""Prints a tree of filenames in Cloud Storage folder.. +def print_gcs_document_tree( + gcs_bucket_name: str, gcs_prefix: str, files_to_display: int = 4 +) -> None: + r"""Prints a tree of filenames in a Cloud Storage folder. Args: gcs_bucket_name (str): @@ -76,13 +173,14 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None: Required. The prefix of the json files in the target_folder. Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`. + files_to_display (int): + Optional. The amount of files to display. Default is `4`. Returns: None. """ FILENAME_TREE_MIDDLE = "├──" FILENAME_TREE_LAST = "└──" - FILES_TO_DISPLAY = 4 path_list = list_gcs_document_tree( gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix @@ -93,18 +191,18 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None: dir_size = len(files) for idx, file_name in enumerate(files): if idx == dir_size - 1: - if dir_size > FILES_TO_DISPLAY: + if dir_size > files_to_display: print("│ ....") print(f"{FILENAME_TREE_LAST}{file_name}\n") break - if idx <= FILES_TO_DISPLAY: + if idx <= files_to_display: print(f"{FILENAME_TREE_MIDDLE}{file_name}") def create_batches( gcs_bucket_name: str, gcs_prefix: str, - batch_size: Optional[int] = constants.BATCH_MAX_FILES, + batch_size: int = constants.BATCH_MAX_FILES, ) -> List[documentai.BatchDocumentsInputConfig]: """Create batches of documents in Cloud Storage to process with `batch_process_documents()`. @@ -117,7 +215,7 @@ def create_batches( Required. The prefix of the json files in the `target_folder` Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. - batch_size (Optional[int]): + batch_size (int): Optional. Size of each batch of documents. Default is `50`. Returns: @@ -143,7 +241,7 @@ def create_batches( print(f"Skipping file {blob.name}. Invalid Mime Type {blob.content_type}.") continue - if blob.size > constants.BATCH_MAX_FILE_SIZE: + if int(blob.size) > constants.BATCH_MAX_FILE_SIZE: print( f"Skipping file {blob.name}. File size must be less than {constants.BATCH_MAX_FILE_SIZE} bytes. File size is {blob.size} bytes." ) @@ -159,7 +257,7 @@ def create_batches( batch.append( documentai.GcsDocument( - gcs_uri=f"gs://{gcs_bucket_name}/{blob.name}", + gcs_uri=create_gcs_uri(gcs_bucket_name, blob.name), mime_type=blob.content_type, ) ) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 7eb9a639..89619c72 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -20,13 +20,15 @@ import re from typing import Dict, List, Optional -from google.api_core import client_info +from google.api_core.client_options import ClientOptions + from google.cloud import bigquery from google.cloud import documentai -from google.cloud import storage -from google.cloud import documentai_toolbox from google.cloud.documentai_toolbox import constants + +from google.cloud.documentai_toolbox.utilities import gcs_utilities + from google.cloud.documentai_toolbox.wrappers.page import Page from google.cloud.documentai_toolbox.wrappers.page import FormField from google.cloud.documentai_toolbox.wrappers.entity import Entity @@ -42,6 +44,8 @@ PageInfo, ) +from google.longrunning.operations_pb2 import Operation, GetOperationRequest + from pikepdf import Pdf @@ -94,55 +98,6 @@ def _pages_from_shards(shards: List[documentai.Document]) -> List[Page]: return result -def _get_storage_client(): - r"""Returns a Storage client with custom user agent header. - - Returns: - storage.Client. - - """ - user_agent = f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}" - - info = client_info.ClientInfo( - client_library_version=documentai_toolbox.__version__, - user_agent=user_agent, - ) - - return storage.Client(client_info=info) - - -def _get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]: - r"""Returns a list of bytes of json files from Cloud Storage. - - Args: - gcs_bucket_name (str): - Required. The name of the gcs bucket. - - Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`. - gcs_prefix (str): - Required. The prefix of the json files in the target_folder - - Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`. - Returns: - List[bytes]: - A list of bytes. - - """ - result = [] - - storage_client = _get_storage_client() - blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) - - for blob in blob_list: - if ( - blob.name.endswith(constants.JSON_EXTENSION) - or blob.content_type == constants.JSON_MIMETYPE - ): - result.append(blob.download_as_bytes()) - - return result - - def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Document]: r"""Returns a list of documentai.Document shards from a Cloud Storage folder. @@ -167,7 +122,7 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume if file_check is not None: raise ValueError("gcs_prefix cannot contain file types") - byte_array = _get_bytes(gcs_bucket_name, gcs_prefix) + byte_array = gcs_utilities.get_bytes(gcs_bucket_name, gcs_prefix) for byte in byte_array: shards.append(documentai.Document.from_json(byte, ignore_unknown_fields=True)) @@ -231,14 +186,62 @@ def _convert_to_vision_annotate_file_response(text: str, pages: List[page.Page]) return vision_file_response +def _get_batch_process_metadata( + location: str, operation_name: str +) -> documentai.BatchProcessMetadata: + r"""Get `BatchProcessMetadata` from a `batch_process_documents()` long-running operation. + + Args: + location (str): + Required. The location of the processor used for `batch_process_documents()`. + + operation_name (str): + Required. The fully qualified operation name for a `batch_process_documents()` operation. + Returns: + documentai.BatchProcessMetadata: + Metadata from batch process. + """ + client = documentai.DocumentProcessorServiceClient( + client_options=ClientOptions( + api_endpoint=f"{location}-documentai.googleapis.com" + ) + ) + + while True: + operation: Operation = client.get_operation( + request=GetOperationRequest(name=operation_name) + ) + + if operation.done: + break + + if not operation.metadata: + raise ValueError(f"Operation does not contain metadata: {operation}") + + metadata_type = ( + "type.googleapis.com/google.cloud.documentai.v1.BatchProcessMetadata" + ) + + if not operation.metadata.type_url or operation.metadata.type_url != metadata_type: + raise ValueError( + f"Operation metadata type is not `{metadata_type}`. Type is `{operation.metadata.type_url}`." + ) + + metadata: documentai.BatchProcessMetadata = ( + documentai.BatchProcessMetadata.deserialize(operation.metadata.value) + ) + + return metadata + + @dataclasses.dataclass class Document: - r"""Represents a wrapped Document. + r"""Represents a wrapped `Document`. - This class hides away the complexities of using Document protobuf - response outputted by BatchProcessDocuments or ProcessDocument + This class hides away the complexities of using `Document` protobuf + response outputted by `BatchProcessDocuments` or `ProcessDocument` methods and implements convenient methods for searching and - extracting information within the Document. + extracting information within the `Document`. Attributes: shards: (List[google.cloud.documentai.Document]): @@ -263,6 +266,7 @@ class Document: shards: List[documentai.Document] = dataclasses.field(repr=False) gcs_bucket_name: Optional[str] = dataclasses.field(default=None, repr=False) gcs_prefix: Optional[str] = dataclasses.field(default=None, repr=False) + gcs_input_uri: Optional[str] = dataclasses.field(default=None, repr=False) pages: List[Page] = dataclasses.field(init=False, repr=False) entities: List[Entity] = dataclasses.field(init=False, repr=False) @@ -280,6 +284,13 @@ def from_document_path( ): r"""Loads Document from local document_path. + .. code-block:: python + + from google.cloud.documentai_toolbox import document + + document_path = "/path/to/local/file.json + wrapped_document = document.Document.from_document_path(document_path) + Args: document_path (str): Required. The path to the document.json file. @@ -300,6 +311,14 @@ def from_documentai_document( ): r"""Loads Document from local documentai_document. + .. code-block:: python + + from google.cloud import documentai + from google.cloud.documentai_toolbox import document + + documentai_document = client.process_documents(request).document + wrapped_document = document.Document.from_documentai_document(documentai_document) + Args: documentai_document (documentai.Document): Optional. The Document.proto response. @@ -311,7 +330,7 @@ def from_documentai_document( return cls(shards=[documentai_document]) @classmethod - def from_gcs(cls, gcs_bucket_name: str, gcs_prefix: str): + def from_gcs(cls, gcs_bucket_name: str, gcs_prefix: str, gcs_input_uri: str = None): r"""Loads Document from Cloud Storage. Args: @@ -323,13 +342,87 @@ def from_gcs(cls, gcs_bucket_name: str, gcs_prefix: str): Required. The prefix to the location of the target folder. Format: Given `gs://{bucket_name}/{optional_folder}/{target_folder}` where gcs_prefix=`{optional_folder}/{target_folder}`. + gcs_input_uri (str): + Optional. The gcs uri to the original input file. + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/{file_name}.pdf` Returns: Document: A document from gcs. """ shards = _get_shards(gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix) return cls( - shards=shards, gcs_prefix=gcs_prefix, gcs_bucket_name=gcs_bucket_name + shards=shards, + gcs_bucket_name=gcs_bucket_name, + gcs_prefix=gcs_prefix, + gcs_input_uri=gcs_input_uri, + ) + + @classmethod + def from_batch_process_metadata(cls, metadata: documentai.BatchProcessMetadata): + r"""Loads Documents from Cloud Storage, using the output from `BatchProcessMetadata`. + + .. code-block:: python + + from google.cloud import documentai + + operation = client.batch_process_documents(request) + operation.result(timeout=timeout) + metadata = documentai.BatchProcessMetadata(operation.metadata) + + Args: + metadata (documentai.BatchProcessMetadata): + Required. The operation metadata after a `batch_process_documents()` operation completes. + + Returns: + List[Document]: + A list of wrapped documents from gcs. Each document corresponds to an input file. + """ + if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED: + raise ValueError(f"Batch Process Failed: {metadata.state_message}") + + documents: List[Document] = [] + # Each process corresponds to one input document + for process in list(metadata.individual_process_statuses): + # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/ + gcs_bucket_name, gcs_prefix = gcs_utilities.split_gcs_uri( + process.output_gcs_destination + ) + + documents.append( + Document.from_gcs( + gcs_bucket_name, gcs_prefix, gcs_input_uri=process.input_gcs_source + ) + ) + + return documents + + @classmethod + def from_batch_process_operation(cls, location: str, operation_name: str): + r"""Loads Documents from Cloud Storage, using the operation name returned from `batch_process_documents()`. + + .. code-block:: python + + from google.cloud import documentai + + operation = client.batch_process_documents(request) + operation_name = operation.operation.name + + Args: + location (str): + Required. The location of the processor used for `batch_process_documents()`. + + operation_name (str): + Required. The fully qualified operation name for a `batch_process_documents()` operation. + + Returns: + List[Document]: + A list of wrapped documents from gcs. Each document corresponds to an input file. + """ + return cls.from_batch_process_metadata( + metadata=_get_batch_process_metadata( + location=location, operation_name=operation_name + ) ) def search_pages( diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index f740873b..c3116500 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -109,10 +109,10 @@ def _table_wrapper_from_documentai_table( """ header_rows = _table_rows_from_documentai_table_rows( - table_rows=documentai_table.header_rows, text=text + table_rows=list(documentai_table.header_rows), text=text ) body_rows = _table_rows_from_documentai_table_rows( - table_rows=documentai_table.body_rows, text=text + table_rows=list(documentai_table.body_rows), text=text ) return Table( @@ -292,7 +292,7 @@ def _get_form_fields( def _table_rows_from_documentai_table_rows( table_rows: List[documentai.Document.Page.Table.TableRow], text: str -) -> List[str]: +) -> List[List[str]]: r"""Returns a list of rows from table_rows. Args: @@ -303,17 +303,19 @@ def _table_rows_from_documentai_table_rows( from the document. Returns: - List[str]: + List[List[str]]: A list of table rows. """ - body_rows = [] + body_rows: List[List[str]] = [] for row in table_rows: row_text = [] for cell in row.cells: - row_text.append(_text_from_layout(layout=cell.layout, text=text)) + row_text.append( + _text_from_layout(layout=cell.layout, text=text).replace("\n", "") + ) - body_rows.append([x.replace("\n", "") for x in row_text]) + body_rows.append(row_text) return body_rows diff --git a/samples/snippets/create_batches_sample.py b/samples/snippets/create_batches_sample.py index 0847c170..6700f569 100644 --- a/samples/snippets/create_batches_sample.py +++ b/samples/snippets/create_batches_sample.py @@ -17,7 +17,7 @@ # [START documentai_toolbox_create_batches] from google.cloud import documentai -from google.cloud.documentai_toolbox import utilities +from google.cloud.documentai_toolbox import gcs_utilities # TODO(developer): Uncomment these variables before running the sample. # Given unprocessed documents in path gs://bucket/path/to/folder @@ -32,7 +32,7 @@ def create_batches_sample( batch_size: int = 50, ) -> None: # Creating batches of documents for processing - batches = utilities.create_batches( + batches = gcs_utilities.create_batches( gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix, batch_size=batch_size ) diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py index c3e41670..33ff8c0f 100644 --- a/samples/snippets/quickstart_sample.py +++ b/samples/snippets/quickstart_sample.py @@ -17,7 +17,7 @@ # [START documentai_toolbox_quickstart] from google.cloud.documentai_toolbox import document -from google.cloud.documentai_toolbox import utilities +from google.cloud.documentai_toolbox import gcs_utilities # TODO(developer): Uncomment these variables before running the sample. # Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder @@ -27,7 +27,7 @@ def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None: print("Document structure in Cloud Storage") - utilities.print_gcs_document_tree( + gcs_utilities.print_gcs_document_tree( gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix ) diff --git a/tests/unit/test_converter.py b/tests/unit/test_converter.py index 0be99429..0a5410e6 100644 --- a/tests/unit/test_converter.py +++ b/tests/unit/test_converter.py @@ -22,7 +22,7 @@ from google.cloud.documentai_toolbox.converters import converter -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") @mock.patch( "google.cloud.documentai_toolbox.converters.config.converter_helpers._get_docproto_files", return_value=(["file1"], ["test_label"], []), diff --git a/tests/unit/test_converter_helpers.py b/tests/unit/test_converter_helpers.py index 845505c1..dfafc285 100644 --- a/tests/unit/test_converter_helpers.py +++ b/tests/unit/test_converter_helpers.py @@ -1,3 +1,4 @@ +# pylint: disable=protected-access # -*- coding: utf-8 -*- # Copyright 2023 Google LLC # @@ -210,7 +211,7 @@ def test_convert_to_docproto_with_config_with_error_and_retry(mock_ocr, capfd): assert "Could Not Convert test_document" in out -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_get_bytes(mock_storage): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -251,7 +252,7 @@ def test_get_bytes(mock_storage): ] -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_get_bytes_with_error(mock_storage): with pytest.raises(Exception, match="Fail"): client = mock_storage.Client.return_value @@ -272,7 +273,7 @@ def test_get_bytes_with_error(mock_storage): ) -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_upload_file(mock_storage): client = mock_storage.Client.return_value @@ -284,7 +285,7 @@ def test_upload_file(mock_storage): ) -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") @mock.patch( "google.cloud.documentai_toolbox.converters.config.converter_helpers._get_bytes", return_value="file_bytes", @@ -324,7 +325,6 @@ def test_get_files(mock_storage, mock_get_bytes): "google.cloud.documentai_toolbox.converters.config.converter_helpers._convert_to_docproto_with_config", ) def test_get_docproto_files(mocked_convert_docproto): - mock_result = mock.Mock() mock_result.result.return_value = [ "annotated_bytes", @@ -367,7 +367,6 @@ def test_get_docproto_files(mocked_convert_docproto): "google.cloud.documentai_toolbox.converters.config.converter_helpers._convert_to_docproto_with_config", ) def test_get_docproto_files_with_no_docproto(mocked_convert_docproto): - mock_result = mock.Mock() mock_result.result.return_value = [ "annotated_bytes", @@ -425,7 +424,7 @@ def test_upload_with_file_error(): converter_helpers._upload(files, gcs_output_path="gs://output/path.json") -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") @mock.patch( "google.cloud.documentai_toolbox.converters.config.converter_helpers._get_docproto_files", return_value=(["file1"], ["test_label"], ["document_2"]), diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index dd9e412b..0b358ab6 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -27,6 +27,7 @@ import glob from google.cloud.documentai_toolbox import document +from google.cloud.documentai_toolbox import gcs_utilities from google.cloud import documentai from google.cloud.vision import AnnotateFileResponse @@ -43,42 +44,49 @@ def get_bytes(file_name): @pytest.fixture def get_bytes_single_file_mock(): - with mock.patch.object(document, "_get_bytes") as byte_factory: + with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory: byte_factory.return_value = get_bytes("tests/unit/resources/0") yield byte_factory @pytest.fixture def get_bytes_multiple_files_mock(): - with mock.patch.object(document, "_get_bytes") as byte_factory: + with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory: byte_factory.return_value = get_bytes("tests/unit/resources/1") yield byte_factory @pytest.fixture def get_bytes_unordered_files_mock(): - with mock.patch.object(document, "_get_bytes") as byte_factory: + with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory: byte_factory.return_value = get_bytes("tests/unit/resources/unordered_shards") yield byte_factory +@pytest.fixture(params=["tests/unit/resources/0", "tests/unit/resources/1"]) +def get_bytes_multiple_directories_mock(request): + with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory: + byte_factory.return_value = get_bytes(request.param) + yield byte_factory + + @pytest.fixture def get_bytes_form_parser_mock(): - with mock.patch.object(document, "_get_bytes") as byte_factory: + with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory: byte_factory.return_value = get_bytes("tests/unit/resources/form_parser") yield byte_factory @pytest.fixture def get_bytes_splitter_mock(): - with mock.patch.object(document, "_get_bytes") as byte_factory: + with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory: byte_factory.return_value = get_bytes("tests/unit/resources/splitter") yield byte_factory @pytest.fixture def get_bytes_images_mock(): - with mock.patch.object(document, "_get_bytes") as byte_factory: + with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory: byte_factory.return_value = get_bytes("tests/unit/resources/images") yield byte_factory @@ -127,6 +135,77 @@ def test_entities_from_shard(): assert actual[1].normalized_text == "140 USD" +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai") +def test_get_batch_process_metadata_with_valid_operation( + mock_docai, +): + mock_client = mock_docai.DocumentProcessorServiceClient.return_value + + metadata = documentai.BatchProcessMetadata( + state=documentai.BatchProcessMetadata.State.SUCCEEDED, + individual_process_statuses=[ + documentai.BatchProcessMetadata.IndividualProcessStatus( + input_gcs_source="gs://test-directory/documentai/input.pdf", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1/", + ) + ], + ) + + mock_operation = mock.Mock( + done=True, + metadata=mock.Mock( + type_url="type.googleapis.com/google.cloud.documentai.v1.BatchProcessMetadata", + value=documentai.BatchProcessMetadata.serialize(metadata), + ), + ) + + mock_client.get_operation.return_value = mock_operation + + location = "us" + operation_name = "projects/123456/locations/us/operations/7890123" + document._get_batch_process_metadata(location, operation_name) + + mock_client.get_operation.assert_called() + mock_docai.BatchProcessMetadata.deserialize.assert_called() + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai") +def test_get_batch_process_metadata_with_no_metadata(mock_docai): + with pytest.raises( + ValueError, + match="Operation does not contain metadata:", + ): + mock_client = mock_docai.DocumentProcessorServiceClient.return_value + + location = "us" + operation_name = "projects/123456/locations/us/operations/7890123" + mock_operation = mock.Mock(done=True, metadata=None) + mock_client.get_operation.return_value = mock_operation + + document._get_batch_process_metadata(location, operation_name) + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai") +def test_document_from_batch_process_operation_with_invalid_metadata_type(mock_docai): + with pytest.raises( + ValueError, + match="Operation metadata type is not", + ): + mock_client = mock_docai.DocumentProcessorServiceClient.return_value + + location = "us" + operation_name = "projects/123456/locations/us/operations/7890123" + mock_operation = mock.Mock( + done=True, + metadata=mock.Mock( + type_url="type.googleapis.com/google.cloud.documentai.uiv1beta3.TrainProcessorVersionResponse", + ), + ) + mock_client.get_operation.return_value = mock_operation + + document._get_batch_process_metadata(location, operation_name) + + def test_document_from_document_path_with_single_shard(): actual = document.Document.from_document_path( document_path="tests/unit/resources/0/toolbox_invoice_test-0.json" @@ -180,6 +259,49 @@ def test_document_from_gcs_with_unordered_shards(get_bytes_unordered_files_mock) assert page.documentai_page.page_number == page_index + 1 +def test_document_from_batch_process_metadata_with_multiple_input_files( + get_bytes_multiple_directories_mock, +): + mock_metadata = mock.Mock( + state=documentai.BatchProcessMetadata.State.SUCCEEDED, + individual_process_statuses=[ + mock.Mock( + input_gcs_source="gs://test-directory/documentai/input.pdf", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1/", + ), + mock.Mock( + input_gcs_source="gs://test-directory/documentai/input2.pdf", + output_gcs_destination="gs://test-directory/documentai/output/123456789/2/", + ), + ], + ) + documents = document.Document.from_batch_process_metadata(mock_metadata) + + get_bytes_multiple_directories_mock.assert_called() + assert get_bytes_multiple_directories_mock.call_count == 2 + assert len(documents) == 2 + + assert documents[0].gcs_bucket_name == "test-directory" + assert documents[0].gcs_prefix == "documentai/output/123456789/1/" + assert documents[0].gcs_input_uri == "gs://test-directory/documentai/input.pdf" + + assert documents[1].gcs_bucket_name == "test-directory" + assert documents[1].gcs_prefix == "documentai/output/123456789/2/" + assert documents[1].gcs_input_uri == "gs://test-directory/documentai/input2.pdf" + + +def test_document_from_batch_process_metadata_with_failed_operation(): + with pytest.raises( + ValueError, + match="Batch Process Failed: Internal Error Occured", + ): + mock_metadata = mock.Mock( + state=documentai.BatchProcessMetadata.State.FAILED, + state_message="Internal Error Occured", + ) + document.Document.from_batch_process_metadata(mock_metadata) + + def test_search_page_with_target_string(get_bytes_single_file_mock): doc = document.Document.from_gcs( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0/" @@ -266,42 +388,6 @@ def test_get_entity_by_type(get_bytes_single_file_mock): assert actual[0].mention_text == "222 Main Street\nAnytown, USA" -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") -def test_get_bytes(mock_storage): - client = mock_storage.Client.return_value - mock_bucket = mock.Mock() - client.Bucket.return_value = mock_bucket - - mock_ds_store = mock.Mock(name=[]) - mock_ds_store.name = "DS_Store" - - mock_blob1 = mock.Mock(name=[]) - mock_blob1.name = "gs://test-directory/1/test-annotations.json" - mock_blob1.download_as_bytes.return_value = ( - "gs://test-directory/1/test-annotations.json" - ) - - mock_blob2 = mock.Mock(name=[]) - mock_blob2.name = "gs://test-directory/1/test-config.json" - mock_blob2.download_as_bytes.return_value = "gs://test-directory/1/test-config.json" - - mock_blob3 = mock.Mock(name=[]) - mock_blob3.name = "gs://test-directory/1/test.pdf" - mock_blob3.download_as_bytes.return_value = "gs://test-directory/1/test.pdf" - - client.list_blobs.return_value = [mock_ds_store, mock_blob1, mock_blob2, mock_blob3] - - actual = document._get_bytes( - gcs_bucket_name="bucket", - gcs_prefix="prefix", - ) - - assert actual == [ - "gs://test-directory/1/test-annotations.json", - "gs://test-directory/1/test-config.json", - ] - - def test_get_form_field_by_name(get_bytes_form_parser_mock): doc = document.Document.from_gcs( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py index fd9335d8..4c003a8a 100644 --- a/tests/unit/test_utilities.py +++ b/tests/unit/test_utilities.py @@ -16,7 +16,7 @@ import pytest from google.cloud import storage -from google.cloud.documentai_toolbox.utilities import utilities +from google.cloud.documentai_toolbox import gcs_utilities # try/except added for compatibility with python < 3.8 try: @@ -29,7 +29,60 @@ test_prefix = "documentai/input" -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") +def test_get_bytes(mock_storage): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_ds_store = mock.Mock(name=[]) + mock_ds_store.name = "DS_Store" + + mock_blob1 = mock.Mock(name=[]) + mock_blob1.name = "gs://test-directory/1/test-annotations.json" + mock_blob1.download_as_bytes.return_value = ( + "gs://test-directory/1/test-annotations.json" + ) + + mock_blob2 = mock.Mock(name=[]) + mock_blob2.name = "gs://test-directory/1/test-config.json" + mock_blob2.download_as_bytes.return_value = "gs://test-directory/1/test-config.json" + + mock_blob3 = mock.Mock(name=[]) + mock_blob3.name = "gs://test-directory/1/test.pdf" + mock_blob3.download_as_bytes.return_value = "gs://test-directory/1/test.pdf" + + client.list_blobs.return_value = [mock_ds_store, mock_blob1, mock_blob2, mock_blob3] + + actual = gcs_utilities.get_bytes( + gcs_bucket_name="bucket", + gcs_prefix="prefix", + ) + + assert actual == [ + "gs://test-directory/1/test-annotations.json", + "gs://test-directory/1/test-config.json", + ] + + +def test_split_gcs_uri_with_valid_format(): + gcs_uri = "gs://test-bucket/test-directory/1/" + bucket, prefix = gcs_utilities.split_gcs_uri(gcs_uri) + + assert bucket == "test-bucket" + assert prefix == "test-directory/1/" + + +def test_split_gcs_uri_with_invalid_format(): + with pytest.raises( + ValueError, + match="gcs_uri must follow format 'gs://{bucket_name}/{gcs_prefix}'.", + ): + gcs_uri = "test-bucket/test-directory/1/" + gcs_utilities.split_gcs_uri(gcs_uri) + + +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_list_gcs_document_tree_with_one_folder(mock_storage): client = mock_storage.Client.return_value @@ -54,7 +107,7 @@ def test_list_gcs_document_tree_with_one_folder(mock_storage): client.list_blobs.return_value = blobs - doc_list = utilities.list_gcs_document_tree( + doc_list = gcs_utilities.list_gcs_document_tree( gcs_bucket_name="test-directory", gcs_prefix="/" ) @@ -63,7 +116,7 @@ def test_list_gcs_document_tree_with_one_folder(mock_storage): assert "gs://test-directory/1" in list(doc_list.keys()) -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): client = mock_storage.Client.return_value @@ -88,7 +141,7 @@ def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): client.list_blobs.return_value = blobs - doc_list = utilities.list_gcs_document_tree( + doc_list = gcs_utilities.list_gcs_document_tree( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" ) @@ -99,7 +152,7 @@ def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): assert "gs://test-directory/documentai/output/123456789/1" in list(doc_list.keys()) -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): client = mock_storage.Client.return_value @@ -135,7 +188,7 @@ def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): ] client.list_blobs.return_value = blobs - doc_list = utilities.list_gcs_document_tree( + doc_list = gcs_utilities.list_gcs_document_tree( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" ) @@ -148,13 +201,13 @@ def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): def test_list_gcs_document_tree_with_gcs_uri_contains_file_type(): with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): - utilities.list_gcs_document_tree( + gcs_utilities.list_gcs_document_tree( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/test_file.json", ) -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_print_gcs_document_tree_with_one_folder(mock_storage, capfd): client = mock_storage.Client.return_value @@ -179,7 +232,9 @@ def test_print_gcs_document_tree_with_one_folder(mock_storage, capfd): client.list_blobs.return_value = blobs - utilities.print_gcs_document_tree(gcs_bucket_name="test-directory", gcs_prefix="/") + gcs_utilities.print_gcs_document_tree( + gcs_bucket_name="test-directory", gcs_prefix="/" + ) mock_storage.Client.assert_called_once() @@ -193,7 +248,7 @@ def test_print_gcs_document_tree_with_one_folder(mock_storage, capfd): ) -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_print_gcs_document_tree_with_3_documents(mock_storage, capfd): client = mock_storage.Client.return_value @@ -218,7 +273,7 @@ def test_print_gcs_document_tree_with_3_documents(mock_storage, capfd): client.list_blobs.return_value = blobs - utilities.print_gcs_document_tree( + gcs_utilities.print_gcs_document_tree( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" ) @@ -234,7 +289,7 @@ def test_print_gcs_document_tree_with_3_documents(mock_storage, capfd): ) -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_print_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): client = mock_storage.Client.return_value @@ -270,7 +325,7 @@ def test_print_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): ] client.list_blobs.return_value = blobs - utilities.print_gcs_document_tree( + gcs_utilities.print_gcs_document_tree( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" ) @@ -292,13 +347,13 @@ def test_print_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): def test_print_gcs_document_tree_with_gcs_uri_contains_file_type(): with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): - utilities.print_gcs_document_tree( + gcs_utilities.print_gcs_document_tree( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/test_file.json", ) -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_create_batches_with_empty_directory(mock_storage, capfd): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -309,7 +364,7 @@ def test_create_batches_with_empty_directory(mock_storage, capfd): client.list_blobs.return_value = [mock_blob] - actual = utilities.create_batches( + actual = gcs_utilities.create_batches( gcs_bucket_name=test_bucket, gcs_prefix=test_prefix ) @@ -320,7 +375,7 @@ def test_create_batches_with_empty_directory(mock_storage, capfd): assert len(actual) == 0 -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_create_batches_with_3_documents(mock_storage, capfd): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -335,7 +390,7 @@ def test_create_batches_with_3_documents(mock_storage, capfd): mock_blobs.append(mock_blob) client.list_blobs.return_value = mock_blobs - actual = utilities.create_batches( + actual = gcs_utilities.create_batches( gcs_bucket_name=test_bucket, gcs_prefix=test_prefix ) @@ -352,12 +407,12 @@ def test_create_batches_with_invalid_batch_size(): ValueError, match="Batch size must be less than 1000. You provided 1001.", ): - utilities.create_batches( + gcs_utilities.create_batches( gcs_bucket_name=test_bucket, gcs_prefix=test_prefix, batch_size=1001 ) -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_create_batches_with_large_folder(mock_storage, capfd): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -372,7 +427,7 @@ def test_create_batches_with_large_folder(mock_storage, capfd): mock_blobs.append(mock_blob) client.list_blobs.return_value = mock_blobs - actual = utilities.create_batches( + actual = gcs_utilities.create_batches( gcs_bucket_name=test_bucket, gcs_prefix=test_prefix, batch_size=50 ) @@ -385,7 +440,7 @@ def test_create_batches_with_large_folder(mock_storage, capfd): assert len(actual[1].gcs_documents.documents) == 46 -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_create_batches_with_invalid_file_type(mock_storage, capfd): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -397,7 +452,7 @@ def test_create_batches_with_invalid_file_type(mock_storage, capfd): mock_blob.name.endswith.return_value = False client.list_blobs.return_value = [mock_blob] - actual = utilities.create_batches( + actual = gcs_utilities.create_batches( gcs_bucket_name=test_bucket, gcs_prefix=test_prefix ) @@ -408,7 +463,7 @@ def test_create_batches_with_invalid_file_type(mock_storage, capfd): assert not actual -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") def test_create_batches_with_large_file(mock_storage, capfd): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -420,7 +475,7 @@ def test_create_batches_with_large_file(mock_storage, capfd): mock_blob.name.endswith.return_value = False client.list_blobs.return_value = [mock_blob] - actual = utilities.create_batches( + actual = gcs_utilities.create_batches( gcs_bucket_name=test_bucket, gcs_prefix=test_prefix )