feat: Add export merged sharded Document proto (#145)

* feat: Add export merged sharded Document proto - `to_documentai_document` exports a documentai Document proto from all of the shards in the wrapped Document * fix: Refactor `_apply_text_offset()` to use original impentation with dictionary. - Found issue with implementation when trying to update test coverage * chore: Update min python client library for documentai * Update test constraints * fix: Change test to not include indent * fix: merge_document_shards_sample_test * fix: Address lint error for type checking * Fix lint error for incorrect typing * Rename `to_documentai_document` to `to_merged_documentai_document` * Change `to_merged_documentai_document()` to use a deepcopy instead of editing in place * Add more specific type annotation to `_apply_text_offset()` * fix: Fixed how template files are included in the library - Fixes #156 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * refactor: Updated `from_document_path()` to additionally support directory of shards * fix: Fix type annotation --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
googleapis · Aug 9, 2023 · a5e1f5c · a5e1f5c
1 parent e4db698
commit a5e1f5c
Show file tree

Hide file tree

Showing 8 changed files with 236 additions and 24 deletions.
diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py
@@ -15,14 +15,17 @@
 #
 """Wrappers for Document AI Document type."""
 
+import copy
 import dataclasses
+import glob
 import os
 import re
 from typing import Dict, List, Optional, Type, Union
 
 from google.api_core.client_options import ClientOptions
 from google.cloud.vision import AnnotateFileResponse
 from google.longrunning.operations_pb2 import GetOperationRequest, Operation
+
 from jinja2 import Environment, PackageLoader
 from pikepdf import Pdf
 
@@ -284,6 +287,39 @@ def _dict_to_bigquery(
     )
 
 
+def _apply_text_offset(
+    documentai_object: Union[Dict[str, Dict], List], text_offset: int
+) -> None:
+    r"""Applies a text offset to all text_segments in `documentai_object`.
+
+    Args:
+        documentai_object (object):
+            Required. Document AI object to apply `text_offset` to.
+        text_offset (int):
+            Required. Text offset to apply. From `Document.shard_info.text_offset`.
+    Returns:
+        None
+
+    """
+    if isinstance(documentai_object, dict):
+        for key, value in documentai_object.items():
+            if key == "text_segments":
+                documentai_object[key] = [
+                    {
+                        "start_index": int(text_segment.get("start_index", 0))
+                        + text_offset,
+                        "end_index": int(text_segment.get("end_index", 0))
+                        + text_offset,
+                    }
+                    for text_segment in value
+                ]
+            else:
+                _apply_text_offset(value, text_offset)
+    elif isinstance(documentai_object, list):
+        for item in documentai_object:
+            _apply_text_offset(item, text_offset)
+
+
 @dataclasses.dataclass
 class Document:
     r"""Represents a wrapped `Document`.
@@ -344,21 +380,31 @@ def from_document_path(
 
                 from google.cloud.documentai_toolbox import document
 
-                document_path = "/path/to/local/file.json
+                document_path = "/path/to/local/file.json"
                 wrapped_document = document.Document.from_document_path(document_path)
 
         Args:
             document_path (str):
-                Required. The path to the `document.json` file.
+                Required. The path to the `document.json` file or directory containing sharded `document.json` files.
         Returns:
             Document:
                 A document from local `document_path`.
         """
+        document_paths = [document_path]
 
-        with open(document_path, "r", encoding="utf-8") as f:
-            doc = documentai.Document.from_json(f.read(), ignore_unknown_fields=True)
+        if os.path.isdir(document_path):
+            document_paths = glob.glob(
+                os.path.join(document_path, f"*{constants.JSON_EXTENSION}")
+            )
+
+        documents = []
+        for file_path in document_paths:
+            with open(file_path, "r", encoding="utf-8") as f:
+                documents.append(
+                    documentai.Document.from_json(f.read(), ignore_unknown_fields=True)
+                )
 
-        return cls(shards=[doc])
+        return cls(shards=documents)
 
     @classmethod
     def from_documentai_document(
@@ -422,7 +468,7 @@ def from_gcs(
     @classmethod
     def from_batch_process_metadata(
         cls: Type["Document"], metadata: documentai.BatchProcessMetadata
-    ) -> "Document":
+    ) -> List["Document"]:
         r"""Loads Documents from Cloud Storage, using the output from `BatchProcessMetadata`.
 
             .. code-block:: python
@@ -444,26 +490,18 @@ def from_batch_process_metadata(
         if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
             raise ValueError(f"Batch Process Failed: {metadata.state_message}")
 
-        documents: List[Document] = []
-        # Each process corresponds to one input document
-        for process in list(metadata.individual_process_statuses):
-            # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
-            gcs_bucket_name, gcs_prefix = gcs_utilities.split_gcs_uri(
-                process.output_gcs_destination
-            )
-
-            documents.append(
-                Document.from_gcs(
-                    gcs_bucket_name, gcs_prefix, gcs_input_uri=process.input_gcs_source
-                )
+        return [
+            Document.from_gcs(
+                *gcs_utilities.split_gcs_uri(process.output_gcs_destination),
+                gcs_input_uri=process.input_gcs_source,
             )
-
-        return documents
+            for process in list(metadata.individual_process_statuses)
+        ]
 
     @classmethod
     def from_batch_process_operation(
         cls: Type["Document"], location: str, operation_name: str
-    ) -> "Document":
+    ) -> List["Document"]:
         r"""Loads Documents from Cloud Storage, using the operation name returned from `batch_process_documents()`.
 
             .. code-block:: python
@@ -771,3 +809,28 @@ def export_hocr_str(self, title: str) -> str:
         template = environment.get_template("hocr_document_template.xml.j2")
         content = template.render(pages=self.pages, title=title)
         return content
+
+    def to_merged_documentai_document(self) -> documentai.Document:
+        r"""Exports a documentai.Document from the wrapped document with shards merged.
+
+        Args:
+            None.
+        Returns:
+            documentai.Document:
+                Document with all shards merged and text offsets applied.
+        """
+        if len(self.shards) == 1:
+            return self.shards[0]
+
+        merged_document = documentai.Document(text=self.text, pages=[], entities=[])
+        for shard in self.shards:
+            modified_shard = copy.deepcopy(shard)
+
+            _apply_text_offset(
+                documentai_object=modified_shard,
+                text_offset=int(modified_shard.shard_info.text_offset),
+            )
+            merged_document.pages.extend(modified_shard.pages)
+            merged_document.entities.extend(modified_shard.entities)
+
+        return merged_document
diff --git a/samples/snippets/merge_document_shards_sample.py b/samples/snippets/merge_document_shards_sample.py
@@ -0,0 +1,44 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+# [START documentai_toolbox_merge_document_shards]
+
+from google.cloud import documentai
+from google.cloud.documentai_toolbox import document
+
+# TODO(developer): Uncomment these variables before running the sample.
+# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
+# gcs_bucket_name = "bucket"
+# gcs_prefix = "path/to/folder"
+# output_file_name = "path/to/folder/file.json"
+
+
+def merge_document_shards_sample(
+    gcs_bucket_name: str, gcs_prefix: str, output_file_name: str
+) -> None:
+    wrapped_document = document.Document.from_gcs(
+        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
+    )
+
+    merged_document = wrapped_document.to_merged_documentai_document()
+
+    with open(output_file_name, "w") as f:
+        f.write(documentai.Document.to_json(merged_document))
+
+    print(f"Document with {len(wrapped_document.shards)} shards successfully merged.")
+
+
+# [END documentai_toolbox_merge_document_shards]
diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt
@@ -1,4 +1,4 @@
 google-cloud-bigquery==3.11.4
 google-cloud-documentai==2.18.0
 google-cloud-storage==2.10.0
-google-cloud-documentai-toolbox==0.4.1a0
+google-cloud-documentai-toolbox==0.9.0a0
diff --git a/samples/snippets/test_merge_document_shards_sample.py b/samples/snippets/test_merge_document_shards_sample.py
@@ -0,0 +1,45 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import shutil
+
+import pytest
+from samples.snippets import merge_document_shards_sample
+
+gcs_bucket_name = "documentai_toolbox_samples"
+gcs_prefix = "output/987654321/1"
+output_dir = "resources/output/"
+output_path = f"{output_dir}merged_document.json"
+
+
+def test_merge_document_shards_sample(capsys: pytest.CaptureFixture) -> None:
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+
+    os.makedirs(output_dir)
+
+    merge_document_shards_sample.merge_document_shards_sample(
+        gcs_bucket_name=gcs_bucket_name,
+        gcs_prefix=gcs_prefix,
+        output_file_name=output_path,
+    )
+
+    out, _ = capsys.readouterr()
+
+    assert "Document with 5 shards successfully merged." in out
+
+    assert os.path.exists(output_dir)
+    shutil.rmtree(output_dir)
diff --git a/setup.py b/setup.py
@@ -43,6 +43,9 @@
     namespace_packages=("google", "google.cloud"),
     platforms="Posix; MacOS X; Windows",
     include_package_data=True,
+    package_data={
+        "google.cloud.documentai_toolbox": ["templates/*.xml.j2"],
+    },
     install_requires=(
         "google-api-core >= 1.31.5, <3.0.0dev,!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0",
         "pandas >= 1.0.0, <3.0.0",
@@ -51,7 +54,7 @@
         "proto-plus >= 1.22.2, <2.0.0dev; python_version>='3.11'",
         "grpc-google-iam-v1 >= 0.12.4, < 0.13dev",
         "google-cloud-bigquery >= 3.5.0, < 4.0.0dev",
-        "google-cloud-documentai >= 1.2.1, < 3.0.0dev",
+        "google-cloud-documentai >= 2.17.0, < 3.0.0dev",
         "google-cloud-storage >= 1.31.0, < 3.0.0dev",
         "google-cloud-vision >= 2.7.0, < 4.0.0dev ",
         "numpy >= 1.18.1",

diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt
@@ -10,7 +10,7 @@ pandas==1.0.0
 proto-plus==1.22.0
 grpc-google-iam-v1==0.12.4
 google-cloud-bigquery==3.5.0
-google-cloud-documentai==2.12.0
+google-cloud-documentai==2.17.0
 google-cloud-storage==2.7.0
 numpy==1.18.1
 pikepdf==6.2.9
diff --git a/tests/unit/resources/merged_document/merged_shards.json b/tests/unit/resources/merged_document/merged_shards.json
diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
 import shutil
 
@@ -658,3 +659,58 @@ def test_export_hocr_str():
         expected = f.read()
 
     assert actual_hocr == expected
+
+
+def test_document_to_merged_documentai_document(get_bytes_multiple_files_mock):
+    wrapped_document = document.Document.from_gcs(
+        gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/"
+    )
+    get_bytes_multiple_files_mock.assert_called_once()
+
+    actual = documentai.Document.to_json(
+        wrapped_document.to_merged_documentai_document()
+    )
+    with open("tests/unit/resources/merged_document/merged_shards.json", "r") as f:
+        merged_document = documentai.Document.from_json(f.read())
+        expected = documentai.Document.to_json(merged_document)
+
+    assert actual == expected
+
+
+def test_document_to_merged_documentai_document_one_shard():
+    path = "tests/unit/resources/0/toolbox_invoice_test-0.json"
+
+    with open(path, "r", encoding="utf-8") as f:
+        documentai_document = documentai.Document.from_json(f.read())
+
+    wrapped_document = document.Document.from_documentai_document(documentai_document)
+    actual = wrapped_document.to_merged_documentai_document()
+
+    assert actual == documentai_document
+
+
+def test_apply_text_offset():
+    path = "tests/unit/resources/1/toolbox_large_document_test-1.json"
+    with open(path, "r", encoding="utf-8") as f:
+        content = f.read()
+        documentai_document = documentai.Document.from_json(content)
+
+    assert documentai_document.shard_info.text_offset == 4350
+
+    doc_dict = documentai.Document.to_dict(documentai_document)
+    document._apply_text_offset(
+        doc_dict, int(documentai_document.shard_info.text_offset)
+    )
+
+    actual = documentai.Document.from_json(json.dumps(doc_dict))
+    assert actual.entities[0].text_anchor.text_segments[0].start_index == 4616
+    assert actual.entities[0].text_anchor.text_segments[0].end_index == 4622
+    assert actual.entities[0].text_anchor.text_segments[3].start_index == 4634
+    assert actual.entities[0].text_anchor.text_segments[3].end_index == 4640
+
+    assert (
+        actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].start_index
+    ) == 4350
+    assert (
+        actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].end_index == 4358
+    )