googleapis · holtskinner · Aug 9, 2023 · Jul 20, 2023 · Jul 21, 2023 · Jul 21, 2023
@@ -15,6 +15,7 @@
 #
 """Wrappers for Document AI Document type."""
 
+import copy
 import dataclasses
 import os
 import re
@@ -23,6 +24,7 @@
 from google.api_core.client_options import ClientOptions
 from google.cloud.vision import AnnotateFileResponse
 from google.longrunning.operations_pb2 import GetOperationRequest, Operation
+
 from jinja2 import Environment, PackageLoader
 from pikepdf import Pdf
 
@@ -284,6 +286,37 @@ def _dict_to_bigquery(
     )
 
 
+def _apply_text_offset(documentai_object: object, text_offset: int) -> None:
+    r"""Applies a text offset to all text_segments in `documentai_object`.
+
+    Args:
+        documentai_object (object):
+            Required. Document AI object to apply `text_offset` to.
+        text_offset (int):
+            Required. Text offset to apply. From `Document.shard_info.text_offset`.
+    Returns:
+        None
+
+    """
+    if isinstance(documentai_object, dict):
+        for key, value in documentai_object.items():
+            if key == "text_segments":
+                documentai_object[key] = [
+                    {
+                        "start_index": int(text_segment.get("start_index", 0))
+                        + text_offset,
+                        "end_index": int(text_segment.get("end_index", 0))
+                        + text_offset,
+                    }
+                    for text_segment in value
+                ]
+            else:
+                _apply_text_offset(value, text_offset)
+    elif isinstance(documentai_object, list):
+        for item in documentai_object:
+            _apply_text_offset(item, text_offset)
+
+
 @dataclasses.dataclass
 class Document:
     r"""Represents a wrapped `Document`.
@@ -422,7 +455,7 @@ def from_gcs(
     @classmethod
     def from_batch_process_metadata(
         cls: Type["Document"], metadata: documentai.BatchProcessMetadata
-    ) -> "Document":
+    ) -> List["Document"]:
         r"""Loads Documents from Cloud Storage, using the output from `BatchProcessMetadata`.
 
             .. code-block:: python
@@ -444,26 +477,18 @@ def from_batch_process_metadata(
         if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
             raise ValueError(f"Batch Process Failed: {metadata.state_message}")
 
-        documents: List[Document] = []
-        # Each process corresponds to one input document
-        for process in list(metadata.individual_process_statuses):
-            # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
-            gcs_bucket_name, gcs_prefix = gcs_utilities.split_gcs_uri(
-                process.output_gcs_destination
-            )
-
-            documents.append(
-                Document.from_gcs(
-                    gcs_bucket_name, gcs_prefix, gcs_input_uri=process.input_gcs_source
-                )
+        return [
+            Document.from_gcs(
+                *gcs_utilities.split_gcs_uri(process.output_gcs_destination),
+                gcs_input_uri=process.input_gcs_source,
             )
-
-        return documents
+            for process in list(metadata.individual_process_statuses)
+        ]
 
     @classmethod
     def from_batch_process_operation(
         cls: Type["Document"], location: str, operation_name: str
-    ) -> "Document":
+    ) -> List["Document"]:
         r"""Loads Documents from Cloud Storage, using the operation name returned from `batch_process_documents()`.
 
             .. code-block:: python
@@ -771,3 +796,28 @@ def export_hocr_str(self, title: str) -> str:
         template = environment.get_template("hocr_document_template.xml.j2")
         content = template.render(pages=self.pages, title=title)
         return content
+
+    def to_merged_documentai_document(self) -> documentai.Document:
+        r"""Exports a documentai.Document from the wrapped document with shards merged.
+
+        Args:
+            None.
+        Returns:
+            documentai.Document:
+                Document with all shards merged and text offsets applied.
+        """
+        if len(self.shards) == 1:
+            return self.shards[0]
+
+        merged_document = documentai.Document(text=self.text, pages=[], entities=[])
+        for shard in self.shards:
+            modified_shard = copy.deepcopy(shard)
+
+            _apply_text_offset(
+                documentai_object=modified_shard,
+                text_offset=int(modified_shard.shard_info.text_offset),
+            )
+            merged_document.pages.extend(modified_shard.pages)
+            merged_document.entities.extend(modified_shard.entities)
+
+        return merged_document
@@ -0,0 +1,44 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+# [START documentai_toolbox_merge_document_shards]
+
+from google.cloud import documentai
+from google.cloud.documentai_toolbox import document
+
+# TODO(developer): Uncomment these variables before running the sample.
+# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
+# gcs_bucket_name = "bucket"
+# gcs_prefix = "path/to/folder"
+# output_file_name = "path/to/folder/file.json"
+
+
+def merge_document_shards_sample(
+    gcs_bucket_name: str, gcs_prefix: str, output_file_name: str
+) -> None:
+    wrapped_document = document.Document.from_gcs(
+        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
+    )
+
+    merged_document = wrapped_document.to_merged_documentai_document()
+
+    with open(output_file_name, "w") as f:
+        f.write(documentai.Document.to_json(merged_document))
+
+    print(f"Document with {len(wrapped_document.shards)} shards successfully merged.")
+
+
+# [END documentai_toolbox_merge_document_shards]
@@ -1,4 +1,4 @@
 google-cloud-bigquery==3.11.4
 google-cloud-documentai==2.18.0
 google-cloud-storage==2.10.0
-google-cloud-documentai-toolbox==0.4.1a0
+google-cloud-documentai-toolbox==0.9.0a0
@@ -0,0 +1,45 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import shutil
+
+import pytest
+from samples.snippets import merge_document_shards_sample
+
+gcs_bucket_name = "documentai_toolbox_samples"
+gcs_prefix = "output/987654321/1"
+output_dir = "resources/output/"
+output_path = f"{output_dir}merged_document.json"
+
+
+def test_merge_document_shards_sample(capsys: pytest.CaptureFixture) -> None:
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+
+    os.makedirs(output_dir)
+
+    merge_document_shards_sample.merge_document_shards_sample(
+        gcs_bucket_name=gcs_bucket_name,
+        gcs_prefix=gcs_prefix,
+        output_file_name=output_path,
+    )
+
+    out, _ = capsys.readouterr()
+
+    assert "Document with 5 shards successfully merged." in out
+
+    assert os.path.exists(output_dir)
+    shutil.rmtree(output_dir)
@@ -51,7 +51,7 @@
         "proto-plus >= 1.22.2, <2.0.0dev; python_version>='3.11'",
         "grpc-google-iam-v1 >= 0.12.4, < 0.13dev",
         "google-cloud-bigquery >= 3.5.0, < 4.0.0dev",
-        "google-cloud-documentai >= 1.2.1, < 3.0.0dev",
+        "google-cloud-documentai >= 2.17.0, < 3.0.0dev",
         "google-cloud-storage >= 1.31.0, < 3.0.0dev",
         "google-cloud-vision >= 2.7.0, < 4.0.0dev ",
         "numpy >= 1.18.1",

@@ -10,7 +10,7 @@ pandas==1.0.0
 proto-plus==1.22.0
 grpc-google-iam-v1==0.12.4
 google-cloud-bigquery==3.5.0
-google-cloud-documentai==2.12.0
+google-cloud-documentai==2.17.0
 google-cloud-storage==2.7.0
 numpy==1.18.1
 pikepdf==6.2.9
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
 import shutil
 
@@ -658,3 +659,58 @@ def test_export_hocr_str():
         expected = f.read()
 
     assert actual_hocr == expected
+
+
+def test_document_to_merged_documentai_document(get_bytes_multiple_files_mock):
+    wrapped_document = document.Document.from_gcs(
+        gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/"
+    )
+    get_bytes_multiple_files_mock.assert_called_once()
+
+    actual = documentai.Document.to_json(
+        wrapped_document.to_merged_documentai_document()
+    )
+    with open("tests/unit/resources/merged_document/merged_shards.json", "r") as f:
+        merged_document = documentai.Document.from_json(f.read())
+        expected = documentai.Document.to_json(merged_document)
+
+    assert actual == expected
+
+
+def test_document_to_merged_documentai_document_one_shard():
+    path = "tests/unit/resources/0/toolbox_invoice_test-0.json"
+
+    with open(path, "r", encoding="utf-8") as f:
+        documentai_document = documentai.Document.from_json(f.read())
+
+    wrapped_document = document.Document.from_documentai_document(documentai_document)
+    actual = wrapped_document.to_merged_documentai_document()
+
+    assert actual == documentai_document
+
+
+def test_apply_text_offset():
+    path = "tests/unit/resources/1/toolbox_large_document_test-1.json"
+    with open(path, "r", encoding="utf-8") as f:
+        content = f.read()
+        documentai_document = documentai.Document.from_json(content)
+
+    assert documentai_document.shard_info.text_offset == 4350
+
+    doc_dict = documentai.Document.to_dict(documentai_document)
+    document._apply_text_offset(
+        doc_dict, int(documentai_document.shard_info.text_offset)
+    )
+
+    actual = documentai.Document.from_json(json.dumps(doc_dict))
+    assert actual.entities[0].text_anchor.text_segments[0].start_index == 4616
+    assert actual.entities[0].text_anchor.text_segments[0].end_index == 4622
+    assert actual.entities[0].text_anchor.text_segments[3].start_index == 4634
+    assert actual.entities[0].text_anchor.text_segments[3].end_index == 4640
+
+    assert (
+        actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].start_index
+    ) == 4350
+    assert (
+        actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].end_index == 4358
+    )