Skip to content

Commit

Permalink
feat: Add export merged sharded Document proto (#145)
Browse files Browse the repository at this point in the history
* feat: Add export merged sharded Document proto

- `to_documentai_document` exports a documentai Document proto from all of the shards in the wrapped Document

* fix: Refactor `_apply_text_offset()` to use original impentation with dictionary.

- Found issue with implementation when trying to update test coverage

* chore: Update min python client library for documentai

* Update test constraints

* fix: Change test to not include indent

* fix: merge_document_shards_sample_test

* fix: Address lint error for type checking

* Fix lint error for incorrect typing

* Rename `to_documentai_document` to `to_merged_documentai_document`

* Change `to_merged_documentai_document()` to use a deepcopy instead of editing in place

* Add more specific type annotation to `_apply_text_offset()`

* fix: Fixed how template files are included in the library

- Fixes #156

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* refactor: Updated `from_document_path()` to additionally support directory of shards

* fix: Fix type annotation

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
  • Loading branch information
holtskinner and gcf-owl-bot[bot] committed Aug 9, 2023
1 parent e4db698 commit a5e1f5c
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 24 deletions.
105 changes: 84 additions & 21 deletions google/cloud/documentai_toolbox/wrappers/document.py
Expand Up @@ -15,14 +15,17 @@
#
"""Wrappers for Document AI Document type."""

import copy
import dataclasses
import glob
import os
import re
from typing import Dict, List, Optional, Type, Union

from google.api_core.client_options import ClientOptions
from google.cloud.vision import AnnotateFileResponse
from google.longrunning.operations_pb2 import GetOperationRequest, Operation

from jinja2 import Environment, PackageLoader
from pikepdf import Pdf

Expand Down Expand Up @@ -284,6 +287,39 @@ def _dict_to_bigquery(
)


def _apply_text_offset(
documentai_object: Union[Dict[str, Dict], List], text_offset: int
) -> None:
r"""Applies a text offset to all text_segments in `documentai_object`.
Args:
documentai_object (object):
Required. Document AI object to apply `text_offset` to.
text_offset (int):
Required. Text offset to apply. From `Document.shard_info.text_offset`.
Returns:
None
"""
if isinstance(documentai_object, dict):
for key, value in documentai_object.items():
if key == "text_segments":
documentai_object[key] = [
{
"start_index": int(text_segment.get("start_index", 0))
+ text_offset,
"end_index": int(text_segment.get("end_index", 0))
+ text_offset,
}
for text_segment in value
]
else:
_apply_text_offset(value, text_offset)
elif isinstance(documentai_object, list):
for item in documentai_object:
_apply_text_offset(item, text_offset)


@dataclasses.dataclass
class Document:
r"""Represents a wrapped `Document`.
Expand Down Expand Up @@ -344,21 +380,31 @@ def from_document_path(
from google.cloud.documentai_toolbox import document
document_path = "/path/to/local/file.json
document_path = "/path/to/local/file.json"
wrapped_document = document.Document.from_document_path(document_path)
Args:
document_path (str):
Required. The path to the `document.json` file.
Required. The path to the `document.json` file or directory containing sharded `document.json` files.
Returns:
Document:
A document from local `document_path`.
"""
document_paths = [document_path]

with open(document_path, "r", encoding="utf-8") as f:
doc = documentai.Document.from_json(f.read(), ignore_unknown_fields=True)
if os.path.isdir(document_path):
document_paths = glob.glob(
os.path.join(document_path, f"*{constants.JSON_EXTENSION}")
)

documents = []
for file_path in document_paths:
with open(file_path, "r", encoding="utf-8") as f:
documents.append(
documentai.Document.from_json(f.read(), ignore_unknown_fields=True)
)

return cls(shards=[doc])
return cls(shards=documents)

@classmethod
def from_documentai_document(
Expand Down Expand Up @@ -422,7 +468,7 @@ def from_gcs(
@classmethod
def from_batch_process_metadata(
cls: Type["Document"], metadata: documentai.BatchProcessMetadata
) -> "Document":
) -> List["Document"]:
r"""Loads Documents from Cloud Storage, using the output from `BatchProcessMetadata`.
.. code-block:: python
Expand All @@ -444,26 +490,18 @@ def from_batch_process_metadata(
if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
raise ValueError(f"Batch Process Failed: {metadata.state_message}")

documents: List[Document] = []
# Each process corresponds to one input document
for process in list(metadata.individual_process_statuses):
# output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
gcs_bucket_name, gcs_prefix = gcs_utilities.split_gcs_uri(
process.output_gcs_destination
)

documents.append(
Document.from_gcs(
gcs_bucket_name, gcs_prefix, gcs_input_uri=process.input_gcs_source
)
return [
Document.from_gcs(
*gcs_utilities.split_gcs_uri(process.output_gcs_destination),
gcs_input_uri=process.input_gcs_source,
)

return documents
for process in list(metadata.individual_process_statuses)
]

@classmethod
def from_batch_process_operation(
cls: Type["Document"], location: str, operation_name: str
) -> "Document":
) -> List["Document"]:
r"""Loads Documents from Cloud Storage, using the operation name returned from `batch_process_documents()`.
.. code-block:: python
Expand Down Expand Up @@ -771,3 +809,28 @@ def export_hocr_str(self, title: str) -> str:
template = environment.get_template("hocr_document_template.xml.j2")
content = template.render(pages=self.pages, title=title)
return content

def to_merged_documentai_document(self) -> documentai.Document:
r"""Exports a documentai.Document from the wrapped document with shards merged.
Args:
None.
Returns:
documentai.Document:
Document with all shards merged and text offsets applied.
"""
if len(self.shards) == 1:
return self.shards[0]

merged_document = documentai.Document(text=self.text, pages=[], entities=[])
for shard in self.shards:
modified_shard = copy.deepcopy(shard)

_apply_text_offset(
documentai_object=modified_shard,
text_offset=int(modified_shard.shard_info.text_offset),
)
merged_document.pages.extend(modified_shard.pages)
merged_document.entities.extend(modified_shard.entities)

return merged_document
44 changes: 44 additions & 0 deletions samples/snippets/merge_document_shards_sample.py
@@ -0,0 +1,44 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


# [START documentai_toolbox_merge_document_shards]

from google.cloud import documentai
from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"
# output_file_name = "path/to/folder/file.json"


def merge_document_shards_sample(
gcs_bucket_name: str, gcs_prefix: str, output_file_name: str
) -> None:
wrapped_document = document.Document.from_gcs(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)

merged_document = wrapped_document.to_merged_documentai_document()

with open(output_file_name, "w") as f:
f.write(documentai.Document.to_json(merged_document))

print(f"Document with {len(wrapped_document.shards)} shards successfully merged.")


# [END documentai_toolbox_merge_document_shards]
2 changes: 1 addition & 1 deletion samples/snippets/requirements.txt
@@ -1,4 +1,4 @@
google-cloud-bigquery==3.11.4
google-cloud-documentai==2.18.0
google-cloud-storage==2.10.0
google-cloud-documentai-toolbox==0.4.1a0
google-cloud-documentai-toolbox==0.9.0a0
45 changes: 45 additions & 0 deletions samples/snippets/test_merge_document_shards_sample.py
@@ -0,0 +1,45 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import shutil

import pytest
from samples.snippets import merge_document_shards_sample

gcs_bucket_name = "documentai_toolbox_samples"
gcs_prefix = "output/987654321/1"
output_dir = "resources/output/"
output_path = f"{output_dir}merged_document.json"


def test_merge_document_shards_sample(capsys: pytest.CaptureFixture) -> None:
if os.path.exists(output_dir):
shutil.rmtree(output_dir)

os.makedirs(output_dir)

merge_document_shards_sample.merge_document_shards_sample(
gcs_bucket_name=gcs_bucket_name,
gcs_prefix=gcs_prefix,
output_file_name=output_path,
)

out, _ = capsys.readouterr()

assert "Document with 5 shards successfully merged." in out

assert os.path.exists(output_dir)
shutil.rmtree(output_dir)
5 changes: 4 additions & 1 deletion setup.py
Expand Up @@ -43,6 +43,9 @@
namespace_packages=("google", "google.cloud"),
platforms="Posix; MacOS X; Windows",
include_package_data=True,
package_data={
"google.cloud.documentai_toolbox": ["templates/*.xml.j2"],
},
install_requires=(
"google-api-core >= 1.31.5, <3.0.0dev,!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0",
"pandas >= 1.0.0, <3.0.0",
Expand All @@ -51,7 +54,7 @@
"proto-plus >= 1.22.2, <2.0.0dev; python_version>='3.11'",
"grpc-google-iam-v1 >= 0.12.4, < 0.13dev",
"google-cloud-bigquery >= 3.5.0, < 4.0.0dev",
"google-cloud-documentai >= 1.2.1, < 3.0.0dev",
"google-cloud-documentai >= 2.17.0, < 3.0.0dev",
"google-cloud-storage >= 1.31.0, < 3.0.0dev",
"google-cloud-vision >= 2.7.0, < 4.0.0dev ",
"numpy >= 1.18.1",
Expand Down
2 changes: 1 addition & 1 deletion testing/constraints-3.7.txt
Expand Up @@ -10,7 +10,7 @@ pandas==1.0.0
proto-plus==1.22.0
grpc-google-iam-v1==0.12.4
google-cloud-bigquery==3.5.0
google-cloud-documentai==2.12.0
google-cloud-documentai==2.17.0
google-cloud-storage==2.7.0
numpy==1.18.1
pikepdf==6.2.9
1 change: 1 addition & 0 deletions tests/unit/resources/merged_document/merged_shards.json

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions tests/unit/test_document.py
Expand Up @@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import shutil

Expand Down Expand Up @@ -658,3 +659,58 @@ def test_export_hocr_str():
expected = f.read()

assert actual_hocr == expected


def test_document_to_merged_documentai_document(get_bytes_multiple_files_mock):
wrapped_document = document.Document.from_gcs(
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/"
)
get_bytes_multiple_files_mock.assert_called_once()

actual = documentai.Document.to_json(
wrapped_document.to_merged_documentai_document()
)
with open("tests/unit/resources/merged_document/merged_shards.json", "r") as f:
merged_document = documentai.Document.from_json(f.read())
expected = documentai.Document.to_json(merged_document)

assert actual == expected


def test_document_to_merged_documentai_document_one_shard():
path = "tests/unit/resources/0/toolbox_invoice_test-0.json"

with open(path, "r", encoding="utf-8") as f:
documentai_document = documentai.Document.from_json(f.read())

wrapped_document = document.Document.from_documentai_document(documentai_document)
actual = wrapped_document.to_merged_documentai_document()

assert actual == documentai_document


def test_apply_text_offset():
path = "tests/unit/resources/1/toolbox_large_document_test-1.json"
with open(path, "r", encoding="utf-8") as f:
content = f.read()
documentai_document = documentai.Document.from_json(content)

assert documentai_document.shard_info.text_offset == 4350

doc_dict = documentai.Document.to_dict(documentai_document)
document._apply_text_offset(
doc_dict, int(documentai_document.shard_info.text_offset)
)

actual = documentai.Document.from_json(json.dumps(doc_dict))
assert actual.entities[0].text_anchor.text_segments[0].start_index == 4616
assert actual.entities[0].text_anchor.text_segments[0].end_index == 4622
assert actual.entities[0].text_anchor.text_segments[3].start_index == 4634
assert actual.entities[0].text_anchor.text_segments[3].end_index == 4640

assert (
actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].start_index
) == 4350
assert (
actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].end_index == 4358
)

0 comments on commit a5e1f5c

Please sign in to comment.