Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add export merged sharded Document proto #145

Merged
merged 22 commits into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
cf547fe
feat: Add export merged sharded Document proto
holtskinner Jul 20, 2023
62d7758
Merge branch 'main' into export-doc
holtskinner Jul 21, 2023
5511a33
fix: Refactor `_apply_text_offset()` to use original impentation with…
holtskinner Jul 21, 2023
25c6af4
chore: Update min python client library for documentai
holtskinner Jul 24, 2023
6990662
Update test constraints
holtskinner Jul 24, 2023
85a1370
fix: Change test to not include indent
holtskinner Jul 24, 2023
dde22b9
fix: merge_document_shards_sample_test
holtskinner Jul 25, 2023
4a90472
Merge branch 'main' into export-doc
holtskinner Jul 25, 2023
b91559a
Merge branch 'main' into export-doc
holtskinner Jul 27, 2023
88fc3b9
Merge branch 'main' into export-doc
holtskinner Jul 31, 2023
4ad742f
fix: Address lint error for type checking
holtskinner Jul 31, 2023
98afe85
Merge branch 'main' into export-doc
holtskinner Aug 1, 2023
01928c8
Merge branch 'main' into export-doc
holtskinner Aug 3, 2023
85bb259
Fix lint error for incorrect typing
holtskinner Aug 8, 2023
2c12081
Rename `to_documentai_document` to `to_merged_documentai_document`
holtskinner Aug 8, 2023
9312153
Change `to_merged_documentai_document()` to use a deepcopy instead of…
holtskinner Aug 8, 2023
519e678
Merge branch 'main' into export-doc
holtskinner Aug 8, 2023
91e9988
Add more specific type annotation to `_apply_text_offset()`
holtskinner Aug 9, 2023
c3da29a
fix: Fixed how template files are included in the library
holtskinner Aug 9, 2023
ea34456
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Aug 9, 2023
3ef91ca
refactor: Updated `from_document_path()` to additionally support dire…
holtskinner Aug 9, 2023
6f26f10
fix: Fix type annotation
holtskinner Aug 9, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
82 changes: 66 additions & 16 deletions google/cloud/documentai_toolbox/wrappers/document.py
Expand Up @@ -15,6 +15,7 @@
#
"""Wrappers for Document AI Document type."""

import copy
import dataclasses
import os
import re
Expand All @@ -23,6 +24,7 @@
from google.api_core.client_options import ClientOptions
from google.cloud.vision import AnnotateFileResponse
from google.longrunning.operations_pb2 import GetOperationRequest, Operation

from jinja2 import Environment, PackageLoader
from pikepdf import Pdf

Expand Down Expand Up @@ -284,6 +286,37 @@ def _dict_to_bigquery(
)


def _apply_text_offset(documentai_object: object, text_offset: int) -> None:
holtskinner marked this conversation as resolved.
Show resolved Hide resolved
r"""Applies a text offset to all text_segments in `documentai_object`.

Args:
documentai_object (object):
Required. Document AI object to apply `text_offset` to.
text_offset (int):
Required. Text offset to apply. From `Document.shard_info.text_offset`.
Returns:
None

"""
if isinstance(documentai_object, dict):
for key, value in documentai_object.items():
if key == "text_segments":
documentai_object[key] = [
{
"start_index": int(text_segment.get("start_index", 0))
+ text_offset,
"end_index": int(text_segment.get("end_index", 0))
+ text_offset,
}
for text_segment in value
]
else:
_apply_text_offset(value, text_offset)
elif isinstance(documentai_object, list):
for item in documentai_object:
_apply_text_offset(item, text_offset)


@dataclasses.dataclass
class Document:
r"""Represents a wrapped `Document`.
Expand Down Expand Up @@ -422,7 +455,7 @@ def from_gcs(
@classmethod
def from_batch_process_metadata(
cls: Type["Document"], metadata: documentai.BatchProcessMetadata
) -> "Document":
) -> List["Document"]:
r"""Loads Documents from Cloud Storage, using the output from `BatchProcessMetadata`.

.. code-block:: python
Expand All @@ -444,26 +477,18 @@ def from_batch_process_metadata(
if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
raise ValueError(f"Batch Process Failed: {metadata.state_message}")

documents: List[Document] = []
# Each process corresponds to one input document
for process in list(metadata.individual_process_statuses):
# output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
gcs_bucket_name, gcs_prefix = gcs_utilities.split_gcs_uri(
process.output_gcs_destination
)

documents.append(
Document.from_gcs(
gcs_bucket_name, gcs_prefix, gcs_input_uri=process.input_gcs_source
)
return [
Document.from_gcs(
*gcs_utilities.split_gcs_uri(process.output_gcs_destination),
gcs_input_uri=process.input_gcs_source,
)

return documents
for process in list(metadata.individual_process_statuses)
]

@classmethod
def from_batch_process_operation(
cls: Type["Document"], location: str, operation_name: str
) -> "Document":
) -> List["Document"]:
r"""Loads Documents from Cloud Storage, using the operation name returned from `batch_process_documents()`.

.. code-block:: python
Expand Down Expand Up @@ -771,3 +796,28 @@ def export_hocr_str(self, title: str) -> str:
template = environment.get_template("hocr_document_template.xml.j2")
content = template.render(pages=self.pages, title=title)
return content

def to_merged_documentai_document(self) -> documentai.Document:
r"""Exports a documentai.Document from the wrapped document with shards merged.

Args:
None.
Returns:
documentai.Document:
Document with all shards merged and text offsets applied.
"""
if len(self.shards) == 1:
return self.shards[0]

merged_document = documentai.Document(text=self.text, pages=[], entities=[])
for shard in self.shards:
modified_shard = copy.deepcopy(shard)

_apply_text_offset(
holtskinner marked this conversation as resolved.
Show resolved Hide resolved
documentai_object=modified_shard,
text_offset=int(modified_shard.shard_info.text_offset),
)
merged_document.pages.extend(modified_shard.pages)
merged_document.entities.extend(modified_shard.entities)

return merged_document
44 changes: 44 additions & 0 deletions samples/snippets/merge_document_shards_sample.py
@@ -0,0 +1,44 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


# [START documentai_toolbox_merge_document_shards]

from google.cloud import documentai
from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"
# output_file_name = "path/to/folder/file.json"


def merge_document_shards_sample(
gcs_bucket_name: str, gcs_prefix: str, output_file_name: str
) -> None:
wrapped_document = document.Document.from_gcs(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)

merged_document = wrapped_document.to_merged_documentai_document()

with open(output_file_name, "w") as f:
f.write(documentai.Document.to_json(merged_document))

print(f"Document with {len(wrapped_document.shards)} shards successfully merged.")


# [END documentai_toolbox_merge_document_shards]
2 changes: 1 addition & 1 deletion samples/snippets/requirements.txt
@@ -1,4 +1,4 @@
google-cloud-bigquery==3.11.4
google-cloud-documentai==2.18.0
google-cloud-storage==2.10.0
google-cloud-documentai-toolbox==0.4.1a0
google-cloud-documentai-toolbox==0.9.0a0
45 changes: 45 additions & 0 deletions samples/snippets/test_merge_document_shards_sample.py
@@ -0,0 +1,45 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import shutil

import pytest
from samples.snippets import merge_document_shards_sample

gcs_bucket_name = "documentai_toolbox_samples"
gcs_prefix = "output/987654321/1"
output_dir = "resources/output/"
output_path = f"{output_dir}merged_document.json"


def test_merge_document_shards_sample(capsys: pytest.CaptureFixture) -> None:
if os.path.exists(output_dir):
shutil.rmtree(output_dir)

os.makedirs(output_dir)

merge_document_shards_sample.merge_document_shards_sample(
gcs_bucket_name=gcs_bucket_name,
gcs_prefix=gcs_prefix,
output_file_name=output_path,
)

out, _ = capsys.readouterr()

assert "Document with 5 shards successfully merged." in out

assert os.path.exists(output_dir)
shutil.rmtree(output_dir)
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -51,7 +51,7 @@
"proto-plus >= 1.22.2, <2.0.0dev; python_version>='3.11'",
"grpc-google-iam-v1 >= 0.12.4, < 0.13dev",
"google-cloud-bigquery >= 3.5.0, < 4.0.0dev",
"google-cloud-documentai >= 1.2.1, < 3.0.0dev",
"google-cloud-documentai >= 2.17.0, < 3.0.0dev",
"google-cloud-storage >= 1.31.0, < 3.0.0dev",
"google-cloud-vision >= 2.7.0, < 4.0.0dev ",
"numpy >= 1.18.1",
Expand Down
2 changes: 1 addition & 1 deletion testing/constraints-3.7.txt
Expand Up @@ -10,7 +10,7 @@ pandas==1.0.0
proto-plus==1.22.0
grpc-google-iam-v1==0.12.4
google-cloud-bigquery==3.5.0
google-cloud-documentai==2.12.0
google-cloud-documentai==2.17.0
google-cloud-storage==2.7.0
numpy==1.18.1
pikepdf==6.2.9
1 change: 1 addition & 0 deletions tests/unit/resources/merged_document/merged_shards.json

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions tests/unit/test_document.py
Expand Up @@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import shutil

Expand Down Expand Up @@ -658,3 +659,58 @@ def test_export_hocr_str():
expected = f.read()

assert actual_hocr == expected


def test_document_to_merged_documentai_document(get_bytes_multiple_files_mock):
wrapped_document = document.Document.from_gcs(
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/"
)
get_bytes_multiple_files_mock.assert_called_once()

actual = documentai.Document.to_json(
wrapped_document.to_merged_documentai_document()
)
with open("tests/unit/resources/merged_document/merged_shards.json", "r") as f:
merged_document = documentai.Document.from_json(f.read())
expected = documentai.Document.to_json(merged_document)

assert actual == expected


def test_document_to_merged_documentai_document_one_shard():
path = "tests/unit/resources/0/toolbox_invoice_test-0.json"

with open(path, "r", encoding="utf-8") as f:
documentai_document = documentai.Document.from_json(f.read())

wrapped_document = document.Document.from_documentai_document(documentai_document)
actual = wrapped_document.to_merged_documentai_document()

assert actual == documentai_document


def test_apply_text_offset():
path = "tests/unit/resources/1/toolbox_large_document_test-1.json"
with open(path, "r", encoding="utf-8") as f:
content = f.read()
documentai_document = documentai.Document.from_json(content)

assert documentai_document.shard_info.text_offset == 4350

doc_dict = documentai.Document.to_dict(documentai_document)
document._apply_text_offset(
doc_dict, int(documentai_document.shard_info.text_offset)
)

actual = documentai.Document.from_json(json.dumps(doc_dict))
assert actual.entities[0].text_anchor.text_segments[0].start_index == 4616
assert actual.entities[0].text_anchor.text_segments[0].end_index == 4622
assert actual.entities[0].text_anchor.text_segments[3].start_index == 4634
assert actual.entities[0].text_anchor.text_segments[3].end_index == 4640

assert (
actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].start_index
) == 4350
assert (
actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].end_index == 4358
)