Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add export merged sharded Document proto #145

Merged
merged 22 commits into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
cf547fe
feat: Add export merged sharded Document proto
holtskinner Jul 20, 2023
62d7758
Merge branch 'main' into export-doc
holtskinner Jul 21, 2023
5511a33
fix: Refactor `_apply_text_offset()` to use original impentation with…
holtskinner Jul 21, 2023
25c6af4
chore: Update min python client library for documentai
holtskinner Jul 24, 2023
6990662
Update test constraints
holtskinner Jul 24, 2023
85a1370
fix: Change test to not include indent
holtskinner Jul 24, 2023
dde22b9
fix: merge_document_shards_sample_test
holtskinner Jul 25, 2023
4a90472
Merge branch 'main' into export-doc
holtskinner Jul 25, 2023
b91559a
Merge branch 'main' into export-doc
holtskinner Jul 27, 2023
88fc3b9
Merge branch 'main' into export-doc
holtskinner Jul 31, 2023
4ad742f
fix: Address lint error for type checking
holtskinner Jul 31, 2023
98afe85
Merge branch 'main' into export-doc
holtskinner Aug 1, 2023
01928c8
Merge branch 'main' into export-doc
holtskinner Aug 3, 2023
85bb259
Fix lint error for incorrect typing
holtskinner Aug 8, 2023
2c12081
Rename `to_documentai_document` to `to_merged_documentai_document`
holtskinner Aug 8, 2023
9312153
Change `to_merged_documentai_document()` to use a deepcopy instead of…
holtskinner Aug 8, 2023
519e678
Merge branch 'main' into export-doc
holtskinner Aug 8, 2023
91e9988
Add more specific type annotation to `_apply_text_offset()`
holtskinner Aug 9, 2023
c3da29a
fix: Fixed how template files are included in the library
holtskinner Aug 9, 2023
ea34456
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Aug 9, 2023
3ef91ca
refactor: Updated `from_document_path()` to additionally support dire…
holtskinner Aug 9, 2023
6f26f10
fix: Fix type annotation
holtskinner Aug 9, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
54 changes: 54 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,37 @@ def _dict_to_bigquery(
)


def _apply_text_offset(documentai_object: object, text_offset: int) -> None:
holtskinner marked this conversation as resolved.
Show resolved Hide resolved
r"""Applies a text offset to all text_segments in `documentai_object`.

Args:
documentai_object (object):
Required. Document AI object to apply `text_offset` to.
text_offset (int):
Required. Text offset to apply. From `Document.shard_info.text_offset`.
Returns:
None

"""
if isinstance(documentai_object, dict):
for key, value in documentai_object.items():
if key == "text_segments":
documentai_object[key] = [
{
"start_index": int(text_segment.get("start_index", 0))
+ text_offset,
"end_index": int(text_segment.get("end_index", 0))
+ text_offset,
}
for text_segment in value
]
else:
_apply_text_offset(value, text_offset)
elif isinstance(documentai_object, list):
for item in documentai_object:
_apply_text_offset(item, text_offset)


@dataclasses.dataclass
class Document:
r"""Represents a wrapped `Document`.
Expand Down Expand Up @@ -771,3 +802,26 @@ def export_hocr_str(self, title: str) -> str:
template = environment.get_template("hocr_document_template.xml.j2")
content = template.render(pages=self.pages, title=title)
return content

def to_documentai_document(self) -> documentai.Document:
holtskinner marked this conversation as resolved.
Show resolved Hide resolved
r"""Exports a documentai.Document from the wrapped document with shards merged.

Args:
None.
Returns:
documentai.Document:
Document with all shards merged and text offsets applied.
"""
if len(self.shards) == 1:
return self.shards[0]

merged_document = documentai.Document(text=self.text, pages=[], entities=[])
for shard in self.shards:
_apply_text_offset(
holtskinner marked this conversation as resolved.
Show resolved Hide resolved
documentai_object=shard,
text_offset=int(shard.shard_info.text_offset),
)
merged_document.pages.extend(shard.pages)
merged_document.entities.extend(shard.entities)

return merged_document
44 changes: 44 additions & 0 deletions samples/snippets/merge_document_shards_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


# [START documentai_toolbox_merge_document_shards]

from google.cloud import documentai
from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"
# output_file_name = "path/to/folder/file.json"


def merge_document_shards_sample(
gcs_bucket_name: str, gcs_prefix: str, output_file_name: str
) -> None:
wrapped_document = document.Document.from_gcs(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)

merged_document = wrapped_document.to_documentai_document()

with open(output_file_name, "w") as f:
f.write(documentai.Document.to_json(merged_document))

print(f"Document with {len(wrapped_document.shards)} shards successfully merged.")


# [END documentai_toolbox_merge_document_shards]
2 changes: 1 addition & 1 deletion samples/snippets/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
google-cloud-bigquery==3.11.4
google-cloud-documentai==2.18.0
google-cloud-storage==2.10.0
google-cloud-documentai-toolbox==0.4.1a0
google-cloud-documentai-toolbox==0.9.0a0
45 changes: 45 additions & 0 deletions samples/snippets/test_merge_document_shards_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import shutil

import pytest
from samples.snippets import merge_document_shards_sample

gcs_bucket_name = "documentai_toolbox_samples"
gcs_prefix = "output/987654321/1"
output_dir = "resources/output/"
output_path = f"{output_dir}merged_document.json"


def test_merge_document_shards_sample(capsys: pytest.CaptureFixture) -> None:
if os.path.exists(output_dir):
shutil.rmtree(output_dir)

os.makedirs(output_dir)

merge_document_shards_sample.merge_document_shards_sample(
gcs_bucket_name=gcs_bucket_name,
gcs_prefix=gcs_prefix,
output_file_name=output_path,
)

out, _ = capsys.readouterr()

assert "Document with 5 shards successfully merged." in out

assert os.path.exists(output_dir)
shutil.rmtree(output_dir)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"proto-plus >= 1.22.2, <2.0.0dev; python_version>='3.11'",
"grpc-google-iam-v1 >= 0.12.4, < 0.13dev",
"google-cloud-bigquery >= 3.5.0, < 4.0.0dev",
"google-cloud-documentai >= 1.2.1, < 3.0.0dev",
"google-cloud-documentai >= 2.17.0, < 3.0.0dev",
"google-cloud-storage >= 1.31.0, < 3.0.0dev",
"google-cloud-vision >= 2.7.0, < 4.0.0dev ",
"numpy >= 1.18.1",
Expand Down
2 changes: 1 addition & 1 deletion testing/constraints-3.7.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pandas==1.0.0
proto-plus==1.22.0
grpc-google-iam-v1==0.12.4
google-cloud-bigquery==3.5.0
google-cloud-documentai==2.12.0
google-cloud-documentai==2.17.0
google-cloud-storage==2.7.0
numpy==1.18.1
pikepdf==6.2.9
1 change: 1 addition & 0 deletions tests/unit/resources/merged_document/merged_shards.json

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import shutil

Expand Down Expand Up @@ -658,3 +659,56 @@ def test_export_hocr_str():
expected = f.read()

assert actual_hocr == expected


def test_document_to_documentai_document(get_bytes_multiple_files_mock):
wrapped_document = document.Document.from_gcs(
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/"
)
get_bytes_multiple_files_mock.assert_called_once()

actual = documentai.Document.to_json(wrapped_document.to_documentai_document())
with open("tests/unit/resources/merged_document/merged_shards.json", "r") as f:
merged_document = documentai.Document.from_json(f.read())
expected = documentai.Document.to_json(merged_document)

assert actual == expected


def test_document_to_documentai_document_one_shard():
path = "tests/unit/resources/0/toolbox_invoice_test-0.json"

with open(path, "r", encoding="utf-8") as f:
documentai_document = documentai.Document.from_json(f.read())

wrapped_document = document.Document.from_documentai_document(documentai_document)
actual = wrapped_document.to_documentai_document()

assert actual == documentai_document


def test_apply_text_offset():
path = "tests/unit/resources/1/toolbox_large_document_test-1.json"
with open(path, "r", encoding="utf-8") as f:
content = f.read()
documentai_document = documentai.Document.from_json(content)

assert documentai_document.shard_info.text_offset == 4350

doc_dict = documentai.Document.to_dict(documentai_document)
document._apply_text_offset(
doc_dict, int(documentai_document.shard_info.text_offset)
)

actual = documentai.Document.from_json(json.dumps(doc_dict))
assert actual.entities[0].text_anchor.text_segments[0].start_index == 4616
assert actual.entities[0].text_anchor.text_segments[0].end_index == 4622
assert actual.entities[0].text_anchor.text_segments[3].start_index == 4634
assert actual.entities[0].text_anchor.text_segments[3].end_index == 4640

assert (
actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].start_index
) == 4350
assert (
actual.pages[0].blocks[0].layout.text_anchor.text_segments[0].end_index == 4358
)