Skip to content

Commit

Permalink
Powerpoint preview (#2598)
Browse files Browse the repository at this point in the history
  • Loading branch information
sir-sigurd committed Jan 25, 2022
1 parent 93f8a7b commit 1f50a2b
Show file tree
Hide file tree
Showing 11 changed files with 131 additions and 116 deletions.
2 changes: 1 addition & 1 deletion docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
* [Added] Add object-level metadata editor and move package metadata editor to popup ([#2510](https://github.com/quiltdata/quilt/pull/2510/))
* [Added] Video previews ([#2540](https://github.com/quiltdata/quilt/pull/2540))
* [Added] Audio previews ([#2547](https://github.com/quiltdata/quilt/pull/2547))
* [Added] Powerpoint (`.pptx`) preview ([#2626](https://github.com/quiltdata/quilt/pull/2626))
* [Added] Powerpoint (`.pptx`) preview ([#2598](https://github.com/quiltdata/quilt/pull/2598), [#2626](https://github.com/quiltdata/quilt/pull/2626))

# 3.6.0 - 2021-10-15
## Python API
Expand Down
9 changes: 9 additions & 0 deletions lambdas/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
**/*.pyc
**/__pycache__/
**/venv/
**/tests/
**/.pytest_cache/
**/*.egg-info/
**/build/
**/*Dockerfile
.dockerignore
2 changes: 1 addition & 1 deletion lambdas/shared/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
fcsparser==0.2.1
numpy==1.19.5
numpy==1.21.2
openpyxl==3.0.7
pandas==1.1.5
psutil==5.7.0
Expand Down
41 changes: 21 additions & 20 deletions lambdas/thumbnail/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
# The bare minimum Dockerfile to install aicsimageio==3.0.* and all its dependencies
FROM amazon/aws-lambda-python:3.8 as base

# Must start from this image for lambda support
FROM amazonlinux:2018.03
FROM base as pyenv_builder
RUN yum -y install gcc
COPY shared/requirements.txt /requirements/shared.txt
COPY thumbnail/requirements.txt /requirements/thumbnail.txt

# Install python
RUN yum install python36-devel.x86_64 -y
RUN yum install python36-pip.noarch -y
RUN pip install -U pip setuptools
RUN pip install --target /deps -r /requirements/shared.txt -r /requirements/thumbnail.txt

# Get gcc for Cython
RUN yum install gcc.noarch -y
COPY shared/ /src/shared/
COPY thumbnail/ /src/thumbnail/
RUN pip install --target /lambda --no-deps /src/shared/ /src/thumbnail/

# Set python3.6 to default python3
RUN ln -sf /usr/bin/python3.6 /usr/bin/python3
RUN ln -sf /usr/bin/pip-3.6 /usr/bin/pip3

# Get requirements file
COPY requirements.txt requirements.txt
FROM base
RUN yum -y install amazon-linux-extras && \
# amazon-linux-extras command is broken by the base image.
python2 -m amazon_linux_extras enable libreoffice && \
yum -y install libreoffice-impress poppler-utils && \
yum -y remove amazon-linux-extras && \
yum clean all

# Update pip
RUN pip3 install --upgrade pip
# Install Python environment.
COPY --from=pyenv_builder /deps/ $LAMBDA_TASK_ROOT
COPY --from=pyenv_builder /lambda/ $LAMBDA_TASK_ROOT

# Get Cython for numpy
RUN pip3 install Cython

# Install package
RUN pip3 install -r requirements.txt
CMD ["index.lambda_handler"]
49 changes: 32 additions & 17 deletions lambdas/thumbnail/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
import base64
import json
import os
import subprocess
import sys
import tempfile
from io import BytesIO
from math import sqrt
from typing import List, Tuple
Expand Down Expand Up @@ -65,7 +67,7 @@
'enum': list(SIZE_PARAMETER_MAP)
},
'input': {
'enum': ['pdf']
'enum': ['pdf', 'pptx']
},
'output': {
'enum': ['json', 'raw']
Expand Down Expand Up @@ -230,17 +232,26 @@ def format_aicsimage_to_prepped(img: AICSImage) -> np.ndarray:
return img.reader.data


def set_pdf_env():
"""set env vars to support PDF binary, library, font discovery
see https://docs.aws.amazon.com/lambda/latest/dg/configuration-envvars.html"""
prefix = 'quilt_binaries'
lambda_root = os.environ["LAMBDA_TASK_ROOT"]
# binaries
os.environ["PATH"] += os.pathsep + os.path.join(lambda_root, prefix, 'usr', 'bin')
# libs
os.environ["LD_LIBRARY_PATH"] += os.pathsep + os.path.join(lambda_root, prefix, 'usr', 'lib64')
# fonts
os.environ["FONTCONFIG_FILE"] = os.path.join(lambda_root, prefix, 'fonts', 'fonts.conf')
def pptx_to_pdf(src: bytes) -> bytes:
with tempfile.TemporaryDirectory() as tmp_dir:
file_name_base = "file"
output_ext = "pdf"
src_file_path = os.path.join(tmp_dir, f"{file_name_base}.pptx")
with open(src_file_path, "xb") as src_file:
src_file.write(src)

subprocess.run(
("soffice", "--convert-to", output_ext, "--outdir", tmp_dir, src_file_path),
check=True,
env={
**os.environ,
# This is needed because LibreOffice writes some stuff to $HOME/.config.
"HOME": tmp_dir,
},
)

with open(os.path.join(tmp_dir, f"{file_name_base}.{output_ext}"), "rb") as out_file:
return out_file.read()


@api(cors_origins=get_default_origins())
Expand All @@ -267,18 +278,22 @@ def lambda_handler(request):
}
return make_json_response(resp.status_code, ret_val)

src_bytes = resp.content
if input_ == "pptx":
src_bytes = pptx_to_pdf(src_bytes)
input_ = "pdf"

try:
thumbnail_format = SUPPORTED_BROWSER_FORMATS.get(
imageio.get_reader(resp.content),
imageio.get_reader(src_bytes),
"PNG"
)
except ValueError:
thumbnail_format = "JPEG" if input_ == "pdf" else "PNG"
if input_ == "pdf":
set_pdf_env()
try:
pages = convert_from_bytes(
resp.content,
src_bytes,
# respect width but not necessarily height to preserve aspect ratio
size=(size[0], None),
fmt="JPEG",
Expand All @@ -300,14 +315,14 @@ def lambda_handler(request):
'thumbnail_size': preview.size,
}
if count_pages:
info['page_count'] = pdf2image.pdfinfo_from_bytes(resp.content)["Pages"]
info['page_count'] = pdf2image.pdfinfo_from_bytes(src_bytes)["Pages"]

thumbnail_bytes = BytesIO()
preview.save(thumbnail_bytes, thumbnail_format)
data = thumbnail_bytes.getvalue()
else:
# Read image data
img = AICSImage(resp.content)
img = AICSImage(src_bytes)
orig_size = list(img.reader.data.shape)
# Generate a formatted ndarray using the image data
# Makes some assumptions for n-dim data
Expand Down
4 changes: 0 additions & 4 deletions lambdas/thumbnail/quilt_binaries.json

This file was deleted.

20 changes: 19 additions & 1 deletion lambdas/thumbnail/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,26 @@ def pytest_addoption(parser):
default=False,
help="Indicates poppler tools (incl. pdftoppm) installed"
)
parser.addoption(
'--loffice',
action='store_true',
dest='loffice',
default=False,
help="Indicates LibreOffice installed"
)


def pytest_configure(config):
markers_to_exclude = []

if not config.option.poppler:
setattr(config.option, 'markexpr', 'not poppler')
markers_to_exclude.append('poppler')

if not config.option.loffice:
markers_to_exclude.append('loffice')

setattr(
config.option,
'markexpr',
' and '.join([f'not {m}' for m in markers_to_exclude])
)
Binary file added lambdas/thumbnail/tests/data/pptx/in.pptx
Binary file not shown.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
120 changes: 48 additions & 72 deletions lambdas/thumbnail/tests/test_thumbnail.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import base64
import json
import os
from io import BytesIO
from pathlib import Path
from unittest.mock import patch

import numpy as np
import pytest
Expand Down Expand Up @@ -109,6 +107,18 @@ def test_403():
"pdf-page4-1024w.jpeg", None, [1024, 1450], 8, 200,
marks=pytest.mark.poppler
),
pytest.param(
"pptx/in.pptx",
{"size": "w1024h768", "input": "pptx", "page": "1", "countPages": "true"},
"pptx/out-page1-1024w.jpeg", None, [1024, 1450], 2, 200,
marks=(pytest.mark.poppler, pytest.mark.loffice),
),
pytest.param(
"pptx/in.pptx",
{"size": "w1024h768", "input": "pptx", "page": "2", "countPages": "true"},
"pptx/out-page2-1024w.jpeg", None, [1024, 1450], 2, 200,
marks=(pytest.mark.poppler, pytest.mark.loffice),
),
]
)
def test_generate_thumbnail(
Expand All @@ -121,74 +131,40 @@ def test_generate_thumbnail(
num_pages,
status
):
# don't actually modify the environment in tests
with patch.object(index, 'set_pdf_env', return_value=None) as set_env:
# Resolve the input file path
input_file = data_dir / input_file
# Mock the request
url = f"https://example.com/{input_file}"
responses.add(
responses.GET,
url=url,
body=input_file.read_bytes(),
status=200
)
# Create the lambda request event
event = _make_event({"url": url, **params})
# Get the response
response = index.lambda_handler(event, None)
# Assert the request was handled with no errors
assert response["statusCode"] == 200, f"response: {response}"
# only check the body and expected image if it's a successful call
# Parse the body / the returned thumbnail
body = json.loads(read_body(response))
# Assert basic metadata was filled properly
assert body["info"]["thumbnail_size"] == expected_thumb_size
if expected_original_size: # PDFs don't have an expected size
assert body["info"]["original_size"] == expected_original_size
if "countPages" in params:
assert body["info"]["page_count"] == num_pages
# Assert the produced image is the same as the expected
if params.get('input') == 'pdf':
actual = Image.open(BytesIO(base64.b64decode(body['thumbnail'])))
expected = Image.open(data_dir / expected_thumb)
actual_array = np.array(actual)
expected_array = np.array(expected)
assert set_env.call_count == 1
assert actual_array.shape == expected_array.shape
assert np.allclose(expected_array, actual_array, atol=15, rtol=0.1)
else:
actual = AICSImage(base64.b64decode(body['thumbnail'])).reader.data
expected = AICSImage(data_dir / expected_thumb).reader.data
assert np.array_equal(actual, expected)


@patch.dict(os.environ, {
'LAMBDA_TASK_ROOT': str(Path('/var/task')),
'PATH': str(Path('/one/two')) + os.pathsep + str(Path('/three')),
'LD_LIBRARY_PATH': str(Path('/lib64')) + os.pathsep + str(Path('/usr/lib64')),
# set_pdf_env() will blow this away
# it's only here to prevent side-effects on the test host
'FONTCONFIG_FILE': ''
})
def test_pdf_env():
"""test that env vars are set so that poppler, pdf2image work properly"""
index.set_pdf_env()
assert os.environ.get('FONTCONFIG_FILE') == os.path.join(
os.environ.get('LAMBDA_TASK_ROOT'),
'quilt_binaries',
'fonts',
'fonts.conf',
# Resolve the input file path
input_file = data_dir / input_file
# Mock the request
url = f"https://example.com/{input_file}"
responses.add(
responses.GET,
url=url,
body=input_file.read_bytes(),
status=200
)
assert os.environ.get('PATH') == os.pathsep.join([
str(Path('/one/two')),
str(Path('/three')),
str(Path('/var/task/quilt_binaries/usr/bin'))
])
assert os.environ.get('LD_LIBRARY_PATH') == os.pathsep.join([
str(Path('/lib64')),
str(Path('/usr/lib64')),
str(Path('/var/task/quilt_binaries/usr/lib64'))
])
# we should never mod this:
assert os.environ.get('LAMBDA_TASK_ROOT') == str(Path('/var/task'))
# Create the lambda request event
event = _make_event({"url": url, **params})
# Get the response
response = index.lambda_handler(event, None)
# Assert the request was handled with no errors
assert response["statusCode"] == 200, f"response: {response}"
# only check the body and expected image if it's a successful call
# Parse the body / the returned thumbnail
body = json.loads(read_body(response))
# Assert basic metadata was filled properly
assert body["info"]["thumbnail_size"] == expected_thumb_size
if expected_original_size: # PDFs don't have an expected size
assert body["info"]["original_size"] == expected_original_size
if "countPages" in params:
assert body["info"]["page_count"] == num_pages
# Assert the produced image is the same as the expected
if params.get('input') in ('pdf', "pptx"):
actual = Image.open(BytesIO(base64.b64decode(body['thumbnail'])))
expected = Image.open(data_dir / expected_thumb)
actual_array = np.array(actual)
expected_array = np.array(expected)
assert actual_array.shape == expected_array.shape
assert np.allclose(expected_array, actual_array, atol=15, rtol=0.1)
else:
actual = AICSImage(base64.b64decode(body['thumbnail'])).reader.data
expected = AICSImage(data_dir / expected_thumb).reader.data
assert np.array_equal(actual, expected)

0 comments on commit 1f50a2b

Please sign in to comment.