Powerpoint preview (#2598)

quiltdata · Jan 25, 2022 · 1f50a2b · 1f50a2b
1 parent 93f8a7b
commit 1f50a2b
Show file tree

Hide file tree

Showing 11 changed files with 131 additions and 116 deletions.
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -25,7 +25,7 @@
 * [Added] Add object-level metadata editor and move package metadata editor to popup ([#2510](https://github.com/quiltdata/quilt/pull/2510/))
 * [Added] Video previews ([#2540](https://github.com/quiltdata/quilt/pull/2540))
 * [Added] Audio previews ([#2547](https://github.com/quiltdata/quilt/pull/2547))
-* [Added] Powerpoint (`.pptx`) preview ([#2626](https://github.com/quiltdata/quilt/pull/2626))
+* [Added] Powerpoint (`.pptx`) preview ([#2598](https://github.com/quiltdata/quilt/pull/2598), [#2626](https://github.com/quiltdata/quilt/pull/2626))
 
 # 3.6.0 - 2021-10-15
 ## Python API

diff --git a/lambdas/.dockerignore b/lambdas/.dockerignore
@@ -0,0 +1,9 @@
+**/*.pyc
+**/__pycache__/
+**/venv/
+**/tests/
+**/.pytest_cache/
+**/*.egg-info/
+**/build/
+**/*Dockerfile
+.dockerignore
diff --git a/lambdas/shared/requirements.txt b/lambdas/shared/requirements.txt
@@ -1,5 +1,5 @@
 fcsparser==0.2.1
-numpy==1.19.5
+numpy==1.21.2
 openpyxl==3.0.7
 pandas==1.1.5
 psutil==5.7.0

diff --git a/lambdas/thumbnail/Dockerfile b/lambdas/thumbnail/Dockerfile
@@ -1,27 +1,28 @@
-# The bare minimum Dockerfile to install aicsimageio==3.0.* and all its dependencies
+FROM amazon/aws-lambda-python:3.8 as base
 
-# Must start from this image for lambda support
-FROM amazonlinux:2018.03
+FROM base as pyenv_builder
+RUN yum -y install gcc
+COPY shared/requirements.txt /requirements/shared.txt
+COPY thumbnail/requirements.txt /requirements/thumbnail.txt
 
-# Install python
-RUN yum install python36-devel.x86_64 -y
-RUN yum install python36-pip.noarch -y
+RUN pip install -U pip setuptools
+RUN pip install --target /deps -r /requirements/shared.txt -r /requirements/thumbnail.txt
 
-# Get gcc for Cython
-RUN yum install gcc.noarch -y
+COPY shared/ /src/shared/
+COPY thumbnail/ /src/thumbnail/
+RUN pip install --target /lambda --no-deps /src/shared/ /src/thumbnail/
 
-# Set python3.6 to default python3
-RUN ln -sf /usr/bin/python3.6 /usr/bin/python3
-RUN ln -sf /usr/bin/pip-3.6 /usr/bin/pip3
 
-# Get requirements file
-COPY requirements.txt requirements.txt
+FROM base
+RUN yum -y install amazon-linux-extras && \
+    # amazon-linux-extras command is broken by the base image.
+    python2 -m amazon_linux_extras enable libreoffice && \
+    yum -y install libreoffice-impress poppler-utils && \
+    yum -y remove amazon-linux-extras && \
+    yum clean all
 
-# Update pip
-RUN pip3 install --upgrade pip
+# Install Python environment.
+COPY --from=pyenv_builder /deps/ $LAMBDA_TASK_ROOT
+COPY --from=pyenv_builder /lambda/ $LAMBDA_TASK_ROOT
 
-# Get Cython for numpy
-RUN pip3 install Cython
-
-# Install package
-RUN pip3 install -r requirements.txt
+CMD ["index.lambda_handler"]
diff --git a/lambdas/thumbnail/index.py b/lambdas/thumbnail/index.py
@@ -9,7 +9,9 @@
 import base64
 import json
 import os
+import subprocess
 import sys
+import tempfile
 from io import BytesIO
 from math import sqrt
 from typing import List, Tuple
@@ -65,7 +67,7 @@
             'enum': list(SIZE_PARAMETER_MAP)
         },
         'input': {
-            'enum': ['pdf']
+            'enum': ['pdf', 'pptx']
         },
         'output': {
             'enum': ['json', 'raw']
@@ -230,17 +232,26 @@ def format_aicsimage_to_prepped(img: AICSImage) -> np.ndarray:
     return img.reader.data
 
 
-def set_pdf_env():
-    """set env vars to support PDF binary, library, font discovery
-    see https://docs.aws.amazon.com/lambda/latest/dg/configuration-envvars.html"""
-    prefix = 'quilt_binaries'
-    lambda_root = os.environ["LAMBDA_TASK_ROOT"]
-    # binaries
-    os.environ["PATH"] += os.pathsep + os.path.join(lambda_root, prefix, 'usr', 'bin')
-    # libs
-    os.environ["LD_LIBRARY_PATH"] += os.pathsep + os.path.join(lambda_root, prefix, 'usr', 'lib64')
-    # fonts
-    os.environ["FONTCONFIG_FILE"] = os.path.join(lambda_root, prefix, 'fonts', 'fonts.conf')
+def pptx_to_pdf(src: bytes) -> bytes:
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        file_name_base = "file"
+        output_ext = "pdf"
+        src_file_path = os.path.join(tmp_dir, f"{file_name_base}.pptx")
+        with open(src_file_path, "xb") as src_file:
+            src_file.write(src)
+
+        subprocess.run(
+            ("soffice", "--convert-to", output_ext, "--outdir", tmp_dir, src_file_path),
+            check=True,
+            env={
+                **os.environ,
+                # This is needed because LibreOffice writes some stuff to $HOME/.config.
+                "HOME": tmp_dir,
+            },
+        )
+
+        with open(os.path.join(tmp_dir, f"{file_name_base}.{output_ext}"), "rb") as out_file:
+            return out_file.read()
 
 
 @api(cors_origins=get_default_origins())
@@ -267,18 +278,22 @@ def lambda_handler(request):
         }
         return make_json_response(resp.status_code, ret_val)
 
+    src_bytes = resp.content
+    if input_ == "pptx":
+        src_bytes = pptx_to_pdf(src_bytes)
+        input_ = "pdf"
+
     try:
         thumbnail_format = SUPPORTED_BROWSER_FORMATS.get(
-            imageio.get_reader(resp.content),
+            imageio.get_reader(src_bytes),
             "PNG"
         )
     except ValueError:
         thumbnail_format = "JPEG" if input_ == "pdf" else "PNG"
     if input_ == "pdf":
-        set_pdf_env()
         try:
             pages = convert_from_bytes(
-                resp.content,
+                src_bytes,
                 # respect width but not necessarily height to preserve aspect ratio
                 size=(size[0], None),
                 fmt="JPEG",
@@ -300,14 +315,14 @@ def lambda_handler(request):
             'thumbnail_size': preview.size,
         }
         if count_pages:
-            info['page_count'] = pdf2image.pdfinfo_from_bytes(resp.content)["Pages"]
+            info['page_count'] = pdf2image.pdfinfo_from_bytes(src_bytes)["Pages"]
 
         thumbnail_bytes = BytesIO()
         preview.save(thumbnail_bytes, thumbnail_format)
         data = thumbnail_bytes.getvalue()
     else:
         # Read image data
-        img = AICSImage(resp.content)
+        img = AICSImage(src_bytes)
         orig_size = list(img.reader.data.shape)
         # Generate a formatted ndarray using the image data
         # Makes some assumptions for n-dim data

diff --git a/lambdas/thumbnail/quilt_binaries.json b/lambdas/thumbnail/quilt_binaries.json
diff --git a/lambdas/thumbnail/tests/conftest.py b/lambdas/thumbnail/tests/conftest.py
@@ -6,8 +6,26 @@ def pytest_addoption(parser):
         default=False,
         help="Indicates poppler tools (incl. pdftoppm) installed"
     )
+    parser.addoption(
+        '--loffice',
+        action='store_true',
+        dest='loffice',
+        default=False,
+        help="Indicates LibreOffice installed"
+    )
 
 
 def pytest_configure(config):
+    markers_to_exclude = []
+
     if not config.option.poppler:
-        setattr(config.option, 'markexpr', 'not poppler')
+        markers_to_exclude.append('poppler')
+
+    if not config.option.loffice:
+        markers_to_exclude.append('loffice')
+
+    setattr(
+        config.option,
+        'markexpr',
+        ' and '.join([f'not {m}' for m in markers_to_exclude])
+    )
diff --git a/lambdas/thumbnail/tests/data/pptx/in.pptx b/lambdas/thumbnail/tests/data/pptx/in.pptx
diff --git a/lambdas/thumbnail/tests/data/pptx/out-page1-1024w.jpeg b/lambdas/thumbnail/tests/data/pptx/out-page1-1024w.jpeg
diff --git a/lambdas/thumbnail/tests/data/pptx/out-page2-1024w.jpeg b/lambdas/thumbnail/tests/data/pptx/out-page2-1024w.jpeg
diff --git a/lambdas/thumbnail/tests/test_thumbnail.py b/lambdas/thumbnail/tests/test_thumbnail.py
@@ -1,9 +1,7 @@
 import base64
 import json
-import os
 from io import BytesIO
 from pathlib import Path
-from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -109,6 +107,18 @@ def test_403():
             "pdf-page4-1024w.jpeg", None, [1024, 1450], 8, 200,
             marks=pytest.mark.poppler
         ),
+        pytest.param(
+            "pptx/in.pptx",
+            {"size": "w1024h768", "input": "pptx", "page": "1", "countPages": "true"},
+            "pptx/out-page1-1024w.jpeg", None, [1024, 1450], 2, 200,
+            marks=(pytest.mark.poppler, pytest.mark.loffice),
+        ),
+        pytest.param(
+            "pptx/in.pptx",
+            {"size": "w1024h768", "input": "pptx", "page": "2", "countPages": "true"},
+            "pptx/out-page2-1024w.jpeg", None, [1024, 1450], 2, 200,
+            marks=(pytest.mark.poppler, pytest.mark.loffice),
+        ),
     ]
 )
 def test_generate_thumbnail(
@@ -121,74 +131,40 @@ def test_generate_thumbnail(
         num_pages,
         status
 ):
-    # don't actually modify the environment in tests
-    with patch.object(index, 'set_pdf_env', return_value=None) as set_env:
-        # Resolve the input file path
-        input_file = data_dir / input_file
-        # Mock the request
-        url = f"https://example.com/{input_file}"
-        responses.add(
-            responses.GET,
-            url=url,
-            body=input_file.read_bytes(),
-            status=200
-        )
-        # Create the lambda request event
-        event = _make_event({"url": url, **params})
-        # Get the response
-        response = index.lambda_handler(event, None)
-        # Assert the request was handled with no errors
-        assert response["statusCode"] == 200, f"response: {response}"
-        # only check the body and expected image if it's a successful call
-        # Parse the body / the returned thumbnail
-        body = json.loads(read_body(response))
-        # Assert basic metadata was filled properly
-        assert body["info"]["thumbnail_size"] == expected_thumb_size
-        if expected_original_size:  # PDFs don't have an expected size
-            assert body["info"]["original_size"] == expected_original_size
-        if "countPages" in params:
-            assert body["info"]["page_count"] == num_pages
-        # Assert the produced image is the same as the expected
-        if params.get('input') == 'pdf':
-            actual = Image.open(BytesIO(base64.b64decode(body['thumbnail'])))
-            expected = Image.open(data_dir / expected_thumb)
-            actual_array = np.array(actual)
-            expected_array = np.array(expected)
-            assert set_env.call_count == 1
-            assert actual_array.shape == expected_array.shape
-            assert np.allclose(expected_array, actual_array, atol=15, rtol=0.1)
-        else:
-            actual = AICSImage(base64.b64decode(body['thumbnail'])).reader.data
-            expected = AICSImage(data_dir / expected_thumb).reader.data
-            assert np.array_equal(actual, expected)
-
-
-@patch.dict(os.environ, {
-    'LAMBDA_TASK_ROOT': str(Path('/var/task')),
-    'PATH': str(Path('/one/two')) + os.pathsep + str(Path('/three')),
-    'LD_LIBRARY_PATH': str(Path('/lib64')) + os.pathsep + str(Path('/usr/lib64')),
-    # set_pdf_env() will blow this away
-    # it's only here to prevent side-effects on the test host
-    'FONTCONFIG_FILE': ''
-})
-def test_pdf_env():
-    """test that env vars are set so that poppler, pdf2image work properly"""
-    index.set_pdf_env()
-    assert os.environ.get('FONTCONFIG_FILE') == os.path.join(
-        os.environ.get('LAMBDA_TASK_ROOT'),
-        'quilt_binaries',
-        'fonts',
-        'fonts.conf',
+    # Resolve the input file path
+    input_file = data_dir / input_file
+    # Mock the request
+    url = f"https://example.com/{input_file}"
+    responses.add(
+        responses.GET,
+        url=url,
+        body=input_file.read_bytes(),
+        status=200
     )
-    assert os.environ.get('PATH') == os.pathsep.join([
-        str(Path('/one/two')),
-        str(Path('/three')),
-        str(Path('/var/task/quilt_binaries/usr/bin'))
-    ])
-    assert os.environ.get('LD_LIBRARY_PATH') == os.pathsep.join([
-        str(Path('/lib64')),
-        str(Path('/usr/lib64')),
-        str(Path('/var/task/quilt_binaries/usr/lib64'))
-    ])
-    # we should never mod this:
-    assert os.environ.get('LAMBDA_TASK_ROOT') == str(Path('/var/task'))
+    # Create the lambda request event
+    event = _make_event({"url": url, **params})
+    # Get the response
+    response = index.lambda_handler(event, None)
+    # Assert the request was handled with no errors
+    assert response["statusCode"] == 200, f"response: {response}"
+    # only check the body and expected image if it's a successful call
+    # Parse the body / the returned thumbnail
+    body = json.loads(read_body(response))
+    # Assert basic metadata was filled properly
+    assert body["info"]["thumbnail_size"] == expected_thumb_size
+    if expected_original_size:  # PDFs don't have an expected size
+        assert body["info"]["original_size"] == expected_original_size
+    if "countPages" in params:
+        assert body["info"]["page_count"] == num_pages
+    # Assert the produced image is the same as the expected
+    if params.get('input') in ('pdf', "pptx"):
+        actual = Image.open(BytesIO(base64.b64decode(body['thumbnail'])))
+        expected = Image.open(data_dir / expected_thumb)
+        actual_array = np.array(actual)
+        expected_array = np.array(expected)
+        assert actual_array.shape == expected_array.shape
+        assert np.allclose(expected_array, actual_array, atol=15, rtol=0.1)
+    else:
+        actual = AICSImage(base64.b64decode(body['thumbnail'])).reader.data
+        expected = AICSImage(data_dir / expected_thumb).reader.data
+        assert np.array_equal(actual, expected)