Merge pull request #23 from kotify/py312

Cleanup weasyprint
kotify · Mar 7, 2024 · 2550590 · 2550590
2 parents 96c5ddb + 08303f8
commit 2550590
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 70 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 /build
 /cdk.out
+/weasyprint/aws-lambda-rie/
diff --git a/README.md b/README.md
@@ -3,27 +3,14 @@
 This is a collection of AWS Lambda layers and functions to render pdf documents
 and images from HTML.
 
-Currently solutions based on these tools available:
-
-- [WeasyPrint](https://weasyprint.org/)
-- [wkhtmltopdf](https://wkhtmltopdf.org/)
-
-To build a layer you need **make** and **docker** installed on your system.
-The layers support only amazon linux 2 runtimes, eg. python3.8, nodejs12.x.
+Download layers from release section or build them yourself (requires **make** and **docker**).
+The layers support only Amazon Linux 2023 runtimes, eg. python3.12.
 
 By default only dejavu fonts are installed, edit [build script](fonts/layer_builder.sh) to install others.
 
 ## WeasyPrint
 
 [WeasyPrint](https://weasyprint.org/) is python based pdf/png print service.
 
-Run `make build/weasyprint-layer-python3.8.zip` to build a layer, for details
-see related [readme](weasyprint/README.md).
-
-## wkhtmltopdf
-
-[wkhtmltopdf](https://wkhtmltopdf.org/) is a comand line tool that renders HTML
-into PDF and various image formats using the Qt WebKit rendering engine.
-
-Run `make build/wkhtmltox-layer.zip` to build a layer, for details
-see related [readme](wkhtmltox/README.md).
+Run `make build/weasyprint-layer-python3.12.zip` to build a layer, for details
+and docker lambda example see related [readme](weasyprint/README.md).
diff --git a/weasyprint/Dockerfile b/weasyprint/Dockerfile
@@ -1,7 +1,7 @@
 # Define global args
 ARG FUNCTION_DIR="/home/app/"
-ARG RUNTIME_VERSION="3.11"
-ARG DISTRO_VERSION="3.18"
+ARG RUNTIME_VERSION="3.12"
+ARG DISTRO_VERSION="3.19"
 
 # Stage 1 - bundle base image + runtime
 # Grab a fresh copy of the image and install GCC
@@ -20,21 +20,22 @@ RUN apk add --no-cache \
     automake \
     elfutils-dev \
     make \
-    cmake 
+    cmake \
+    libffi-dev
 # Include global args in this stage of the build
 ARG FUNCTION_DIR
 ARG RUNTIME_VERSION
 # Create function directory
 RUN mkdir -p ${FUNCTION_DIR}
-# Copy handler function
-COPY lambda_function.py ${FUNCTION_DIR}
 
 # Install Lambda Runtime Interface Client for Python
 RUN python${RUNTIME_VERSION} -m pip install awslambdaric --target ${FUNCTION_DIR}
 
-RUN apk add --no-cache libffi-dev
 RUN python${RUNTIME_VERSION} -m pip install weasyprint boto3 --target ${FUNCTION_DIR}
 
+# Copy handler function
+COPY lambda_function.py ${FUNCTION_DIR}
+
 # Stage 3 - final runtime image
 # Grab a fresh copy of the Python image
 FROM python-alpine
@@ -48,6 +49,9 @@ COPY --from=build-image ${FUNCTION_DIR} ${FUNCTION_DIR}
 RUN apk add --no-cache pango zlib-dev jpeg-dev openjpeg-dev libffi-dev
 RUN apk add --no-cache msttcorefonts-installer && update-ms-fonts
 RUN fc-cache --really-force --verbose
+## uncomment for post processing support
+# RUN apk add --no-cache ghostscript
+
 # (Optional) Add Lambda Runtime Interface Emulator and use a script in the ENTRYPOINT for simpler local runs
 # ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/bin/aws-lambda-rie
 # COPY entry.sh /

diff --git a/weasyprint/README.md b/weasyprint/README.md
@@ -1,12 +1,10 @@
 # WeasyPrint AWS Lambda
 
-**WARNING** Native lambda layer can run only legacy WeasyPrint v52, as a workaround you can run your lambda function as a [docker container](https://docs.aws.amazon.com/lambda/latest/dg/images-create.html).
-
-## Native Lambda Layer
+## Native layer
 
 Build layer:
 
-    $ make build/weasyprint-layer-python3.8.zip
+    $ make build/weasyprint-layer-python3.12.zip
 
     # to test your build run
     $ make test.weasyprint
@@ -16,16 +14,14 @@ Deploy layer:
     $ aws lambda publish-layer-version \
         --region <region> \
         --layer-name <name> \
-        --zip-file fileb://build/weasyprint-layer-python3.8.zip
+        --zip-file fileb://build/weasyprint-layer-python3.12.zip
 
-Environment variables expected by layer:
+Lambda must be configured with these env vars:
 
     GDK_PIXBUF_MODULE_FILE="/opt/lib/loaders.cache"
     FONTCONFIG_PATH="/opt/fonts"
     XDG_DATA_DIRS="/opt/lib"
 
-For python3.9 use instructions: https://github.com/kotify/cloud-print-utils/issues/10#issuecomment-1367774956
-
 ## Docker Lambda
 
 Build layer:
@@ -48,8 +44,8 @@ it requires `BUCKET=<bucket name>` env variable if files stored on s3.
 
 Example payload to print pdf from url and return link to s3:
 
-    {"url": "https://kotify.github.io/cloud-print-utils/samples/report/", "filename": "report.pdf"}
+    {"url": "https://kotify.github.io/cloud-print-utils/samples/report/", "filename": "/path/on/s3/report.pdf"}
 
-Example paylod to print pdf from html and css data and return pdf content encoded in base64:
+Example paylod to print pdf from html and css data and return pdf content encoded as base64:
 
     {"html": "<html><h1>Header</h1></html>", "css": "h1 { color: red }", "filename": "report.pdf", "return": "base64"}
diff --git a/weasyprint/lambda_function.py b/weasyprint/lambda_function.py
@@ -1,48 +1,113 @@
 #!/usr/bin/env python
+import subprocess
+import logging
 import base64
 import os
 
+import urllib.request
+from functools import partial
+import tempfile
 from weasyprint import CSS, HTML
+import pathlib
+from urllib.parse import urlparse
+import concurrent.futures
+import boto3
+import uuid
+
+logger = logging.getLogger(__name__)
+s3 = boto3.client("s3")
+
+
+def gen_pdf_name(tmpdir):
+    return tmpdir / f"{uuid.uuid4()}.pdf"
+
+
+def download(tmpdir, url):
+    name = gen_pdf_name(tmpdir)
+    urllib.request.urlretrieve(url, name)
+    return name
+
+
+def fetch_attachments(downloader, pdfs):
+    if not pdfs:
+        return []
+
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=min(len(pdfs), 6)
+    ) as executor:
+        return list(executor.map(downloader, pdfs))
+
+
+def postprocess(tmpdir, document, attachments):
+    pdfs = fetch_attachments(partial(download, tmpdir), attachments)
+    tmpfile = gen_pdf_name(tmpdir)
+    subprocess.check_call(
+        [
+            "gs",
+            "-q",
+            "-sDEVICE=pdfwrite",
+            "-dPDFSETTINGS=/prepress",
+            "-dFIXEDMEDIA",
+            "-sPAPERSIZE=a4",
+            "-dPDFFitPage",
+            "-dAutoRotatePages=/PageByPage",
+            "-o",
+            f"{tmpfile}",
+            document,
+            *pdfs,
+        ]
+    )
+    return tmpfile
 
 
 def lambda_handler(event, context):
     filename = event["filename"]
+    attachments = [
+        a
+        for a in event.get("attachments", [])
+        if urlparse(a).path.lower().endswith(".pdf")
+    ]
+    always_postprocess = event.get("always_postprocess", False)
+    return_base64 = event.get("return") == "base64"
     basename = os.path.basename(filename)
-    tmpfile = f"/tmp/{basename}"
-    if "url" in event:
-        HTML(url=event["url"]).write_pdf(target=tmpfile)
-    else:
-        HTML(string=event["html"]).write_pdf(
-            target=tmpfile,
-            stylesheets=[CSS(string=event["css"])] if "css" in event else None,
-        )
-    if event.get("return") == "base64":
-        with open(tmpfile, "rb") as f:
-            data = f.read()
-        return {
-            "statusCode": 200,
-            "headers": {
-                "Content-type": "application/pdf",
-                "Content-Disposition": f"attachment;filename={basename}",
-            },
-            "isBase64Encoded": True,
-            "body": base64.b64encode(data).decode("utf-8"),
-        }
-    else:
-        import boto3
-
-        s3 = boto3.client("s3")
-        bucket = os.environ["BUCKET"]
-        with open(tmpfile, "rb") as f:
-            s3.upload_fileobj(
-                open(tmpfile, "rb"),
-                bucket,
-                filename,
-                ExtraArgs={"ContentType": "application/pdf"},
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = pathlib.Path(tmpdir)
+        document = gen_pdf_name(tmpdir)
+        if "url" in event:
+            HTML(url=event["url"]).write_pdf(target=document)
+        else:
+            HTML(string=event["html"]).write_pdf(
+                target=document,
+                stylesheets=[CSS(string=event["css"])] if "css" in event else None,
+            )
+
+        if attachments or always_postprocess:
+            document = postprocess(tmpdir, document, attachments)
+        if return_base64:
+            with open(document, "rb") as f:
+                data = f.read()
+            return {
+                "statusCode": 200,
+                "headers": {
+                    "Content-type": "application/pdf",
+                    "Content-Disposition": f"attachment;filename={basename}",
+                },
+                "isBase64Encoded": True,
+                "body": base64.b64encode(data).decode("utf-8"),
+            }
+        else:
+            bucket = os.environ["BUCKET"]
+            with open(document, "rb") as f:
+                s3.upload_fileobj(
+                    f,
+                    bucket,
+                    filename,
+                    ExtraArgs={"ContentType": "application/pdf"},
+                )
+            url = s3.generate_presigned_url(
+                ClientMethod="get_object",
+                Params={"Bucket": bucket, "Key": filename},
+                ExpiresIn=3600,
             )
-        url = s3.generate_presigned_url(
-            ClientMethod="get_object",
-            Params={"Bucket": bucket, "Key": filename},
-            ExpiresIn=3600,
-        )
-        return {"statusCode": 200, "body": url}
+            return {"statusCode": 200, "body": url}