weasyprint/__init__.py

"""The Awesome Document Factory.

The public API is what is accessible from this "root" packages without
importing sub-modules.

"""

import contextlib
from datetime import datetime
from os.path import getctime, getmtime
from pathlib import Path
from urllib.parse import urljoin

import cssselect2
import html5lib
import tinycss2

VERSION = __version__ = '62.0'

#: Default values for command-line and Python API options. See
#: :func:`__main__.main` to learn more about specific options for
#: command-line.
#:
#: :param list stylesheets:
#:     An optional list of user stylesheets. The list can include
#:     are :class:`CSS` objects, filenames, URLs, or file-like
#:     objects. (See :ref:`Stylesheet Origins`.)
#: :param str media_type:
#:     Media type to use for @media.
#: :param list attachments:
#:     A list of additional file attachments for the generated PDF
#:     document or :obj:`None`. The list's elements are
#:     :class:`Attachment` objects, filenames, URLs or file-like objects.
#: :param bytes pdf_identifier:
#:     A bytestring used as PDF file identifier.
#: :param str pdf_variant:
#:     A PDF variant name.
#: :param str pdf_version:
#:     A PDF version number.
#: :param bool pdf_forms:
#:     Whether PDF forms have to be included.
#: :param bool uncompressed_pdf:
#:     Whether PDF content should be compressed.
#: :param bool custom_metadata:
#:     Whether custom HTML metadata should be stored in the generated PDF.
#: :param bool presentational_hints:
#:     Whether HTML presentational hints are followed.
#: :param bool optimize_images:
#:     Whether size of embedded images should be optimized, with no quality
#:     loss.
#: :param int jpeg_quality:
#:     JPEG quality between 0 (worst) to 95 (best).
#: :param int dpi:
#:     Maximum resolution of images embedded in the PDF.
#: :param bool full_fonts:
#:     Whether unmodified font files should be embedded when possible.
#: :param bool hinting:
#:     Whether hinting information should be kept in embedded fonts.
#: :type cache: :obj:`dict`, :class:`pathlib.Path` or :obj:`str`
#: :param cache:
#:     A dictionary used to cache images in memory, or a folder path where
#:     images are temporarily stored.
DEFAULT_OPTIONS = {
    'stylesheets': None,
    'media_type': 'print',
    'attachments': None,
    'pdf_identifier': None,
    'pdf_variant': None,
    'pdf_version': None,
    'pdf_forms': None,
    'uncompressed_pdf': False,
    'custom_metadata': False,
    'presentational_hints': False,
    'optimize_images': False,
    'jpeg_quality': None,
    'dpi': None,
    'full_fonts': False,
    'hinting': False,
    'cache': None,
}

__all__ = [
    'HTML', 'CSS', 'DEFAULT_OPTIONS', 'Attachment', 'Document', 'Page',
    'default_url_fetcher', 'VERSION', '__version__']


# Import after setting the version, as the version is used in other modules
from .urls import (  # noqa: I001, E402
    fetch, default_url_fetcher, path2url, ensure_url, url_is_absolute)
from .logger import LOGGER, PROGRESS_LOGGER  # noqa: E402
# Some imports are at the end of the file (after the CSS class)
# to work around circular imports.


def _find_base_url(html_document, fallback_base_url):
    """Return the base URL for the document.

    See https://www.w3.org/TR/html5/urls.html#document-base-url

    """
    first_base_element = next(iter(html_document.iter('base')), None)
    if first_base_element is not None:
        href = first_base_element.get('href', '').strip()
        if href:
            return urljoin(fallback_base_url, href)
    return fallback_base_url


class HTML:
    """HTML document parsed by html5lib.

    You can just create an instance with a positional argument:
    ``doc = HTML(something)``
    The class will try to guess if the input is a filename, an absolute URL,
    or a :term:`file object`.

    Alternatively, use **one** named argument so that no guessing is involved:

    :type filename: str or pathlib.Path
    :param filename:
        A filename, relative to the current directory, or absolute.
    :param str url:
        An absolute, fully qualified URL.
    :type file_obj: :term:`file object`
    :param file_obj:
        Any object with a ``read`` method.
    :param str string:
        A string of HTML source.

    Specifying multiple inputs is an error:
    ``HTML(filename="foo.html", url="localhost://bar.html")``
    will raise a :obj:`TypeError`.

    You can also pass optional named arguments:

    :param str encoding:
        Force the source character encoding.
    :type base_url: str or pathlib.Path
    :param base_url:
        The base used to resolve relative URLs (e.g. in
        ``<img src="../foo.png">``). If not provided, try to use the input
        filename, URL, or ``name`` attribute of
        :term:`file objects <file object>`.
    :type url_fetcher: :term:`callable`
    :param url_fetcher:
        A function or other callable with the same signature as
        :func:`default_url_fetcher` called to fetch external resources such as
        stylesheets and images. (See :ref:`URL Fetchers`.)
    :param str media_type:
        The media type to use for ``@media``. Defaults to ``'print'``.
        **Note:** In some cases like ``HTML(string=foo)`` relative URLs will be
        invalid if ``base_url`` is not provided.

    """
    def __init__(self, guess=None, filename=None, url=None, file_obj=None,
                 string=None, encoding=None, base_url=None,
                 url_fetcher=default_url_fetcher, media_type='print'):
        PROGRESS_LOGGER.info(
            'Step 1 - Fetching and parsing HTML - %s',
            guess or filename or url or
            getattr(file_obj, 'name', 'HTML string'))
        if isinstance(base_url, Path):
            base_url = str(base_url)
        result = _select_source(
            guess, filename, url, file_obj, string, base_url, url_fetcher)
        with result as (source_type, source, base_url, protocol_encoding):
            if isinstance(source, str):
                result = html5lib.parse(source, namespaceHTMLElements=False)
            else:
                kwargs = {'namespaceHTMLElements': False}
                if protocol_encoding is not None:
                    kwargs['transport_encoding'] = protocol_encoding
                if encoding is not None:
                    kwargs['override_encoding'] = encoding
                result = html5lib.parse(source, **kwargs)
        self.base_url = _find_base_url(result, base_url)
        self.url_fetcher = url_fetcher
        self.media_type = media_type
        self.wrapper_element = cssselect2.ElementWrapper.from_html_root(
            result, content_language=None)
        self.etree_element = self.wrapper_element.etree_element

    def _ua_stylesheets(self, forms=False):
        if forms:
            return [HTML5_UA_STYLESHEET, HTML5_UA_FORM_STYLESHEET]
        return [HTML5_UA_STYLESHEET]

    def _ua_counter_style(self):
        return [HTML5_UA_COUNTER_STYLE.copy()]

    def _ph_stylesheets(self):
        return [HTML5_PH_STYLESHEET]

    def render(self, font_config=None, counter_style=None, **options):
        """Lay out and paginate the document, but do not (yet) export it.

        This returns a :class:`document.Document` object which provides
        access to individual pages and various meta-data.
        See :meth:`write_pdf` to get a PDF directly.

        :type font_config: :class:`text.fonts.FontConfiguration`
        :param font_config:
            A font configuration handling ``@font-face`` rules.
        :type counter_style: :class:`css.counters.CounterStyle`
        :param counter_style:
            A dictionary storing ``@counter-style`` rules.
        :param options:
            The ``options`` parameter includes by default the
            :data:`DEFAULT_OPTIONS` values.
        :returns: A :class:`document.Document` object.

        """
        new_options = DEFAULT_OPTIONS.copy()
        new_options.update(options)
        options = new_options
        return Document._render(self, font_config, counter_style, options)

    def write_pdf(self, target=None, zoom=1, finisher=None,
                  font_config=None, counter_style=None, **options):
        """Render the document to a PDF file.

        This is a shortcut for calling :meth:`render`, then
        :meth:`Document.write_pdf() <document.Document.write_pdf>`.

        :type target:
            :class:`str`, :class:`pathlib.Path` or :term:`file object`
        :param target:
            A filename where the PDF file is generated, a file object, or
            :obj:`None`.
        :param float zoom:
            The zoom factor in PDF units per CSS units.  **Warning**:
            All CSS units are affected, including physical units like
            ``cm`` and named sizes like ``A4``.  For values other than
            1, the physical CSS units will thus be "wrong".
        :type finisher: :term:`callable`
        :param finisher:
            A finisher function or callable that accepts the document and a
            :class:`pydyf.PDF` object as parameters. Can be passed to perform
            post-processing on the PDF right before the trailer is written.
        :type font_config: :class:`text.fonts.FontConfiguration`
        :param font_config:
            A font configuration handling ``@font-face`` rules.
        :type counter_style: :class:`css.counters.CounterStyle`
        :param counter_style:
            A dictionary storing ``@counter-style`` rules.
        :param options:
            The ``options`` parameter includes by default the
            :data:`DEFAULT_OPTIONS` values.
        :returns:
            The PDF as :obj:`bytes` if ``target`` is not provided or
            :obj:`None`, otherwise :obj:`None` (the PDF is written to
            ``target``).

        """
        new_options = DEFAULT_OPTIONS.copy()
        new_options.update(options)
        options = new_options
        return (
            self.render(font_config, counter_style, **options)
            .write_pdf(target, zoom, finisher, **options))


class CSS:
    """CSS stylesheet parsed by tinycss2.

    An instance is created in the same way as :class:`HTML`, with the same
    arguments.

    An additional argument called ``font_config`` must be provided to handle
    ``@font-face`` rules. The same ``text.fonts.FontConfiguration`` object
    must be used for different ``CSS`` objects applied to the same document.

    ``CSS`` objects have no public attributes or methods. They are only meant
    to be used in the :meth:`HTML.write_pdf` and :meth:`HTML.render` methods
    of :class:`HTML` objects.

    """
    def __init__(self, guess=None, filename=None, url=None, file_obj=None,
                 string=None, encoding=None, base_url=None,
                 url_fetcher=default_url_fetcher, _check_mime_type=False,
                 media_type='print', font_config=None, counter_style=None,
                 matcher=None, page_rules=None):
        PROGRESS_LOGGER.info(
            'Step 2 - Fetching and parsing CSS - %s',
            filename or url or getattr(file_obj, 'name', 'CSS string'))
        result = _select_source(
            guess, filename, url, file_obj, string,
            base_url=base_url, url_fetcher=url_fetcher,
            check_css_mime_type=_check_mime_type)
        with result as (source_type, source, base_url, protocol_encoding):
            if source_type == 'file_obj':
                source = source.read()
            if isinstance(source, str):
                # unicode, no encoding
                stylesheet = tinycss2.parse_stylesheet(source)
            else:
                stylesheet, encoding = tinycss2.parse_stylesheet_bytes(
                    source, environment_encoding=encoding,
                    protocol_encoding=protocol_encoding)
        self.base_url = base_url
        self.matcher = matcher or cssselect2.Matcher()
        self.page_rules = [] if page_rules is None else page_rules
        preprocess_stylesheet(
            media_type, base_url, stylesheet, url_fetcher, self.matcher,
            self.page_rules, font_config, counter_style)


class Attachment:
    """File attachment for a PDF document.

    An instance is created in the same way as :class:`HTML`, except that the
    HTML specific arguments (``encoding`` and ``media_type``) are not
    supported.

    :param str description:
        A description of the attachment to be included in the PDF document.
        May be :obj:`None`.
    :type created: :obj:`datetime.datetime`
    :param created:
        Creation date and time. Default is current date and time.
    :type modified: :obj:`datetime.datetime`
    :param modified:
        Modification date and time. Default is current date and time.
    :param str relationship:
        A string that represents the relationship between the attachment and
        the PDF it is embedded in. Default is 'Unspecified', other common
        values are defined in ISO-32000-2:2020, 7.11.3.

    """
    def __init__(self, guess=None, filename=None, url=None, file_obj=None,
                 string=None, base_url=None, url_fetcher=default_url_fetcher,
                 description=None, created=None, modified=None,
                 relationship='Unspecified'):
        self.source = _select_source(
            guess, filename, url, file_obj, string, base_url=base_url,
            url_fetcher=url_fetcher)
        self.description = description
        self.relationship = relationship
        self.md5 = None

        if created is None:
            if filename:
                created = datetime.fromtimestamp(getctime(filename))
            else:
                created = datetime.now()
        if modified is None:
            if filename:
                modified = datetime.fromtimestamp(getmtime(filename))
            else:
                modified = datetime.now()
        self.created = created
        self.modified = modified


@contextlib.contextmanager
def _select_source(guess=None, filename=None, url=None, file_obj=None,
                   string=None, base_url=None, url_fetcher=default_url_fetcher,
                   check_css_mime_type=False):
    """If only one input is given, return it with normalized ``base_url``."""
    if base_url is not None:
        base_url = ensure_url(base_url)

    selected_params = [
        param for param in (guess, filename, url, file_obj, string) if
        param is not None]
    if len(selected_params) != 1:
        source = ', '.join(selected_params) or 'nothing'
        raise TypeError(f'Expected exactly one source, got {source}')
    elif guess is not None:
        if hasattr(guess, 'read'):
            type_ = 'file_obj'
        elif isinstance(guess, Path):
            type_ = 'filename'
        elif url_is_absolute(guess):
            type_ = 'url'
        else:
            type_ = 'filename'
        result = _select_source(
            base_url=base_url, url_fetcher=url_fetcher,
            check_css_mime_type=check_css_mime_type,
            **{type_: guess})
        with result as result:
            yield result
    elif filename is not None:
        if base_url is None:
            base_url = path2url(filename)
        with open(filename, 'rb') as file_obj:
            yield 'file_obj', file_obj, base_url, None
    elif url is not None:
        with fetch(url_fetcher, url) as result:
            if check_css_mime_type and result['mime_type'] != 'text/css':
                LOGGER.error(
                    'Unsupported stylesheet type %s for %s',
                    result['mime_type'], result['redirected_url'])
                yield 'string', '', base_url, None
            else:
                proto_encoding = result.get('encoding')
                if base_url is None:
                    base_url = result.get('redirected_url', url)
                if 'string' in result:
                    yield 'string', result['string'], base_url, proto_encoding
                else:
                    yield (
                        'file_obj', result['file_obj'], base_url,
                        proto_encoding)
    elif file_obj is not None:
        if base_url is None:
            # filesystem file-like objects have a 'name' attribute.
            name = getattr(file_obj, 'name', None)
            # Some streams have a .name like '<stdin>', not a filename.
            if name and not name.startswith('<'):
                base_url = ensure_url(name)
        yield 'file_obj', file_obj, base_url, None
    else:
        assert string is not None
        yield 'string', string, base_url, None

# Work around circular imports.
from .css import preprocess_stylesheet  # noqa: I001, E402
from .html import (  # noqa: E402
    HTML5_UA_COUNTER_STYLE, HTML5_UA_STYLESHEET, HTML5_UA_FORM_STYLESHEET,
    HTML5_PH_STYLESHEET)
from .document import Document, Page  # noqa: E402