Source code for gummy.utils.outfmt_utils

# coding: utf-8
""" Utility programs for creating HTML or PDF."""
import os
import re
import warnings
import unicodedata
import pdfkit
from jinja2 import Environment, FileSystemLoader

from . import TEMPLATES_DIR
from .generic_utils import str_strip
from .coloring_utils import toRED, toBLUE, toGREEN

[docs]def sanitize_filename(fp, dirname=None, ext=None, allow_unicode=False):
    """Convert from original filename to sanitized filename

    Args:
        fp (str)             : File path.
        dirname (str)        : Directory part of the ``fp``
        ext (str)            : Required file extension.
        allow_unicode (bool) : Whether allowing unicode or not.

    Returns:
        str : Sanitized filename.

    Examples:
        >>> from gummy.utils import sanitize_filename
        >>> sanitize_filename("path/to/image\u2013.png")
        >>> 'path/to/image.png'
        >>> sanitize_filename("path/to/image\u2013.jpg", ext=".png")
        >>> 'path/to/image.jpg.png'
        >>> sanitize_filename("path/to/image\u2013.png", allow_unicode=True)
        >>> 'path/to/image–.png'
        >>> # Doesn't work
        >>> sanitize_filename(fp='mir-193 targets ALDH2 and contributes to toxic aldehyde accumulation and tyrosine hydroxylase dysfunction in cerebral ischemia/reperfusion injury')
        'mir-193 targets ALDH2 and contributes to toxic aldehyde accumulation and tyrosine hydroxylase dysfunction in cerebral ischemia/reperfusion injury'
        >>> # Work well :)
        >>> sanitize_filename(fp='mir-193 targets ALDH2 and contributes to toxic aldehyde accumulation and tyrosine hydroxylase dysfunction in cerebral ischemia/reperfusion injury', dirname=".")
        './mir-193 targets ALDH2 and contributes to toxic aldehyde accumulation and tyrosine hydroxylase dysfunction in cerebral ischemia0reperfusion injury'
    """
    if dirname is None:
        dirname, fn = os.path.split(fp)
    else:
        fn = os.path.relpath(fp, start=dirname)
    if allow_unicode:
        fn = unicodedata.normalize("NFKC", fn)
    else:
        fn = unicodedata.normalize("NFKD", fn).encode("ascii", "ignore").decode("ascii")
    fn = str_strip(fn)
    fn = re.sub(pattern=r'[\\\/\?\*\|<>":;]+', repl='', string=fn)
    if ext is not None:
        if not ext.startswith("."): ext = "." + ext
        if not fn.endswith(ext): fn += ext
    fp = os.path.normpath(os.path.join(dirname, fn))
    return fp

[docs]def get_jinja_all_attrs(string, keyname):
    """Get the keynames which each element in ``keyname`` is expected to have.

    Args:
        string (str)  : Content strings in template file.
        keyname (str) : keyname which is used in ``template.render(keyname=...)``

    Examples:
        >>> import os
        >>> from gummy.utils import TEMPLATES_DIR, get_jinja_all_attrs
        >>> path = os.path.join(TEMPLATES_DIR, "paper.html")
        >>> with open(path, mode="r", encoding="utf-8") as f:
        ...     html = "".join(f.readlines())
        >>> get_jinja_all_attrs(string=html, keyname="contents")
        {'en', 'head', 'img', 'ja', 'subhead'}
    """
    attributes = set()
    get_from_either_pipe = lambda x,y: x if len(x)>0 else y
    for arg in re.findall(pattern=r"{%\s+for\s+(.+)\s+in\s+" + keyname + r"\s+%}", string=string):
        # get 'bar' from {{ arg.bar }} or {{ arg['bar'] }}
        attrs = re.findall(pattern=rf"{{\s+{arg}(?:\.(.+?)|\[['\"](.+?)['\"]\])\s+}}", string=string)
        attrs = [get_from_either_pipe(*attr) for attr in attrs]
        attributes.update(attrs)
    return attributes

[docs]def check_contents(path, contents):
    """ Check whether all attributes in template is contained in contents.

    Args:
        path (str)     : path/to/template.file
        contens (list) : Each element in ``contents`` should be dictionary.
    """
    if (not isinstance(contents, list)) or \
        ((len(contents)>0) and (not isinstance(contents[0], dict))):
        raise TypeError("`contents` should be list, and each element in `contents` should be dictionary.")

    with open(path, mode="r", encoding="utf-8") as f:
        html = "".join(f.readlines())
    # All attributes in template.
    attributes = get_jinja_all_attrs(string=html, keyname="contents")
    # All keys in contens list.
    content_keys = set([e for content in contents for e in content.keys()])

    # Print key which is in content_keys but not in attributes.
    for key in content_keys.difference(attributes):
        warnings.warn(f"An attribute {toGREEN(key)} is not used in {toBLUE(path)}.")
    # Print key which is in attributes but not in content_keys.
    for key in attributes.difference(content_keys):
        warnings.warn(f"An attribute {toGREEN(key)} is not used in this contents, but used in {toBLUE(path)}.")

[docs]def tohtml(path, title="", contents=[], searchpath=TEMPLATES_DIR, template="paper.html", verbose=True):
    """ Arrange ``title`` and ``contents`` in html format.

    Args:
        path (str)       : path/to/output.html
        title (str)      : title for html.
        contents (list)  : Contens which used for render method of ``jinja2.environment.Template`` instance.
        searchpath (str) : Loader will find templates from the file system, and this directory is a base.
        template (str)   : template filename. Loader will find ``f"{searchpath}/{template}"``

    Returns:
        str : path/to/output.html
    """
    env = Environment(loader=FileSystemLoader(searchpath=searchpath))
    template = env.get_template(template)

    # TODO: Check nested all variables.
    # check_contents(path=template.filename, contents=contents)

    root,ext = os.path.splitext(path)
    if ext == ".pdf":
        path = root + ".html"
    path = sanitize_filename(fp=path, ext=".html")
    with open(path, mode="w", encoding='utf-8') as f:
        output = template.render(title=title, contents=contents)
        try:
            f.write(output)
        except UnicodeEncodeError:
            f.write(output.encode("utf-8"))

    if verbose: print(f"Save HTML file at {toBLUE(path)}")
    return path

[docs]def html2pdf(path, delete_html=True, verbose=True, options={}):
    """Convert from HTML to PDF.

    Args:
        path (str)         : path/to/input.html
        delete_html (bool) : Whether you want to delete html file. (default= ``True``)
        verbose (bool)     : Whether to print message or not. (default= ``True``)
        options (dict)     : options for wkhtmltopdf. See https://wkhtmltopdf.org/usage/wkhtmltopdf.txt

    Returns:
        str : path/to/output.pdf
    """
    options.update({
        "page-size"           : "A4",
        "encoding"            : "UTF-8",
        # "quiet"               : not verbose,
        "header-html"         : os.path.join(TEMPLATES_DIR, "header.html"),
        # "include-in-outline"  : True,
        # "load-error-handling" : "ignore",
        # "footer-center"       : "Page  [page]  of  [toPage]",
        "--print-media-type" : None,
    })
    html_removed_path = path.replace(".html", "")
    pdf_path = html_removed_path + ".pdf"
    pdfkit.from_file(input=path, output_path=pdf_path, options=options)
    if verbose: print(f"Save PDF file at {toBLUE(pdf_path)}")
    if delete_html:
        os.remove(path)
        if verbose: print(f"Delete original HTML file at {toRED(path)}")
    return pdf_path

[docs]def toPDF(path, title="", contents=[], searchpath=TEMPLATES_DIR, template="paper.html", verbose=True, options={}):
    """ Arrange ``title`` and ``contents`` in html format, then convert it to PDF.

    Args:
        path (str)       : path/to/output.html
        title (str)      : title for html.
        contents (list)  : Contens which used for render method of ``jinja2.environment.Template`` instance.
        searchpath (str) : Loader will find templates from the file system, and this directory is a base.
        template (str)   : template filename. Loader will find ``f"{searchpath}/{template}"``
        verbose (bool)   : Whether to print message or not. (default= ``True``)
        options (dict)   : options for wkhtmltopdf. See https://wkhtmltopdf.org/usage/wkhtmltopdf.txt

    Returns:
        str : path/to/output.pdf
    """
    pdf_removed_path = path.remove(".pdf", "")
    html_path = pdf_removed_path + ".html"
    html_path = tohtml(path=html_path, title=title, contents=contents, searchpath=searchpath, template=template, verbose=verbose)
    pdf_path = html2pdf(path=html_path, delete_html=True, verbose=verbose, options=options)
    return pdf_path
Source code for gummy.utils.outfmt_utils

Other contents

Social link