Source code for gummy.utils.download_utils

# coding: utf-8
""" Utility programs for downloading """
import os
import re
import bs4
import time
import base64
import urllib

from ._path import IMG_NOT_FOUND_SRC, GUMMY_DIR
from .coloring_utils  import toBLUE, toGREEN, toRED
from .compress_utils import extract_from_compressed, is_compressed
from .generic_utils import readable_bytes
from .monitor_utils import progress_reporthook_create
from .driver_utils import download_PDF_with_driver

CONTENT_ENCODING2EXT = {
    "x-gzip"                    : ".gz",
    "image/jpeg"                : ".jpg",
    "image/jpx"                 : ".jpx",
    "image/png"                 : ".png",
    "image/gif"                 : ".gif",
    "image/webp"                : ".webp",
    "image/x-canon-cr2"         : ".cr2",
    "image/tiff"                : ".tif",
    "image/bmp"                 : ".bmp",
    "image/vnd.ms-photo"        : ".jxr",
    "image/vnd.adobe.photoshop" : ".psd",
    "image/x-icon"              : ".ico",
    "image/heic"                : ".heic",
}

CONTENT_TYPE2EXT = {
    "application/epub+zip"                  : ".epub",
    "application/zip"                       : ".zip",
    "application/x-tar"                     : ".tar",
    "application/x-rar-compressed"          : ".rar",
    "application/gzip"                      : ".gz",
    "application/x-bzip2"                   : ".bz2",
    "application/x-7z-compressed"           : ".7z",
    "application/x-xz"                      : ".xz",
    "application/pdf"                       : ".pdf",
    "application/x-msdownload"              : ".exe",
    "application/x-shockwave-flash"         : ".swf",
    "application/rtf"                       : ".rtf",
    "application/octet-stream"              : ".eot",
    "application/postscript"                : ".ps",
    "application/x-sqlite3"                 : ".sqlite",
    "application/x-nintendo-nes-rom"        : ".nes",
    "application/x-google-chrome-extension" : ".crx",
    "application/vnd.ms-cab-compressed"     : ".cab",
    "application/x-deb"                     : ".deb",
    "application/x-unix-archive"            : ".ar",
    "application/x-compress"                : ".Z",
    "application/x-lzip"                    : ".lz",
}

[docs]def decide_extension(content_encoding=None, content_type=None, filename=None):
    """Decide File Extension based on ``content_encoding`` and ``content_type``
    Args:
        content_encoding (str) : The MIME type of the resource or the data.
        content_type (str)     : The Content-Encoding entity header is used to compress the media-type.
        filename (str)         : The filename.
    Returns:
        ext (str): Starts with "."
    Examples:
        >>> from gummy.utils import decide_extension
        >>> decide_extension(content_encoding="x-gzip", content_type="application/zip")
        .gz
        >>> decide_extension(content_encoding="image/png", content_type=None)
        .png
        >>> decide_extension(content_encoding=None, content_type="application/pdf")
        .pdf
    """
    ext = CONTENT_ENCODING2EXT.get(content_encoding) or CONTENT_TYPE2EXT.get(content_type) or "." + str(filename).split(".")[-1]
    return ext

[docs]def download_file(url, dirname=".", path=None, bar_width=20, verbose=True):
    """Download a file.
    Args:
        url (str)       : File URL.
        dirname (str)   : The directory where downloaded data will be saved.
        path (str)      : path/to/downloaded_file
        bar_width (int) : The width of progress bar.
        verbose (bool)  : Whether print verbose or not.

    Returns:
        path (str) : path/to/downloaded_file
    
    Examples:
        >>> from gummy.utils import download_file
        >>> download_file(url="https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_eye.xml")
        Download a file from https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_eye.xml
                    * Content-Encoding : None
                    * Content-Length   : (333.404296875, 'MB')
                    * Content-Type     : text/plain; charset=utf-8
                    * Save Destination : ./haarcascade_eye.xml 
        haarcascade_eye.xml	100.0%[####################] 0.1[s] 5.5[GB/s]	eta -0.0[s]
        './haarcascade_eye.xml'
    """
    try:
        with urllib.request.urlopen(url) as web_file:
            # Get Information from webfile header
            headers = dict(web_file.headers._headers)
        content_encoding     = headers.get("Content-Encoding")
        content_length, unit = readable_bytes(int(headers.get("Content-Length", 0)))
        content_length       = f"{content_length:.1f} [{unit}]"
        content_type         = headers.get("Content-Type")
        fn = url.split("/")[-1]
        if path is None:
            *name, ext = fn.split(".")
            name = ".".join(name)
            guessed_ext = decide_extension(content_encoding, content_type, fn)
            path = os.path.join(dirname, name+guessed_ext)
        if verbose:
            print(f"""Download a file from {toBLUE(url)}
            * Content-Encoding : {toGREEN(content_encoding)}
            * Content-Length   : {toGREEN(content_length)}
            * Content-Type     : {toGREEN(content_type)}
            * Save Destination : {toBLUE(path)}""")
        _, res = urllib.request.urlretrieve(url=url, filename=path, reporthook=progress_reporthook_create(filename=fn, bar_width=bar_width, verbose=verbose))
    except urllib.error.URLError as e:
        if verbose:
            print(f"{toRED(e)} : url={toBLUE(url)}")
            print(f"Try to download using webdriver {toRED('(Open Browser)')}")
        try:
            path = download_PDF_with_driver(url=url, dirname=dirname, verbose=verbose)
        except urllib.error.URLError as e:
            if verbose: print(f"{toRED(e)}")
            path = None
    return path

[docs]def src2base64(src, base=None):
    """Create base64 encoded img tag from src url or <img> tag element.

    Args:
        src (str, bs4.element.Tag) : Image src url, or ``<img>`` tag element.
        base (str)                 : Base URL. Join a base URL and a possibly relative URL to form an absolute interpretation of the latter.
    
    Returns:
        str : base64 encoded img tag

    Examples:
        >>> from gummy.utils import src2base64
        >>> img_tag = src2base64(src="https://iwasakishuto.github.io/images/contents-icon/Translation-Gummy.png")
        >>> with open("sample.html", mode="w") as f:
        ...     f.write(img_tag)
        >>> # open sample.html to check the results.
        >>> img_tag = src2base64(src="https://iwasakishuto.github.io/images/XXX/XXXXX.png")
        Tried to get an image but got an error: HTTP Error 404: Not Found
        >>> with open("error.html", mode="w") as f:
        ...     f.write(img_tag)
        >>> # open sample.html to check the results.    
    """
    if isinstance(src, bs4.element.Tag) and src.name == "img":
        for target in ["src", "data-src", "data-original"]:
            s = src.get(target)
            if (s is not None) and (not re.match(pattern=r"^(javascript:|data:).+", string=s)):
                break
        src = s
    url = urllib.parse.urljoin(base=base, url=src)
    try:
        request = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0"})
        with urllib.request.urlopen(request) as web_file:
            data = base64.b64encode(web_file.read()).decode('utf-8')
            img_tag = f'<img src="data:image/jpeg;base64,{data}"/>'
    except Exception as e:
        print(f"Tried to get an image but got an error: {toRED(e)}")
        img_tag = f'<img src="{IMG_NOT_FOUND_SRC}"/>'
    return img_tag

[docs]def path2base64(path):
    """Create base64 encoded img tag from local image.

    Args:
        path (str) : path/to/image.

    Returns:
        str : base64 encoded img tag

    Examples:
        >>> from gummy.utils import path2base64, download_file
        >>> path = download_file(url="https://iwasakishuto.github.io/images/contents-icon/Translation-Gummy.png")
        Download a file from https://iwasakishuto.github.io/images/contents-icon/Translation-Gummy.png
                    * Content-Encoding : None
                    * Content-Length   : 21.4 [MB]
                    * Content-Type     : image/png
                    * Save Destination : ./Translation-Gummy.png
        Translation-Gummy.png	100.0%[####################] 0.0[s] 3.4[GB/s]	eta -0.0[s]
        >>> img_tag = path2base64(path=path)
        >>> with open("sample.html", mode="w") as f:
        ...     f.write(img_tag)
        >>> # open sample.html to check the results.
    """
    try:
        with open(path, "rb") as image_file:
            data = base64.b64encode(image_file.read()).decode('utf-8')
            img_tag = f'<img src="data:image/jpeg;base64,{data}"/>'
    except Exception as e:
        print(toRED(f"[{str(e)}]\nCould not load data from {toBLUE(path)}"))
        img_tag = f'<img src="{IMG_NOT_FOUND_SRC}" />'
    return img_tag

[docs]def match2path(file, dirname=GUMMY_DIR):
    """Match url or path to path while downloading if ``file`` is url.

    Args:
        file (data, str) : url or path or data of PDF.
        dirname (str)    : if ``file`` is url, download and save it to ``dirname``. (defalt= ``GUMMY_DIR``)

    Returns:
        str : path to a PDF.
    """
    if isinstance(file, str) and (not os.path.exists(file)):
        path = download_file(url=file, dirname=dirname)
        if path is None:
            print(toRED(f"Failed to download PDF from {toBLUE(file)}"))
        ext = "." + path.split(".")[-1]
        if is_compressed(ext):
            extracted_file_paths = extract_from_compressed(path, ext=".pdf", dirname=dirname)
            path = extracted_file_paths[0]
    else:
        path = file
    return path
Source code for gummy.utils.download_utils

Other contents

Social link