Source code for gummy.utils.download_utils

# coding: utf-8
""" Utility programs for downloading """
import os
import re
import bs4
import time
import base64
import urllib

from ._path import IMG_NOT_FOUND_SRC, GUMMY_DIR
from .coloring_utils  import toBLUE, toGREEN, toRED
from .compress_utils import extract_from_compressed, is_compressed
from .generic_utils import readable_bytes
from .monitor_utils import progress_reporthook_create
from .driver_utils import download_PDF_with_driver

CONTENT_ENCODING2EXT = {
    "x-gzip"                    : ".gz",
    "image/jpeg"                : ".jpg",
    "image/jpx"                 : ".jpx",
    "image/png"                 : ".png",
    "image/gif"                 : ".gif",
    "image/webp"                : ".webp",
    "image/x-canon-cr2"         : ".cr2",
    "image/tiff"                : ".tif",
    "image/bmp"                 : ".bmp",
    "image/vnd.ms-photo"        : ".jxr",
    "image/vnd.adobe.photoshop" : ".psd",
    "image/x-icon"              : ".ico",
    "image/heic"                : ".heic",
}

CONTENT_TYPE2EXT = {
    "application/epub+zip"                  : ".epub",
    "application/zip"                       : ".zip",
    "application/x-tar"                     : ".tar",
    "application/x-rar-compressed"          : ".rar",
    "application/gzip"                      : ".gz",
    "application/x-bzip2"                   : ".bz2",
    "application/x-7z-compressed"           : ".7z",
    "application/x-xz"                      : ".xz",
    "application/pdf"                       : ".pdf",
    "application/x-msdownload"              : ".exe",
    "application/x-shockwave-flash"         : ".swf",
    "application/rtf"                       : ".rtf",
    "application/octet-stream"              : ".eot",
    "application/postscript"                : ".ps",
    "application/x-sqlite3"                 : ".sqlite",
    "application/x-nintendo-nes-rom"        : ".nes",
    "application/x-google-chrome-extension" : ".crx",
    "application/vnd.ms-cab-compressed"     : ".cab",
    "application/x-deb"                     : ".deb",
    "application/x-unix-archive"            : ".ar",
    "application/x-compress"                : ".Z",
    "application/x-lzip"                    : ".lz",
}

[docs]def decide_extension(content_encoding=None, content_type=None, filename=None): """Decide File Extension based on ``content_encoding`` and ``content_type`` Args: content_encoding (str) : The MIME type of the resource or the data. content_type (str) : The Content-Encoding entity header is used to compress the media-type. filename (str) : The filename. Returns: ext (str): Starts with "." Examples: >>> from gummy.utils import decide_extension >>> decide_extension(content_encoding="x-gzip", content_type="application/zip") .gz >>> decide_extension(content_encoding="image/png", content_type=None) .png >>> decide_extension(content_encoding=None, content_type="application/pdf") .pdf """ ext = CONTENT_ENCODING2EXT.get(content_encoding) or CONTENT_TYPE2EXT.get(content_type) or "." + str(filename).split(".")[-1] return ext
[docs]def download_file(url, dirname=".", path=None, bar_width=20, verbose=True): """Download a file. Args: url (str) : File URL. dirname (str) : The directory where downloaded data will be saved. path (str) : path/to/downloaded_file bar_width (int) : The width of progress bar. verbose (bool) : Whether print verbose or not. Returns: path (str) : path/to/downloaded_file Examples: >>> from gummy.utils import download_file >>> download_file(url="https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_eye.xml") Download a file from https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_eye.xml * Content-Encoding : None * Content-Length : (333.404296875, 'MB') * Content-Type : text/plain; charset=utf-8 * Save Destination : ./haarcascade_eye.xml haarcascade_eye.xml 100.0%[####################] 0.1[s] 5.5[GB/s] eta -0.0[s] './haarcascade_eye.xml' """ try: with urllib.request.urlopen(url) as web_file: # Get Information from webfile header headers = dict(web_file.headers._headers) content_encoding = headers.get("Content-Encoding") content_length, unit = readable_bytes(int(headers.get("Content-Length", 0))) content_length = f"{content_length:.1f} [{unit}]" content_type = headers.get("Content-Type") fn = url.split("/")[-1] if path is None: *name, ext = fn.split(".") name = ".".join(name) guessed_ext = decide_extension(content_encoding, content_type, fn) path = os.path.join(dirname, name+guessed_ext) if verbose: print(f"""Download a file from {toBLUE(url)} * Content-Encoding : {toGREEN(content_encoding)} * Content-Length : {toGREEN(content_length)} * Content-Type : {toGREEN(content_type)} * Save Destination : {toBLUE(path)}""") _, res = urllib.request.urlretrieve(url=url, filename=path, reporthook=progress_reporthook_create(filename=fn, bar_width=bar_width, verbose=verbose)) except urllib.error.URLError as e: if verbose: print(f"{toRED(e)} : url={toBLUE(url)}") print(f"Try to download using webdriver {toRED('(Open Browser)')}") try: path = download_PDF_with_driver(url=url, dirname=dirname, verbose=verbose) except urllib.error.URLError as e: if verbose: print(f"{toRED(e)}") path = None return path
[docs]def src2base64(src, base=None): """Create base64 encoded img tag from src url or <img> tag element. Args: src (str, bs4.element.Tag) : Image src url, or ``<img>`` tag element. base (str) : Base URL. Join a base URL and a possibly relative URL to form an absolute interpretation of the latter. Returns: str : base64 encoded img tag Examples: >>> from gummy.utils import src2base64 >>> img_tag = src2base64(src="https://iwasakishuto.github.io/images/contents-icon/Translation-Gummy.png") >>> with open("sample.html", mode="w") as f: ... f.write(img_tag) >>> # open sample.html to check the results. >>> img_tag = src2base64(src="https://iwasakishuto.github.io/images/XXX/XXXXX.png") Tried to get an image but got an error: HTTP Error 404: Not Found >>> with open("error.html", mode="w") as f: ... f.write(img_tag) >>> # open sample.html to check the results. """ if isinstance(src, bs4.element.Tag) and src.name == "img": for target in ["src", "data-src", "data-original"]: s = src.get(target) if (s is not None) and (not re.match(pattern=r"^(javascript:|data:).+", string=s)): break src = s url = urllib.parse.urljoin(base=base, url=src) try: request = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0"}) with urllib.request.urlopen(request) as web_file: data = base64.b64encode(web_file.read()).decode('utf-8') img_tag = f'<img src="data:image/jpeg;base64,{data}"/>' except Exception as e: print(f"Tried to get an image but got an error: {toRED(e)}") img_tag = f'<img src="{IMG_NOT_FOUND_SRC}"/>' return img_tag
[docs]def path2base64(path): """Create base64 encoded img tag from local image. Args: path (str) : path/to/image. Returns: str : base64 encoded img tag Examples: >>> from gummy.utils import path2base64, download_file >>> path = download_file(url="https://iwasakishuto.github.io/images/contents-icon/Translation-Gummy.png") Download a file from https://iwasakishuto.github.io/images/contents-icon/Translation-Gummy.png * Content-Encoding : None * Content-Length : 21.4 [MB] * Content-Type : image/png * Save Destination : ./Translation-Gummy.png Translation-Gummy.png 100.0%[####################] 0.0[s] 3.4[GB/s] eta -0.0[s] >>> img_tag = path2base64(path=path) >>> with open("sample.html", mode="w") as f: ... f.write(img_tag) >>> # open sample.html to check the results. """ try: with open(path, "rb") as image_file: data = base64.b64encode(image_file.read()).decode('utf-8') img_tag = f'<img src="data:image/jpeg;base64,{data}"/>' except Exception as e: print(toRED(f"[{str(e)}]\nCould not load data from {toBLUE(path)}")) img_tag = f'<img src="{IMG_NOT_FOUND_SRC}" />' return img_tag
[docs]def match2path(file, dirname=GUMMY_DIR): """Match url or path to path while downloading if ``file`` is url. Args: file (data, str) : url or path or data of PDF. dirname (str) : if ``file`` is url, download and save it to ``dirname``. (defalt= ``GUMMY_DIR``) Returns: str : path to a PDF. """ if isinstance(file, str) and (not os.path.exists(file)): path = download_file(url=file, dirname=dirname) if path is None: print(toRED(f"Failed to download PDF from {toBLUE(file)}")) ext = "." + path.split(".")[-1] if is_compressed(ext): extracted_file_paths = extract_from_compressed(path, ext=".pdf", dirname=dirname) path = extracted_file_paths[0] else: path = file return path