Source code for gummy.utils.generic_utils

# coding: utf-8
""" Utility programs that can be used in general."""
import os
import re
import shutil
import datetime
import argparse
try:
    from nltk.tokenize import sent_tokenize, word_tokenize
    _ = sent_tokenize(text="gummy")
    _ = word_tokenize(text="gummy")
except LookupError:
    print("You have to download some resources for using NLTK.")
    import nltk
    nltk.download('punkt')
    from nltk.tokenize import sent_tokenize, word_tokenize

from .coloring_utils import toRED, toBLUE, toGREEN, toACCENT
from ._exceptions import KeyError

[docs]def handleKeyError(lst, **kwargs): """Check whether all ``kwargs.values()`` in the ``lst``. Args: lst (list) : candidates. kwargs : ``key`` is the varname that is easy to understand when an error occurs Examples: >>> from pycharmers.utils import handleKeyError >>> handleKeyError(lst=range(3), val=1) >>> handleKeyError(lst=range(3), val=100) KeyError: Please choose the argment val from ['0', '1', '2']. you chose 100 >>> handleKeyError(lst=range(3), val1=1, val2=2) >>> handleKeyError(lst=range(3), val1=1, val2=100) KeyError: Please choose the argment val2 from ['0', '1', '2']. you chose 100 Raise: KeyError: If ``kwargs.values()`` not in the ``lst`` """ for k,v in kwargs.items(): if v not in lst: lst = ', '.join([f"'{toGREEN(e)}'" for e in lst]) raise KeyError(f"Please choose the argment {toBLUE(k)} from [{lst}]. you chose '{toRED(v)}'")
[docs]def class2str(class_): """Convert class to str. Args: class_ (class): class object Examples: >>> from pycharmers.utils import class2str >>> class2str(str) 'str' >>> class2str(tuple) 'tuple' """ return re.sub(r"<class '(.*?)'>", r"\1", str(class_))
[docs]def handleTypeError(types, **kwargs): """Check whether all types of ``kwargs.values()`` match any of ``types``. Args: lst (list) : candidate types. kwargs : ``key`` is the varname that is easy to understand when an error occurs Examples: >>> from pycharmers.utils import handleTypeError >>> handleTypeError(types=[str], val="foo") >>> handleTypeError(types=[str, int], val=1) >>> handleTypeError(types=[str, int], val=1.) TypeError: val must be one of ['str', 'int'], not float >>> handleTypeError(types=[str], val1="foo", val2="bar") >>> handleTypeError(types=[str, int], val1="foo", val2=1.) TypeError: val2 must be one of ['str', 'int'], not float Raise: TypeError: If the types of ``kwargs.values()`` are none of the ``types`` """ for k,v in kwargs.items(): if not any([isinstance(v,t) for t in types]): str_true_types = ', '.join([f"'{toGREEN(class2str(t))}'" for t in types]) srt_false_type = class2str(type(v)) if len(types)==1: err_msg = f"must be {str_true_types}" else: err_msg = f"must be one of [{str_true_types}]" raise TypeError(f"{toBLUE(k)} {err_msg}, not {toRED(srt_false_type)}")
[docs]def str_strip(string): """Convert all consecutive whitespace characters to `' '` (half-width whitespace), then return a copy of the string with leading and trailing whitespace removed. Args: string (str) : string Example: >>> from pycharmers.utils import str_strip >>> str_strip(" hoge ") 'hoge' >>> str_strip(" ho ge ") 'ho ge' >>> str_strip(" ho g e") 'ho g e' """ return re.sub(pattern=r"[\s  ]+", repl=" ", string=str(string)).strip()
[docs]def now_str(tz=None, fmt="%Y-%m-%d@%H.%M.%S"): """Returns new datetime string representing current time local to tz under the control of an explicit format string. Args: tz (datetime.timezone) : Timezone object. If no ``tz`` is specified, uses local timezone. fmt (str) : format string. See `Python Documentation <https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes>`_ Example: >>> from pycharmers.utils import now_str >>> now_str() '2020-09-14@22.31.17' >>>now_str(fmt="%A, %d. %B %Y %I:%M%p") Monday, 14. September 2020 10:31PM' >>> now_str(tz=datetime.timezone.utc) '2020-09-14@13.31.17' """ return datetime.datetime.now(tz=tz).strftime(fmt)
[docs]def mk_class_get(all_classes={}, gummy_abst_class=[], genre=""): """Create a get function. Args: all_classes (dict) : Dictionary of ``identifier`` -> instance gummy_abst_class (list) : The list of GummyAbstClass names. genre (str) : Genre of the class. """ if not isinstance(gummy_abst_class, list): gummy_abst_class = [gummy_abst_class] gummy_abst_class = gummy_abst_class + [str] # Create a get function. def get(identifier, **kwargs): handleTypeError(types=gummy_abst_class, identifier=identifier) if isinstance(identifier, str): identifier = identifier.lower() handleKeyError(lst=list(all_classes.keys()), identifier=identifier) instance = all_classes.get(identifier)(**kwargs) else: instance = identifier return instance # Set a docstrings. genre = genre.capitalize() class_str = ", ".join([class2str(e) for e in gummy_abst_class]) get.__doc__ = f"""Retrieves a Translation-Gummy {genre} instance. Args: identifier ({class_str}) : {genre} identifier, string name of a {genre}, or {' '*len(class_str)} a Translation-Gummy {genre} instance. Returns: {class2str(gummy_abst_class[0])} : A Translation-Gummy {genre} instance. """ return get
[docs]def recreate_dir(path, exist_ok=True): """Super-mkdir. Create a leaf directory and all intermediate ones. Args: path (str) : Path to the target directory. exist_ok (bool) : If the target directory already exists, raise an FileExistsError if ``exist_ok`` is ``False``. """ if os.path.exists(path): if exist_ok: if os.path.isdir(path): print(toRED("Delete existing directory")) shutil.rmtree(path) else: print(toRED("Delete existing file.")) os.remove(path) else: raise FileExistsError(f"[Errno 17] File exists: '{path}'") os.makedirs(path, exist_ok=False)
[docs]def readable_bytes(size): """Unit conversion for readability. Args: size (int): File size expressed in bytes Returns: tuple (int, str): (size, unit) Examples: >>> from pycharmers.utils import readable_bytes >>> size, unit = readable_bytes(1e2) >>> print(f"{size:.2f}[{unit}]") 100.00[KB] >>> size, unit = readable_bytes(1e5) >>> print(f"{size:.2f}[{unit}]") 97.66[MB] >>> size, unit = readable_bytes(1e10) >>> print(f"{size:.2f}[{unit}]") 9.31[GB] """ for unit in ["K","M","G"]: if abs(size) < 1024.0: break size /= 1024.0 # size >> 10 return (size, unit+"B")
[docs]def splitted_query_generator(query, maxsize=5000): """ Use `Natural Language Toolkit <https://www.nltk.org/index.html>`_ to split text wisely. NOTE: If ``word_tokenize(sentence) >> maxsize``, Get stuck in an infinite loop Args: query (str) : English texts. maxsize (int) : Number of English characters that this generator can yield at one time. Examples: >>> from gummy.utils import splitted_query_generator >>> gen = splitted_query_generator(query="I have a pen. I have an apple. Apple pen! I have a pen. I have a pineapple. Pineapple pen! Applepen… pineapplepen… Pen-Pineapple-Apple-Pen! Pen-Pineapple-Apple-Pen!", maxsize=25) >>> for i,text in enumerate(gen): ... print(i, text) 0 I have a pen. 1 I have an apple. 2 Apple pen! I have a pen. 3 I have a pineapple. 4 Pineapple pen! Applepen… 5 pineapplepen… 6 Pen-Pineapple-Apple-Pen ! 7 Pen-Pineapple-Apple-Pen! """ sent_tokenized_query = sent_tokenize(query) while True: splitted_query = "" num_allowed_chars = maxsize while len(sent_tokenized_query)>0: sentence = sent_tokenized_query.pop(0) len_sentence = len(sentence) if num_allowed_chars >= len_sentence: splitted_query += sentence + " " num_allowed_chars -= len_sentence+1 else: # If the length of one sentence exceeds maxsize, split it into words. if len_sentence>maxsize: sent_tokenized_query = word_tokenize(sentence) + sent_tokenized_query # Else, stop adding sentence and carry over the current one. else: sent_tokenized_query.insert(0, sentence) break if num_allowed_chars == maxsize: break else: yield splitted_query.rstrip(" ")
[docs]def get_latest_filename(dirname=".", ext=None): """Returns the most recently edited or added file path. Args: dirname (str) : Where the extracted file will be stored. ext (str) : Extract only files with this extension from compressed files. If ``None``, all files will be extracted. Examples: >>> from gummy.utils import UTILS_DIR, get_latest_filename >>> get_latest_filename(UTILS_DIR) '/Users/iwasakishuto/Github/portfolio/Translation-Gummy/gummy/utils/__pycache__' >>> get_latest_filename(UTILS_DIR, ext=".py") '/Users/iwasakishuto/Github/portfolio/Translation-Gummy/gummy/utils/generic_utils.py' """ if len(os.listdir(dirname)) == 0: return None else: return max([os.path.join(dirname,fn) for fn in os.listdir(dirname) if ext is None or fn.endswith(ext)], key=os.path.getctime)
[docs]class DictParamProcessor(argparse.Action): """Receive an argument as a dictionary. Raises: ValueError: You must give one argument for each one keyword. Examples: >>> import argparse >>> from gummy.utils import DictParamProcessor >>> parser = argparse.ArgumentParser() >>> parser.add_argument("--dict_params", action=DictParamProcessor) >>> args = parser.parse_args(args=["--dict_params", "foo = [a, b, c]", "--dict_params", "bar=d"]) >>> args.dict_params {'foo': ['a', 'b', 'c'], 'bar': 'd'} >>> args = parser.parse_args(args=["--dict_params", "foo=a, bar=b"]) ValueError: too many values to unpack (expected 2) Note: If you run from the command line, execute as follows:: $ python app.py --dict_params "foo = [a, b, c]" --dict_params bar=c """ def __call__(self, parser, namespace, values, option_strings=None): param_dict = getattr(namespace, self.dest) or {} k, v = values.split("=") match = re.match(pattern=r"\[(.+)\]", string=str_strip(v)) if match is not None: v = [str_strip(e) for e in match.group(1).split(",")] else: v = str_strip(v) param_dict[str_strip(k)] = v setattr(namespace, self.dest, param_dict)
[docs]def ListParamProcessorCreate(type=str): """Create a ListParamProcessor Args: type (type) : type of each element in list. Returns: ListParamProcessor (argparse.Action) : Processor which receives list arguments. Examples: >>> import argparse >>> from pycharmers.utils import ListParamProcessorCreate >>> parser = argparse.ArgumentParser() >>> parser.add_argument("--list_params", action=ListParamProcessorCreate()) >>> args = parser.parse_args(args=["--list_params", "[あ, い, う]"]) >>> args.list_params ['あ', 'い', 'う'] """ class ListParamProcessor(argparse.Action): """Receive List arguments. Examples: >>> import argparse >>> from pycharmers.utils import ListParamProcessor >>> parser = argparse.ArgumentParser() >>> parser.add_argument("--list_params", action=ListParamProcessor) >>> args = parser.parse_args(args=["--list_params", "[あ, い, う]"]) >>> args.list_params ['あ', 'い', 'う'] Note: If you run from the command line, execute as follows:: $ python app.py --list_params "[あ, い, う]" """ def __call__(self, parser, namespace, values, option_strings=None, **kwargs): match = re.match(pattern=r"(?:\[|\()(.+)(?:\]|\))", string=values) if match: values = [type(str_strip(e)) for e in match.group(1).split(",")] else: values = [type(values)] setattr(namespace, self.dest, values) return ListParamProcessor
[docs]def try_wrapper(func, *args, ret_=None, msg_="", verbose_=True, **kwargs): """Wrap ``func(*args, **kwargs)`` with ``try-`` and ``except`` blocks. Args: func (functions) : functions. args (tuple) : ``*args`` for ``func``. kwargs (kwargs) : ``*kwargs`` for ``func``. ret_ (any) : default ret val. msg_ (str) : message to print. verbose_ (bool) : Whether to print message or not. (default= ``True``) Examples: >>> from gummy.utils import try_wrapper >>> ret = try_wrapper(lambda x,y: x/y, 1, 2, msg_="divide") * Succeeded to divide >>> ret 0.5 >>> ret = try_wrapper(lambda x,y: x/y, 1, 0, msg_="divide") * Failed to divide (ZeroDivisionError: division by zero) >>> ret is None True >>> ret = try_wrapper(lambda x,y: x/y, 1, 0, ret_=1, msg_="divide") * Failed to divide (ZeroDivisionError: division by zero) >>> ret is None False >>> ret 1 """ try: ret_ = func(*args, **kwargs) prefix = toGREEN("Succeeded to ") suffix = "" except Exception as e: e.__class__.__name__ prefix = toRED("Failed to ") suffix = f" ({toRED(e.__class__.__name__)}: {toACCENT(e)})" if verbose_: print("* " + prefix + msg_ + suffix) return ret_