Source code for gummy.models

# coding: utf-8
"""This file defines a model that integrates all of :mod:`journals <gummy.journals>`, 
:mod:`translators <gummy.translators>`, :mod:`gateways <gummy.gateways>`, and 
it is possible to do all of the following at once.

1. Determine the ``journal_type`` of paper from the ``url`` or file extension.
2. If necessary, use a ``GummyGateway`` to access non-open content of the journal.
3. Parse the paper using ``GummyJournals`` and obtain the contents.
4. Translate the English obtained using ``GummyTranslators`` to Japanese.
5. Arrange Japanese and English according to the `templates <https://github.com/iwasakishuto/Translation-Gummy/tree/master/gummy/templates>`_ .
6. Convert the obtained HTML to PDF.

You can get (import) ``TranslationGummy`` by the following 2 ways.

.. code-block:: python

    >>> from gummy.models import TranslationGummy
    >>> from gummy import TranslationGummy
"""

import os
import time

from . import gateways
from . import journals
from . import translators

from .utils._path import GUMMY_DIR, TEMPLATES_DIR
from .utils.coloring_utils import toACCENT, toBLUE, toGREEN
from .utils.driver_utils import get_driver
from .utils.journal_utils import whichJournal
from .utils.outfmt_utils import tohtml, html2pdf, sanitize_filename
from .utils.download_utils import match2path
from .utils.driver_utils import get_driver
from .utils.pdf_utils import createHighlight, addHighlightToPage

[docs]class TranslationGummy():
    """This class integrates all of the followings
    
    - :mod:`journals <gummy.journals>`
    - :mod:`translators <gummy.translators>`
    - :mod:`gateways <gummy.gateways>`

    Args:
        chrome_options (ChromeOptions)    : Instance of ChromeOptions. (default= :meth:`get_chrome_options() <gummy.utils.driver_utils.get_chrome_options>` ) 
        browser (bool)                    : Whether you want to run Chrome with GUI browser. (default= ``False`` ) 
        driver (WebDriver)                : Selenium WebDriver.
        gateway (str, GummyGateWay)       : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= `"useless"`)
        translator (str, GummyTranslator) : identifier of the Gummy Translator Class. See :mod:`translators <gummy.translators>`. (default= `"deepl"`)
        maxsize (int)                     : Number of English characters that we can send a request at one time. (default= ``5000``)
        specialize (bool)                 : Whether to support multiple languages or specialize. (default= ``True``) If you want to specialize in translating between specific languages, set ``from_lang`` and ``to_lang`` arguments.
        from_lang (str)                   : Language before translation.
        to_lang (str)                     : Language after translation.
        verbose (bool)                    : Whether you want to print output or not. (default= ``True`` ) 
        translator_verbose (bool)         : Whether you want to print translator’s output or not. (default= ``False`` ) 
    """
    def __init__(self, chrome_options=None, browser=False, driver=None,
                 gateway="useless", translator="deepl", maxsize=5000,
                 specialize=True, from_lang="en", to_lang="ja",
                 verbose=True, translator_verbose=True):
        self.driver = driver or get_driver(chrome_options=chrome_options, browser=browser)
        self.gateway = gateway
        self.translator = translators.get(translator, maxsize=maxsize, specialize=specialize, from_lang=from_lang, to_lang=to_lang, verbose=translator_verbose)
        self.verbose = verbose

[docs]    def translate(self, query, barname=None, from_lang="en", to_lang="ja", correspond=False):
        """Translate English into Japanese. See :meth:`translate <gummy.translators.translate>`.

        Args:
            query (str)        : English to be translated.
            barname (str)      : Bar name for :meth:`ProgressMonitor <gummy.utils.monitor_utils.ProgressMonitor>`.
            from_lang (str)    : Language before translation.
            to_lang (str)      : Language after translation.
            correspond (bool)  : Whether to correspond the location of ``from_lang`` correspond to that of ``to_lang``.

        Examples:
            >>> from gummy import TranslationGummy
            >>> model = TranslationGummy()
            >>> ja = model.translate("This is a pen.")
            DeepLTranslator (query1) 03/30 [##------------------] 10.00% - 3.243[s]
            >>> print(ja)
            'これはペンです。'
        """
        return self.translator.translate(query=query, driver=self.driver, barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=correspond)

[docs]    def get_contents(self, url, journal_type=None, crawl_type=None, gateway=None, **gatewaykwargs):
        """Get contents of the journal.

        Args:
            url (str)                   : URL of a paper or ``path/to/local.pdf``.
            journal_type (str)          : Journal type, if you not specify, judge by analyzing from ``url``.
            crawl_type (str)            : Crawling type, if you not specify, use recommended crawling type.
            gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= ``None``)
            gatewaykwargs (dict)        : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`.

        Returns:
            tuple (str, dict) : (title, content)

        Examples:
            >>> from gummy import TranslationGummy
            >>> model = TranslationGummy()
            >>> title, texts = model.get_contents("https://www.nature.com/articles/ncb0800_500")
            Estimated Journal Type : Nature
            Crawling Type: soup
                :
            >>> print(title)
            Formation of the male-specific muscle in female by ectopic expression
            >>> print(texts[:1])
            [{'head': 'Abstract', 'en': 'The  () gene product Fru has been ... for the sexually dimorphic actions of the gene.'}]
        """
        if journal_type is None:
            if os.path.exists(url):
                journal_type = "pdf"
            else:
                journal_type = whichJournal(url, driver=self.driver, verbose=self.verbose)
        gateway = gateway or self.gateway
        crawler = journals.get(journal_type, gateway=gateway, sleep_for_loading=3, verbose=self.verbose)
        title, texts = crawler.get_contents(url=url, driver=self.driver, crawl_type=crawl_type, **gatewaykwargs)
        return title, texts

[docs]    def toHTML(self, url, path=None, out_dir=GUMMY_DIR,
               from_lang="en", to_lang="ja", correspond=True,
               journal_type=None, crawl_type=None, gateway=None,
               searchpath=TEMPLATES_DIR, template="paper.html",
               **gatewaykwargs):
        """Get contents from URL and create a HTML.

        Args:
            url (str)                   : URL of a paper or ``path/to/local.pdf``.
            path/out_dir (str)          : Where you save a created HTML. If path is None, save at ``<out_dir>/<title>.html`` (default= ``GUMMY_DIR``)
            from_lang (str)             : Language before translation.
            to_lang (str)               : Language after translation.
            correspond (bool)           : Whether to correspond the location of ``from_lang`` correspond to that of ``to_lang``.            
            journal_type (str)          : Journal type, if you specify, use ``journal_type`` journal crawler. (default= `None`)
            crawl_type (str)            : Crawling type, if you not specify, use recommended crawling type. (default= `None`)
            gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= `None`)
            searchpath/template (str)   : Use a ``<searchpath>/<template>`` tpl for creating HTML. (default= `TEMPLATES_DIR/paper.html`)
            gatewaykwargs (dict)        : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`.
        """
        title, contents = self.get_contents(
            url=url, journal_type=journal_type, crawl_type=crawl_type,
            gateway=gateway, **gatewaykwargs
        )
        print(f"\nTranslation: {toACCENT(self.translator.name)}\n{'='*30}")
        len_contents = len(contents)
        # Combine split text for faster translation.
        if crawl_type=="pdf":
            raw = ""
            for i,content in enumerate(contents):
                barname = f"[{i+1:>0{len(str(len_contents))}}/{len_contents}] " + toACCENT(content.get("head","\t"))
                if "raw" in content:
                    if content["raw"]=="":
                        content["raw"], content["translated"] = self.translator.translate_wrapper(query=raw, barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=correspond)
                        raw = ""
                    else:
                        raw += " "+content.pop("raw")
                elif "img" in content and self.verbose:
                    print(barname + "<img>")
            if len(raw)>0:
                content["raw"], content["translated"] = self.translator.translate_wrapper(query=raw, barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=correspond)
        else:
            for i,content in enumerate(contents):
                barname = f"[{i+1:>0{len(str(len_contents))}}/{len_contents}] " + toACCENT(content.get("head","\t"))
                if "raw" in content:
                    content["raw"], content["translated"] = self.translator.translate_wrapper(query=content["raw"], barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=correspond)
                elif "img" in content and self.verbose:
                    print(barname + "<img>")
        if path is None:
            path = os.path.join(out_dir, sanitize_filename(fp=title, dirname="."))
        htmlpath = tohtml(
            path=path, title=title, contents=contents,
            searchpath=searchpath, template=template, verbose=self.verbose
        )
        return htmlpath

[docs]    def toPDF(self, url, path=None, out_dir=GUMMY_DIR,
              from_lang="en", to_lang="ja", correspond=True,
              journal_type=None, crawl_type=None, gateway=None,
              searchpath=TEMPLATES_DIR, template="paper.html",
              delete_html=True, options={},
              **gatewaykwargs):
        """Get contents from URL and create a PDF.

        Args:
            url (str)                   : URL of a paper or ``path/to/local.pdf``.
            path/out_dir (str)          : Where you save a created HTML. If path is None, save at ``<out_dir>/<title>.html`` (default= ``GUMMY_DIR``)
            from_lang (str)             : Language before translation.
            to_lang (str)               : Language after translation.
            correspond (bool)           : Whether to correspond the location of ``from_lang`` correspond to that of ``to_lang``.            
            journal_type (str)          : Journal type, if you specify, use ``journal_type`` journal crawler. (default= `None`)
            crawl_type (str)            : Crawling type, if you not specify, use recommended crawling type. (default= `None`)
            gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= `None`)
            searchpath/template (str)   : Use a ``<searchpath>/<template>`` tpl for creating HTML. (default= `TEMPLATES_DIR/paper.html`)
            delete_html (bool)          : Whether you want to delete an intermediate html file. (default= `True`) 
            options (dict)              : Options for wkhtmltopdf. See https://wkhtmltopdf.org/usage/wkhtmltopdf.txt (default= `{}`)
            gatewaykwargs (dict)        : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`.
        """
        htmlpath = self.toHTML(
            url=url, path=path, out_dir=out_dir,
            from_lang=from_lang, to_lang=to_lang, correspond=correspond,
            journal_type=journal_type, crawl_type=crawl_type, gateway=gateway,
            searchpath=searchpath, template=template,
            **gatewaykwargs
        )
        if self.verbose: print(f"\nConvert from HTML to PDF\n{'='*30}")
        pdfpath = html2pdf(path=htmlpath, delete_html=delete_html, verbose=self.verbose, options=options)
        return pdfpath

[docs]    def highlight(self, url, path=None, out_dir=GUMMY_DIR,
                  from_lang="en", to_lang="ja", journal_type=None, gateway=None,
                  ignore_length=10, highlight_color=[1,1,0],
                  **gatewaykwargs):
        """Get contents from URL and create a PDF.

        Args:
            url (str)                   : URL of a paper or ``path/to/local.pdf``.
            path/out_dir (str)          : Where you save a created HTML. If path is None, save at ``<out_dir>/<title>.html`` (default= ``GUMMY_DIR``)
            from_lang (str)             : Language before translation.
            to_lang (str)               : Language after translation.
            journal_type (str)          : Journal type, if you specify, use ``journal_type`` journal crawler. (default= `None`)
            gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= `None`)
            ignore_length (int)         : If the number of English characters is smaller than ``ignore_length`` , do not highlight
            highlight_color (list)      : The highlight color.
            gatewaykwargs (dict)        : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`.
        """
        from PyPDF2 import PdfFileWriter, PdfFileReader
        title, contents = self.get_contents(
            url=url, journal_type=journal_type, crawl_type="pdf",
            gateway=gateway, **gatewaykwargs
        )
        path_ = match2path(url, dirname=out_dir)
        out_path = path or os.path.join(out_dir, "_higlighted".join(os.path.splitext(os.path.basename(path_))))
        with open(path_, "rb") as inPdf:
            pdfInput  = PdfFileReader(inPdf)
            pdfOutput = PdfFileWriter()
            page_no = 0
            page = None
            len_contents = len(contents)
            for i,content in enumerate(contents):
                if "head" in content:
                    if page_no>0:
                        pdfOutput.addPage(page)
                    page = pdfInput.getPage(page_no)
                    page_no += 1
                raw = content.get("raw", "")
                if raw=="" or len(raw)<ignore_length: continue
                barname = f"[page.{page_no} {i+1:>0{len(str(len_contents))}}/{len_contents}] "
                translated = self.translator.translate(query=raw, barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=False)
                highlight = createHighlight(bbox=content["bbox"], contents=translated, color=highlight_color)
                addHighlightToPage(highlight, page, pdfOutput)
            pdfOutput.addPage(page)
            with open(out_path, "wb") as outPdf:
                pdfOutput.write(outPdf)
            print(f"{toBLUE(out_path)} is created.")
        return out_path
Source code for gummy.models

Other contents

Social link