Source code for gummy.models

# coding: utf-8
"""This file defines a model that integrates all of :mod:`journals <gummy.journals>`, 
:mod:`translators <gummy.translators>`, :mod:`gateways <gummy.gateways>`, and 
it is possible to do all of the following at once.

1. Determine the ``journal_type`` of paper from the ``url`` or file extension.
2. If necessary, use a ``GummyGateway`` to access non-open content of the journal.
3. Parse the paper using ``GummyJournals`` and obtain the contents.
4. Translate the English obtained using ``GummyTranslators`` to Japanese.
5. Arrange Japanese and English according to the `templates <https://github.com/iwasakishuto/Translation-Gummy/tree/master/gummy/templates>`_ .
6. Convert the obtained HTML to PDF.

You can get (import) ``TranslationGummy`` by the following 2 ways.

.. code-block:: python

    >>> from gummy.models import TranslationGummy
    >>> from gummy import TranslationGummy
"""

import os
import time

from . import gateways
from . import journals
from . import translators

from .utils._path import GUMMY_DIR, TEMPLATES_DIR
from .utils.coloring_utils import toACCENT, toBLUE, toGREEN
from .utils.driver_utils import get_driver
from .utils.journal_utils import whichJournal
from .utils.outfmt_utils import tohtml, html2pdf, sanitize_filename
from .utils.download_utils import match2path
from .utils.driver_utils import get_driver
from .utils.pdf_utils import createHighlight, addHighlightToPage

[docs]class TranslationGummy(): """This class integrates all of the followings - :mod:`journals <gummy.journals>` - :mod:`translators <gummy.translators>` - :mod:`gateways <gummy.gateways>` Args: chrome_options (ChromeOptions) : Instance of ChromeOptions. (default= :meth:`get_chrome_options() <gummy.utils.driver_utils.get_chrome_options>` ) browser (bool) : Whether you want to run Chrome with GUI browser. (default= ``False`` ) driver (WebDriver) : Selenium WebDriver. gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= `"useless"`) translator (str, GummyTranslator) : identifier of the Gummy Translator Class. See :mod:`translators <gummy.translators>`. (default= `"deepl"`) maxsize (int) : Number of English characters that we can send a request at one time. (default= ``5000``) specialize (bool) : Whether to support multiple languages or specialize. (default= ``True``) If you want to specialize in translating between specific languages, set ``from_lang`` and ``to_lang`` arguments. from_lang (str) : Language before translation. to_lang (str) : Language after translation. verbose (bool) : Whether you want to print output or not. (default= ``True`` ) translator_verbose (bool) : Whether you want to print translator’s output or not. (default= ``False`` ) """ def __init__(self, chrome_options=None, browser=False, driver=None, gateway="useless", translator="deepl", maxsize=5000, specialize=True, from_lang="en", to_lang="ja", verbose=True, translator_verbose=True): self.driver = driver or get_driver(chrome_options=chrome_options, browser=browser) self.gateway = gateway self.translator = translators.get(translator, maxsize=maxsize, specialize=specialize, from_lang=from_lang, to_lang=to_lang, verbose=translator_verbose) self.verbose = verbose
[docs] def translate(self, query, barname=None, from_lang="en", to_lang="ja", correspond=False): """Translate English into Japanese. See :meth:`translate <gummy.translators.translate>`. Args: query (str) : English to be translated. barname (str) : Bar name for :meth:`ProgressMonitor <gummy.utils.monitor_utils.ProgressMonitor>`. from_lang (str) : Language before translation. to_lang (str) : Language after translation. correspond (bool) : Whether to correspond the location of ``from_lang`` correspond to that of ``to_lang``. Examples: >>> from gummy import TranslationGummy >>> model = TranslationGummy() >>> ja = model.translate("This is a pen.") DeepLTranslator (query1) 03/30 [##------------------] 10.00% - 3.243[s] >>> print(ja) 'これはペンです。' """ return self.translator.translate(query=query, driver=self.driver, barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=correspond)
[docs] def get_contents(self, url, journal_type=None, crawl_type=None, gateway=None, **gatewaykwargs): """Get contents of the journal. Args: url (str) : URL of a paper or ``path/to/local.pdf``. journal_type (str) : Journal type, if you not specify, judge by analyzing from ``url``. crawl_type (str) : Crawling type, if you not specify, use recommended crawling type. gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= ``None``) gatewaykwargs (dict) : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`. Returns: tuple (str, dict) : (title, content) Examples: >>> from gummy import TranslationGummy >>> model = TranslationGummy() >>> title, texts = model.get_contents("https://www.nature.com/articles/ncb0800_500") Estimated Journal Type : Nature Crawling Type: soup : >>> print(title) Formation of the male-specific muscle in female by ectopic expression >>> print(texts[:1]) [{'head': 'Abstract', 'en': 'The () gene product Fru has been ... for the sexually dimorphic actions of the gene.'}] """ if journal_type is None: if os.path.exists(url): journal_type = "pdf" else: journal_type = whichJournal(url, driver=self.driver, verbose=self.verbose) gateway = gateway or self.gateway crawler = journals.get(journal_type, gateway=gateway, sleep_for_loading=3, verbose=self.verbose) title, texts = crawler.get_contents(url=url, driver=self.driver, crawl_type=crawl_type, **gatewaykwargs) return title, texts
[docs] def toHTML(self, url, path=None, out_dir=GUMMY_DIR, from_lang="en", to_lang="ja", correspond=True, journal_type=None, crawl_type=None, gateway=None, searchpath=TEMPLATES_DIR, template="paper.html", **gatewaykwargs): """Get contents from URL and create a HTML. Args: url (str) : URL of a paper or ``path/to/local.pdf``. path/out_dir (str) : Where you save a created HTML. If path is None, save at ``<out_dir>/<title>.html`` (default= ``GUMMY_DIR``) from_lang (str) : Language before translation. to_lang (str) : Language after translation. correspond (bool) : Whether to correspond the location of ``from_lang`` correspond to that of ``to_lang``. journal_type (str) : Journal type, if you specify, use ``journal_type`` journal crawler. (default= `None`) crawl_type (str) : Crawling type, if you not specify, use recommended crawling type. (default= `None`) gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= `None`) searchpath/template (str) : Use a ``<searchpath>/<template>`` tpl for creating HTML. (default= `TEMPLATES_DIR/paper.html`) gatewaykwargs (dict) : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`. """ title, contents = self.get_contents( url=url, journal_type=journal_type, crawl_type=crawl_type, gateway=gateway, **gatewaykwargs ) print(f"\nTranslation: {toACCENT(self.translator.name)}\n{'='*30}") len_contents = len(contents) # Combine split text for faster translation. if crawl_type=="pdf": raw = "" for i,content in enumerate(contents): barname = f"[{i+1:>0{len(str(len_contents))}}/{len_contents}] " + toACCENT(content.get("head","\t")) if "raw" in content: if content["raw"]=="": content["raw"], content["translated"] = self.translator.translate_wrapper(query=raw, barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=correspond) raw = "" else: raw += " "+content.pop("raw") elif "img" in content and self.verbose: print(barname + "<img>") if len(raw)>0: content["raw"], content["translated"] = self.translator.translate_wrapper(query=raw, barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=correspond) else: for i,content in enumerate(contents): barname = f"[{i+1:>0{len(str(len_contents))}}/{len_contents}] " + toACCENT(content.get("head","\t")) if "raw" in content: content["raw"], content["translated"] = self.translator.translate_wrapper(query=content["raw"], barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=correspond) elif "img" in content and self.verbose: print(barname + "<img>") if path is None: path = os.path.join(out_dir, sanitize_filename(fp=title, dirname=".")) htmlpath = tohtml( path=path, title=title, contents=contents, searchpath=searchpath, template=template, verbose=self.verbose ) return htmlpath
[docs] def toPDF(self, url, path=None, out_dir=GUMMY_DIR, from_lang="en", to_lang="ja", correspond=True, journal_type=None, crawl_type=None, gateway=None, searchpath=TEMPLATES_DIR, template="paper.html", delete_html=True, options={}, **gatewaykwargs): """Get contents from URL and create a PDF. Args: url (str) : URL of a paper or ``path/to/local.pdf``. path/out_dir (str) : Where you save a created HTML. If path is None, save at ``<out_dir>/<title>.html`` (default= ``GUMMY_DIR``) from_lang (str) : Language before translation. to_lang (str) : Language after translation. correspond (bool) : Whether to correspond the location of ``from_lang`` correspond to that of ``to_lang``. journal_type (str) : Journal type, if you specify, use ``journal_type`` journal crawler. (default= `None`) crawl_type (str) : Crawling type, if you not specify, use recommended crawling type. (default= `None`) gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= `None`) searchpath/template (str) : Use a ``<searchpath>/<template>`` tpl for creating HTML. (default= `TEMPLATES_DIR/paper.html`) delete_html (bool) : Whether you want to delete an intermediate html file. (default= `True`) options (dict) : Options for wkhtmltopdf. See https://wkhtmltopdf.org/usage/wkhtmltopdf.txt (default= `{}`) gatewaykwargs (dict) : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`. """ htmlpath = self.toHTML( url=url, path=path, out_dir=out_dir, from_lang=from_lang, to_lang=to_lang, correspond=correspond, journal_type=journal_type, crawl_type=crawl_type, gateway=gateway, searchpath=searchpath, template=template, **gatewaykwargs ) if self.verbose: print(f"\nConvert from HTML to PDF\n{'='*30}") pdfpath = html2pdf(path=htmlpath, delete_html=delete_html, verbose=self.verbose, options=options) return pdfpath
[docs] def highlight(self, url, path=None, out_dir=GUMMY_DIR, from_lang="en", to_lang="ja", journal_type=None, gateway=None, ignore_length=10, highlight_color=[1,1,0], **gatewaykwargs): """Get contents from URL and create a PDF. Args: url (str) : URL of a paper or ``path/to/local.pdf``. path/out_dir (str) : Where you save a created HTML. If path is None, save at ``<out_dir>/<title>.html`` (default= ``GUMMY_DIR``) from_lang (str) : Language before translation. to_lang (str) : Language after translation. journal_type (str) : Journal type, if you specify, use ``journal_type`` journal crawler. (default= `None`) gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= `None`) ignore_length (int) : If the number of English characters is smaller than ``ignore_length`` , do not highlight highlight_color (list) : The highlight color. gatewaykwargs (dict) : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`. """ from PyPDF2 import PdfFileWriter, PdfFileReader title, contents = self.get_contents( url=url, journal_type=journal_type, crawl_type="pdf", gateway=gateway, **gatewaykwargs ) path_ = match2path(url, dirname=out_dir) out_path = path or os.path.join(out_dir, "_higlighted".join(os.path.splitext(os.path.basename(path_)))) with open(path_, "rb") as inPdf: pdfInput = PdfFileReader(inPdf) pdfOutput = PdfFileWriter() page_no = 0 page = None len_contents = len(contents) for i,content in enumerate(contents): if "head" in content: if page_no>0: pdfOutput.addPage(page) page = pdfInput.getPage(page_no) page_no += 1 raw = content.get("raw", "") if raw=="" or len(raw)<ignore_length: continue barname = f"[page.{page_no} {i+1:>0{len(str(len_contents))}}/{len_contents}] " translated = self.translator.translate(query=raw, barname=barname, from_lang=from_lang, to_lang=to_lang, correspond=False) highlight = createHighlight(bbox=content["bbox"], contents=translated, color=highlight_color) addHighlightToPage(highlight, page, pdfOutput) pdfOutput.addPage(page) with open(out_path, "wb") as outPdf: pdfOutput.write(outPdf) print(f"{toBLUE(out_path)} is created.") return out_path