Source code for gummy.journals

# coding: utf-8
"""These classes get contents from paper pages ( ``html`` ) or files ( ``PDF``, ``TeX``)

Supported journals are listed `here (Supported journals · iwasakishuto/Translation-Gummy Wiki) <https://github.com/iwasakishuto/Translation-Gummy/wiki/Supported-journals>`_, 
and if you want to support for new journals, please request on twitter DM |twitter badge| or `Github issues <https://github.com/iwasakishuto/Translation-Gummy/issues>`_.

You can easily get (import) ``Journal Crawler Class`` by the following ways.

.. code-block:: python

    >>> from gummy import journals
    >>> crawler = journals.get("nature")
    >>> crawler
    <gummy.journals.NatureCrawler at 0x1256777c0>
    >>> from gummy.journals import NatureCrawler
    >>> nature = NatureCrawler()
    >>> nature
    <gummy.journals.NatureCrawler at 0x1253da9a0>
    >>> crawler = journals.get(nature)
    >>> id(crawler) == id(nature)
    True

.. |twitter badge| image:: https://img.shields.io/badge/twitter-Requests-1da1f2?style=flat-square&logo=twitter
   :target: https://www.twitter.com/messages/compose?recipient_id=1042783905697288193&text=Please%20support%20this%20journal%3A%20
"""
import io
import re
import os
import sys
import time
import urllib
import datetime
import requests
import warnings
from abc import ABCMeta
from bs4 import BeautifulSoup
from pylatexenc.latex2text import LatexNodes2Text
from requests.exceptions import RequestException

from . import gateways
from .utils._exceptions import JournalTypeIndistinguishableError
from .utils._path import GUMMY_DIR
from .utils.coloring_utils import toGREEN, toBLUE, toRED, toACCENT
from .utils.compress_utils import extract_from_compressed, is_compressed
from .utils.download_utils import download_file, decide_extension, src2base64
from .utils.driver_utils import wait_until_all_elements, try_find_element_click, scrollDown
from .utils.generic_utils import mk_class_get, handleKeyError, str_strip, now_str
from .utils.journal_utils import canonicalize, whichJournal
from .utils.monitor_utils import ProgressMonitor
from .utils.outfmt_utils import sanitize_filename
from .utils.pdf_utils import get_pdf_contents
from .utils.soup_utils import str2soup, split_section, group_soup_with_head, replace_soup_tag, find_target_text, find_target_id, kwargs2tag

SUPPORTED_CRAWL_TYPES = ["soup", "tex", "pdf"]

[docs]class GummyAbstJournal(metaclass=ABCMeta): """If you want define your own journal crawlers, please inherit this class and define these methods: - if ``crawl_type`` == ``"tex"``: - :meth:`get_contents_tex(self, url, driver=None) <gummy.journals.GummyAbstJournal.get_contents_tex>` - (required) get_contents_tex(self, url, driver=None) - (required) get_sections_from_tex(tex) - (required) get_contents_from_tex_sections(tex_sections) - if ``crawl_type`` == ``"soup"``: - :meth:`get_soup_source(self, url, driver=None, **gatewaykwargs) <gummy.journals.GummyAbstJournal.get_soup_source>` - :meth:`get_contents_soup(self, url, driver=None, **gatewaykwargs) <gummy.journals.GummyAbstJournal.get_contents_soup>` - (if necessary) :meth:`get_contents_from_soup_sections(self, soup_sections) <gummy.journals.GummyAbstJournal.get_contents_from_soup_sections>` - (required) :meth:`get_title_from_soup(self, soup) <gummy.journals.GummyAbstJournal.get_title_from_soup>` - (required) :meth:`get_sections_from_soup(self, soup) <gummy.journals.GummyAbstJournal.get_sections_from_soup>` - (required) :meth:`get_head_from_section(self, section) <gummy.journals.GummyAbstJournal.get_head_from_section>` - (if necessary) :meth:`make_elements_visible(self, driver) <gummy.journals.GummyAbstJournal.make_elements_visible>` - :meth:`decompose_soup_tags(self, soup) <gummy.journals.GummyAbstJournal.decompose_soup_tags>` - :meth:`organize_soup_section(self, section, head="", head_is_not_added=True) <gummy.journals.GummyAbstJournal.organize_soup_section>` - if ``crawl_type`` == ``"pdf"``: - :meth:`get_contents_pdf(self, url, driver=None) <gummy.journals.GummyAbstJournal.get_contents_pdf>` - :meth:`get_pdf_source(self, url, driver=None) <gummy.journals.GummyAbstJournal.get_pdf_source>` - :meth:`get_title_from_pdf(self, pdf_gen) <gummy.journals.GummyAbstJournal.get_title_from_pdf>` - :meth:`get_contents_from_pdf_pages(self, pdf_pages) <gummy.journals.GummyAbstJournal.get_contents_from_pdf_pages>` Args: crawl_type (str) : Crawling type, if you not specify, use recommended crawling type. gateway (str, GummyGateWay) : identifier of the Gummy Gateway Class. See :mod:`gateways <gummy.gateways>`. (default= ``None`` ) sleep_for_loading (int) : Number of seconds to wait for a web page to load (default= ``3`` ) verbose (bool) : Whether you want to print output or not. (default= ``True`` ) DecomposeTexTags (list) : Tex tags to be removed in advance for easier analysis. (default= ``["<cit.>","\xa0","<ref>"]`` ) DecomposeSoupTags (list) : HTML tags to be removed in advance for easier analysis. (default= ``["i","link","meta","noscript","script","style","sup"]`` ) subheadTags (list) : HTML tag names to identify the subheadings. kwargs (dict) : There is no use for it so far. Attributes: crawling_logs (dict) : Crawling logs. """ def __init__(self, crawl_type="soup", gateway="useless", sleep_for_loading=3, verbose=True, DecomposeTexTags=["<cit.>","\xa0","<ref>"], DecomposeSoupTags=[{'name': 'link'}, {'name': 'meta'}, {'name': 'noscript'}, {'name': 'script'}, {'name': 'style'}, {'name': 'sup'}], subheadTags=[], **kwargs): handleKeyError(lst=SUPPORTED_CRAWL_TYPES, crawl_type=crawl_type) self.crawl_type = crawl_type self.gateway = gateways.get(gateway, verbose=verbose) self.sleep_for_loading = sleep_for_loading self.verbose = verbose self.DecomposeTexTags = DecomposeTexTags self.DecomposeSoupTags = DecomposeSoupTags self.subheadTags = subheadTags self.crawling_logs = {} self.__dict__.update(kwargs) @property def class_name(self): """Same as ``self.__class__.__name__``.""" return self.__class__.__name__ @property def name(self): """Translator service name.""" return self.class_name.replace("Crawler", "") @property def journal_type(self): """Journal Type.""" return self.name.lower() def _store_crawling_logs(self, **kwargs): """Store ``kwargs`` in ``self.crawling_logs``""" self.crawling_logs.update(kwargs) @property def default_title(self): """Default title.""" return sanitize_filename(fp=self.crawling_logs.get("url", now_str()))
[docs] def get_contents(self, url, driver=None, crawl_type=None, **gatewaykwargs): """Get contents using the method which is determined based on ``crawl_type`` Args: url (str) : URL of a paper or ``path/to/local.file``. driver (WebDriver) : Selenium WebDriver. crawl_type (str) : Crawling type, if you not specify, use recommended crawling type. gatewaykwargs (dict) : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`. Returns: tuple (str, dict) : (title, content) Examples: >>> from gummy import journals >>> crawler = journals.get("nature") >>> title, texts = crawler.get_contents(url="https://www.nature.com/articles/ncb0800_500") Crawling Type: soup : >>> print(title) Formation of the male-specific muscle in female by ectopic expression >>> print(texts[:1]) [{'head': 'Abstract', 'raw': 'The () gene product Fru has been ... for the sexually dimorphic actions of the gene.'}] """ self._store_crawling_logs(url=url, start_time=now_str()) crawl_type = crawl_type or self.crawl_type handleKeyError(lst=SUPPORTED_CRAWL_TYPES, crawl_type=crawl_type) if self.verbose: print(f"Crawling Type: {toACCENT(crawl_type)}") get_contents_func = getattr(self, f"get_contents_{crawl_type}") title, contents = get_contents_func(url=url, driver=driver, **gatewaykwargs) return (title, contents)
def _get_contents_from_sections_base(self, sections): """Base method for ``get_contents_from_XXX``""" contents = [] if self.verbose: print(f"\nShow contents of the paper.\n{'='*30}") return contents # ================== # # crawl_type="soup" # # ================== #
[docs] @staticmethod def get_soup_url(url): """Convert the URL to the URL of the web page you access when ``crawl_type=="soup"`` """ return url
[docs] def get_contents_soup(self, url, driver=None, **gatewaykwargs): """ Get contents from url of the web page using ``BeautifulSoup``. Args: url (str) : URL of a paper. driver (WebDriver) : Selenium WebDriver. gatewaykwargs (dict) : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`. Returns: tuple (str, dict) : (title, content) """ soup = self.get_soup_source(url=self.get_soup_url(url), driver=driver, **gatewaykwargs) title = self.get_title_from_soup(soup) soup_sections = self.get_sections_from_soup(soup) contents = self.get_contents_from_soup_sections(soup_sections) self._store_crawling_logs(soup=soup, title=title, soup_sections=soup_sections, contents=contents) return (title, contents)
[docs] def get_soup_source(self, url, driver=None, **gatewaykwargs): """ Scrape and get page source from ``url``. Args: url (str) : URL of a paper. driver (WebDriver) : webdriver gatewaykwargs (dict) : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`. Returns: BeautifulSoup : A data structure representing a parsed HTML or XML document. """ # If url is None, we will use crawled information. cano_url = canonicalize(url=url, driver=driver) self._store_crawling_logs(cano_url=cano_url) # If driver is None, we could not use gateway service. if driver is None: html = requests.get(url=cano_url).content if self.verbose: print(f"Get HTML content from {toBLUE(cano_url)}") else: driver, fmt_url_func = self.gateway.passthrough(driver=driver, url=cano_url, journal_type=self.journal_type, **gatewaykwargs) gateway_fmt_url = fmt_url_func(cano_url=cano_url) driver.get(gateway_fmt_url) if self.verbose: print(f"Get HTML content from {toBLUE(gateway_fmt_url)}") wait_until_all_elements(driver=driver, timeout=self.sleep_for_loading, verbose=self.verbose) self.make_elements_visible(driver) html = driver.page_source.encode("utf-8") soup = BeautifulSoup(html, "html.parser") soup = self.decompose_soup_tags(soup=soup) return soup
[docs] def make_elements_visible(self, driver): """Make all elements of the page visible. Args: driver (WebDriver) : Selenium WebDriver. """ scrollDown(driver=driver, verbose=self.verbose)
[docs] def decompose_soup_tags(self, soup): """This function is not necessary for all Journals, but you can trim ``DecomposeSoupTags`` from the soup and it will help with debugging. Args: soup (BeautifulSoup) : A data structure representing a parsed HTML or XML document. Returns: tuple (BeautifulSoup, dict) : soup, a dict showing the number of decomposed tags. """ if self.verbose: print(f"\nDecompose unnecessary tags.\n{'='*30}") decoCounts = {} for decoKwargs in self.DecomposeSoupTags: decoTags = soup.find_all(**decoKwargs) print(f"Decomposed {len(decoTags)} {kwargs2tag(**decoKwargs)} tag{'s' if len(decoTags)!=1 else ''}.") for decoTag in decoTags: decoTag.decompose() return soup
[docs] def register_decompose_soup_tags(self, **kwargs): """Register ``DecomposeSoupTags`` Args: \*\*kwargs : Kwargs for ``soup.find`` method. """ self.DecomposeSoupTags.append(kwargs)
[docs] def get_title_from_soup(self, soup): """ Get page title from page source. Args: soup (BeautifulSoup) : A data structure representing a parsed HTML or XML document.) Returns: str : A page title. """ title = find_target_text(soup=soup, name="h1", default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): """ Get sections from page source. Args: soup (BeautifulSoup) : A data structure representing a parsed HTML or XML document.) Returns: list : Page sections. Each element is (bs4.element.Tag) """ sections = soup.find_all("section") return sections
[docs] def get_head_from_section(self, section): """Get head from a page section. Args: section (bs4.element.Tag) : Represents an HTML or XML tag that is part of a parse tree, along with its attributes and contents. Returns: bs4.element.Tag : A section head tag. """ head = section.find(name="h2") return head
[docs] def get_contents_from_soup_sections(self, soup_sections): """ Get contents from each soup section. Args: soup_sections (list) : Each element is (bs4.element.Tag). Returns: list : Each element is ``dict`` (key is one of the ``["raw", "head", "subhead", "img"]``). """ contents = self._get_contents_from_sections_base(soup_sections) len_soup_sections = len(soup_sections) for i,section in enumerate(soup_sections): headTag = self.get_head_from_section(section) if headTag is not None: head = str_strip(headTag.get_text()) headTag.decompose() else: head = "" contents.extend(self.organize_soup_section(section=section, head=head)) if self.verbose: print(f"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {head}") return contents
[docs] def organize_soup_section(self, section, head="", head_is_not_added=True): """ Organize soup section: - Extract an image and display it as ``base64`` format in the section. - Add ``head`` only to the initial content. Args: section (bs4.element.Tag) : Represents an HTML or XML tag that is part of a parse tree, along with its attributes and contents. head (str) : Head word. head_is_not_added (bool) : Whether head is added or not. (default= ``True``) """ contents = [] splitted_soup = split_section(section=section, name=self.subheadTags+["img"]) for element in splitted_soup: content = {} if head_is_not_added: content["head"] = head head_is_not_added = False if element.name == "img": content["img"] = src2base64(base=self.crawling_logs.get("cano_url"), src=element) elif element.name in self.subheadTags: content["subhead"] = str_strip(element.get_text()) else: content["raw"] = self.arrange_english(element.get_text()) contents.append(content) return contents
[docs] @staticmethod def arrange_english(english): """Get rid of extra characters from body (english). This method is used in :meth:`arrange_english <gummy.gateways.GummyAbstGateWay.organize_soup_section>`. Args: english (str) : Raw English. Returns: str : Arranged English """ return english
# ================== # # crawl_type="tex" # # ================== #
[docs] @staticmethod def get_tex_url(url): """Convert the URL to the URL of the tex page you access when ``crawl_type=="tex"`` """ return url
[docs] def get_contents_tex(self, url, driver=None): """ Get contents from url by parsing TeX sources. Args: url : (str) URL of a paper. driver : (WebDriver) webdriver gatewaykwargs (dict) : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`. Returns: tuple (str, dict) : (title, content) """ tex = self.get_tex_source(url=self.get_tex_url(url), driver=driver) title = self.get_title_from_tex(tex) # NOTE: If we can scrape "title" from soup, please prioritize it. if self.get_pdf_url(url) != self.get_soup_url(url): soup = self.get_soup_source(url=self.get_soup_url(url), driver=None) title = self.get_title_from_soup(soup) tex_sections = self.get_sections_from_tex(tex) contents = self.get_contents_from_tex_sections(tex_sections) self._store_crawling_logs(tex=tex, title=title, tex_sections=tex_sections, contents=contents) return (title, contents)
[docs] def get_tex_source(self, url, driver=None): """ Download and get tex source from url. Args: url (str) : URL of a tex source or ``path/to/local.tex``. driver (WebDriver) : Selenium WebDriver. Returns: str : Plain text in tex source. """ if not os.path.exists(url): path = download_file(url=url, dirname=GUMMY_DIR) ext = "." + path.split(".")[-1] if is_compressed(ext): extracted_file_paths = extract_from_compressed(path, ext=".tex", dirname=GUMMY_DIR) path = extracted_file_paths[0] else: path = url with open(path, mode="r") as ftex: tex = LatexNodes2Text().latex_to_text(ftex.read()) for decompose in self.DecomposeTexTags: tex = tex.replace(decompose, "") tex = re.sub('[  ]+', ' ', tex) return tex
[docs] def get_title_from_tex(self, tex): """ Get a title from tex source. Args: tex (str) : Plain text in tex source. Returns: str : TeX title. """ title = "title" return title
[docs] def get_sections_from_tex(self, tex): """ Get sections from tex source. Args: tex (str) : Plain text in tex source. Returns: list: Each element is plain text (str) """ sections = [tex] return sections
[docs] def get_contents_from_tex_sections(self, tex_sections): """ Get text for each tex section. Args: tex_sections : (list) Each element is plain text (str). Returns: (list) : Each element is ``dict`` (key is one of the ``["raw", "head", "subhead", "img"]``). """ contents = self._get_contents_from_sections_base(tex_sections) len_tex_sections = len(tex_sections) for i,section in enumerate(tex_sections): content = {} first_nl = section.index("\n") head = str_strip(section[:first_nl]).capitalize() content["head"] = head content["raw"] = section[first_nl:].replace("\n", "") contents.append(content) if self.verbose: print(f"[{i+1:>0{len(str(len_tex_sections))}}/{len_tex_sections}] {head}") return contents
# ================== # # crawl_type="pdf" # # ================== #
[docs] @staticmethod def get_pdf_url(url): """Convert the URL to the URL of the PDF page you access when ``crawl_type=="pdf"`` """ return url
[docs] def get_contents_pdf(self, url, driver=None): """ Get contents from url by parsing PDF file. Args: url (str) : URL of a paper or ``path/to/local.pdf``. driver (WebDriver) : Selenium WebDriver. gatewaykwargs (dict) : Gateway keywargs. See :meth:`passthrough <gummy.gateways.GummyAbstGateWay.passthrough>`. Returns: tuple (str, dict) : (title, content) """ pdf_pages = self.get_pdf_source(url=self.get_pdf_url(url), driver=driver) title = self.get_title_from_pdf(pdf_pages) # NOTE: If we can scrape "title" from soup, please prioritize it. if self.get_pdf_url(url) != self.get_soup_url(url): soup = self.get_soup_source(url=self.get_soup_url(url), driver=None) title = self.get_title_from_soup(soup) contents = self.get_contents_from_pdf_pages(pdf_pages) self._store_crawling_logs(title=title, pdf_pages=pdf_pages, contents=contents) return (title, contents)
[docs] def get_pdf_source(self, url, driver=None): """ Download and get PDF source from url. Args: url (str) : URL of a PDF file or ``path/to/local.pdf``. driver (WebDriver) : Selenium WebDriver. Returns: list : Each element is a list which contains [text, bbox(x0,y0,x1,y1)] """ pdf_pages = get_pdf_contents(file=url) return pdf_pages
[docs] def get_title_from_pdf(self, pdf_pages): """ Get title from PDF source. Args: pdf_pages (list) : Each element is a list which contains [text, bbox(x0,y0,x1,y1)] Returns: str : PDF title. """ title = "title" return title
[docs] def get_contents_from_pdf_pages(self, pdf_pages): """Get contents from each page. Args: pdf_pages (list) : Each element is a list which contains [text, bbox(x0,y0,x1,y1)] Returns: tuple (str, dict) : (title, content) """ contents = [] len_pdf_pages = len(pdf_pages) digit = len(str(len_pdf_pages)) for i,page_texts in enumerate(pdf_pages): page_no = f"Page.{i+1:>0{digit}}/{len_pdf_pages}" header = {"head" : page_no, "raw" : "", "bbox": (0,0,1,1)} contents.append(header) for text,bbox in page_texts: content = {"raw": "", "bbox": bbox} if text.startswith('<img src="data:image/jpeg;base64'): content["img"] = text else: content["raw"] = text.replace("-\n", "").replace("\n", " ") contents.append(content) if self.verbose: print(page_no) return contents
[docs]class PDFCrawler(GummyAbstJournal): def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="pdf", gateway="useless", sleep_for_loading=3, )
[docs] @staticmethod def get_soup_url(url): return url
[docs] @staticmethod def get_pdf_url(url): return url
[docs] def get_contents_pdf(self, url, driver=None): _, contents = super().get_contents_pdf(url=url, driver=driver) if hasattr(url, "filename"): url = getattr(url, "filename") title = str(url).split("/")[-1].rstrip(".pdf") return title, contents
[docs]class NatureCrawler(GummyAbstJournal): """ URL: - https://www.nature.com Attributes: crawl_type (str) : :meth:`NatureCrawler's <gummy.journals.NatureCrawler>` default ``crawl_type`` is ``"soup"``. AvoidAriaLabel (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.NatureCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.AvoidAriaLabel = ['Ack1','Bib1','additional-information','article-comments','article-info','author-information','ethics','further-reading','rightslink']# ,None]
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"class" : "c-article-title"}, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="section") if e.get("aria-labelledby") not in self.AvoidAriaLabel] # If the paper is old version, it may not be possible to obtain the content by the above way. if len(sections)==0: sections = soup.find_all(name="div", class_="c-article-section__content") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class arXivCrawler(GummyAbstJournal): """ URL: - https://arxiv.org Attributes: crawl_type (str) : :meth:`arXivCrawler's <gummy.journals.arXivCrawler>` default ``crawl_type`` is ``"pdf"``. AvoidAriaLabel (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_pdf <gummy.journals.arXivCrawler.get_sections_from_pdf>` """ def __init__(self, sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="pdf", gateway="useless", sleep_for_loading=sleep_for_loading, verbose=verbose, DecomposeTexTags=["<cit.>", "\xa0", "<ref>"], ) self.AvoidAriaLabel = [None, 'Bib1', 'additional-information', 'article-comments', 'article-info', 'author-information', 'ethics', 'further-reading', 'rightslink']
[docs] @staticmethod def get_pdf_url(url): return f"https://arxiv.org/pdf/{arXivCrawler.get_arXivNo(url)}.pdf"
[docs] @staticmethod def get_soup_url(url): return f"https://arxiv.org/abs/{arXivCrawler.get_arXivNo(url)}"
[docs] @staticmethod def get_tex_url(url): return f"https://arxiv.org/e-print/{arXivCrawler.get_arXivNo(url)}"
[docs] @staticmethod def get_arXivNo(url): return re.sub(pattern=r"^.+\/((?:\d|\.|v)+)(?:\.pdf)?$", repl=r"\1", string=url)
[docs] def get_sections_from_tex(self, tex): sections = tex.replace("§.§", "§").split("§") return sections
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"class" : "title mathjax"}, default=self.default_title).lstrip("Title:") return title
[docs]class NCBICrawler(GummyAbstJournal): """ URL: - https://www.ncbi.nlm.nih.gov Attributes: crawl_type (str) : :meth:`NCBICrawler's <gummy.journals.NCBICrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags = ["h3"], ) # self.AvoidIdsPatterns = [r"^idm[0-9]+", r"^S49$", r"^ass-data$"] # self.AvoidHead = ["References", "References and Notes"]
[docs] @staticmethod def arrange_english(english): return english[6:] if english.startswith("Go to:") else english
# @property # def AvoidIdsPattern(self): # return f"(?:{'|'.join([f'(?:{pat})' for pat in self.AvoidIdsPatterns])})"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="content-title", default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="div", class_="tsec") if not find_target_text(soup=e, name="h2", default="OK").lower().startswith("reference")] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2")#, class_="c-article-section__title") return head
[docs]class PubMedCrawler(GummyAbstJournal): """ URL: - https://pubmed.ncbi.nlm.nih.gov Attributes: crawl_type (str) : :meth:`PubMedCrawler's <gummy.journals.PubMedCrawler>` default ``crawl_type`` is ``"soup"``. AvoidIdsPatterns (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.PubMedCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, ) self.AvoidIdsPatterns = [r"^idm[0-9]+", r"^S49$", r"^ass-data$"]
[docs] def get_contents_soup(self, url, driver=None, **gatewaykwargs): soup = self.get_soup_source(url=url, driver=driver, **gatewaykwargs) a_doi = soup.find(name="a", class_="id-link") if a_doi is not None: cano_url = canonicalize(url=a_doi.get("href"), driver=driver, sleep_for_loading=5) if cano_url!=url: try: journal_type = whichJournal(url=cano_url, driver=None, verbose=self.verbose) if journal_type!="pubmed": # Prevent infinite loop crawler = get(journal_type, gateway=self.gateway, sleep_for_loading=3, verbose=self.verbose) return crawler.get_contents(url=cano_url, driver=driver) except JournalTypeIndistinguishableError: if self.verbose: print(f"{toGREEN('gummy.utils.journal_utils.whichJournal')} could not distinguish the journal type, so Scraping from PubMed") title = self.get_title_from_soup(soup) soup_sections = self.get_sections_from_soup(soup) contents = self.get_contents_from_soup_sections(soup_sections) self._store_crawling_logs(soup=soup, title=title, soup_sections=soup_sections, contents=contents) return (title, contents)
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="heading-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="abstract") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2", class_="title") return head
[docs]class OxfordAcademicCrawler(GummyAbstJournal): """ URL: - https://academic.oup.com Attributes: crawl_type (str) : :meth:`OxfordAcademicCrawler's <gummy.journals.OxfordAcademicCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3","h4"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article-title-main", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] article_full_text = soup.find(name="div", class_="widget-items", attrs={"data-widgetname": "ArticleFulltext"}) if article_full_text is not None: sections.extend(group_soup_with_head(soup=article_full_text, name="h2")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class ScienceDirectCrawler(GummyAbstJournal): """ URL: - https://www.sciencedirect.com Attributes: crawl_type (str) : :meth:`ScienceDirectCrawler's <gummy.journals.ScienceDirectCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.register_decompose_soup_tags(name="ol", class_="links-for-figure")
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="span", class_="title-text", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="Abstracts") body = soup.find(name="div", attrs={"id": "body"}) if body is not None: sections.extend(body.find_all(name="section")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class SpringerCrawler(GummyAbstJournal): """ URL: - https://link.springer.com Attributes: crawl_type (str) : :meth:`SpringerCrawler's <gummy.journals.SpringerCrawler>` default ``crawl_type`` is ``"soup"``. AvoidAriaLabel (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.SpringerCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, ) self.AvoidAriaLabel = [None,'Ack1','Bib1','additional-information','article-comments','article-info','author-information','ethics','further-reading','rightslink']
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="c-article-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="section") if e.get("aria-labelledby") not in self.AvoidAriaLabel] if len(sections)==0: sections = [e for e in soup.find_all(name="section") if e.get("Abs1") not in self.AvoidAriaLabel] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2")#, class_="c-article-section__title") return head
[docs]class MDPICrawler(GummyAbstJournal): """ URL: - https://www.mdpi.com Attributes: crawl_type (str) : :meth:`MDPICrawler's <gummy.journals.MDPICrawler>` default ``crawl_type`` is ``"soup"``. AvoidAriaLabel (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.MDPICrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h4"], ) self.AvoidAriaLabel = [None,'Ack1','Bib1','additional-information','article-comments','article-info','author-information','ethics','further-reading','rightslink']
[docs] def get_soup_source(self, url, driver=None, **gatewaykwargs): # If url is None, we will use crawled information. cano_url = canonicalize(url=url, driver=driver) cano_url = cano_url.rstrip("/htm")+"/htm" soup = super().get_soup_source(url=cano_url, driver=driver, **gatewaykwargs) return soup
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="section", attrs={"type" : "other"}) sections = [e for e in soup.find_all(name="section") if e.get("aria-labelledby") not in self.AvoidAriaLabel] abst = soup.find(name="div", class_="art-abstract") if abst is not None: abst_section = soup.new_tag(name="section", attrs={"type" : "other"}) abst_h2Tag = soup.new_tag(name="h2") abst_h2Tag.string = "0. Abstract" abst_section.append(abst_h2Tag) abst_section.append(abst) sections.insert(0, abst_section) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class UniOKLAHOMACrawler(GummyAbstJournal): """ URL: - https://www.ou.edu Attributes: crawl_type (str) : :meth:`UniOKLAHOMACrawler's <gummy.journals.UniOKLAHOMACrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): sections = soup.find_all(name="p", class_="a") if len(sections)>2: title = sections[1].get_text() else: title = self.default_title return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="p", class_=("a", "a0", "MsoNormal"))[2:] return sections
[docs] def get_contents_from_soup_sections(self, soup_sections): contents = super()._get_contents_from_sections_base(soup_sections) len_soup_sections = len(soup_sections) for i,section in enumerate(soup_sections): contents.extend(self.organize_soup_section(section=section, head="", head_is_not_added=False)) if self.verbose: print(f"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}]") return contents
[docs]class LungCancerCrawler(GummyAbstJournal): """ URL: - https://www.lungcancerjournal.info Attributes: crawl_type (str) : :meth:`LungCancerCrawler's <gummy.journals.LungCancerCrawler>` default ``crawl_type`` is ``"soup"``. AvoidHead (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.LungCancerCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.AvoidHead = ["", "Keywords", "References", "Article Info", "Publication History", "Identification", "Copyright", "ScienceDirect"]
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article-header__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="section") if find_target_text(soup=e, name=("h2", "h3"), default="") not in self.AvoidHead] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class CellPressCrawler(GummyAbstJournal): """ URL: - https://www.cell.com Attributes: crawl_type (str) : :meth:`CellPressCrawler's <gummy.journals.CellPressCrawler>` default ``crawl_type`` is ``"soup"``. AvoidDataLeftHandNavs (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.CellPressCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.AvoidDataLeftHandNavs = [None, "Acknowledgements", "References"] self.register_decompose_soup_tags(name="div", class_="dropBlock reference-citations")
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article-header__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="section") if find_target_id(soup=e, key="data-left-hand-nav", name="h2", default=None) not in self.AvoidDataLeftHandNavs] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class WileyOnlineLibraryCrawler(GummyAbstJournal): """ URL: - https://onlinelibrary.wiley.com - https://febs.onlinelibrary.wiley.com Attributes: crawl_type (str) : :meth:`WileyOnlineLibraryCrawler's <gummy.journals.WileyOnlineLibraryCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.register_decompose_soup_tags(name="a")
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="citation__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="section", class_=("article-section__abstract", "article-section__content")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class JBCCrawler(GummyAbstJournal): """ URL: - https://www.jbc.org Attributes: crawl_type (str) : :meth:`JBCCrawler's <gummy.journals.JBCCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"id": "article-title-1"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_=("section abstract")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class BiologistsCrawler(GummyAbstJournal): """ URL: - https://bio.biologists.org - https://dev.biologists.org - https://jcs.biologists.org Attributes: crawl_type (str) : :meth:`BiologistsCrawler's <gummy.journals.BiologistsCrawler>` default ``crawl_type`` is ``"soup"``. AvoidIDs (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.BiologistsCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.AvoidIDs = ["ack", "fn-group", "ref-list"]
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="div", class_="highwire-cite-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="div", class_="section") if not any([e.get("id", self.AvoidIDs[0]).startswith(id) for id in self.AvoidIDs])] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class BioMedCentralCrawler(GummyAbstJournal): """ URL: - https://biologydirect.biomedcentral.com - https://bmcbioinformatics.biomedcentral.com - https://bmcevolbiol.biomedcentral.com - https://bmcgenomics.biomedcentral.com - https://genomebiology.biomedcentral.com - https://retrovirology.biomedcentral.com Attributes: crawl_type (str) : :meth:`BioMedCentralCrawler's <gummy.journals.BioMedCentralCrawler>` default ``crawl_type`` is ``"soup"``. AvoidAriaLabel (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.BioMedCentralCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.AvoidAriaLabel = ['Ack1','Bib1','additional-information','article-comments','article-info','author-information','ethics','further-reading','rightslink','Sec2','Sec3']#,None]
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="c-article-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="section") if e.get("aria-labelledby") not in self.AvoidAriaLabel] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class IEEEXploreCrawler(GummyAbstJournal): """ URL: - https://ieeexplore.ieee.org Attributes: crawl_type (str) : :meth:`IEEEXploreCrawler's <gummy.journals.IEEEXploreCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=10, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="document-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] # Abstract abst = soup.find(name="div", class_="abstract-text") if abst is not None: abst_div_section = soup.new_tag(name="div", class_="section") abst_strong = abst.find(name="strong") if abst_strong is not None: abst_strong.decompose() abst_divTag = soup.new_tag(name="div", class_="header") abst_divTag.string = "Abstract" abst_div_section.append(abst_divTag) abst_div_section.append(abst) sections.append(abst_div_section) # Other article article = soup.find(name="div", id="article") if article is not None: sections.extend(article.find_all(name="div", class_="section")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="div", class_="header") return head
[docs]class JSTAGECrawler(GummyAbstJournal): """ URL: - https://www.jstage.jst.go.jp Attributes: crawl_type (str) : :meth:`JSTAGECrawler's <gummy.journals.JSTAGECrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_soup_source(self, url, driver=None, **gatewaykwargs): cano_url = canonicalize(url=url, driver=driver) if self.verbose: print(f"You can download PDF from {toBLUE(cano_url.replace('_article', '_pdf/-char/en'))}") soup = super().get_soup_source(url=cano_url, driver=driver, **gatewaykwargs) return soup
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="div", class_="global-article-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", attrs={"id": "article-overiew-abstract-wrap"}) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="div") return head
[docs]class ACSPublicationsCrawler(GummyAbstJournal): """ URL: - https://pubs.acs.org/ Attributes: crawl_type (str) : :meth:`ACSPublicationsCrawler's <gummy.journals.ACSPublicationsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h4"], )
[docs] def get_soup_source(self, url, driver=None, **gatewaykwargs): cano_url = canonicalize(url=url, driver=driver) if self.verbose: print(f"You can download PDF from {toBLUE(cano_url.replace('/doi/', '/doi/pdf/'))}") soup = super().get_soup_source(url=cano_url, driver=driver, **gatewaykwargs) return soup
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article_header-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="article_abstract") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class StemCellsCrawler(GummyAbstJournal): """ URL: - https://stemcellsjournals.onlinelibrary.wiley.com Attributes: crawl_type (str) : :meth:`StemCellsCrawler's <gummy.journals.StemCellsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="citation__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="section", class_=("article-section__abstract", "article-section__content")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class UniKeioCrawler(GummyAbstJournal): """ URL: - https://keio.pure.elsevier.com Attributes: crawl_type (str) : :meth:`UniKeioCrawler's <gummy.journals.UniKeioCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_contents_soup(self, url, driver=None, **gatewaykwargs): soup = self.get_soup_source(url=url, driver=driver, **gatewaykwargs) div_doi = soup.find(name="div", class_="doi") if div_doi is not None: a_doi = div_doi.find(name="a") if a_doi is not None: cano_url = canonicalize(url=a_doi.get("href"), driver=driver, sleep_for_loading=5) if cano_url!=url: try: journal_type = whichJournal(url=cano_url, driver=None, verbose=self.verbose) if journal_type!="unikeio": # Prevent infinite loop crawler = get(journal_type, gateway=self.gateway, sleep_for_loading=3, verbose=self.verbose) return crawler.get_contents(url=cano_url, driver=driver) except JournalTypeIndistinguishableError: if self.verbose: print(f"{toGREEN('gummy.utils.journal_utils.whichJournal')} could not distinguish the journal type, so Scraping from PubMed") title = self.get_title_from_soup(soup) soup_sections = self.get_sections_from_soup(soup) contents = self.get_contents_from_soup_sections(soup_sections) self._store_crawling_logs(soup=soup, title=title, soup_sections=soup_sections, contents=contents) return (title, contents)
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="rendering_contributiontojournal_abstractportal") return sections
[docs] def get_contents_from_soup_sections(self, soup_sections): contents = super()._get_contents_from_sections_base(soup_sections) len_soup_sections = len(soup_sections) for i,section in enumerate(soup_sections): head = section.get("class", ["Abstract"])[-1] contents.extend(self.organize_soup_section(section=section, head=head)) if self.verbose: print(f"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {head}") return contents
[docs]class PLOSONECrawler(GummyAbstJournal): """ URL: - https://journals.plos.org Attributes: crawl_type (str) : :meth:`PLOSONECrawler's <gummy.journals.PLOSONECrawler>` default ``crawl_type`` is ``"soup"``. AvoidIDs (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.PLOSONECrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.AvoidIDs = ["authcontrib", "references"]
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"id": "artTitle"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="div", class_="toc-section") if e.find(name="a") is not None and e.find(name="a").get("id") not in self.AvoidIDs] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class frontiersCrawler(GummyAbstJournal): """ URL: - https://www.frontiersin.org Attributes: crawl_type (str) : :meth:`frontiersCrawler's <gummy.journals.frontiersCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h2","h3","h4"] ) self.register_decompose_soup_tags(name="div", class_="authors") self.register_decompose_soup_tags(name="ul", class_="notes")
[docs] def get_title_from_soup(self, soup): abst = soup.find(name="div", class_="JournalAbstract") if abst is None: abst = soup title = find_target_text(soup=abst, name="h1", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="JournalAbstract") text_section = soup.find(name="div", class_="JournalFullText") if text_section is not None: sections.extend(group_soup_with_head(soup=text_section, name="h2")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h1") return head
[docs]class RNAjournalCrawler(GummyAbstJournal): """ URL: - https://rnajournal.cshlp.org Attributes: crawl_type (str) : :meth:`RNAjournalCrawler's <gummy.journals.RNAjournalCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_soup_source(self, url, driver=None, **gatewaykwargs): cano_url = canonicalize(url=url, driver=driver) if self.verbose: print(f"You can download PDF from {toBLUE(cano_url + '.full.pdf')}") soup = super().get_soup_source(url=cano_url, driver=driver, **gatewaykwargs) return soup
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"id": "article-title-1", "itemprop": "headline"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="section") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class IntechOpenCrawler(GummyAbstJournal): """ URL: - https://www.intechopen.com Attributes: crawl_type (str) : :meth:`IntechOpenCrawler's <gummy.journals.IntechOpenCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="intro") body = soup.find(name="div", class_="reader-body") if body is not None: sections.extend(body.find_all(name="div", class_="section")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class NRCResearchPressCrawler(GummyAbstJournal): """ URL: - https://www.nrcresearchpress.com Attributes: crawl_type (str) : :meth:`NRCResearchPressCrawler's <gummy.journals.NRCResearchPressCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, # subheadTags=["span"], <span class="title2"> )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="box-pad border-lightgray margin-bottom") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class SpandidosCrawler(GummyAbstJournal): """ URL: - https://www.spandidos-publications.com Attributes: crawl_type (str) : :meth:`SpandidosCrawler's <gummy.journals.SpandidosCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h5"] )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"id": "titleId"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", attrs={"id": "articleAbstract"}) try: sections.extend(group_soup_with_head(soup=sections[0].next.next.next, name=("h4", "h5"))) except AttributeError: if self.verbose: print("Use only Abstract.") except IndexError: if self.verbose: print(toRED("Couldn't scrape well.")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h4") return head
[docs]class TaylorandFrancisOnlineCrawler(GummyAbstJournal): """ URL: - https://www.tandfonline.com Attributes: crawl_type (str) : :meth:`TaylorandFrancisOnlineCrawler's <gummy.journals.TaylorandFrancisOnlineCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"id": "titleId"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_=("hlFld-Abstract","NLM_sec NLM_sec-type_intro NLM_sec_level_1")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class bioRxivCrawler(GummyAbstJournal): """ URL: - https://www.biorxiv.org Attributes: crawl_type (str) : :meth:`bioRxivCrawler's <gummy.journals.bioRxivCrawler>` default ``crawl_type`` is ``"soup"``. AvoidIDs (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.bioRxivCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"] ) self.AvoidIDs = ["ref-list-1"]
[docs] @staticmethod def get_soup_url(url): return re.sub(pattern=r"(.+?)(:?\.full)?", string=url, repl=r"\1")+".full"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"id":"page-title", "class": "highwire-cite-title"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="article fulltext-view") if len(sections)>0: sections = [e for e in sections[0].find_all(name="div", class_="section") if e.get("id") not in self.AvoidIDs] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class RSCPublishingCrawler(GummyAbstJournal): """ URL: - https://pubs.rsc.org Attributes: crawl_type (str) : :meth:`RSCPublishingCrawler's <gummy.journals.RSCPublishingCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h2", class_="capsule__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="capsule__column-wrapper") text_section = soup.find(name="div", attrs={"id": "pnlArticleContent"}) if text_section is not None: sections.extend(group_soup_with_head(soup=text_section, name="h2")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class JSSECrawler(GummyAbstJournal): """ URL: - https://www.jsse.org Attributes: crawl_type (str) : :meth:`JSSECrawler's <gummy.journals.JSSECrawler>` default ``crawl_type`` is ``"pdf"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="pdf", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_jsseNo(url): return re.sub(pattern=r"^.*\/index.php\/jsse\/article\/(?:view|download)\/([0-9]+)(?:\/[0-9]+)?$", repl=r"\1", string=url)
[docs] @staticmethod def get_soup_url(url): return f"https://www.jsse.org/index.php/jsse/article/view/{JSSECrawler.get_jsseNo(url)}"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="page_title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="item abstract") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h3") return head
[docs]class ScienceAdvancesCrawler(GummyAbstJournal): """ URL: - https://advances.sciencemag.org Attributes: crawl_type (str) : :meth:`ScienceAdvancesCrawler's <gummy.journals.ScienceAdvancesCrawler>` default ``crawl_type`` is ``"soup"``. AvoidIDs (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.ScienceAdvancesCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.AvoidIDs = ["ref-list-1"]
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article__headline", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): article = soup.find(name="article", class_="primary primary--content") if article is not None: soup = article sections = [e for e in soup.find_all(name="div", class_="section") if e.get("id") not in self.AvoidIDs] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class medRxivCrawler(GummyAbstJournal): """ URL: - https://www.medrxiv.org Attributes: crawl_type (str) : :meth:`medRxivCrawler's <gummy.journals.medRxivCrawler>` default ``crawl_type`` is ``"pdf"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="pdf", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_soup_url(url): return re.sub(pattern=r"(^.*\/.*?v[0-9]+)(\..+)?", repl=r"\1", string=url)
[docs] @staticmethod def get_pdf_url(url): return medRxivCrawler.get_soup_url(url) + ".full.pdf"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"class": "highwire-cite-title", "id": "page-title"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="section") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class ACLAnthologyCrawler(GummyAbstJournal): """ URL: - https://www.aclweb.org Attributes: crawl_type (str) : :meth:`ACLAnthologyCrawler's <gummy.journals.ACLAnthologyCrawler>` default ``crawl_type`` is ``"pdf"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="pdf", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_soup_url(url): return url.rstrip("/").rstrip(".pdf")
[docs] @staticmethod def get_pdf_url(url): return ACLAnthologyCrawler.get_soup_url(url) + ".pdf"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h2", attrs={"id": "title"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="card-body acl-abstract") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h5") return head
[docs]class PNASCrawler(GummyAbstJournal): """ URL: - https://www.pnas.org Attributes: crawl_type (str) : :meth:`PNASCrawler's <gummy.journals.PNASCrawler>` default ``crawl_type`` is ``"soup"``. AvoidIDs (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.PNASCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], ) self.AvoidIDs = ["fn-group-1", "ref-list-1"]
[docs] @staticmethod def get_soup_url(url): return url.rstrip("/").rstrip(".full.pdf")
[docs] @staticmethod def get_pdf_url(url): return PNASCrawler.get_soup_url(url) + ".full.pdf"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"class": "highwire-cite-title", "id": "page-title"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="executive-summary") sections += [e for e in soup.find_all(name="div", class_="section") if e.get("id") not in self.AvoidIDs] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class AMSCrawler(GummyAbstJournal): """ URL: - https://journals.ametsoc.org Attributes: crawl_type (str) : :meth:`AMSCrawler's <gummy.journals.AMSCrawler>` default ``crawl_type`` is ``"soup"``. AvoidIDs (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.AMSCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"] ) self.AvoidIDs = ["fn-group-1", "ref-list-1"]
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1",class_="wi-article-title article-title-main", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] article = soup.find(name="div", attrs={"class": "widget-items", "data-widgetname": "ArticleFulltext"}) if article is not None: head = article.find(name=("h2","h3")) while head is not None: section = BeautifulSoup(features="lxml").new_tag(name="section") article = head.find_next_sibling(name=("div")) section.append(head) head = article.find_next_sibling(name=("h2","h3")) if head is None or head.get_text() == "REFERENCES" or head.get("class") == "backreferences-title": break section.append(article) sections.append(section) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class ACMCrawler(GummyAbstJournal): """NOTE: If you want to download PDF, you must run driver with a browser. URL: - https://dl.acm.org Attributes: crawl_type (str) : :meth:`ACMCrawler's <gummy.journals.ACMCrawler>` default ``crawl_type`` is ``"pdf"``. AvoidIDs (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_pdf <gummy.journals.ACMCrawler.get_sections_from_pdf>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="pdf", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, ) self.AvoidIDs = ["sec-ref", "sec-terms", "sec-comments"]
[docs] @staticmethod def get_soup_url(url): return re.sub(pattern=r"\/doi(?:\/(abs|e?pdf))?\/", repl=lambda m: m.group(0).replace(m.group(1), "abs") if m.group(1) is not None else m.group(0).replace("/doi/", "/doi/abs/"), string=url)
[docs] @staticmethod def get_pdf_url(url): return re.sub(pattern=r"\/doi(?:\/(abs|e?pdf))?\/", repl=lambda m: m.group(0).replace(m.group(1), "pdf") if m.group(1) is not None else m.group(0).replace("/doi/", "/doi/pdf/"), string=url)
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1",class_="citation__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="div", class_="article__section") if e.find("h2") is None or e.find("h2").get("id") not in self.AvoidIDs] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class APSCrawler(GummyAbstJournal): """ URL: - https://journals.aps.org Attributes: crawl_type (str) : :meth:`APSCrawler's <gummy.journals.APSCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h6"], )
[docs] @staticmethod def get_soup_url(url): return url.replace("/pdf/", "/abstract/")
[docs] @staticmethod def get_pdf_url(url): return url.replace("/abstract/", "/pdf/")
[docs] def make_elements_visible(self, driver): driver.implicitly_wait(3) try_find_element_click(driver=driver, identifier="//section[@class='article fulltext']/h4[@class='title']/span[@class='right']/i[@class='fi-plus']", by="xpath") while True: hidden_elements = driver.find_elements(by="xpath", value="//i[@class='fi-plus'][contains(@style,'display: block;')]") for e in hidden_elements: try_find_element_click(driver=driver, target=e) if len(hidden_elements)==0: break
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h3", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="section", class_="article-section") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h5", class_="section title") return head
[docs]class ASIPCrawler(GummyAbstJournal): """ URL: - https://ajp.amjpathol.org Attributes: crawl_type (str) : :meth:`ASIPCrawler's <gummy.journals.ASIPCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], )
[docs] @staticmethod def get_asipID(url): return re.sub(pattern=r"^.+\/(?:(?:article\/(.+)\/)|(?:action\/showPdf\?pii=(.+))).*$", repl=lambda m: m.group(1) or m.group(2), string=url)
[docs] @staticmethod def get_soup_url(url): return f"https://ajp.amjpathol.org/article/{ASIPCrawler.get_asipID(url)}/fulltext"
[docs] @staticmethod def get_pdf_url(url): return f"https://ajp.amjpathol.org/action/showPdf?pii={ASIPCrawler.get_asipID(url)}"
[docs] def get_soup_source(self, url, driver=None, **gatewaykwargs): asipID = self.get_asipID(url) soup = super().get_soup_source(url=self.get_soup_url(asipID), driver=driver, **gatewaykwargs) return soup
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article-header__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] article = soup.find(name="div", class_="article__sections") if article is not None: for section in article.find_all(name="section"): if find_target_text(soup=section, name="h2", default="OK").lower().startswith("reference"): break sections.append(section) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class AnatomyPubsCrawler(GummyAbstJournal): """ URL: - https://anatomypubs.onlinelibrary.wiley.com Attributes: crawl_type (str) : :meth:`AnatomyPubsCrawler's <gummy.journals.AnatomyPubsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3", "h4"], )
[docs] @staticmethod def get_soup_url(url): return re.sub(pattern=r"\/doi(?:\/(full|e?pdf))?\/", repl=lambda m: m.group(0).replace(m.group(1), "full") if m.group(1) is not None else m.group(0).replace("/doi/", "/doi/full/"), string=url)
[docs] @staticmethod def get_pdf_url(url): return re.sub(pattern=r"\/doi(?:\/(full|e?pdf))?\/", repl=lambda m: m.group(0).replace(m.group(1), "pdf") if m.group(1) is not None else m.group(0).replace("/doi/", "/doi/pdf/"), string=url)
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="citation__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="section", class_="article-section__content") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class RenalPhysiologyCrawler(GummyAbstJournal): """ URL: - https://journals.physiology.org Attributes: crawl_type (str) : :meth:`RenalPhysiologyCrawler's <gummy.journals.RenalPhysiologyCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h2"], )
[docs] @staticmethod def get_soup_url(url): return re.sub(pattern=r"\/doi(?:\/(full|e?pdf))?\/", repl=lambda m: m.group(0).replace(m.group(1), "full") if m.group(1) is not None else m.group(0).replace("/doi/", "/doi/full/"), string=url)
[docs] @staticmethod def get_pdf_url(url): return re.sub(pattern=r"\/doi(?:\/(full|e?pdf))?\/", repl=lambda m: m.group(0).replace(m.group(1), "pdf") if m.group(1) is not None else m.group(0).replace("/doi/", "/doi/pdf/"), string=url)
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="citation__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="hlFld-Abstract") body_section = soup.find(name="div", class_="hlFld-Fulltextl") if body_section is not None: sections.extend(group_soup_with_head(soup=body_section, name="h1")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h1") return head
[docs]class GeneticsCrawler(GummyAbstJournal): """ URL: - https://www.genetics.org Attributes: crawl_type (str) : :meth:`GeneticsCrawler's <gummy.journals.GeneticsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h4"], )
[docs] @staticmethod def get_soup_url(url): return url.rstrip(".full.pdf").replace("/content/genetics/", "/content/")
[docs] @staticmethod def get_pdf_url(url): return GeneticsCrawler.get_soup_url(url).replace("/content/", "/content/genetics/")+".full.pdf"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"class": "highwire-cite-title", "id": "page-title"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] article = soup.find(name="div", class_="article fulltext-view") if article is not None: article_sections = group_soup_with_head(soup=article, name="h2") for article_section in article_sections: h2Tag = article_section.find(name="h2") if (h2Tag is not None) and h2Tag.get_text().lower().startswith("reference"): break sections.append(article_section) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class GeneDevCrawler(GummyAbstJournal): """ URL: - https://genesdev.cshlp.org Attributes: crawl_type (str) : :meth:`GeneDevCrawler's <gummy.journals.GeneDevCrawler>` default ``crawl_type`` is ``"pdf"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="pdf", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_soup_url(url): return url.rstrip("+html").rstrip(".full.pdf")
[docs] @staticmethod def get_pdf_url(url): return GeneDevCrawler.get_soup_url(url) + ".full.pdf"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"id": "article-title-1", "itemprop": "headline"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="section abstract") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class JAMANetworkCrawler(GummyAbstJournal): """ URL: - https://jamanetwork.com Attributes: crawl_type (str) : :meth:`JAMANetworkCrawler's <gummy.journals.JAMANetworkCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["strong"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="meta-article-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] article = soup.find(name="div", class_="article-full-text") if article is not None: article_sections = group_soup_with_head(soup=article, name="div", class_="h3") for article_section in article_sections: if article_section.find(name="div", class_="references") is not None: break sections.append(article_section) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="div", class_="h3") return head
[docs]class SAGEjournalsCrawler(GummyAbstJournal): """ URL: - https://journals.sagepub.com Attributes: crawl_type (str) : :meth:`SAGEjournalsCrawler's <gummy.journals.SAGEjournalsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="hlFld-Abstract") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class MolCellBioCrawler(GummyAbstJournal): """ URL: - https://mcb.asm.org Attributes: crawl_type (str) : :meth:`MolCellBioCrawler's <gummy.journals.MolCellBioCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["span"], )
[docs] @staticmethod def get_soup_url(url): return url.rstrip(".full.pdf")
[docs] @staticmethod def get_pdf_url(url): return MolCellBioCrawler.get_soup_url(url) + ".full.pdf"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="highwire-cite-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] article = soup.find(name="div", class_="article fulltext-view") if article is not None: article_sections = group_soup_with_head(soup=article, name="h2") for article_section in article_sections: h2Tag = article_section.find(name="h2") if (h2Tag is not None) and h2Tag.get_text().lower().startswith("reference"): break sections.append(article_section) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class JKMSCrawler(GummyAbstJournal): """ URL: - https://jkms.org Attributes: crawl_type (str) : :meth:`JKMSCrawler's <gummy.journals.JKMSCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_pdf_url(url): if not url.endswith(".pdf"): soup = BeautifulSoup(requests.get(url=url).content, "html.parser") PDF_urls = [a.get("href") for a in soup.find_all(name="a") if a.get_text().upper()=="PDF"] if len(PDF_urls)>0: url = PDF_urls[0] return url
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="span", class_="tl-document", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_=("fm", "body")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="div", class_="tl-main-part") return head
[docs]class JKNSCrawler(GummyAbstJournal): """ URL: - https://www.jkns.or.kr Attributes: crawl_type (str) : :meth:`JKNSCrawler's <gummy.journals.JKNSCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_pdf_url(url): if not url.endswith(".pdf"): soup = BeautifulSoup(requests.get(url=url).content, "html.parser") PDF_urls = [a.get("onclick") for a in soup.find_all(name="a") if a.get_text().strip() == "PDF Links"] if len(PDF_urls)>0: url = re.sub(pattern=r'journal_download\("(.+?)",\s*"(.+?)",\s*"(.+?)"\)', repl=r"https://www.jkns.or.kr/upload/pdf/\3", string=PDF_urls[0]) return url
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h3", class_="PubTitle", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="abstract-layer") article = soup.find(name="div", class_="abstract-layer") if article is not None: sections.extend(article.find_all(name="div", class_="section")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h3") return head
[docs]class BioscienceCrawler(GummyAbstJournal): """ URL: - https://www.bioscience.org Attributes: crawl_type (str) : :meth:`BioscienceCrawler's <gummy.journals.BioscienceCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_soup_url(url): soup = BeautifulSoup(requests.get(url=url).content, "html.parser") frame_urls = [e.get("src") for e in soup.find_all(name="frame") if re.search(pattern=r"\.html?", string=e.get("src")) is not None] if len(frame_urls)>0: url = re.sub(pattern=r"(^.*\/)(.+?\.html?)$", repl=r"\1", string=url) + frame_urls[0] return url
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="p", attrs={"align":"JUSTIFY"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="p", attrs={"align" : "JUSTIFY"}) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="b") return head
[docs]class RadioGraphicsCrawler(GummyAbstJournal): """ URL: - https://pubs.rsna.org Attributes: crawl_type (str) : :meth:`RadioGraphicsCrawler's <gummy.journals.RadioGraphicsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_soup_url(url): return url.replace("/doi/pdf/", "/doi/")
[docs] @staticmethod def get_pdf_url(url): return RadioGraphicsCrawler.get_soup_url(url).replace("/doi/", "/doi/pdf/")
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="citation__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="hlFld-Abstract") article = soup.find(name="div", class_="hlFld-Fulltext") if article is not None: sections.extend(group_soup_with_head(soup=article, name="h2", class_="article-section__title section__title")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class PediatricSurgeryCrawler(GummyAbstJournal): """ URL: - https://www.jpedsurg.org Attributes: crawl_type (str) : :meth:`PediatricSurgeryCrawler's <gummy.journals.PediatricSurgeryCrawler>` default ``crawl_type`` is ``"soup"``. AvoidDataLeftHandNavs (list) : Markers indicating the extra section to remove in :meth:`get_sections_from_soup <gummy.journals.PediatricSurgeryCrawler.get_sections_from_soup>` """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, ) self.AvoidDataLeftHandNavs = [None, "References"]
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article-header__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [e for e in soup.find_all(name="section") if (e.find(name="h2") is not None) and (e.find(name="h2").get("data-left-hand-nav") not in self.AvoidDataLeftHandNavs)] return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class AGUPublicationsCrawler(GummyAbstJournal): """ URL: - https://agupubs.onlinelibrary.wiley.com Attributes: crawl_type (str) : :meth:`AGUPublicationsCrawler's <gummy.journals.AGUPublicationsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_soup_url(url): return re.sub(pattern=r"\/(?:abs|pdf)\/", repl="/full/", string=url)
[docs] @staticmethod def get_pdf_url(url): return re.sub(pattern=r"\/(?:abs|full)\/", repl="/pdf/", string=url)
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="citation__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="section", class_=("article-section__abstract", "article-section__content")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class NEJMCrawler(GummyAbstJournal): """ URL: - https://www.nejm.org Attributes: crawl_type (str) : :meth:`NEJMCrawler's <gummy.journals.NEJMCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h2"], )
[docs] @staticmethod def get_soup_url(url): return url.replace("/pdf/", "/full/")
[docs] @staticmethod def get_pdf_url(url): return url.replace("/full/", "/pdf/")
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="span", class_="title_default", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="section", class_="o-article-body__section") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class LWWJournalsCrawler(GummyAbstJournal): """ URL: - https://journals.lww.com Attributes: crawl_type (str) : :meth:`LWWJournalsCrawler's <gummy.journals.LWWJournalsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags = ["h3"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="header", class_="ejp-article-header", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="section", attrs={"id" : "abstractWrap"}) article = soup.find(name="section", attrs={"id": "ArticleBody"}) if article is not None: sections.extend(group_soup_with_head(soup=article, name="h2")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name=("h1","h2")) return head
[docs]class ARVOJournalsCrawler(GummyAbstJournal): """ URL: - https://iovs.arvojournals.org/ - https://jov.arvojournals.org/ - https://tvst.arvojournals.org/ Attributes: crawl_type (str) : :meth:`ARVOJournalsCrawler's <gummy.journals.ARVOJournalsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags = ["strong"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="div", class_="wi-article-title article-title-main", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] section = soup.find(name="div", attrs={"class":"widget-items", "data-widgetname":"ArticleFulltext"}) if section is not None: section = replace_soup_tag(soup=section, new_name="strong", old_name="div", old_attrs={"class": "h7"}) for sec in split_section(section=section, name="div", class_="h6"): if str_strip(sec).lower().startswith("reference"): break sections.append(sec) return sections
[docs] def get_head_from_section(self, section): if section.name == "div" and "h6" in section.get_attribute_list("class"): return section else: return None
[docs]class LearningMemoryCrawler(GummyAbstJournal): """ URL: - https://learnmem.cshlp.org/ Attributes: crawl_type (str) : :meth:`LearningMemoryCrawler's <gummy.journals.LearningMemoryCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags = ["h3", "h4"], )
[docs] @staticmethod def get_soup_url(url): return re.sub(pattern=r"(\.full)?(\.((html)|(pdf)))", repl=".full.html", string=url)
[docs] @staticmethod def get_pdf_url(url): return re.sub(pattern=r"(\.full)?(\.((html)|(pdf)))", repl=".full.pdf", string=url)
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"id": "article-title-1"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] section = soup.find(name="div", class_="article fulltext-view") if section is not None: for sec in group_soup_with_head(soup=section, name="h2"): if find_target_text(soup=sec, name="h2").lower().startswith("reference"): break sections.append(sec) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class ScienceMagCrawler(GummyAbstJournal): """ URL: - https://science.sciencemag.org/ Attributes: crawl_type (str) : :meth:`ScienceMagCrawler's <gummy.journals.ScienceMagCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article__headline", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] abstract = soup.find(name="div", class_="section abstract") if abstract is not None: sections.append(abstract.__copy__()) article = soup.find(name="div", class_="article fulltext-view") if article is not None: if abstract is not None: abstract.decompose() sections.extend(article.find_all(name=("p", "figure"))) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class PsyChiArtistCrawler(GummyAbstJournal): """ URL: - https://www.psychiatrist.com/ Attributes: crawl_type (str) : :meth:`PsyChiArtistCrawler's <gummy.journals.PsyChiArtistCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="p", class_="title-left", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] article = soup.find(name="div", attrs={"id":"articlecontent"}) if article is not None: for e in article.find_all(name=("p","div")): if e.get_text().lower().startswith("reference"): break sections.append(e) return sections
[docs] def get_head_from_section(self, section): return None
[docs]class OncotargetCrawler(GummyAbstJournal): """ URL: - https://www.oncotarget.com/ Attributes: crawl_type (str) : :meth:`OncotargetCrawler's <gummy.journals.OncotargetCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags = ["h3"] )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", attrs={"id":"articleTitle"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] abst_body = soup.find(name="p", class_="BodyText") if abst_body is not None: abst = BeautifulSoup(markup="", features="lxml").new_tag(name="section") abst.append(str2soup("<h2>ABSTRACT</h2>")) abst.append(abst_body) sections.append(abst) section = soup.find(name="div", class_="sns") if section is not None: article_sections = group_soup_with_head(soup=section, name="h2") for sec in article_sections: if find_target_text(soup=sec, name="h2", default="head").lower().startswith("reference"): break sections.append(sec) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return None
[docs]class ClinicalEndoscopyCrawler(GummyAbstJournal): """ URL: - https://www.e-ce.org/ Attributes: crawl_type (str) : :meth:`ClinicalEndoscopyCrawler's <gummy.journals.ClinicalEndoscopyCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], # <h3 class="section-title"> )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h3", class_="PubTitle", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] abst = soup.find(name="div", class_="abstract-layer") if abst is not None: sections.append(abst) article = soup.find(name="div", attrs={"id": "article-body"}) if article is not None: sections.extend(article.find_all(name="div", class_="section")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h3", class_="main-title") return head
[docs]class EMBOPressCrawler(GummyAbstJournal): """ URL: - https://www.embopress.org Attributes: crawl_type (str) : :meth:`EMBOPressCrawler's <gummy.journals.EMBOPressCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="citation__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="section", class_="article-section") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h3", class_="article-section__header") return head
[docs]class ASPBCrawler(GummyAbstJournal): """ URL: - http://www.plantphysiol.org/ Attributes: crawl_type (str) : :meth:`ASPBCrawlerCrawler's <gummy.journals.ASPBCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["h3"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="highwire-cite-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] article = soup.find(name="div", class_="article fulltext-view") if article is not None: ref = article.find(name="div", class_="section ref-list") if ref is not None: ref.decompose() sections.extend(group_soup_with_head(article, name="h2")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class BiomedGridCrawler(GummyAbstJournal): """ URL: - https://biomedgrid.com/ Attributes: crawl_type (str) : :meth:`BiomedGridCrawler's <gummy.journals.BiomedGridCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="farticle-title citation_title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] for sec in soup.find_all(name="section", class_="lighten-4"): head = self.get_head_from_section(sec) if (head is not None) and head.get_text().lower().startswith("reference"): break sections.append(sec) return sections
[docs] def get_head_from_section(self, section): head = section.find(name=("h2", "h4")) return head
[docs]class NRRCrawler(GummyAbstJournal): """ URL: - https://www.nrronline.org/ Attributes: crawl_type (str) : :meth:`NRRCrawler's <gummy.journals.NRRCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, subheadTags=["b"], )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="font", class_="sTitle", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] start = False article = soup.find(name="table", class_="articlepage") for sec in group_soup_with_head(article, name="table"): if find_target_text(soup=sec, name="table").lower().startswith("abstract"): start = True if find_target_text(soup=sec, name="table").lower().startswith("reference"): break if start: sections.append(sec) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="td", class_="pageSub") return head
[docs]class YMJCrawler(GummyAbstJournal): """ URL: - https://eymj.org/ Attributes: crawl_type (str) : :meth:`YMJCrawler's <gummy.journals.YMJCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, # subheadTags=["p"], <p class="tl-lowest-section"> )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="span", class_="tl-document", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] for block in soup.find_all(name="div", attrs={"id": ("article-level-0-front", "article-level-0-body")}): sections.extend(group_soup_with_head(soup=block, name="div", class_="tl-main-part")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="div", class_="tl-main-part") return head
[docs]class TheLancetCrawler(GummyAbstJournal): """ URL: - https://www.thelancet.com/ Attributes: crawl_type (str) : :meth:`YMJCrawler's <gummy.journals.YMJCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, # subheadTags=["p"], <p class="tl-lowest-section"> )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article-header__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] for sec in soup.find_all(name="section"): if find_target_text(soup=sec, name="h2", default="head").lower().startswith("reference"): break sections.append(sec) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class FutureScienceCrawler(GummyAbstJournal): """ URL: - https://www.future-science.com/ Attributes: crawl_type (str) : :meth:`FutureScienceCrawler's <gummy.journals.FutureScienceCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="citation__title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] body = soup.find(name="div", class_="article__body") if body is not None: sections.extend(group_soup_with_head(soup=body, name="h2")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class ScitationCrawler(GummyAbstJournal): """ URL: - https://aip.scitation.org/ - https://www.scitation.org/ Attributes: crawl_type (str) : :meth:`ScitationCrawler's <gummy.journals.ScitationCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_pdf_url(url): return url.replace("/doi/", "/doi/pdf/")
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="hlFld-Abstract") article = soup.find(name="div", class_="hlFld-Fulltext") if article is not None: for sec in group_soup_with_head(article, name="div", class_="sectionHeading"): if find_target_text(soup=sec, name="div", class_="sectionHeading").lower().startswith("reference"): break sections.append(sec) return sections
[docs] def get_head_from_section(self, section): head = section.find(name=None, class_="sectionHeading") return head
[docs]class IOPScienceCrawler(GummyAbstJournal): """ URL: - https://iopscience.iop.org/ TODO: Deal with **ShieldSquare Captcha**. .. image:: _images/ss_captcha.png Attributes: crawl_type (str) : :meth:`IOPScienceCrawler's <gummy.journals.IOPScienceCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="wd-jnl-art-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="article-content") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class AACRPublicationsCrawler(GummyAbstJournal): """ URL: - https://aacrjournals.org/ - https://bloodcancerdiscov.aacrjournals.org/ - https://cancerdiscovery.aacrjournals.org - https://cebp.aacrjournals.org - https://cancerimmunolres.aacrjournals.org/ - https://cancerpreventionresearch.aacrjournals.org/ - https://cancerres.aacrjournals.org/ - https://clincancerres.aacrjournals.org/ - https://mcr.aacrjournals.org/ - https://mct.aacrjournals.org/ Attributes: crawl_type (str) : :meth:`AACRPublicationsCrawler's <gummy.journals.AACRPublicationsCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="highwire-cite-title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] for sec in soup.find_all(name="div", class_=("section", "boxed-text")): head = self.get_head_from_section(sec) if (head is not None) and head.get_text().lower().startswith("reference"): break sections.append(sec) return sections
[docs] def get_head_from_section(self, section): head = section.find(name=("h2", "h3")) return head
[docs]class PsycNetCrawler(GummyAbstJournal): """ URL: - https://psycnet.apa.org/ Attributes: crawl_type (str) : :meth:`PsycNetCrawler's <gummy.journals.PsycNetCrawlerCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def make_elements_visible(self, driver): wait_until_all_elements(driver=driver, timeout=5, verbose=self.verbose)
[docs] @staticmethod def is_request_successful(soup): """Check whether request is successful or not. Args: soup (BeautifulSoup) : A data structure representing a parsed HTML or XML document. """ if soup.get_text().startswith("Request unsuccessful"): raise RequestException(toRED("[403 Forbidden]\n") + soup.get_text() + toGREEN("\nPlease run Chrome with GUI browser."))
[docs] def get_title_from_soup(self, soup): self.is_request_successful(soup) title = self.default_title article_title = soup.find(name="div", class_="articleTitleGroup") if article_title is not None: title = find_target_text(soup=article_title, name="h1", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] abst = soup.find(name="div", class_="abstractsContainer") if abst is not None: abst.append("<h3>Abstract</h3>") body = soup.find(name="div", class_="articleBody") if body is not None: sections.extend(body.find_all(name="div", class_="section ftJumpToAnchor")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h3") return head
[docs]class MinervaMedicaCrawler(GummyAbstJournal): """ URL: - https://www.minervamedica.it/ Attributes: crawl_type (str) : :meth:`MinervaMedicaCrawler's <gummy.journals.MinervaMedicaCrawlerCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_soup_url(url): return f"https://www.minervamedica.it/en/journals/minerva-anestesiologica/article.php?cod={list(urllib.parse.parse_qs(url).values())[0][0]}"
[docs] def make_elements_visible(self, driver): try_find_element_click(driver=driver, by="xpath", identifier='//*[@id="pluto"]/div/p/a[1]') wait_until_all_elements(driver=driver, timeout=self.sleep_for_loading, verbose=self.verbose)
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = [] siteloader = soup.find(name="div", attrs={"id": "siteloader"}) if siteloader is not None: sections.extend(siteloader.find_all(name=None, class_=("Abstract-Head", "Abstract", "Head1", "Paragraph-Post-Head", "Paragraph", "BOX-IMMAGINE"))) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="p", class_=("Head1", "Abstract-Head")) return head
[docs]class JNeurosciCrawler(GummyAbstJournal): """ URL: - https://www.jneurosci.org/ Attributes: crawl_type (str) : :meth:`JNeurosciCrawler's <gummy.journals.JNeurosciCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_soup_url(url): for t in [".full", ".abstract", ".full.pdf"]: url = url.rstrip(t) return url.replace("/content/jneuro/", "/content/")
[docs] @staticmethod def get_pdf_url(url): return JNeurosciCrawler.get_soup_url(url).replace("/content/", "/content/jneuro/")+".full.pdf"
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="highwire-cite-title", attrs={"id": "page-title"}, strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): sections = soup.find_all(name="div", class_="section") return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
[docs]class HindawiCrawler(GummyAbstJournal): """ URL: - https://www.hindawi.com/ Attributes: crawl_type (str) : :meth:`HindawiCrawler's <gummy.journals.HindawiCrawler>` default ``crawl_type`` is ``"soup"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="soup", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", class_="article_title", strip=True, default=self.default_title) return title
[docs] def get_sections_from_soup(self, soup): """ .. code-block:: html <article> <div class="xml-content"> paper content </div> <div class="xml-content"> References </div> <div class="xml-content"> Copyright </div> </article> """ sections = [] article = soup.find(name="article", class_="article_body") if article is not None: xml_content = article.find(name="div", class_="xml-content") if xml_content is not None: sections.extend(group_soup_with_head(soup=xml_content, name="h4")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h4") return head
[docs]class ChemRxivCrawler(GummyAbstJournal): """ URL: - https://chemrxiv.org/ Attributes: crawl_type (str) : :meth:`ChemRxivCrawler's <gummy.journals.ChemRxivCrawler>` default ``crawl_type`` is ``"pdf"``. """ def __init__(self, gateway="useless", sleep_for_loading=3, verbose=True, **kwargs): super().__init__( crawl_type="pdf", gateway=gateway, sleep_for_loading=sleep_for_loading, verbose=verbose, )
[docs] @staticmethod def get_pdf_url(url): soup = BeautifulSoup(markup=requests.get(ChemRxivCrawler.get_soup_url(url)).content, features="html.parser") if soup is not None: div = soup.find(name="div", class_="_2FHUU") if div is not None: a = div.find(name="a", class_=("_2PSMA","_1jrLT","_3kcSK","_27tHP","_2BFqb")) if a is not None: url = a.get("href") return url
[docs] def get_title_from_soup(self, soup): title = find_target_text(soup=soup, name="h1", strip=True, default=self.default_title) # class_="_3lGK4" return title
[docs] def get_sections_from_soup(self, soup): sections = [] div = soup.find(name="div", class_="VCg-j") if div is not None: sections.extend(div.find_all(name="div", class_="_3nx2Q")) return sections
[docs] def get_head_from_section(self, section): head = section.find(name="h2") return head
all = TranslationGummyJournalCrawlers = { "pdf" : PDFCrawler, "arxiv" : arXivCrawler, "nature" : NatureCrawler, "ncbi" : NCBICrawler, "pubmed" : PubMedCrawler, "oxfordacademic" : OxfordAcademicCrawler, "sciencedirect" : ScienceDirectCrawler, "springer" : SpringerCrawler, "mdpi" : MDPICrawler, "unioklahoma" : UniOKLAHOMACrawler, "lungcancer" : LungCancerCrawler, "cellpress" : CellPressCrawler, "wileyonlinelibrary" : WileyOnlineLibraryCrawler, "jbc" : JBCCrawler, "biologists" : BiologistsCrawler, "biomedcentral" : BioMedCentralCrawler, "ieeexplore" : IEEEXploreCrawler, "jstage" : JSTAGECrawler, "acspublications" : ACSPublicationsCrawler, "stemcells" : StemCellsCrawler, "unikeio" : UniKeioCrawler, "plosone" : PLOSONECrawler, "frontiers" : frontiersCrawler, "rnajournal" : RNAjournalCrawler, "intechopen" : IntechOpenCrawler, "nrcresearchpress" : NRCResearchPressCrawler, "spandidos" : SpandidosCrawler, "taylorandfrancisonline" : TaylorandFrancisOnlineCrawler, "biorxiv" : bioRxivCrawler, "rscpublishing" : RSCPublishingCrawler, "jsse" : JSSECrawler, "scienceadvances" : ScienceAdvancesCrawler, "medrxiv" : medRxivCrawler, "aclanthology" : ACLAnthologyCrawler, "pnas" : PNASCrawler, "ams" : AMSCrawler, "acm" : ACMCrawler, "aps" : APSCrawler, "asip" : ASIPCrawler, "anatomypubs" : AnatomyPubsCrawler, "renalphysiology" : RenalPhysiologyCrawler, "genetics" : GeneticsCrawler, "genedev" : GeneDevCrawler, "jamanetwork" : JAMANetworkCrawler, "sagejournals" : SAGEjournalsCrawler, "molcellbio" : MolCellBioCrawler, "jkms" : JKMSCrawler, "jkns" : JKNSCrawler, "bioscience" : BioscienceCrawler, "radiographics" : RadioGraphicsCrawler, "pediatricsurgery" : PediatricSurgeryCrawler, "agupublications" : AGUPublicationsCrawler, "nejm" : NEJMCrawler, "lwwjournals" : LWWJournalsCrawler, "arvojournals" : ARVOJournalsCrawler, "learningmemory" : LearningMemoryCrawler, "sciencemag" : ScienceMagCrawler, "psychiartist" : PsyChiArtistCrawler, "oncotarget" : OncotargetCrawler, "clinicalendoscopy" : ClinicalEndoscopyCrawler, "embopress" : EMBOPressCrawler, "aspb" : ASPBCrawler, "biomedgrid" : BiomedGridCrawler, "nrr" : NRRCrawler, "ymj" : YMJCrawler, "thelancet" : TheLancetCrawler, "futurescience" : FutureScienceCrawler, "scitation" : ScitationCrawler, "iopscience" : IOPScienceCrawler, "aacrpublications" : AACRPublicationsCrawler, "psycnet" : PsycNetCrawler, "minervamedica" : MinervaMedicaCrawler, "jneurosci" : JNeurosciCrawler, "hindawi" : HindawiCrawler, "chemrxiv" : ChemRxivCrawler, } get = mk_class_get( all_classes=TranslationGummyJournalCrawlers, gummy_abst_class=[GummyAbstJournal], genre="journals" )