Source code for pycharmers.utils.soup_utils

#coding: utf-8
import re
import requests
from bs4 import BeautifulSoup
from .generic_utils import str_strip, handleKeyError

[docs]def str2soup(string): """Convert strings to soup, and removed extra tags such as ``<html>``, ``<body>``, and ``<head>``. Args: string (str) : strings Returns: bs4.BeautifulSoup : A data structure representing a parsed HTML or XML document. Examples: >>> from pycharmers.utils import str2soup >>> string = "<title>Python-Charmers</title>" >>> type(string) str >>> soup = str2soup(string) >>> soup <title>Python-Charmers</title> >>> type(soup) bs4.BeautifulSoup >>> from bs4 import BeautifulSoup >>> BeautifulSoup(string) <html><head><title>Python-Charmers</title></head></html> """ soup = BeautifulSoup(markup=string, features="html5lib") for attr in ["html", "body", "head"]: if hasattr(soup, attr) and getattr(soup, attr) is not None: getattr(soup, attr).unwrap() return soup
[docs]def split_section(section, name=None, attrs={}, recursive=True, text=None, **kwargs): """ Split ``bs4.BeautifulSoup``. Args: section (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document. name (str) : A filter on tag name. attrs (dict) : A dictionary of filters on attribute values. recursive (bool) : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered. text (str) : An inner text. kwargs (dict) : A dictionary of filters on attribute values. Returns: list : A list of elements without filter tag elements. Examples: >>> from bs4 import BeautifulSoup >>> from pycharmers.utils import split_section >>> section = BeautifulSoup(\"\"\" ... <section> ... <div> ... <h2>Title</h2> ... <div> ... <p>aaaaaaaaaaaaaaaaaaaaaa</p> ... <div> ... <img/> ... </div> ... <p>bbbbbbbbbbbbbbbbbbbbbb</p> ... </div> ... </div> ... </section> >>> \"\"\") >>> len(split_section(section, name="img")) 3 >>> split_section(section, name="img") [<section> <div> <h2>Title</h2> <div> <p>aaaaaaaaaaaaaaaaaaaaaa</p> <div> </div></div></div></section>, <img/>, <p>bbbbbbbbbbbbbbbbbbbbbb</p> ] """ str_section = str(section) page_elements = [] delimiters = section.find_all(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs) # Initialization (Prevent occuring an error when for-loop enter continue at the beginning (i=0)) end = 0 for i,delimiter in enumerate(delimiters): str_delimiter = str(delimiter) start = str_section.find(str_delimiter) if start==-1: continue page_elements.append(str2soup(string=str_section[end:start])) page_elements.append(delimiter) end = start + len(str_delimiter) page_elements.append(str2soup(string=str_section[end:])) return page_elements
[docs]def group_soup_with_head(soup, name=None, attrs={}, recursive=True, text=None, **kwargs): """ Gouping ``bs4.BeautifulSoup`` based on head. Args: section (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document. name (str) : A filter on tag name. attrs (dict) : A dictionary of filters on attribute values. recursive (bool) : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered. text (str) : An inner text. kwargs (dict) : A dictionary of filters on attribute values. Returns: list : A list of elements without filter tag elements. Examples: >>> from bs4 import BeautifulSoup >>> from pycharmers.utils import group_soup_with_head >>> section = BeautifulSoup(\"\"\" ... <h2>AAA</h2> ... <div> ... <p>aaaaaaaaaaaaaaaaaaaaaa</p> ... </div> ... <h2>BBB</h2> ... <div> ... <p>bbbbbbbbbbbbbbbbbbbbbb</p> ... </div> >>> \"\"\") >>> sections = group_soup_with_head(section, name="h2") >>> len(sections) 2 >>> sections [<section><h2>AAA</h2><div> <p>aaaaaaaaaaaaaaaaaaaaaa</p> </div> </section>, <section><h2>BBB</h2><div> <p>bbbbbbbbbbbbbbbbbbbbbb</p> </div> </section>] """ str_soup = str(soup) sections = [] heads = soup.find_all(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs) # Initialization (Prevent occuring an error when for-loop enter continue at the beginning (i=0)) end = 0; section = BeautifulSoup(markup="", features="lxml").new_tag(name="section") if len(heads)>0: for i,head in enumerate(heads): str_head = str(head) start = str_soup.find(str_head) if start==-1: continue if i>0: body = str2soup(string=str_soup[end:start]) section.append(body) sections.append(section) end = start + len(str_head) section = BeautifulSoup(markup="", features="lxml").new_tag(name="section") section.append(head) body = str2soup(string=str_soup[end:]) section.append(body) sections.append(section) return sections
[docs]def replace_soup_tag(soup, new_name, new_namespace=None, new_nsprefix=None, new_attrs={}, new_sourceline=None, new_sourcepos=None, new_kwattrs={}, old_name=None, old_attrs={}, old_recursive=True, old_text=None, old_limit=None, old_kwargs={}, **kwargs): """Replace Old tag with New tag. - Args named ``old_XXX`` specifies "How to find old tags" - Args named ``new_XXX`` specifies "How to create new tags" Args: old_name (str) : A filter on tag name. old_attrs (dict) : A dictionary of filters on attribute values. old_recursive (bool) : If this is True, ``.find_all`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered. old_limit (int) : Stop looking after finding this many results. old_kwargs (dict) : A dictionary of filters on attribute values. new_name (str) : The name of the new Tag. new_namespace (str) : The URI of the new Tag's XML namespace, if any. new_prefix (str) : The prefix for the new Tag's XML namespace, if any. new_attrs (dict) : A dictionary of this Tag's attribute values; can be used instead of `kwattrs` for attributes like 'class' that are reserved words in Python. new_sourceline (str) : The line number where this tag was (purportedly) found in its source document. new_sourcepos (str) : The character position within ``sourceline`` where this tag was (purportedly) found. new_kwattrs (dict) : Keyword arguments for the new Tag's attribute values. Examples: >>> from bs4 import BeautifulSoup >>> from pycharmers.utils import replace_soup_tag >>> section = BeautifulSoup(\"\"\" ... <h2>AAA</h2> ... <div> ... <p>aaaaaaaaaaaaaaaaaaaaaa</p> ... </div> ... <h3>BBB</h3> ... <div> ... <p>bbbbbbbbbbbbbbbbbbbbbb</p> ... </div> >>> \"\"\") >>> section = replace_soup_tag(soup=section, old_name="h3", new_name="h2") >>> section <html><body><h2>AAA</h2> <div> <p>aaaaaaaaaaaaaaaaaaaaaa</p> </div> <h2>BBB</h2> <div> <p>bbbbbbbbbbbbbbbbbbbbbb</p> </div> </body></html> """ for old in soup.find_all(name=old_name, attrs=old_attrs, recursive=old_recursive, text=old_text, limit=old_limit, **old_kwargs): new = BeautifulSoup(markup="", features="lxml").new_tag(name=new_name, namespace=new_namespace, nsprefix=new_nsprefix, attrs=new_attrs, sourceline=new_sourceline, sourcepos=new_sourcepos, **new_kwattrs) new.extend(list(old.children)) old.replace_with(new) return soup
[docs]def find_target_text(soup, name=None, attrs={}, recursive=True, text=None, default="__NOT_FOUND__", strip=True, **kwargs): """Find target element, and get all child strings from it. Args: soup (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document. name (str) : A filter on tag name. attrs (dict) : A dictionary of filters on attribute values. recursive (bool) : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered. text (str) : An inner text. default (str) : Default return value if element not found. strip (bool) : Whether to use :func:`str_strip <pycharmers.utils.generic_utils.str_strip>` kwargs (dict) : A dictionary of filters on attribute values. Returns: str : text Examples: >>> from bs4 import BeautifulSoup >>> from pycharmers.utils import find_target_text >>> section = BeautifulSoup(\"\"\" ... <h2>AAA</h2> ... <div> <p>aaaaaaaaaaaaaaaaaaaaaa</p></div> >>> \"\"\") >>> find_target_text(soup=section, name="div") 'aaaaaaaaaaaaaaaaaaaaaa' >>> find_target_text(soup=section, name="div", strip=False) ' aaaaaaaaaaaaaaaaaaaaaa ' >>> find_target_text(soup=section, name="divdiv", default="not found") 'not found' """ target = soup.find(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs) if target is None: text = default else: text = target.text if strip: text = str_strip(string=text) return text
[docs]def find_all_target_text(soup, name=None, attrs={}, recursive=True, text=None, default="__NOT_FOUND__", strip=True, joint="", **kwargs): """Find target element, and get all child strings from it. Args: soup (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document. name (str) : A filter on tag name. attrs (dict) : A dictionary of filters on attribute values. recursive (bool) : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered. text (str) : An inner text. default (str) : Default return value if element not found. strip (bool) : Whether to use :func:`str_strip <pycharmers.utils.generic_utils.str_strip>` joint (str) : Inserted between target strings. kwargs (dict) : A dictionary of filters on attribute values. Returns: str : text Examples: >>> from bs4 import BeautifulSoup >>> from pycharmers.utils import find_all_target_text >>> section = BeautifulSoup(\"\"\" ... <div> ... <p class="lang en">Hello</p> ... <p class="lang zh-CN">你好</p> ... <p class="lang es">Hola</p> ... <p class="lang fr">Bonjour</p> ... <p class="lang ja">こんにちは</p> ... </div> >>> \"\"\") >>> find_all_target_text(soup=section, name="p", class_="lang", joint=", ") 'Hello, 你好, Hola, Bonjour, こんにちは' >>> find_all_target_text(soup=section, name="p", class_="es", joint=", ") 'Hola' """ texts = [] for target in soup.find_all(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs): text = target.text if strip: text = str_strip(string=text) texts.append(text) return joint.join(texts)
[docs]def find_target_id(soup, key, name=None, attrs={}, recursive=True, text=None, default=None, strip=True, **kwargs): """Find target element, and get id from it. Args: soup (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document. key (str) : id name. name (str) : A filter on tag name. attrs (dict) : A dictionary of filters on attribute values. recursive (bool) : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered. text (str) : An inner text. default (str) : Default return value if element not found. strip (bool) : Whether to use :func:`str_strip <pycharmers.utils.generic_utils.str_strip>` kwargs (dict) : A dictionary of filters on attribute values. Returns: str : text. Examples: >>> from bs4 import BeautifulSoup >>> from pycharmers.utils import find_target_id >>> section = BeautifulSoup(\"\"\" ... <h2>IMAGE</h2> ... <div> ... <img id="apple-touch-icon" src="https://iwasakishuto.github.io/images/apple-touch-icon/Python-Charmers.png"> ... </div> >>> \"\"\") >>> find_target_id(soup=section, name="img", key="id") 'apple-touch-icon' >>> find_target_id(soup=section, name="img", key="src") 'https://iwasakishuto.github.io/images/apple-touch-icon/Python-Charmers.png' """ target = soup.find(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs) if target is None: id_ = default else: id_ = target.get(key=key, default=default) if strip: id_ = str_strip(string=id_) return id_
[docs]def get_soup(url, driver=None, features="lxml", timeout=1): """ Scrape and get page source from ``url``. Args: url (str) : URL. driver (WebDriver) : webdriver features (str) : Desirable features of the parser to be used. This may be the name of a specific parser ("lxml", "lxml-xml", "html.parser", or "html5lib") or it may be the type of markup to be used ("html", "html5", "xml"). It's recommended that you name a specific parser, so that Beautiful Soup gives you the same results across platforms and virtual environments. Returns: BeautifulSoup : A data structure representing a parsed HTML or XML document. """ handleKeyError(lst=["lxml", "lxml-xml", "html.parser", "html5lib", "html", "html5", "xml"], features=features) if driver is None: html = requests.get(url=url).content else: from .driver_utils import scrollDown, wait_until_all_elements driver.get(url) wait_until_all_elements(driver=driver, timeout=timeout, verbose=False) scrollDown(driver=driver, verbose=False) html = driver.page_source.encode("utf-8") return BeautifulSoup(markup=html, features=features)