Source code for pycharmers.utils.soup_utils

#coding: utf-8
import re
import requests
from bs4 import BeautifulSoup
from .generic_utils import str_strip, handleKeyError

[docs]def str2soup(string):
    """Convert strings to soup, and removed extra tags such as ``<html>``, ``<body>``, and ``<head>``.

    Args:
        string (str) : strings

    Returns:
        bs4.BeautifulSoup : A data structure representing a parsed HTML or XML document.

    Examples:
        >>> from pycharmers.utils import str2soup
        >>> string = "<title>Python-Charmers</title>"
        >>> type(string)
        str
        >>> soup = str2soup(string)
        >>> soup
        <title>Python-Charmers</title>
        >>> type(soup)
        bs4.BeautifulSoup
        >>> from bs4 import BeautifulSoup
        >>> BeautifulSoup(string)
        <html><head><title>Python-Charmers</title></head></html>
    """
    soup = BeautifulSoup(markup=string, features="html5lib")
    for attr in ["html", "body", "head"]:
        if hasattr(soup, attr) and getattr(soup, attr) is not None:
            getattr(soup, attr).unwrap()
    return soup
    
[docs]def split_section(section, name=None, attrs={}, recursive=True, text=None, **kwargs):
    """ Split ``bs4.BeautifulSoup``.

    Args:
        section (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document.
        name (str)                  : A filter on tag name.
        attrs (dict)                : A dictionary of filters on attribute values.
        recursive (bool)            : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered.
        text (str)                  : An inner text.
        kwargs (dict)               : A dictionary of filters on attribute values.
    
    Returns:
        list : A list of elements without filter tag elements.

    Examples:
        >>> from bs4 import BeautifulSoup
        >>> from pycharmers.utils import split_section
        >>> section = BeautifulSoup(\"\"\"
        ... <section>
        ...   <div>
        ...     <h2>Title</h2>
        ...     <div>
        ...     <p>aaaaaaaaaaaaaaaaaaaaaa</p>
        ...     <div>
        ...     <img/>
        ...     </div>
        ...     <p>bbbbbbbbbbbbbbbbbbbbbb</p>
        ...     </div>
        ...   </div>
        ... </section>
        >>> \"\"\")
        >>> len(split_section(section, name="img"))
        3
        >>> split_section(section, name="img")
        [<section>
        <div>
        <h2>Title</h2>
        <div>
        <p>aaaaaaaaaaaaaaaaaaaaaa</p>
        <div>
        </div></div></div></section>,
        <img/>,
        <p>bbbbbbbbbbbbbbbbbbbbbb</p>
        ]
    """
    str_section = str(section)
    page_elements = []
    delimiters = section.find_all(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs)
    # Initialization (Prevent occuring an error when for-loop enter continue at the beginning (i=0))
    end = 0
    for i,delimiter in enumerate(delimiters):
        str_delimiter = str(delimiter)
        start = str_section.find(str_delimiter)
        if start==-1:
            continue
        page_elements.append(str2soup(string=str_section[end:start]))
        page_elements.append(delimiter)
        end = start + len(str_delimiter)
    page_elements.append(str2soup(string=str_section[end:]))
    return page_elements

[docs]def group_soup_with_head(soup, name=None, attrs={}, recursive=True, text=None, **kwargs):
    """ Gouping ``bs4.BeautifulSoup`` based on head.

    Args:
        section (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document.
        name (str)                  : A filter on tag name.
        attrs (dict)                : A dictionary of filters on attribute values.
        recursive (bool)            : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered.
        text (str)                  : An inner text.
        kwargs (dict)               : A dictionary of filters on attribute values.

    Returns:
        list : A list of elements without filter tag elements.

    Examples:
        >>> from bs4 import BeautifulSoup
        >>> from pycharmers.utils import group_soup_with_head
        >>> section = BeautifulSoup(\"\"\"
        ... <h2>AAA</h2>
        ... <div>
        ...   <p>aaaaaaaaaaaaaaaaaaaaaa</p>
        ... </div>
        ... <h2>BBB</h2>
        ... <div>
        ...   <p>bbbbbbbbbbbbbbbbbbbbbb</p>
        ... </div>
        >>> \"\"\")
        >>> sections = group_soup_with_head(section, name="h2")
        >>> len(sections)
        2
        >>> sections
        [<section><h2>AAA</h2><div>
        <p>aaaaaaaaaaaaaaaaaaaaaa</p>
        </div>
        </section>,
        <section><h2>BBB</h2><div>
        <p>bbbbbbbbbbbbbbbbbbbbbb</p>
        </div>
        </section>]
    """
    str_soup = str(soup)
    sections = []
    heads = soup.find_all(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs)
    # Initialization (Prevent occuring an error when for-loop enter continue at the beginning (i=0))
    end = 0; section = BeautifulSoup(markup="", features="lxml").new_tag(name="section")
    if len(heads)>0:
        for i,head in enumerate(heads):
            str_head = str(head)
            start = str_soup.find(str_head)
            if start==-1:
                continue
            if i>0:
                body = str2soup(string=str_soup[end:start])
                section.append(body)
                sections.append(section)
            end = start + len(str_head)
            section = BeautifulSoup(markup="", features="lxml").new_tag(name="section")
            section.append(head)
        body = str2soup(string=str_soup[end:])
        section.append(body)
        sections.append(section)
    return sections

[docs]def replace_soup_tag(soup, 
    new_name, new_namespace=None, new_nsprefix=None, new_attrs={}, new_sourceline=None, 
    new_sourcepos=None, new_kwattrs={},
    old_name=None, old_attrs={}, old_recursive=True, old_text=None, old_limit=None, old_kwargs={}, **kwargs):
    """Replace Old tag with New tag.

    - Args named ``old_XXX`` specifies "How to find old tags"
    - Args named ``new_XXX`` specifies "How to create new tags"

    Args:
        old_name (str)       : A filter on tag name.
        old_attrs (dict)     : A dictionary of filters on attribute values.
        old_recursive (bool) : If this is True, ``.find_all`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered.
        old_limit (int)      : Stop looking after finding this many results.
        old_kwargs (dict)    : A dictionary of filters on attribute values.
        new_name (str)       : The name of the new Tag.
        new_namespace (str)  : The URI of the new Tag's XML namespace, if any.
        new_prefix (str)     : The prefix for the new Tag's XML namespace, if any.
        new_attrs (dict)     : A dictionary of this Tag's attribute values; can be used instead of `kwattrs` for attributes like 'class' that are reserved words in Python.
        new_sourceline (str) : The line number where this tag was (purportedly) found in its source document.
        new_sourcepos (str)  : The character position within ``sourceline`` where this tag was (purportedly) found.
        new_kwattrs (dict)   : Keyword arguments for the new Tag's attribute values.
    
    Examples:
        >>> from bs4 import BeautifulSoup
        >>> from pycharmers.utils import replace_soup_tag
        >>> section = BeautifulSoup(\"\"\"
        ... <h2>AAA</h2>
        ... <div>
        ...   <p>aaaaaaaaaaaaaaaaaaaaaa</p>
        ... </div>
        ... <h3>BBB</h3>
        ... <div>
        ...   <p>bbbbbbbbbbbbbbbbbbbbbb</p>
        ... </div>
        >>> \"\"\")
        >>> section = replace_soup_tag(soup=section, old_name="h3", new_name="h2")
        >>> section
        <html><body><h2>AAA</h2>
        <div>
        <p>aaaaaaaaaaaaaaaaaaaaaa</p>
        </div>
        <h2>BBB</h2>
        <div>
        <p>bbbbbbbbbbbbbbbbbbbbbb</p>
        </div>
        </body></html>
    """
    for old in soup.find_all(name=old_name, attrs=old_attrs, recursive=old_recursive, text=old_text, limit=old_limit, **old_kwargs):
        new = BeautifulSoup(markup="", features="lxml").new_tag(name=new_name, namespace=new_namespace, nsprefix=new_nsprefix, attrs=new_attrs, sourceline=new_sourceline, sourcepos=new_sourcepos, **new_kwattrs)
        new.extend(list(old.children))
        old.replace_with(new)
    return soup

[docs]def find_target_text(soup, name=None, attrs={}, recursive=True, text=None, default="__NOT_FOUND__", strip=True, **kwargs):
    """Find target element, and get all child strings from it.

    Args:
        soup (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document.
        name (str)               : A filter on tag name.
        attrs (dict)             : A dictionary of filters on attribute values.
        recursive (bool)         : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered.
        text (str)               : An inner text.
        default (str)            : Default return value if element not found.
        strip (bool)             : Whether to use :func:`str_strip <pycharmers.utils.generic_utils.str_strip>`
        kwargs (dict)            : A dictionary of filters on attribute values.

    Returns:
        str : text

    Examples:
        >>> from bs4 import BeautifulSoup
        >>> from pycharmers.utils import find_target_text
        >>> section = BeautifulSoup(\"\"\"
        ... <h2>AAA</h2>
        ... <div> <p>aaaaaaaaaaaaaaaaaaaaaa</p></div>
        >>> \"\"\")
        >>> find_target_text(soup=section, name="div")
        'aaaaaaaaaaaaaaaaaaaaaa'
        >>> find_target_text(soup=section, name="div", strip=False)
        ' aaaaaaaaaaaaaaaaaaaaaa '
        >>> find_target_text(soup=section, name="divdiv", default="not found")
        'not found'
    """
    target = soup.find(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs)
    if target is None:
        text = default
    else:
        text = target.text
    if strip:
        text = str_strip(string=text)
    return text

[docs]def find_all_target_text(soup, name=None, attrs={}, recursive=True, text=None, default="__NOT_FOUND__", strip=True, joint="", **kwargs):
    """Find target element, and get all child strings from it.

    Args:
        soup (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document.
        name (str)               : A filter on tag name.
        attrs (dict)             : A dictionary of filters on attribute values.
        recursive (bool)         : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered.
        text (str)               : An inner text.
        default (str)            : Default return value if element not found.
        strip (bool)             : Whether to use :func:`str_strip <pycharmers.utils.generic_utils.str_strip>`
        joint (str)              : Inserted between target strings.
        kwargs (dict)            : A dictionary of filters on attribute values.

    Returns:
        str : text

    Examples:
        >>> from bs4 import BeautifulSoup
        >>> from pycharmers.utils import find_all_target_text
        >>> section = BeautifulSoup(\"\"\"
        ... <div>
        ...   <p class="lang en">Hello</p>
        ...   <p class="lang zh-CN">你好</p>
        ...   <p class="lang es">Hola</p>
        ...   <p class="lang fr">Bonjour</p>
        ...   <p class="lang ja">こんにちは</p>
        ... </div>
        >>> \"\"\")
        >>> find_all_target_text(soup=section, name="p", class_="lang", joint=", ")
        'Hello, 你好, Hola, Bonjour, こんにちは'
        >>> find_all_target_text(soup=section, name="p", class_="es", joint=", ")
        'Hola'
    """
    texts = []
    for target in soup.find_all(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs):
        text = target.text
        if strip:
            text = str_strip(string=text)
        texts.append(text)
    return joint.join(texts)

[docs]def find_target_id(soup, key, name=None, attrs={}, recursive=True, text=None, default=None, strip=True, **kwargs):
    """Find target element, and get id from it.

    Args:
        soup (bs4.BeautifulSoup) : A data structure representing a parsed HTML or XML document.
        key (str)                : id name.
        name (str)               : A filter on tag name.
        attrs (dict)             : A dictionary of filters on attribute values.
        recursive (bool)         : If this is True, ``.find`` will perform a recursive search of this PageElement's children. Otherwise, only the direct children will be considered.
        text (str)               : An inner text.
        default (str)            : Default return value if element not found.
        strip (bool)             : Whether to use :func:`str_strip <pycharmers.utils.generic_utils.str_strip>`
        kwargs (dict)            : A dictionary of filters on attribute values.

    Returns:
        str : text.

    Examples:
        >>> from bs4 import BeautifulSoup
        >>> from pycharmers.utils import find_target_id
        >>> section = BeautifulSoup(\"\"\"
        ... <h2>IMAGE</h2>
        ... <div>
        ...   <img id="apple-touch-icon" src="https://iwasakishuto.github.io/images/apple-touch-icon/Python-Charmers.png">
        ... </div>
        >>> \"\"\")
        >>> find_target_id(soup=section, name="img", key="id")
        'apple-touch-icon'
        >>> find_target_id(soup=section, name="img", key="src")
        'https://iwasakishuto.github.io/images/apple-touch-icon/Python-Charmers.png'
    """
    target = soup.find(name=name, attrs=attrs, recursive=recursive, text=text, **kwargs)
    if target is None:
        id_ = default
    else:
        id_ = target.get(key=key, default=default)
    if strip:
        id_ = str_strip(string=id_)
    return id_

[docs]def get_soup(url, driver=None, features="lxml", timeout=1):
    """ Scrape and get page source from ``url``.

    Args:
        url  (str)         : URL.
        driver (WebDriver) : webdriver
        features (str)     : Desirable features of the parser to be used. This may be the name of a specific parser ("lxml", "lxml-xml", "html.parser", or "html5lib") or it may be the type of markup to be used ("html", "html5", "xml"). It's recommended that you name a specific parser, so that Beautiful Soup gives you the same results across platforms and virtual environments.

    Returns:
        BeautifulSoup : A data structure representing a parsed HTML or XML document.
    """
    handleKeyError(lst=["lxml", "lxml-xml", "html.parser", "html5lib", "html", "html5", "xml"], features=features)
    if driver is None:
        html = requests.get(url=url).content
    else:
        from .driver_utils import scrollDown, wait_until_all_elements
        driver.get(url)
        wait_until_all_elements(driver=driver, timeout=timeout, verbose=False)
        scrollDown(driver=driver, verbose=False)
        html = driver.page_source.encode("utf-8")
    return BeautifulSoup(markup=html, features=features)
Source code for pycharmers.utils.soup_utils

Other contents

Social link

Table of Contents