Source code for teilab.seedmatch

# coding: utf-8
import re

import pandas as pd
import requests

from .utils._config import SEEDMATCH_URL


[docs]def get_matched_mRNAs(seedseq: str) -> pd.DataFrame:
    """Get a list of mRNAs with ``seedseq`` in 3'UTR

    Args:
        seedseq (str): A query sequence.

    Returns:
        pd.DataFrame: A data frame with a column named ``"SystematicName"`` meaning Accession numbers for each mRNA and a column named ``"NumHits"`` meaning how many ``seedseq`` sequences are in its 3'UTR

    Examples:
        >>> from teilab.seedmatch import get_matched_mRNAs
        >>> df_matched_mRNAs = get_matched_mRNAs(seedseq="gagttca")
        >>> print(df_matched_mRNAs.to_markdown())
        |      | SystematicName   |   NumHits |
        |-----:|:-----------------|----------:|
        |    0 | NM_001004713     |         1 |
        |    1 | NM_173860        |         1 |
        |    2 | NM_001005493     |         1 |
        |    : |     :            |         : |
        | 3643 | NM_015139        |         1 |
        | 3644 | NM_015463        |         1 |
        | 3645 | NM_007189        |         1 |
        >>> from teilab.utils import now_str
        >>> now_str()
        '2021-06-10@23.36.12'

    You can also get the data with the command like ``curl`` .

    .. code-block:: shell

        $ curl -d "seedseq=gagttca" <SEEDMATCH_URL>

    """
    ret = requests.post(url=SEEDMATCH_URL, data={"seedseq": seedseq.lower()})
    return pd.DataFrame(
        data=re.findall(pattern=r"\n(.+)\t([0-9]+)", string=ret.text), columns=["SystematicName", "NumHits"]
    )