# coding: utf-8
"""This module handles Datasets used in the lecture. The table below describes the meaning of the values in each column of the data used in the lecture. If you want to refer each class or method, click :py:class:`HERE <teilab.datasets.Samples>` to skip it.
Reference: `Agilent Feature Extraction 12.0 Reference Guide <https://www.agilent.com/cs/library/usermanuals/public/G4460-90052_FE_RefGuide.pdf>`_
########
FEATURES
########
.. code-block:: python
>>> from teilab.datasets import TeiLabDataSets
>>> datasets = TeiLabDataSets(verbose=True)
>>> df_data = datasets.read_data(no=0)
>>> df_data.columns
Index(['FEATURES', 'FeatureNum', 'Row', 'Col', 'accessions', ... ], dtype='object')
+--------+--------+------------------------------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| G2505C | G2600D | Features | Types | Description |

| 1 | 1 | ``FeatureNum`` | int | Feature number. |
+--------+--------+------------------------------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 2 | 2 | ``Row`` | int | Feature location: row. |

| 3 | 3 | ``Col`` | int | Feature location: column. |

| 4 | | ``accessions`` | str | Gene accession numbers. |

| 5 | | ``chr_coord`` | str | Chromosome coordinates of the feature. |
+--------+--------+------------------------------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 6 | 4 | ``SubTypeMask`` | int | Numeric code defining the subtype of any control feature. |

| 7 | | ``SubTypeName`` | int | Name of the subtype of any control feature. |

| 8 | | ``Start`` | int | Indicates the place in the transcript where the probe sequence starts. |

| 9 | | ``Sequence`` | str | The sequence of bases printed on the array. |

| 10 | | ``ProbeUID`` | int | Unique integer for each unique probe in a design. |

| 11 | 5 | ``ControlType`` | int | Feature control type. |
+--------+--------+------------------------------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 12 | 6 | ``ProbeName`` | str | An Agilent-assigned identifier for the probe synthesized on the microarray. |
+--------+--------+------------------------------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 13 | | ``GeneName`` | str | This is an identifier for the gene for which the probe provides expression information. The target sequence identified by the systematic name is normally a representative or consensus sequence for the gene. |
+--------+--------+------------------------------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 14 | 7 | ``SystematicName`` | str | This is an identifier for the target sequence that the probe was designed to hybridize with. Where possible, a public database identifier is used (e.g., TAIR locus identifier for Arabidopsis). ``SystematicName`` is reported ONLY if Gene name and Systematic name are different. |

| 15 | | ``Description`` | str | Description of gene. |

| 16 | 8 | ``PositionX`` | float | Found coordinates (X) of the feature centroid in microns. |

| 17 | 9 | ``PositionY`` | float | Found coordinates (Y) of the feature centroid in microns. |

| 18 | | ``gSurrogateUsed`` | float | If ``gSurrogateUsed`` is Non-zero value, The g surrogate value used. Otherwise, No surrogate value used. |

| 19 | | ``gIsFound`` | bool | A boolean used to flag found features. The flag is applied independently in each channel. (``1`` = IsFound, ``0`` = IsNotFound) A feature is considered Found if two conditions are true: 1) the difference between the feature signal and the local background signal is more than 1.5 times the local background noise and 2) the spot diameter is at least 0.30 times the nominal spot diameter. |

| 20 | 10 | ``gProcessedSignal`` | float | The signal left after all the Feature Extraction processing steps have been completed. In the case of one color, ``ProcesssedSignal`` contains the Multiplicatively Detrended BackgroundSubtracted Signal if the detrending is selected and helps. If the detrending does not help, this column will contain the ``BackgroundSubtractedSignal``. |

| 21 | 11 | ``gProcessedSigError`` | float | The universal or propagated error left after all the processing steps of Feature Extraction have been completed. In the case of one color, ``ProcessedSignalError`` has had the Error Model applied and will contain at least the larger of the universal (UEM) error or the propagated error. If multiplicative detrending is performed, ``ProcessedSignalError`` contains the error propagated from detrending. This is done by dividing the error by the normalized ``MultDetrendSignal``. |

| 22 | | ``gNumPixOLHi`` | int | Number of outlier pixels per feature with intensity > upper threshold set via the pixel outlier rejection method. The number is computed independently in each channel. These pixels are omitted from all subsequent calculations. |

| 23 | | ``gNumPixOLLo`` | int | Number of outlier pixels per feature with intensity < lower threshold set via the pixel outlier rejection method. The number is computed independently in each channel. These pixels are omitted from all subsequent calculations. NOTE: The pixel outlier method is the ONLY step that removes data in Feature Extraction. |

| 24 | | ``gNumPix`` | int | Total number of pixels used to compute feature statistics; i.e. total number of inlier pixels/per spot; same in both channels. |

| 25 | | ``gMeanSignal`` | float | Raw mean signal of feature from inlier pixels in green and/or red channel. |

| 26 | 12 | ``gMedianSignal`` | float | Raw median signal of feature from inlier pixels in green and/or red channel. |

| 27 | | ``gPixSDev`` | float | Standard deviation of all inlier pixels per feature; this is computed independently in each channel. |

| 28 | | ``gPixNormIQR`` | float | The normalized Inter-quartile range of all of the inlier pixels per feature. The range is computed independently in each channel. |

| 29 | | ``gBGNumPix`` | int | Total number of pixels used to compute local BG statistics per spot; i.e. total number of BG inlier pixels; same in both channels. |

| 30 | 25 | ``gBGMeanSignal`` | float | Mean local background signal (local to corresponding feature) computed per channel (inlier pixels). |

| 31 | 13 | ``gBGMedianSignal`` | float | Median local background signal (local to corresponding feature) computed per channel (inlier pixels). |

| 32 | 14 | ``gBGPixSDev`` | float | Standard deviation of all inlier pixels per local BG of each feature, computed independently in each channel. |

| 33 | | ``gBGPixNormIQR`` | float | The normalized Inter-quartile range of all of the inlier pixels per local BG of each feature. The range is computed independently in each channel. |

| 34 | | ``gNumSatPix`` | int | Total number of saturated pixels per feature, computed per channel. |

| 35 | 15 | ``gIsSaturated`` | bool | Boolean flag indicating if a feature is saturated or not. A feature is saturated. IF 50% of the pixels in a feature are above the saturation threshold. (``1`` = Saturated, ``0`` = Not saturated). |

| 36 | 16 | ``gIsFeatNonUnifOL`` | bool | Boolean flag indicating if a feature is a NonUniformity Outlier or not. A feature is non-uniform if the pixel noise of feature exceeds a threshold established for a "uniform" feature. ``g(r)IsFeatNonUnifOL`` = ``1`` indicates Feature is a non-uniformity outlier in g(r). |

| 37 | 17 | ``gIsBGNonUnifOL`` | bool | The same concept as above but for background. ``g(r)IsBGNonUnifOL`` = ``1`` indicates Local background is a non-uniformity outlier in g(r). |

| 38 | 18 | ``gIsFeatPopnOL`` | bool | Boolean flag indicating if a feature is a Population Outlier or not. Probes with replicate features on a microarray are examined using population statistics. A feature is a population outlier if its signal is less than a lower threshold or exceeds an upper threshold determined using a multiplier (1.42) times the interquartile range (i.e., IQR) of the population. ``g(r)IsFeatPopnOL`` = ``1`` indicates Feature is a population outlier in g(r). |

| 39 | 19 | ``gIsBGPopnOL`` | bool | The same concept as above but for background. ``g(r)IsBGPopnOL`` = ``1`` indicates local background is a population outlier in g(r). |

| 40 | 20 | ``IsManualFlag`` | bool | Boolean to flag features for downstream filtering in third party gene expression software. |

| 41 | 21 | ``gBGSubSignal`` | float | Background-subtracted signal. To display the values used to calculate this variable using different background signals and settings of spatial detrend and global background adjust, see `Table 34 on page 254 <https://www.agilent.com/cs/library/usermanuals/public/G4460-90052_FE_RefGuide.pdf>`_ . ``g(r)BGSubSignal`` = ``g(r)MeanSignal`` - ``g(r)BGUsed``. |
+--------+--------+------------------------------------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 42 | | ``gBGSubSigError`` | float | Propagated standard error as computed on net g(r) background-subtracted signal. For one color, the error model is applied to the background-subtracted signal. This will contain the larger of he universal (UEM) error or the propagated error. |

| 43 | 22 | ``gIsPosAndSignif`` | bool | Boolean flag, established via a 2-sided t-test, indicates if the mean signal of a feature is greater than the corresponding background (selected by user) and if this difference is significant. ``g(r)isPosAndSignif`` = 1 indicates Feature is positive and significant above background. |

| 44 | | ``gPValFeatEqBG`` | float | pValue from t-test of significance between g(r)Mean signal and g(r) background (selected by user). |

| 45 | | ``gNumBGUsed`` | int | Number of local background regions or features used to calculate the background used for background subtraction on this feature. |

| 46 | 23 | ``gIsWellAboveBG`` | bool | Boolean flag indicating if a feature is WellAbove Background or not, feature passes ``g(r)IsPosAndSignif`` and additionally the g(r)BGSubSignal is greater than ``2.6*g(r)BG_SD``. You can change the multiplier ``2.6``. |

| 47 | | ``gBGUsed`` | float | Background used to subtract from the MeanSignal; variable also used in t-test. To display the values used to calculate this variable using different background signals and settings of spatial detrend and global background adjust, see `Table 34 on page 254 <https://www.agilent.com/cs/library/usermanuals/public/G4460-90052_FE_RefGuide.pdf>`_ . ``g(r)BGSubSignal`` = ``g(r)MeanSignal`` - ``g(r)BGUsed``. |

| 48 | | ``gBGSDUsed`` | float | Standard deviation of background used in g(r) channel; variable also used in t-test and surrogate algorithms. To display the values used to calculate this variable using different background signals and settings of spatial detrend and global background adjust, see `Table 34 on page 254 <https://www.agilent.com/cs/library/usermanuals/public/G4460-90052_FE_RefGuide.pdf>`_ . |

| 49 | | ``ErrorModel`` | bool | Indicates the error model that you chose for Feature Extraction or that the software uses if you have chosen the "Most Conservative" option. ``0`` = Propagated model chosen by you or by software. ``1`` = Universal error model chosen by you or by software. |

| 50 | | ``gSpatialDetrendIsInFilteredSet`` | bool | Set to true for a given feature if it is part of the filtered set used to detrend the background. This feature is considered part of the locally weighted lowest x% of features as defined by the DetrendLowPassPercentage. ``1`` = Feature in filtered set. ``0`` = Feature not in filtered set. |

| 51 | | ``gSpatialDetrendSurfaceValue`` | float | Value of the smoothed surface calculated by the Spatial detrend algorithm. |

| 52 | 24 | ``SpotExtentX`` | float | Diameter of the spot (X-axis). |

| 53 | | ``SpotExtentY`` | float | Diameter of the spot (Y-axis). |

| 54 | | ``gNetSignal`` | float | ``MeanSignal`` minus ``DarkOffset``. |

| 55 | | ``gMultDetrendSignal`` | float | A surface is fitted through the log of the background-subtracted signal to look for multiplicative gradients. A normalized version of that surface interpolated at each point of the microarray is stored in ``MultDetrendSignal``. The surface is normalized by dividing each point by the overall average of the surface. That average is stored in ``MultDetrendSurfaceAverage`` as a statistic. 1-color only. |

| 56 | | ``gProcessedBackground`` | float | Indicates the Background signal that was selected to be used (Mean or Median). |

| 57 | | ``gProcessedBkngError`` | float | Indicates the Background error that was selected to be used (PixSD or NormIQR) . |

| 58 | | ``IsUsedBGAdjust`` | bool | A Boolean used to flag features used for computation of global BG offset. ``1`` = Feature used. ``0`` = Feature not used. |

| 59 | | ``gInterpolatedNegCtrlSub`` | float | Value at the polynomial fit of the negative controls. |

| 60 | | ``gIsInNegCtrlRange`` | bool | Set to true for a given feature if its signal intensity is in the negative control range. |

| 61 | | ``gIsUsedInMD`` | bool | Indicates whether this feature was included in the set used to generate the multiplicative detrend surface. |

**********************
Background Subtraction
**********************
The feature background-subtracted signal, ``BGSubSignal``, is calculated by subtracting a value called the ``BGUsed`` from the feature mean signal.
.. math::
\\text{ BGSubSignal } = \\text{ MeanSignal } - \\text{ BGUsed }
where ``BGSubSignal`` and ``BGUsed`` depend on the type of background method and the settings for spatial detrend and global background adjust. See the following table.
+-------------------------------+---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
| Background Subtraction Method | Background Subtraction Variable | SpDe OFF / GBA OFF | SpDe ON / GBA OFF | SpDe OFF / GBA ON | SpDe ON / GBA ON |
+===============================+=================================+=============================+================================+=================================+=============================================+
| No background subtract | ``BGUsed`` = | ``BGMeanSignal`` | ``SDSV`` | ``BGAdjust`` | ``SDSV`` + ``BGAdjust`` |
+ +---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
| | ``BGSDUsed`` = | ``BGPixSDev`` | ``BGPixSDev`` | ``BGPixSDev`` | ``BGPixSDev`` |
+ +---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
| | ``BGSubSignal`` = | ``MeanSignal`` | ``MeanSignal`` - ``BGUsed`` | ``MeanSignal`` - ``BGUsed`` | ``MeanSignal`` - ``BGUsed`` |
+-------------------------------+---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
| Local Background | ``BGUsed`` = | ``BGMeanSignal`` | ``BGMeanSignal`` + ``SDSV`` | ``BGMeanSignal`` + ``BGAdjust`` | ``BGMeanSignal`` + ``SDSV`` + ``BGAdjust`` |
+ +---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
| | ``BGSDUsed`` = | ``BGPixSDev`` | ``BGPixSDev`` | ``BGPixSDev`` | ``BGPixSDev`` |
+ +---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
| | ``BGSubSignal`` = | ``MeanSignal`` - ``BGUsed`` | ``MeanSignal`` - ``BGUsed`` | ``MeanSignal`` - ``BGUsed`` | ``MeanSignal`` - ``BGUsed`` |
+-------------------------------+---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
| Global Background method | ``BGUsed`` = | ``GBGIA`` | ``GBGIA`` + ``SDSV`` | ``GBGIA`` + ``BGAdjust`` | ``GBGIA`` + ``SDSV`` + ``BGAdjust`` |
+ +---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
| | ``BGSDUsed`` = | ``GBGISD`` | ``GBGISD`` | ``GBGISD`` | ``GBGISD`` |
+ +---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
| | ``BGSubSignal`` = | ``MeanSignal`` - ``BGUsed`` | ``MeanSignal`` - ``BGUsed`` | ``MeanSignal`` - ``BGUsed`` | ``MeanSignal`` - ``BGUsed`` |
+-------------------------------+---------------------------------+-----------------------------+--------------------------------+---------------------------------+---------------------------------------------+
- SpDe: Spatial Detrend
- SDSV: ``SpatialDetrendSurfaceValue``
- GBA: Global Bkgnd Adjust
- GBGISD: ``GlobalBGInlierSDev``
- GBGISD: ``GlobalBGInlierSDev``
.. code-block:: python
>>> from teilab.datasets import TeiLabDataSets
>>> datasets = TeiLabDataSets(verbose=False)
>>> df_meta = datasets.read_meta(no=-1)
>>> df_meta["BGSubtractor_BGSubMethod"].values[0]
7
The BGSubMethod of ``7`` corresponds to "No Background Subtraction method" (see `Table 17 on page 129 <https://www.agilent.com/cs/library/usermanuals/public/G4460-90052_FE_RefGuide.pdf>`_ .).
- Global Background Adjustment is turned Off.
- Spatial Detrending is turned On.
.. code-block:: python
>>> from teilab.datasets import TeiLabDataSets
>>> datasets = TeiLabDataSets(verbose=False)
>>> df_data = datasets.read_data(no=-1)
>>> all(df_data["gBGUsed"] == df_data["gSpatialDetrendSurfaceValue"])
True
>>> all(abs((df_data["gBGSDUsed"]-df_data["gBGPixSDev"])/df_data["gBGSDUsed"])<1e-3)
True
>>> all(abs((df_data["gBGSubSignal"]-(df_data["gMeanSignal"]-df_data["gBGUsed"]))/df_data["gBGSubSignal"]) < 1.4e-1)
True
"""
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
import pandas as pd
import requests
from nptyping import NDArray
from requests import Response
from tabulate import tabulate
from .utils._config import GAS_WEBAPP_URL
from .utils._path import DATA_DIR, SAMPLE_LIST_PATH
from .utils.download_utils import decide_downloader
from .utils.generic_utils import verbose2print
[docs]class Samples:
"""Utility Sample Class for this lecture.
Attributes:
_df (pd.DataFrame) : Sample information described in text file at ``sample_list_path`` .
SampleNumber (NDArray[Any,str]) : Index numbers for each sample.
FileName (NDArray[Any,str]) : File namse for each sample.
Condition (NDArray[Any,str]) : Experimental conditions for each sample.
_group_names (NDArray[Any,str]) : ``i-th`` group's filename prefix.
_group_numbers (NDArray[Any,np.uint8]) : Which group (``no``) the ``j-th`` sample belongs to
Examples:
>>> from teilab.datasets import Samples
>>> from teilab.utils._path import SAMPLE_LIST_PATH
>>> samples = Samples(sample_list_path=SAMPLE_LIST_PATH)
>>> print(sorted(samples.__dict__.keys()))
['Condition', 'FileName', 'SampleNumber', '_df', '_group_names', '_group_numbers']
"""
def __init__(self, sample_list_path: str):
self._df = pd.read_csv(sample_list_path)
for col in self._df.columns:
setattr(self, col, self._df[col].values)
self.grouping()
[docs] def grouping(self) -> None:
"""Grouping the samples based on their filenames."""
group_names: List[str] = []
group_numbers: List[int] = []
for fn in self.FileName:
gn: str = "_".join(fn.split("_")[:-3])
if gn not in group_names:
group_names.append(gn)
group_numbers.append(group_names.index(gn))
self._group_names: NDArray[Any, str] = np.asarray(group_names)
self._group_numbers: NDArray[Any, np.uint8] = np.asarray(group_numbers, dtype=np.uint8)
@property
def groups(self):
return [
[i, cnd, gn, self._group_names[gn], fn]
for i, (gn, fn, cnd) in enumerate(zip(self._group_numbers, self.FileName, self.Condition))
]
[docs] def show_groups(self, tablefmt: str = "simple") -> None:
"""Show groups neatly.
Args:
tablefmt (str, optional) : Table formats. Please choose from [``"plain"``, ``"simple"``, ``"grid"``, ``"pipe"``, ``"orgtbl"``, ``"rst"``, ``"mediawiki"``, ``"latex"``, ``"latex_raw"``, ``"latex_booktabs"``, ``"latex_longtable"``, ``"tsv"``] . Defaults to ``"simple"``.
Examples:
>>> from teilab.datasets import Samples
>>> from teilab.utils._path import SAMPLE_LIST_PATH
>>> samples = Samples(sample_list_path=SAMPLE_LIST_PATH)
>>> samples.show_groups()
idx gn GroupName FileName
----- ---- ------------------------------------- ---------------------------------------------------
0 0 SG19378659_257236339458_S001_GE1_1200 SG19378659_257236339458_S001_GE1_1200_Jun14_1_1.txt
1 0 SG19378659_257236339458_S001_GE1_1200 SG19378659_257236339458_S001_GE1_1200_Jun14_1_2.txt
: : : :
11 1 US91503671_253949442637_S01_GE1_105 US91503671_253949442637_S01_GE1_105_Dec08_1_4.txt
12 1 US91503671_253949442637_S01_GE1_105 US91503671_253949442637_S01_GE1_105_Dec08_2_2.txt
"""
print(
tabulate(
tabular_data=self.groups,
tablefmt=tablefmt,
headers=["idx", "Condition", "gn", "GroupName", "FileName"],
)
)
[docs] def get_group_numbers(self, group_no: Optional[int] = None, group_name: Optional[str] = None) -> List[int]:
"""Get the specified group index List.
Args:
group_no (Optional[int], optional) : [description]. Defaults to ``None``.
group_name (Optional[str], optional) : [description]. Defaults to ``None``.
Returns:
List[int]: [description]
Examples:
>>> from teilab.datasets import Samples
>>> from teilab.utils._path import SAMPLE_LIST_PATH
>>> samples = Samples(sample_list_path=SAMPLE_LIST_PATH)
>>> samples._group_names
['SG19378659_257236339458_S001_GE1_1200', 'US91503671_253949442637_S01_GE1_105']
>>> samples._group_numbers
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> samples.get_group_numbers(0)
[0, 1, 2, 3, 4]
>>> samples.get_group_numbers(group_name="US91503671_253949442637_S01_GE1_105")
[5, 6, 7, 8, 9, 10, 11, 12]
"""
if group_name is not None:
group_no = self._group_names.index(group_name)
return [i for i, gn in enumerate(self._group_numbers) if gn == group_no]
[docs]class TeiLabDataSets:
"""Utility Datasets Class for this lecture.
Args:
verbose (bool, optional) : Whether print verbose or not. Defaults to ``True``.
Attributes:
verbose (bool) : Whether print verbose or not. Defaults to ``True``.
print (callable) : Print function.
sample (Samples) : Datasts Samples.
root (Path) : Root Directory for Datasets. ( ``DATA_DIR`` )
Examples:
>>> from teilab.datasets import TeiLabDataSets
>>> datasets = TeiLabDataSets(verbose=False)
There are not enough datasets. Use ``.get_data`` to prepare all the required datasets.
>>> datasets.satisfied
False
>>> datasets.get_data(password="PASSWORD1")
>>> datasets.get_data(password="PASSWORD2")
>>> datasets.satisfied
True
"""
TARGET_GeneName: str = "VIM" #: TARGET_GeneName (str) ``GeneName`` of the target RNA (vimentin)
TARGET_SystematicName: str = (
"NM_003380" #: TARGET_SystematicName (str) ``SystematicName`` of the target RNA (vimentin)
)
ANNO_COLNAMES: List[str] = [
"FeatureNum",
"ProbeName",
"SystematicName",
] #: ANNO_COLNAMES (List[str]) Column names for annotation.
TARGET_COLNAME: str = "gProcessedSignal" #: TARGET_COLNAME (str) Column name for expression data.
def __init__(self, verbose: bool = True):
self.verbose = verbose
self.print: callable = verbose2print(verbose=verbose)
self.init()
[docs] def init(self):
"""Initialization"""
self.samples: Samples = Samples(SAMPLE_LIST_PATH)
self.root: Path = Path(DATA_DIR)
if not self.satisfied:
self.print(f"There are not enough datasets. Use ``.get_data`` to prepare all the required datasets.")
[docs] def get_data(self, password: str) -> str:
"""Get data which is necessary for this lecture.
Args:
password (str) : Password. (Because some data are ubpublished.)
Returns:
str: The path to the downloaded file.
Examples:
>>> from teilab.utils import TeiLabDataSets
>>> datasets = TeiLabDataSets()
>>> path = datasets.get_teilab_data(password="PASSWORD")
Try to get data from <SECRET_URL>
This is our unpublished data, so please treat it confidential.
[Download] URL: <SECRET_URL>
* Content-Encoding : None
* Content-Length : 45.9 [MB]
* Content-Type : application/zip
* Save Destination : PATH/TO/PASSWORD.zip
===== Progress =====
<SECRET FILENAME> 100.0%[####################] 45.3[s] 1.0[MB/s] eta -0.0[s]
Save data at PATH/TO/PASSWORD.zip
[Unzip] Show file contents:
* <SECRET_FILE_1>
* <SECRET_FILE_2>
* :
* <SECRET_FILE_N>
>>> path
'PATH/TO/PASSWORD.zip'
Below is the code for the GAS(Google Apps Script) API server.
.. code-block:: js
const P = PropertiesService.getScriptProperties();
const sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName(P.getProperty("sheetname"))
const values = sheet.getRange("A2:C").getValues();
var Password2dataURL = {};
for (let i=0; i<values.length; i++){
Password2dataURL[values[i][0]] = values[i].slice(1);
}
function doPost(e) {
var password = e.parameter.password;
var response = {
message : "Invalid Password",
dataURL : "",
password : password
};
if (password in Password2dataURL){
let data = Password2dataURL[password]
response.dataURL = data[0]
response.message = data[1]
}
var output = ContentService.createTextOutput();
output.setMimeType(ContentService.MimeType.JSON);
output.setContent(JSON.stringify(response));
return output;
}
You can also get the data with the command like ``curl`` .
.. code-block:: shell
$ curl -L <GAS_WEBAPP_URL> \\
-d password=<PASSWORD> \\
-H "Content-Type: application/x-www-form-urlencoded"
"""
# Get the target data URL.
path = ""
ret: Response = requests.post(url=GAS_WEBAPP_URL, data={"password": password})
data: Dict[str, str] = ret.json()
dataURL: str = data.get("dataURL", "")
message: str = data.get("message", "")
if len(dataURL) == 0:
self.print(f"Could not get the valid URL.\n{message}")
else:
self.print(f"Try to get data from {dataURL}\n{message}")
downloader = decide_downloader(url=dataURL)
print(f"Data ULR: {dataURL}")
ret: Tuple[str, str] = downloader.prepare_for_download(
url=dataURL, basename=password, dirname=DATA_DIR, verbose=False
)
path = ret[1]
if os.path.exists(path):
self.print(f"Data already exists, so do nothing here.")
else:
# Use the specific ``Downloader`` to download the target data.
downloader = decide_downloader(url=dataURL)
path = downloader.download_file(url=dataURL, path=path, verbose=self.verbose, expand=True)
self.print(f"Saved data at {path}")
self.samples.grouping()
return path
[docs] def get_filePaths(self) -> NDArray[Any, Path]:
"""Get the path list of files used in the lecture.
Returns:
NDArray[Path]: The path lists for datasets.
Examples:
>>> from teilab.utils import TeiLabDataSets
>>> datasets = TeiLabDataSets()
>>> filelists = datasets.get_filePaths()
>>> len(filelists)
13
>>> filelists[0].name
'US91503671_253949442637_S01_GE1_105_Dec08_1_1.txt'
"""
fnLists = self.samples.FileName.tolist()
return np.asarray(
sorted(
[path for path in self.root.glob("**/*.txt") if path.name in fnLists],
key=lambda x: fnLists.index(x.name),
)
)
@property
def filePaths(self) -> NDArray[Any, Path]:
"""The path lists for datasets."""
return self.get_filePaths()
@property
def satisfied(self) -> bool:
"""Whether to get all necessary data or not."""
return len(self.filePaths) == len(self.samples.FileName)
[docs] def read(
self,
no: Union[int, str, List[int]],
sep: Optional[str] = "\t",
header: Union[int, List[int]] = "infer",
nrows: Optional[int] = None,
usecols: Optional[Union[List[str], callable]] = None,
**kwargs,
) -> Union[pd.DataFrame, List[pd.DataFrame]]:
"""Read sample(s) data as ``pd.DataFrame``
Args:
no (Union[int,str,List[int]]) : Target sample number(s) or ``"all"`` .
sep (Optional[str], optional) : Delimiter to use. Defaults to ``"\\t"``
header (Union[int,List[int]], optional) : Row number(s) to use as the column names, and the start of the data. Defaults to ``"infer"``.
nrows (Optional[int], optional) : Number of rows of file to read. Useful for reading pieces of large files. Defaults to ``None``.
usecols (Optional[Union[List[str],callable]], optional) : Return a subset of the columns. Defaults to ``None``.
**kwargs (dict) : Other keyword arguments for ``pd.read_csv`` .
Raises:
TypeError: When argument ``no`` is an instance of unexpected type or is an unexpected value.
Returns:
Union[pd.DataFrame, List[pd.DataFrame]]: DataFrame of the specified sample(s).
Examples:
>>> from teilab.datasets import TeiLabDataSets
>>> datasets = TeiLabDataSets(verbose=False)
>>> dfs = datasets.read(no="all", header=9)
>>> len(dfs)
13
>>> type(dfs[0])
pandas.core.frame.DataFrame
"""
kwargs.update({"sep": sep, "header": header, "nrows": nrows, "usecols": usecols})
if isinstance(no, list):
return [self._read_csv(no=n, **kwargs) for n in no]
elif isinstance(no, str):
if no == "all":
return [self._read_csv(no=n, **kwargs) for n in range(len(self.filePaths))]
else:
raise TypeError("Please specify the sample number.")
else:
return self._read_csv(no=no, **kwargs)
def _read_csv(self, no: int, **kwargs) -> pd.DataFrame:
"""Read the sample as ``pd.DataFrame``
Args:
no (int) : Target sample number.
kwargs (dict) : Keyword arguments for ``pd.read_csv`` .
Returns:
pd.DataFrame: DataFrame of the specified sample.
Examples:
>>> from teilab.datasets import TeiLabDataSets
>>> datasets = TeiLabDataSets(verbose=False)
>>> df = datasets._read_csv(no=0, header=9)
>>> len(df)
62976
>>> type(df)
pandas.core.frame.DataFrame
"""
filepath = self.filePaths[no]
self.print(f"Read data from '{filepath.relative_to(self.root)}'")
return pd.read_csv(filepath_or_buffer=filepath, **kwargs)
[docs] def read_data(self, no: Union[int, str, List[int]], **kwargs) -> Union[pd.DataFrame, List[pd.DataFrame]]:
"""Read sample(s) 'expression' data as ``pd.DataFrame``
Args:
no (Union[int,str,List[int]]) : Target sample number(s) or ``"all"`` .
Returns:
Union[pd.DataFrame, List[pd.DataFrame]]: DataFrame of the specified sample(s) 'expression' data.
Examples:
>>> from teilab.datasets import TeiLabDataSets
>>> datasets = TeiLabDataSets(verbose=False)
>>> dfs = datasets.read_data(no=[0,1,2])
>>> len(dfs)
3
>>> len(dfs[0])
62976
"""
return self.read(no=no, header=9, **kwargs)
[docs] def read_summary(self, no: Union[int, str, List[int]], **kwargs) -> Union[pd.DataFrame, List[pd.DataFrame]]:
"""Read sample(s) 'summary' data as ``pd.DataFrame``
Args:
no (Union[int,str,List[int]]) : Target sample number(s) or ``"all"`` .
Returns:
Union[pd.DataFrame, List[pd.DataFrame]]: DataFrame of the specified sample(s) 'summary' data.
Examples:
>>> from teilab.datasets import TeiLabDataSets
>>> datasets = TeiLabDataSets(verbose=False)
>>> dfs = datasets.read_summary(no=[0,3,8,9])
>>> len(dfs)
4
>>> len(dfs[0])
1
"""
return self.read(no=no, header=5, nrows=1, **kwargs)
[docs] @staticmethod
def reliable_filter(df: pd.DataFrame, name: Optional[str] = None) -> pd.DataFrame:
"""Create a dataframe which means whether data is reliable or not.
Args:
df (pd.DataFrame) : Input dataframe.
name (Optional[str], optional) : The column name. Defaults to ``None``.
Returns:
pd.DataFrame: Filter DataFrame which means whether data is reliable or not.
Examples:
>>> import pandas as pd
>>> from teilab.datasets import TeiLabDataSets
>>> datasets = TeiLabDataSets(verbose=False)
>>> df_sg = datasets.read_data(0)
>>> len(df_sg), datasets.reliable_filter(df_sg).sum().values[0]
(62976, 30385)
>>> df_us = datasets.read_data(-1)
>>> len(df_us), datasets.reliable_filter(df_us).sum().values[0]
(62976, 23434)
"""
control = df.ControlType == 0
present = df.gIsPosAndSignif == 1
uniform = df.gIsFeatNonUnifOL == 0
wellabove = df.gIsWellAboveBG == 1
saturated = df.gIsSaturated == 0
popnol = df.gIsFeatPopnOL == 0
return df[(control & present & uniform & wellabove & saturated & popnol)].index