# coding: utf-8
from numbers import Number
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from nptyping import NDArray
[docs]def assign_rank(arr: NDArray[Any, Number], method: str = "average") -> NDArray[Any, float]:
"""Assign rank to data, dealing with ties appropriately.
Args:
arr (NDArray[Any,Number]): The array of values to be ranked
method (str, optional) : The method used to assign ranks to tied elements. Defaults to ``"average"``.
The following ``method``s are available.
+---------------+--------------------------------------------------------------------------------------------------------------------------------------+
| ``method`` | description |
+===============+======================================================================================================================================+
| ``"average"`` | The average of the ranks that would have been assigned to all the tied values is assigned to each value. |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------+
| ``"min"`` | The minimum of the ranks that would have been assigned to all the tied values is assigned to each value. |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------+
| ``"max"`` | The maximum of the ranks that would have been assigned to all the tied values is assigned to each value. |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------+
| ``"dense"`` | Like ``"min"``, but the rank of the next highest element is assigned the rank immediately after those assigned to the tied elements. |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------+
| ``"ordinal"`` | All values are given a distinct rank, corresponding to the order that the values occur in ``arr`` |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------+
Returns:
NDArray[Any,float]: An array of size equal to the size of ``arr``, containing rank scores.
Examples:
>>> import numpy as np
>>> from teilab.utils import assign_rank
>>> arr = np.asarray([0,2,3,2])
>>> assign_rank(arr, method="average")
array([1. , 2.5, 4. , 2.5])
>>> assign_rank(arr, method="min")
array([1, 2, 4, 2])
>>> assign_rank(arr, method="max")
array([1, 3, 4, 3])
>>> assign_rank(arr, method="dense")
array([1, 2, 3, 2])
>>> assign_rank(arr, method="ordinal")
array([1, 2, 4, 3])
.. seealso::
- :fa:`wikipedia-w` `Ranking <https://en.wikipedia.org/wiki/Ranking>`_
- `scipy.stats.rankdata(a, method='average') <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rankdata.html>`_
"""
sorter = np.argsort(
arr, kind="mergesort" if method == "ordinal" else "quicksort"
) #: arr[sorter[i]] is i-th ranked values.
inv = np.empty(sorter.size, dtype=np.intp)
inv[sorter] = np.arange(sorter.size, dtype=np.intp) #: inv[i] stores (original) arr[i]'s ranking (0-based index).
if method == "ordinal":
return inv + 1 #: 1-based ranking.
arr = arr[sorter] #: Sort the input array (arr)
obs = np.r_[True, arr[1:] != arr[:-1]].astype(np.intp) #: obs[i] means arr[i]!=a[i-1]
dense = obs.cumsum()[inv] #: dense[i] means (original) arr[i]'s ranking within unique values. (1-based index)
if method == "dense":
return dense
count = np.r_[
np.nonzero(obs)[0], len(obs)
] # count[i] means the cumulative counts of unique values under i-th rank's unique value.
if method == "max":
return count[dense]
if method == "min":
return count[dense - 1] + 1
# average method
return 0.5 * (count[dense] + count[dense - 1] + 1)
[docs]def tiecorrect(ranks: NDArray[Any, Number]) -> float:
"""Tie correction factor for Mann-Whitney U and Kruskal-Wallis H tests.
Args:
ranks (NDArray[Any,Number]): A 1-D sequence of ranks. Typically this will be the array returned by :func:`assign_rank <teilab.utils.math_utils.assign_rank>` .
Returns:
float: Correction factor for ``U`` or ``H`` .
Examples:
>>> import numpy as np
>>> from teilab.utils import tiecorrect, assign_rank
>>> tiecorrect(np.asarray([0,2,3,2]))
0.9
>>> ranks = assign_rank(np.asarray([1,3,2,4,5,7,2,8,4]), method="average")
>>> ranks
array([1. , 4. , 2.5, 5.5, 7. , 8. , 2.5, 9. , 5.5])
>>> tiecorrect(ranks)
0.9833333333333333
.. seealso::
- :fa:`home` `Nonparametric Statistics for the Behavioral Sciences. <https://www.amazon.co.jp/-/en/Sidney-Siegel/dp/0070573484>`_
- `scipy.stats.tiecorrect(rankvals) <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.tiecorrect.html>`_
"""
arr = np.sort(ranks)
idx = np.nonzero(np.r_[True, arr[1:] != arr[:-1], True])[0]
cnt = np.diff(idx).astype(np.float64)
size = np.float64(arr.size)
return 1.0 if size < 2 else 1.0 - (cnt ** 3 - cnt).sum() / (size ** 3 - size)
[docs]def optimize_linear(X: NDArray[Any, Number], Y: NDArray[Any, Number]) -> Tuple[float, float, callable]:
r"""Optimize linear function using least-squares method.
.. math::
\begin{cases}
a&=\frac{\displaystyle n\sum x_iy_i-\sum x_i\sum y_i}{\displaystyle n\sum x^2_i-\left( \sum x_i \right)^2}\\
b&=\frac{\displaystyle \sum x^2_i\sum y_i-\sum x_iy_i\sum x_i}{\displaystyle n\sum x^2_i-\left( \sum x_i \right)^2}
\end{cases}
Args:
X (NDArray[Any,Number]): explanatory variables.
Y (NDArray[Any,Number]): objective variables.
Returns:
Tuple[float,float,callable]: Optimal linear functions and their components.
"""
n = len(X)
a = (n * np.sum(X * Y) - np.sum(X) * np.sum(Y)) / (n * np.sum(X * X) - np.square(np.sum(X)))
b = (np.sum(X * X) * np.sum(Y) - np.sum(X * Y) * np.sum(X)) / (n * np.sum(X * X) - np.square(np.sum(X)))
return (a, b, np.vectorize(lambda x: a * x + b))