# -*- coding: utf-8 -*-
"""Stats.py.
Hotelling's T-Squared multivariate test for one sample or two independent samples
See:
Hotelling, Harold. (1931). The Generalization of Student's Ratio. Ann. Math. Statist. 2,
no. 3, 360--378. doi:10.1214/aoms/1177732979.
https://projecteuclid.org/euclid.aoms/1177732979
"""
from warnings import warn
import numpy as np
from scipy.stats import f
[docs]def bessel_correction(x, y=None):
"""bessel_correction.
Sampling tends to underestimate variability of a population. This is due to the fact that we are more likely to
sample around the mean than near the extremities. Bessel's correction uses n−1 instead of n which is used to
calculate variance etc, in order to correct for the bias in the estimation of the population variance.
:param x: array-like, samples of observations
:param y: array-like, samples of observations, optional
:return: returns x_n - 1, y_n - 1
"""
n1 = x.shape[0] - 1
try:
n1 = n1.compute()
except AttributeError:
pass
if y is None:
n2 = 0
else:
n2 = y.shape[0] - 1
try:
n2 = n2.compute()
except AttributeError:
pass
return n1, n2
[docs]def inverse_covariance_matrix(x, y, bessel=True):
"""inverse_covariance_matrix.
:param x: array-like, samples of observations
:param y: array-like, samples of observations
:param bessel: bool, apply bessel correction (default)
:return: float, the pooled variance inverse, the pooled variance
"""
_, *p = x.shape
p = p[0] if p else 1
s = pooled_covariance_matrix(x, y, bessel)
try:
ident_p = np.identity(p).compute
except AttributeError:
ident_p = np.identity(p)
inv = np.linalg.solve(s, ident_p)
return inv, s
[docs]def pooled_covariance_matrix(x, y, bessel=True):
r"""pooled_covariance.
Compute the pooled covariance matrix
Equation:
The pooled covariance matrix is defined as:
.. math::
S = \\frac{n_xS_x + n_yS_y}{n_x+n_y}
And with bessel correction as:
.. math::
S = \\frac{(n_x-1)S_x + (n_y-1)S_y}{n_x+n_y-2}
Reference
---------
see: https://en.wikipedia.org/wiki/Hotelling%27s_T-squared_distribution#Pooled_covariance_matrix
:param x: array-like, samples of observations
:param y: array-like, samples of observations
:param bessel: bool, apply bessel correction (default)
:return: float, the pooled variance
"""
if bessel:
n1, n2 = bessel_correction(x, y)
else:
try:
n1 = x.shape[0]
n1 = n1.compute()
n2 = y.shape[0]
n2 = n2.compute()
except AttributeError:
n1 = x.shape[0]
n2 = y.shape[0]
try:
s1 = n1 * x.cov().compute()
except AttributeError:
s1 = n1 * np.cov(x, rowvar=False)
try:
s2 = n2 * y.cov().compute()
except AttributeError:
s2 = n2 * np.cov(y, rowvar=False)
s = (s1 + s2) / (n1 + n2)
return s
[docs]def hotelling_t2(x, y=None, bessel=True, S=None):
r"""hotelling_t2.
Compute the Hotelling (T2) test statistic.
It is the multivariate extension of the Student's t-test.
Test the null hypothesis that two multivariate samples have the same underlying
probability distribution, when specifying samples for x and y. The number of samples do not have
to be the same, but the number of features does have to be equal.
Equation:
Hotelling's t-squared statistic is defined as:
.. math::
T^2 = n (\\bar{x} - {\mu})^{T} S^{-1} (\\bar{x} - {\mu})
Where S is the pooled covariance matrix and ᵀ represents the transpose.
The two sample t-squared statistic is defined as:
.. math::
T^2 = (\\bar{x} - \\bar{y})^{T} [S(\\frac1 n_x +\\frac 1 n_y)]^{-1} (\\bar{x}̄ - \\bar{y})
References:
- Hotelling, Harold. (1931). The Generalization of Student's Ratio. Ann. Math. Statist. 2, no. 3, 360--378.
doi:10.1214/aoms/1177732979. https://projecteuclid.org/euclid.aoms/1177732979
- Hotelling, Harold. (1955) Les Rapports entre les Methodes Statistiques recentes portant sur des Variables Multiples
et l'Analyse Factorielle. 107-119.
In: L'Analyse Factorielle et ses Applications. Centre National de la Recherche Scientifique, Paris.
- Anderson T.W. (1992) Introduction to Hotelling (1931) The Generalization of Student’s Ratio.
In: Kotz S., Johnson N.L. (eds) Breakthroughs in Statistics.
Springer Series in Statistics (Perspectives in Statistics). Springer, New York, NY
:param x: array-like, samples of observations for one or two sample test (required)
:param y: for two sample test, array-like, samples of observations (optional), for one sample, list of means to test
:param bessel: bool, apply bessel correction (default)
:return:
statistic: float,
the t2 statistic
f_value: float,
the f value
p_value: float,
the p value
s: 2d array,
the pooled variance
""" # noqa: W605
try:
nx, p = x.shape
except AttributeError as ex:
if "list" in str(ex):
x = np.asarray(x)
nx, *p = x.shape
p = p[0] if p else 1
y = np.asarray(y)
else:
warn("Error: The two samples must be in arrays or dataframes format.")
raise ValueError
# samples observed means
try:
nx = nx.compute()
x_bar = x.mean(0).compute()
except AttributeError: # series has no attribute compute
x_bar = x.mean(0)
one_sample = False
if y is None:
# One sample T-squared
one_sample = True
y = np.zeros(p)
ny = None
py = p
diff_bar = x_bar - y
else:
ny, *py = y.shape
if len(py) == 0:
one_sample = True
py = p
diff_bar = x_bar - y
else:
# Two sample T-squared
py = py[0] if py else 1
try:
ny = ny.compute()
y_bar = y.mean(0).compute()
except AttributeError: # series has no attribute compute
y_bar = y.mean(0)
# difference of means
diff_bar = x_bar - y_bar
if p != py:
warn(
f"Error: the two samples must have the same number of features ({p} != {py})."
)
raise ValueError
# bessel correction ( -1 )
if bessel:
n1, n2 = bessel_correction(x, y)
else:
n1 = nx
n2 = ny
if one_sample:
n = nx
else:
n = n1 + n2
# calculate the T2 statistics
# Technically, we use diff_bar.T for the transpose, but with Pandas, a 1 dimensional dataframe
# is automatically aligned for @ and is not required
if one_sample:
if S is not None:
cov = S
else:
try:
cov = x.cov().compute()
except AttributeError:
try:
cov = x.cov()
except AttributeError:
cov = np.cov(x, rowvar=False)
inv_cov = np.linalg.pinv(cov)
# for f test
# term = (n - p) / (p * (n - 1)) # getting different results
t2_stat = n * (diff_bar.T @ inv_cov @ diff_bar)
if S is not None:
return t2_stat
# f statistic
# TODO: use chi square instead of f statistic for large sample
f_value = (n - p) * t2_stat / ((n - 1) * p)
else:
# pooled covariance
inv_s, s = inverse_covariance_matrix(x, y, bessel)
t2_stat = nx * ny / (nx + ny) * (diff_bar.T @ inv_s @ diff_bar)
# f statistic
# TODO: use chi square instead of f statistic for large sample
f_value = (nx + ny - p - 1) * t2_stat / (n * p)
# p-value
p_value = f.sf(f_value, p, n - p) # survival function, 1 - cdf
# return the list of results
return t2_stat, f_value, p_value, cov if one_sample else s
[docs]def hotelling_dict(x, y=None, bessel=True):
"""hotelling_dict.
returns the same values as `hotelling_t2`, but in a dictionary - for API etc
:param x: array-like, samples of observations for one or two sample test (required)
:param y: for two sample test, array-like, samples of observations (optional), for one sample, list of means to test
:return: dict
"""
t2_stat, f_stat, p_value, s = hotelling_t2(x, y, bessel)
return dict(t2_stat=t2_stat, f_stat=f_stat, p_value=p_value, pooled_var=s)