Source code for hotelling.stats

# -*- coding: utf-8 -*-
"""Stats.py.

Hotelling's T-Squared multivariate test for one sample or two independent samples

See:
 Hotelling, Harold. (1931). The Generalization of Student's Ratio. Ann. Math. Statist. 2,
  no. 3, 360--378. doi:10.1214/aoms/1177732979.

https://projecteuclid.org/euclid.aoms/1177732979
"""
from warnings import warn

import numpy as np
from scipy.stats import f


[docs]def bessel_correction(x, y=None):
    """bessel_correction.

    Sampling tends to underestimate variability of a population. This is due to the fact that we are more likely to
    sample around the mean than near the extremities. Bessel's correction uses n−1 instead of n which is used to
    calculate variance etc, in order to correct for the bias in the estimation of the population variance.

    :param x: array-like, samples of observations
    :param y: array-like, samples of observations, optional
    :return: returns x_n - 1, y_n - 1
    """
    n1 = x.shape[0] - 1
    try:
        n1 = n1.compute()
    except AttributeError:
        pass
    if y is None:
        n2 = 0
    else:
        n2 = y.shape[0] - 1
        try:
            n2 = n2.compute()
        except AttributeError:
            pass
    return n1, n2


[docs]def inverse_covariance_matrix(x, y, bessel=True):
    """inverse_covariance_matrix.

    :param x: array-like, samples of observations
    :param y: array-like, samples of observations
    :param bessel: bool, apply bessel correction (default)
    :return: float, the pooled variance inverse, the pooled variance
    """
    _, *p = x.shape
    p = p[0] if p else 1
    s = pooled_covariance_matrix(x, y, bessel)
    try:
        ident_p = np.identity(p).compute
    except AttributeError:
        ident_p = np.identity(p)
    inv = np.linalg.solve(s, ident_p)
    return inv, s


[docs]def pooled_covariance_matrix(x, y, bessel=True):
    r"""pooled_covariance.

    Compute the pooled covariance matrix

    Equation:

    The pooled covariance matrix is defined as:

    .. math::
        S =  \\frac{n_xS_x + n_yS_y}{n_x+n_y}

    And with bessel correction as:

    .. math::
        S =  \\frac{(n_x-1)S_x + (n_y-1)S_y}{n_x+n_y-2}

    Reference
    ---------
    see: https://en.wikipedia.org/wiki/Hotelling%27s_T-squared_distribution#Pooled_covariance_matrix

    :param x: array-like, samples of observations
    :param y: array-like, samples of observations
    :param bessel: bool, apply bessel correction (default)
    :return: float, the pooled variance
    """
    if bessel:
        n1, n2 = bessel_correction(x, y)
    else:
        try:
            n1 = x.shape[0]
            n1 = n1.compute()
            n2 = y.shape[0]
            n2 = n2.compute()
        except AttributeError:
            n1 = x.shape[0]
            n2 = y.shape[0]
    try:
        s1 = n1 * x.cov().compute()

    except AttributeError:
        s1 = n1 * np.cov(x, rowvar=False)
    try:
        s2 = n2 * y.cov().compute()
    except AttributeError:
        s2 = n2 * np.cov(y, rowvar=False)
    s = (s1 + s2) / (n1 + n2)

    return s


[docs]def hotelling_t2(x, y=None, bessel=True, S=None):
    r"""hotelling_t2.

    Compute the Hotelling (T2) test statistic.

    It is the multivariate extension of the Student's t-test.
    Test the null hypothesis that two multivariate samples have the same underlying
    probability distribution, when specifying samples for x and y. The number of samples do not have
    to be the same, but the number of features does have to be equal.

    Equation:

    Hotelling's t-squared statistic is defined as:

    .. math::
        T^2 = n (\\bar{x} - {\mu})^{T} S^{-1} (\\bar{x} - {\mu})

    Where S is the pooled covariance matrix and ᵀ represents the transpose.

    The two sample t-squared statistic is defined as:

    .. math::
        T^2 = (\\bar{x} - \\bar{y})^{T} [S(\\frac1 n_x +\\frac 1 n_y)]^{-1} (\\bar{x}̄ - \\bar{y})

    References:
        - Hotelling, Harold. (1931). The Generalization of Student's Ratio. Ann. Math. Statist. 2, no. 3, 360--378.
          doi:10.1214/aoms/1177732979. https://projecteuclid.org/euclid.aoms/1177732979

        - Hotelling, Harold. (1955) Les Rapports entre les Methodes Statistiques recentes portant sur des Variables Multiples
          et l'Analyse Factorielle. 107-119.
          In: L'Analyse Factorielle et ses Applications. Centre National de la Recherche Scientifique, Paris.

        - Anderson T.W. (1992) Introduction to Hotelling (1931) The Generalization of Student’s Ratio.
          In: Kotz S., Johnson N.L. (eds) Breakthroughs in Statistics.
          Springer Series in Statistics (Perspectives in Statistics). Springer, New York, NY

    :param x: array-like, samples of observations for one or two sample test (required)
    :param y: for two sample test, array-like, samples of observations (optional), for one sample, list of means to test
    :param bessel: bool, apply bessel correction (default)
    :return:
        statistic: float,
            the t2 statistic
        f_value: float,
            the f value
        p_value: float,
            the p value
        s: 2d array,
            the pooled variance
    """  # noqa: W605
    try:
        nx, p = x.shape
    except AttributeError as ex:
        if "list" in str(ex):
            x = np.asarray(x)
            nx, *p = x.shape
            p = p[0] if p else 1
            y = np.asarray(y)
        else:
            warn("Error: The two samples must be in arrays or dataframes format.")
            raise ValueError

    # samples observed means
    try:
        nx = nx.compute()
        x_bar = x.mean(0).compute()
    except AttributeError:  # series has no attribute compute
        x_bar = x.mean(0)

    one_sample = False
    if y is None:
        # One sample T-squared
        one_sample = True
        y = np.zeros(p)
        ny = None
        py = p
        diff_bar = x_bar - y

    else:
        ny, *py = y.shape
        if len(py) == 0:
            one_sample = True
            py = p
            diff_bar = x_bar - y
        else:
            # Two sample T-squared
            py = py[0] if py else 1
            try:
                ny = ny.compute()
                y_bar = y.mean(0).compute()
            except AttributeError:  # series has no attribute compute
                y_bar = y.mean(0)
            # difference of means
            diff_bar = x_bar - y_bar
    if p != py:
        warn(
            f"Error: the two samples must have the same number of features ({p} != {py})."
        )
        raise ValueError

    # bessel correction ( -1 )
    if bessel:
        n1, n2 = bessel_correction(x, y)
    else:
        n1 = nx
        n2 = ny
    if one_sample:
        n = nx
    else:
        n = n1 + n2

    # calculate the T2 statistics
    # Technically, we use diff_bar.T for the transpose, but with Pandas, a 1 dimensional dataframe
    # is automatically aligned for @ and is not required
    if one_sample:
        if S is not None:
            cov = S
        else:
            try:
                cov = x.cov().compute()
            except AttributeError:
                try:
                    cov = x.cov()
                except AttributeError:
                    cov = np.cov(x, rowvar=False)
        inv_cov = np.linalg.pinv(cov)
        # for f test
        # term = (n - p) / (p * (n - 1))  # getting different results
        t2_stat = n * (diff_bar.T @ inv_cov @ diff_bar)
        if S is not None:
            return t2_stat
        # f statistic
        # TODO: use chi square instead of f statistic for large sample
        f_value = (n - p) * t2_stat / ((n - 1) * p)
    else:
        # pooled covariance
        inv_s, s = inverse_covariance_matrix(x, y, bessel)
        t2_stat = nx * ny / (nx + ny) * (diff_bar.T @ inv_s @ diff_bar)
        # f statistic
        # TODO: use chi square instead of f statistic for large sample
        f_value = (nx + ny - p - 1) * t2_stat / (n * p)

    # p-value
    p_value = f.sf(f_value, p, n - p)  # survival function, 1 - cdf

    # return the list of results
    return t2_stat, f_value, p_value, cov if one_sample else s


[docs]def hotelling_dict(x, y=None, bessel=True):
    """hotelling_dict.

    returns the same values as `hotelling_t2`, but in a dictionary - for API etc

    :param x: array-like, samples of observations for one or two sample test (required)
    :param y: for two sample test, array-like, samples of observations (optional), for one sample, list of means to test
    :return: dict
    """
    t2_stat, f_stat, p_value, s = hotelling_t2(x, y, bessel)
    return dict(t2_stat=t2_stat, f_stat=f_stat, p_value=p_value, pooled_var=s)
Source code for hotelling.stats

Hotelling T2

Navigation

Related Topics