Source code for hotelling.plots

# -*- coding: utf-8 -*-
"""plot.py.

Hotelling's T-Squared multivariate control charts

See:

  - Hotelling, Harold. (1931). The Generalization of Student's Ratio. Ann. Math. Statist. 2,
    no. 3, 360--378. doi:10.1214/aoms/1177732979.
  - Tukey, J. W. (1960). A survey of sampling from contaminated distributions. In: Contributions
    to Probability and Statistics. Stanford Univ. Press. 448-85
  - Gnanadesikan, R. and J.R. Kettenring (1972). Robust Estimates, Residuals, and Outlier Detection
    with Multiresponse Data. Biometrics 28, 81-124

"""
from warnings import warn

import matplotlib.pyplot as plt
import pandas as pd

try:
    from plotly.offline import iplot
    from plotly.subplots import make_subplots
    import plotly.tools as tls

    plotly_module = True
except ModuleNotFoundError:
    plotly_module = False
from scipy import stats

from hotelling.stats import hotelling_t2


[docs]def control_interval(m, n, f, phase=1, alpha=0.001):
    """control_interval.

    For Hotelling control charts, phase 1 is using Qi. This follows a beta distribution, not an F distribution. For
    phase 2 uses future observations. These would follow a known distribution ~ F (Seber, 1984).
    The lower and upper lines are based on the quantiles of the distribution (aka `percent point function`)
    for α and 1 - α, while the center line is the median (50%).

    See:
        - Seber, G (1984). Multivariate Observations. John Wiley & Sons.
        - Nola D. Tracy, John C. Young & Robert L. Mason (1992) Multivariate Control Charts for individual Observations,
          Journal or Quality Technology, 24:2, 88-95, DOI:10.1080/00224065.1992.12015232

    :param m: sample groups (between 1 and n)
    :param n: number of samples
    :param f: number of features in the multivariate samples
    :param phase: 1 or 2 - phase 1 is within initial sample, phase 2 is measuring implemented control
    :param alpha: significance level - used to calculate control lines at α/2 and 1-α/2
    :return:
    """
    if phase == 1:
        lcl = float(
            ((m - 1) * (n - 1) / m)
            * (stats.beta(f / 2, ((m - f - 1) / 2)).ppf(alpha / 2)),
        )
        cl = float(
            ((m - 1) * (n - 1) / m) * (stats.beta(f / 2, ((m - f - 1) / 2)).ppf(0.5)),
        )
        ucl = float(
            ((m - 1) * (n - 1) / m)
            * (stats.beta(f / 2, ((m - f - 1) / 2)).ppf(1 - alpha / 2)),
        )
    else:
        lcl = float(
            (f * (m - 1) * (m + 1)) / (m * (m - f)) * stats.f(f, m - f).ppf(alpha / 2)
        )
        cl = float((f * (m - 1) * (m + 1)) / (m * (m - f)) * stats.f(f, m - f).ppf(0.5))
        ucl = float(
            (f * (m - 1) * (m + 1))
            / (m * (m - f))
            * stats.f(f, m - f).ppf(1 - alpha / 2)
        )
    return lcl, cl, ucl


[docs]def control_stats(x):
    """control_stats.

    Compute the sample mean vector and the covariance matrix

    :param x: pandas dataframe, uni or multivariate
    :return: sample mean, sample covariance
    """
    try:
        return x.mean(0).compute(), x.cov().compute()
    except AttributeError:
        return x.mean(0), x.cov()


[docs]def control_chart(
    x,
    phase=1,
    alpha=0.001,
    x_bar=None,
    s=None,
    legend_right=False,
    interactive=False,
    width=10,
    cusum=False,
    template="none",
    marker="o",
    ooc_marker="x",
    random_state=42,
    limit=1000,
    no_display=False,
):
    """control_chart.

    Hotelling Control Chart based on Q / T^2.

    See also `control_interval` for more detail

    :param x: pandas dataframe, uni or multivariate
    :param phase: 1 or 2 - phase 1 is within initial sample, phase 2 is measuring implemented control
    :param alpha: significance level - used to calculate control lines at α/2 and 1-α/2
    :param x_bar: sample mean (optional, required with s)
    :param s: sample covariance (optional, required with x_bar)
    :param legend_right: default to 'left', can specify 'right'
    :param interactive: if  True and plotly is available, renders as interactive plot in notebook. False, render image.
    :param width: how many units wide. defaults to 10, good for notebooks
    :param cusum: use cumulative sum instead of average
    :param template: plotly template, defaults to 'none', matching default matplotlib
    :param marker: default marker symbol - one valid for matplotlib
    :param ooc_marker: out of control marker symbol (x) - one valid for matplotlib
    :param random_state: seed for sample (n > limit)
    :param limit: max number of points to plot, defaults to 1000
    :return: matplotlib ax / plotly fig
    """
    n, subset = limit_display(x, limit, random_state)
    m = n

    # computing each individual values to the mean and covariance of the whole dataset
    if x_bar is None and s is None:
        x_bar, s = control_stats(x)
    elif x_bar is None or s is None:
        raise ValueError("Error: must specify both x_bar and s, or none at all.")

    # data might be a subset (sample), but control stats above are calculated on the whole dataset
    points, f = subset.shape
    qi = [hotelling_t2(subset[i:i + 1], x_bar, S=s) for i in range(points)]

    df = pd.DataFrame({"qi": qi})

    lcl, cl, ucl = control_interval(m, n, f, phase=phase, alpha=alpha)

    cusum_text = ""
    if cusum:
        df["deviation"] = (df["qi"] - cl).cumsum() + cl
        cusum_text = f" w/deviation (ref={cl:.3f})"
    ax = df.plot(
        title=f"Hotelling Control Chart (α={alpha}, phase={phase}{cusum_text})",
        marker=marker,
        figsize=(width, width / 2),
    )
    ax.set_xlabel("samples")

    try:
        df[(df["qi"] > ucl) | (df["qi"] < lcl)].plot(
            ax=ax, marker=ooc_marker, linestyle="None", color="red", legend=None
        )
    except TypeError:
        # nothing to plot
        pass
    x_pos = 0
    align = "left"
    if legend_right:
        x_pos = len(qi)
        align = "right"
    font_dict = {"family": "serif", "color": "red", "size": 10}
    if not interactive:
        ax.hlines(
            ucl,
            xmin=0,
            xmax=len(qi),
            linestyles="dashed",
            color="r",
            label=f"UCL={ucl}",
        )
    plt.text(
        x_pos,
        ucl + 0.1,
        s=f"UCL={ucl:.3f}",
        fontdict=font_dict,
        horizontalalignment=align,
    )
    if not interactive:
        ax.hlines(
            cl, xmin=0, xmax=len(qi), linestyles="dashed", color="k", label=f"CL={cl}"
        )
    font_dict = {"family": "serif", "color": "black", "size": 10}
    plt.text(
        x_pos, cl + 0.1, s=f"CL={cl:.3f}", fontdict=font_dict, horizontalalignment=align
    )
    if not interactive:
        ax.hlines(
            lcl,
            xmin=0,
            xmax=len(qi),
            linestyles="dashed",
            color="r",
            label=f"LCL={lcl}",
        )
    font_dict = {"family": "serif", "color": "red", "size": 10}
    plt.text(
        x_pos,
        lcl + 0.1,
        s=f"LCL={lcl:.3f}",
        fontdict=font_dict,
        horizontalalignment=align,
    )
    if plotly_module and interactive:
        fig = tls.mpl_to_plotly(ax.get_figure())
        for var, col in [(ucl, "Red"), (lcl, "Red"), (cl, "Black")]:
            fig.add_shape(
                type="line",
                x0=0,
                y0=var,
                x1=len(qi),
                y1=var,
                line=dict(color=col, width=4, dash="dashdot",),
            )
        fig.update_layout(template=template)
        if no_display is False:
            iplot(fig)
        return fig
    else:
        return ax


[docs]def limit_display(x, limit, random_state):
    """limit_displau.

    Convenient way to get around the issue of very large datasets. We can't show everything, so we display
    a subset. The tests and stats like T2, F and P values are not affected, because we calculate them on all
    the data.

    :param x: dask or pandas dataframe, uni or multivariate
    :param random_state: seed for sample (n > limit)
    :param limit: max number of points to plot, defaults to 1000
    :return: returns original number of rows and limited dataframe
    """
    n, *f = x.shape

    try:
        n = n.compute()
    except AttributeError:
        pass
    if n > limit:
        try:
            frac = 1000 / n
            subset = x.sample(frac=frac, random_state=random_state).compute()
        except AttributeError:
            subset = x.sample(n=1000, random_state=random_state)
    else:
        # The whole thing
        try:
            subset = x.compute()
        except AttributeError:
            subset = x

    return n, subset


[docs]def univariate_control_chart(
    x,
    var=None,
    sigma=3,
    legend_right=False,
    interactive=False,
    connected=True,
    width=10,
    cusum=False,
    cusum_only=False,
    template="none",
    marker="o",
    ooc_marker="x",
    limit=1000,
    random_state=42,
    no_display=False,
):
    """univariate_control_chart.

    :param x: dask or pandas dataframe, uni or multivariate
    :param var: optional, variable to plot (default to all)
    :param sigma: default to 3 sigma from mean for upper and lower control lines
    :param legend_right: default to 'left', can specify 'right'
    :param interactive: if plotly is available, renders as interactive plot in notebook. False to render image.
    :param connected: defaults to True. Appropriate when time related /consecutive batches, else, should be False
    :param width: how many units wide. defaults to 10, good for notebooks
    :param cusum: use cumulative sum instead of average
    :param cusum_only: don't display values, just cusum referenced to 0
    :param template: plotly template, defaults to 'none', matching default matplotlib
    :param marker: default marker symbol (o) - one valid for matplotlib
    :param ooc_marker: out of control marker symbol (x) - one valid for matplotlib
    :param random_state: seed for sample (n > limit)
    :param limit: max number of points to plot, defaults to 1000
    :return: returns matplotlib figure or array of plotly figures
    """
    n, *f, df = limit_display(x, limit, random_state)
    num_plots = len(df.columns)
    k = sigma  # 3 sigma default
    if interactive:
        fig = make_subplots(rows=num_plots, cols=1)
    else:
        fig = plt.figure(figsize=(width, num_plots * width / 2))

    ax = list(range(num_plots))

    layout = num_plots * 100 + 11
    features = df.columns if var is None else [var]
    x_pos = 0
    align = "left"
    if legend_right:
        x_pos = n
        align = "right"

    plotly_figs = []
    for i, var in enumerate(features):
        x_bar = df[var].mean()
        cusum_text = ""
        columns = var
        if cusum:
            if cusum_only:
                columns = "deviation"
                df["deviation"] = (df[var] - x_bar).cumsum()
                cusum_text = f" cumulative deviation (ref={x_bar:.3f})"
            else:
                columns = [var, "deviation"]
                df["deviation"] = (df[var] - x_bar).cumsum() + (x_bar)
                cusum_text = f" w/deviation (ref={x_bar:.3f})"
        ucl = x_bar + k * df[var].std()
        lcl = x_bar - k * df[var].std()
        if interactive:
            mpl_fig, ax[i] = plt.subplots(figsize=(width, width / 2))
        else:
            ax[i] = fig.add_subplot(layout + i)
        if connected:
            df[columns].plot(ax=ax[i], marker=marker)
        else:
            df[columns].plot(ax=ax[i], marker=marker, linestyle="None")
        try:
            if cusum_only is False:
                df[var][(x[var] > ucl) | (x[var] < lcl)].plot(
                    ax=ax[i], marker=ooc_marker, linestyle="None", color="red"
                )
        except TypeError:
            # no outliers
            pass
        x_min = df.index.min()
        x_max = df.index.max()
        if cusum_only is False:
            y_low = min(df[var].min(), lcl) - 0.1 * abs(df[var].min())
            y_high = max(df[var].max(), ucl) + 0.1 * abs(df[var].max())
        elif cusum:
            y_low = min(df["deviation"].min() - 0.1 * abs(df["deviation"].min()), 0)
            y_high = df["deviation"].max() + 0.1 * abs(df["deviation"].max())
        else:
            warn("Error: must specify cusum=True when using cusum_only=True.")

        if plotly_module and interactive and cusum_only is False:
            ucl_text = dict(
                x=x_pos,
                y=ucl + 0.2,
                showarrow=False,
                text=f"UCL={ucl:.3f}",
                xref="x",
                yref="y",
                font=dict(family="serif", color="red", size=10),
            )
            mean_text = dict(
                x=x_pos,
                y=x_bar + 0.2,
                showarrow=False,
                text=f"mean={x_bar:.3f}",
                xref="x",
                yref="y",
                font=dict(family="serif", color="black", size=10),
            )
            lcl_text = dict(
                x=x_pos,
                y=lcl + 0.2,
                showarrow=False,
                text=f"LCL={lcl:.3f}",
                xref="x",
                yref="y",
                font=dict(family="serif", color="red", size=10),
            )
        elif cusum_only is False:
            ax[i].hlines(
                ucl, xmin=x_min, xmax=x_max, linestyles="dashed", color="r", label="UCL"
            )
            font_dict = {"family": "serif", "color": "red", "size": 10}
            plt.text(
                x_pos,
                ucl + 0.2,
                s=f"UCL={ucl:.3f}",
                fontdict=font_dict,
                horizontalalignment=align,
            )
            ax[i].hlines(
                x_bar,
                xmin=x_min,
                xmax=x_max,
                linestyles="dashed",
                color="k",
                label="mean",
            )
            font_dict = {"family": "serif", "color": "black", "size": 10}
            plt.text(
                x_pos,
                x_bar + 0.2,
                s=f"mean={x_bar:.3f}",
                fontdict=font_dict,
                horizontalalignment=align,
            )

            ax[i].hlines(
                lcl, xmin=x_min, xmax=x_max, linestyles="dashed", color="r", label="LCL"
            )
            font_dict = {"family": "serif", "color": "red", "size": 10}
            plt.text(
                x_pos,
                lcl + 0.2,
                s=f"LCL={ucl:.3f}",
                fontdict=font_dict,
                horizontalalignment=align,
            )

        ax[i].title.set_text(
            f"Univariate Control Chart for {var}{cusum_text} (σ={sigma})"
        )
        plt.tight_layout()
        if plotly_module and interactive:
            pfig = tls.mpl_to_plotly(mpl_fig)
            if cusum_only is False:
                for var, col in [(ucl, "Red"), (lcl, "Red"), (x_bar, "Black")]:
                    pfig.add_shape(
                        type="line",
                        x0=x_min,
                        y0=var,
                        x1=x_max,
                        y1=var,
                        line=dict(color=col, width=4, dash="dashdot",),
                    )
            pfig.update_xaxes(range=(x_min - 1, x_max + 1))
            pfig.update_yaxes(range=(y_low, y_high))
            annotations = None if cusum_only else [ucl_text, mean_text, lcl_text]
            pfig.update_layout(margin=dict(l=1, r=1),  # noqa
                yaxis_tickmode="auto",
                annotations=annotations,
                template=template,
            )
            if no_display is False:
                iplot(pfig)
            plotly_figs.append(pfig)
    if interactive:
        return plotly_figs
    else:
        return fig
Source code for hotelling.plots

Hotelling T2

Navigation

Related Topics