Source code for hotelling.plots

# -*- coding: utf-8 -*-
"""plot.py.

Hotelling's T-Squared multivariate control charts

See:

  - Hotelling, Harold. (1931). The Generalization of Student's Ratio. Ann. Math. Statist. 2,
    no. 3, 360--378. doi:10.1214/aoms/1177732979.
  - Tukey, J. W. (1960). A survey of sampling from contaminated distributions. In: Contributions
    to Probability and Statistics. Stanford Univ. Press. 448-85
  - Gnanadesikan, R. and J.R. Kettenring (1972). Robust Estimates, Residuals, and Outlier Detection
    with Multiresponse Data. Biometrics 28, 81-124

"""
from warnings import warn

import matplotlib.pyplot as plt
import pandas as pd

try:
    from plotly.offline import iplot
    from plotly.subplots import make_subplots
    import plotly.tools as tls

    plotly_module = True
except ModuleNotFoundError:
    plotly_module = False
from scipy import stats

from hotelling.stats import hotelling_t2


[docs]def control_interval(m, n, f, phase=1, alpha=0.001): """control_interval. For Hotelling control charts, phase 1 is using Qi. This follows a beta distribution, not an F distribution. For phase 2 uses future observations. These would follow a known distribution ~ F (Seber, 1984). The lower and upper lines are based on the quantiles of the distribution (aka `percent point function`) for α and 1 - α, while the center line is the median (50%). See: - Seber, G (1984). Multivariate Observations. John Wiley & Sons. - Nola D. Tracy, John C. Young & Robert L. Mason (1992) Multivariate Control Charts for individual Observations, Journal or Quality Technology, 24:2, 88-95, DOI:10.1080/00224065.1992.12015232 :param m: sample groups (between 1 and n) :param n: number of samples :param f: number of features in the multivariate samples :param phase: 1 or 2 - phase 1 is within initial sample, phase 2 is measuring implemented control :param alpha: significance level - used to calculate control lines at α/2 and 1-α/2 :return: """ if phase == 1: lcl = float( ((m - 1) * (n - 1) / m) * (stats.beta(f / 2, ((m - f - 1) / 2)).ppf(alpha / 2)), ) cl = float( ((m - 1) * (n - 1) / m) * (stats.beta(f / 2, ((m - f - 1) / 2)).ppf(0.5)), ) ucl = float( ((m - 1) * (n - 1) / m) * (stats.beta(f / 2, ((m - f - 1) / 2)).ppf(1 - alpha / 2)), ) else: lcl = float( (f * (m - 1) * (m + 1)) / (m * (m - f)) * stats.f(f, m - f).ppf(alpha / 2) ) cl = float((f * (m - 1) * (m + 1)) / (m * (m - f)) * stats.f(f, m - f).ppf(0.5)) ucl = float( (f * (m - 1) * (m + 1)) / (m * (m - f)) * stats.f(f, m - f).ppf(1 - alpha / 2) ) return lcl, cl, ucl
[docs]def control_stats(x): """control_stats. Compute the sample mean vector and the covariance matrix :param x: pandas dataframe, uni or multivariate :return: sample mean, sample covariance """ try: return x.mean(0).compute(), x.cov().compute() except AttributeError: return x.mean(0), x.cov()
[docs]def control_chart( x, phase=1, alpha=0.001, x_bar=None, s=None, legend_right=False, interactive=False, width=10, cusum=False, template="none", marker="o", ooc_marker="x", random_state=42, limit=1000, no_display=False, ): """control_chart. Hotelling Control Chart based on Q / T^2. See also `control_interval` for more detail :param x: pandas dataframe, uni or multivariate :param phase: 1 or 2 - phase 1 is within initial sample, phase 2 is measuring implemented control :param alpha: significance level - used to calculate control lines at α/2 and 1-α/2 :param x_bar: sample mean (optional, required with s) :param s: sample covariance (optional, required with x_bar) :param legend_right: default to 'left', can specify 'right' :param interactive: if True and plotly is available, renders as interactive plot in notebook. False, render image. :param width: how many units wide. defaults to 10, good for notebooks :param cusum: use cumulative sum instead of average :param template: plotly template, defaults to 'none', matching default matplotlib :param marker: default marker symbol - one valid for matplotlib :param ooc_marker: out of control marker symbol (x) - one valid for matplotlib :param random_state: seed for sample (n > limit) :param limit: max number of points to plot, defaults to 1000 :return: matplotlib ax / plotly fig """ n, subset = limit_display(x, limit, random_state) m = n # computing each individual values to the mean and covariance of the whole dataset if x_bar is None and s is None: x_bar, s = control_stats(x) elif x_bar is None or s is None: raise ValueError("Error: must specify both x_bar and s, or none at all.") # data might be a subset (sample), but control stats above are calculated on the whole dataset points, f = subset.shape qi = [hotelling_t2(subset[i:i + 1], x_bar, S=s) for i in range(points)] df = pd.DataFrame({"qi": qi}) lcl, cl, ucl = control_interval(m, n, f, phase=phase, alpha=alpha) cusum_text = "" if cusum: df["deviation"] = (df["qi"] - cl).cumsum() + cl cusum_text = f" w/deviation (ref={cl:.3f})" ax = df.plot( title=f"Hotelling Control Chart (α={alpha}, phase={phase}{cusum_text})", marker=marker, figsize=(width, width / 2), ) ax.set_xlabel("samples") try: df[(df["qi"] > ucl) | (df["qi"] < lcl)].plot( ax=ax, marker=ooc_marker, linestyle="None", color="red", legend=None ) except TypeError: # nothing to plot pass x_pos = 0 align = "left" if legend_right: x_pos = len(qi) align = "right" font_dict = {"family": "serif", "color": "red", "size": 10} if not interactive: ax.hlines( ucl, xmin=0, xmax=len(qi), linestyles="dashed", color="r", label=f"UCL={ucl}", ) plt.text( x_pos, ucl + 0.1, s=f"UCL={ucl:.3f}", fontdict=font_dict, horizontalalignment=align, ) if not interactive: ax.hlines( cl, xmin=0, xmax=len(qi), linestyles="dashed", color="k", label=f"CL={cl}" ) font_dict = {"family": "serif", "color": "black", "size": 10} plt.text( x_pos, cl + 0.1, s=f"CL={cl:.3f}", fontdict=font_dict, horizontalalignment=align ) if not interactive: ax.hlines( lcl, xmin=0, xmax=len(qi), linestyles="dashed", color="r", label=f"LCL={lcl}", ) font_dict = {"family": "serif", "color": "red", "size": 10} plt.text( x_pos, lcl + 0.1, s=f"LCL={lcl:.3f}", fontdict=font_dict, horizontalalignment=align, ) if plotly_module and interactive: fig = tls.mpl_to_plotly(ax.get_figure()) for var, col in [(ucl, "Red"), (lcl, "Red"), (cl, "Black")]: fig.add_shape( type="line", x0=0, y0=var, x1=len(qi), y1=var, line=dict(color=col, width=4, dash="dashdot",), ) fig.update_layout(template=template) if no_display is False: iplot(fig) return fig else: return ax
[docs]def limit_display(x, limit, random_state): """limit_displau. Convenient way to get around the issue of very large datasets. We can't show everything, so we display a subset. The tests and stats like T2, F and P values are not affected, because we calculate them on all the data. :param x: dask or pandas dataframe, uni or multivariate :param random_state: seed for sample (n > limit) :param limit: max number of points to plot, defaults to 1000 :return: returns original number of rows and limited dataframe """ n, *f = x.shape try: n = n.compute() except AttributeError: pass if n > limit: try: frac = 1000 / n subset = x.sample(frac=frac, random_state=random_state).compute() except AttributeError: subset = x.sample(n=1000, random_state=random_state) else: # The whole thing try: subset = x.compute() except AttributeError: subset = x return n, subset
[docs]def univariate_control_chart( x, var=None, sigma=3, legend_right=False, interactive=False, connected=True, width=10, cusum=False, cusum_only=False, template="none", marker="o", ooc_marker="x", limit=1000, random_state=42, no_display=False, ): """univariate_control_chart. :param x: dask or pandas dataframe, uni or multivariate :param var: optional, variable to plot (default to all) :param sigma: default to 3 sigma from mean for upper and lower control lines :param legend_right: default to 'left', can specify 'right' :param interactive: if plotly is available, renders as interactive plot in notebook. False to render image. :param connected: defaults to True. Appropriate when time related /consecutive batches, else, should be False :param width: how many units wide. defaults to 10, good for notebooks :param cusum: use cumulative sum instead of average :param cusum_only: don't display values, just cusum referenced to 0 :param template: plotly template, defaults to 'none', matching default matplotlib :param marker: default marker symbol (o) - one valid for matplotlib :param ooc_marker: out of control marker symbol (x) - one valid for matplotlib :param random_state: seed for sample (n > limit) :param limit: max number of points to plot, defaults to 1000 :return: returns matplotlib figure or array of plotly figures """ n, *f, df = limit_display(x, limit, random_state) num_plots = len(df.columns) k = sigma # 3 sigma default if interactive: fig = make_subplots(rows=num_plots, cols=1) else: fig = plt.figure(figsize=(width, num_plots * width / 2)) ax = list(range(num_plots)) layout = num_plots * 100 + 11 features = df.columns if var is None else [var] x_pos = 0 align = "left" if legend_right: x_pos = n align = "right" plotly_figs = [] for i, var in enumerate(features): x_bar = df[var].mean() cusum_text = "" columns = var if cusum: if cusum_only: columns = "deviation" df["deviation"] = (df[var] - x_bar).cumsum() cusum_text = f" cumulative deviation (ref={x_bar:.3f})" else: columns = [var, "deviation"] df["deviation"] = (df[var] - x_bar).cumsum() + (x_bar) cusum_text = f" w/deviation (ref={x_bar:.3f})" ucl = x_bar + k * df[var].std() lcl = x_bar - k * df[var].std() if interactive: mpl_fig, ax[i] = plt.subplots(figsize=(width, width / 2)) else: ax[i] = fig.add_subplot(layout + i) if connected: df[columns].plot(ax=ax[i], marker=marker) else: df[columns].plot(ax=ax[i], marker=marker, linestyle="None") try: if cusum_only is False: df[var][(x[var] > ucl) | (x[var] < lcl)].plot( ax=ax[i], marker=ooc_marker, linestyle="None", color="red" ) except TypeError: # no outliers pass x_min = df.index.min() x_max = df.index.max() if cusum_only is False: y_low = min(df[var].min(), lcl) - 0.1 * abs(df[var].min()) y_high = max(df[var].max(), ucl) + 0.1 * abs(df[var].max()) elif cusum: y_low = min(df["deviation"].min() - 0.1 * abs(df["deviation"].min()), 0) y_high = df["deviation"].max() + 0.1 * abs(df["deviation"].max()) else: warn("Error: must specify cusum=True when using cusum_only=True.") if plotly_module and interactive and cusum_only is False: ucl_text = dict( x=x_pos, y=ucl + 0.2, showarrow=False, text=f"UCL={ucl:.3f}", xref="x", yref="y", font=dict(family="serif", color="red", size=10), ) mean_text = dict( x=x_pos, y=x_bar + 0.2, showarrow=False, text=f"mean={x_bar:.3f}", xref="x", yref="y", font=dict(family="serif", color="black", size=10), ) lcl_text = dict( x=x_pos, y=lcl + 0.2, showarrow=False, text=f"LCL={lcl:.3f}", xref="x", yref="y", font=dict(family="serif", color="red", size=10), ) elif cusum_only is False: ax[i].hlines( ucl, xmin=x_min, xmax=x_max, linestyles="dashed", color="r", label="UCL" ) font_dict = {"family": "serif", "color": "red", "size": 10} plt.text( x_pos, ucl + 0.2, s=f"UCL={ucl:.3f}", fontdict=font_dict, horizontalalignment=align, ) ax[i].hlines( x_bar, xmin=x_min, xmax=x_max, linestyles="dashed", color="k", label="mean", ) font_dict = {"family": "serif", "color": "black", "size": 10} plt.text( x_pos, x_bar + 0.2, s=f"mean={x_bar:.3f}", fontdict=font_dict, horizontalalignment=align, ) ax[i].hlines( lcl, xmin=x_min, xmax=x_max, linestyles="dashed", color="r", label="LCL" ) font_dict = {"family": "serif", "color": "red", "size": 10} plt.text( x_pos, lcl + 0.2, s=f"LCL={ucl:.3f}", fontdict=font_dict, horizontalalignment=align, ) ax[i].title.set_text( f"Univariate Control Chart for {var}{cusum_text} (σ={sigma})" ) plt.tight_layout() if plotly_module and interactive: pfig = tls.mpl_to_plotly(mpl_fig) if cusum_only is False: for var, col in [(ucl, "Red"), (lcl, "Red"), (x_bar, "Black")]: pfig.add_shape( type="line", x0=x_min, y0=var, x1=x_max, y1=var, line=dict(color=col, width=4, dash="dashdot",), ) pfig.update_xaxes(range=(x_min - 1, x_max + 1)) pfig.update_yaxes(range=(y_low, y_high)) annotations = None if cusum_only else [ucl_text, mean_text, lcl_text] pfig.update_layout(margin=dict(l=1, r=1), # noqa yaxis_tickmode="auto", annotations=annotations, template=template, ) if no_display is False: iplot(pfig) plotly_figs.append(pfig) if interactive: return plotly_figs else: return fig