Coverage for /home/runner/work/hotelling/hotelling/hotelling/plots.py : 90%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# -*- coding: utf-8 -*-
Hotelling's T-Squared multivariate control charts
See:
- Hotelling, Harold. (1931). The Generalization of Student's Ratio. Ann. Math. Statist. 2, no. 3, 360--378. doi:10.1214/aoms/1177732979. - Tukey, J. W. (1960). A survey of sampling from contaminated distributions. In: Contributions to Probability and Statistics. Stanford Univ. Press. 448-85 - Gnanadesikan, R. and J.R. Kettenring (1972). Robust Estimates, Residuals, and Outlier Detection with Multiresponse Data. Biometrics 28, 81-124
"""
except ModuleNotFoundError: plotly_module = False
"""control_interval.
For Hotelling control charts, phase 1 is using Qi. This follows a beta distribution, not an F distribution. For phase 2 uses future observations. These would follow a known distribution ~ F (Seber, 1984). The lower and upper lines are based on the quantiles of the distribution (aka `percent point function`) for α and 1 - α, while the center line is the median (50%).
See: - Seber, G (1984). Multivariate Observations. John Wiley & Sons. - Nola D. Tracy, John C. Young & Robert L. Mason (1992) Multivariate Control Charts for individual Observations, Journal or Quality Technology, 24:2, 88-95, DOI:10.1080/00224065.1992.12015232
:param m: sample groups (between 1 and n) :param n: number of samples :param f: number of features in the multivariate samples :param phase: 1 or 2 - phase 1 is within initial sample, phase 2 is measuring implemented control :param alpha: significance level - used to calculate control lines at α/2 and 1-α/2 :return: """ ((m - 1) * (n - 1) / m) * (stats.beta(f / 2, ((m - f - 1) / 2)).ppf(alpha / 2)), ) ((m - 1) * (n - 1) / m) * (stats.beta(f / 2, ((m - f - 1) / 2)).ppf(0.5)), ) ((m - 1) * (n - 1) / m) * (stats.beta(f / 2, ((m - f - 1) / 2)).ppf(1 - alpha / 2)), ) else: lcl = float( (f * (m - 1) * (m + 1)) / (m * (m - f)) * stats.f(f, m - f).ppf(alpha / 2) ) cl = float((f * (m - 1) * (m + 1)) / (m * (m - f)) * stats.f(f, m - f).ppf(0.5)) ucl = float( (f * (m - 1) * (m + 1)) / (m * (m - f)) * stats.f(f, m - f).ppf(1 - alpha / 2) )
"""control_stats.
Compute the sample mean vector and the covariance matrix
:param x: pandas dataframe, uni or multivariate :return: sample mean, sample covariance """
x, phase=1, alpha=0.001, x_bar=None, s=None, legend_right=False, interactive=False, width=10, cusum=False, template="none", marker="o", ooc_marker="x", random_state=42, limit=1000, no_display=False, ): """control_chart.
Hotelling Control Chart based on Q / T^2.
See also `control_interval` for more detail
:param x: pandas dataframe, uni or multivariate :param phase: 1 or 2 - phase 1 is within initial sample, phase 2 is measuring implemented control :param alpha: significance level - used to calculate control lines at α/2 and 1-α/2 :param x_bar: sample mean (optional, required with s) :param s: sample covariance (optional, required with x_bar) :param legend_right: default to 'left', can specify 'right' :param interactive: if True and plotly is available, renders as interactive plot in notebook. False, render image. :param width: how many units wide. defaults to 10, good for notebooks :param cusum: use cumulative sum instead of average :param template: plotly template, defaults to 'none', matching default matplotlib :param marker: default marker symbol - one valid for matplotlib :param ooc_marker: out of control marker symbol (x) - one valid for matplotlib :param random_state: seed for sample (n > limit) :param limit: max number of points to plot, defaults to 1000 :return: matplotlib ax / plotly fig """
# computing each individual values to the mean and covariance of the whole dataset
# data might be a subset (sample), but control stats above are calculated on the whole dataset
title=f"Hotelling Control Chart (α={alpha}, phase={phase}{cusum_text})", marker=marker, figsize=(width, width / 2), )
ax=ax, marker=ooc_marker, linestyle="None", color="red", legend=None ) except TypeError: # nothing to plot pass ucl, xmin=0, xmax=len(qi), linestyles="dashed", color="r", label=f"UCL={ucl}", ) x_pos, ucl + 0.1, s=f"UCL={ucl:.3f}", fontdict=font_dict, horizontalalignment=align, ) cl, xmin=0, xmax=len(qi), linestyles="dashed", color="k", label=f"CL={cl}" ) x_pos, cl + 0.1, s=f"CL={cl:.3f}", fontdict=font_dict, horizontalalignment=align ) lcl, xmin=0, xmax=len(qi), linestyles="dashed", color="r", label=f"LCL={lcl}", ) x_pos, lcl + 0.1, s=f"LCL={lcl:.3f}", fontdict=font_dict, horizontalalignment=align, ) type="line", x0=0, y0=var, x1=len(qi), y1=var, line=dict(color=col, width=4, dash="dashdot",), ) iplot(fig) else:
"""limit_displau.
Convenient way to get around the issue of very large datasets. We can't show everything, so we display a subset. The tests and stats like T2, F and P values are not affected, because we calculate them on all the data.
:param x: dask or pandas dataframe, uni or multivariate :param random_state: seed for sample (n > limit) :param limit: max number of points to plot, defaults to 1000 :return: returns original number of rows and limited dataframe """
try: frac = 1000 / n subset = x.sample(frac=frac, random_state=random_state).compute() except AttributeError: subset = x.sample(n=1000, random_state=random_state) else: # The whole thing
x, var=None, sigma=3, legend_right=False, interactive=False, connected=True, width=10, cusum=False, cusum_only=False, template="none", marker="o", ooc_marker="x", limit=1000, random_state=42, no_display=False, ): """univariate_control_chart.
:param x: dask or pandas dataframe, uni or multivariate :param var: optional, variable to plot (default to all) :param sigma: default to 3 sigma from mean for upper and lower control lines :param legend_right: default to 'left', can specify 'right' :param interactive: if plotly is available, renders as interactive plot in notebook. False to render image. :param connected: defaults to True. Appropriate when time related /consecutive batches, else, should be False :param width: how many units wide. defaults to 10, good for notebooks :param cusum: use cumulative sum instead of average :param cusum_only: don't display values, just cusum referenced to 0 :param template: plotly template, defaults to 'none', matching default matplotlib :param marker: default marker symbol (o) - one valid for matplotlib :param ooc_marker: out of control marker symbol (x) - one valid for matplotlib :param random_state: seed for sample (n > limit) :param limit: max number of points to plot, defaults to 1000 :return: returns matplotlib figure or array of plotly figures """ else:
else: else: else: df[columns].plot(ax=ax[i], marker=marker, linestyle="None") ax=ax[i], marker=ooc_marker, linestyle="None", color="red" ) except TypeError: # no outliers pass else: warn("Error: must specify cusum=True when using cusum_only=True.")
x=x_pos, y=ucl + 0.2, showarrow=False, text=f"UCL={ucl:.3f}", xref="x", yref="y", font=dict(family="serif", color="red", size=10), ) x=x_pos, y=x_bar + 0.2, showarrow=False, text=f"mean={x_bar:.3f}", xref="x", yref="y", font=dict(family="serif", color="black", size=10), ) x=x_pos, y=lcl + 0.2, showarrow=False, text=f"LCL={lcl:.3f}", xref="x", yref="y", font=dict(family="serif", color="red", size=10), ) ucl, xmin=x_min, xmax=x_max, linestyles="dashed", color="r", label="UCL" ) x_pos, ucl + 0.2, s=f"UCL={ucl:.3f}", fontdict=font_dict, horizontalalignment=align, ) x_bar, xmin=x_min, xmax=x_max, linestyles="dashed", color="k", label="mean", ) x_pos, x_bar + 0.2, s=f"mean={x_bar:.3f}", fontdict=font_dict, horizontalalignment=align, )
lcl, xmin=x_min, xmax=x_max, linestyles="dashed", color="r", label="LCL" ) x_pos, lcl + 0.2, s=f"LCL={ucl:.3f}", fontdict=font_dict, horizontalalignment=align, )
f"Univariate Control Chart for {var}{cusum_text} (σ={sigma})" ) type="line", x0=x_min, y0=var, x1=x_max, y1=var, line=dict(color=col, width=4, dash="dashdot",), ) yaxis_tickmode="auto", annotations=annotations, template=template, ) iplot(pfig) else: |