From: John Darrington Date: Sat, 4 May 2019 06:20:09 +0000 (+0200) Subject: EXAMINE: Implement the Shapiro-Wilk Test. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0b7e8882ce9bf2166c6bcc0db1334357ba5a76d4;p=pspp EXAMINE: Implement the Shapiro-Wilk Test. Closes bug #42511 --- diff --git a/NEWS b/NEWS index 75fc620e70..e745639b4c 100644 --- a/NEWS +++ b/NEWS @@ -18,7 +18,10 @@ Changes from 1.2.0 to 1.3.0: * Bug fix for CVE-2018-20230. - * The REGRESSION command now supports the /STATISTICS=TOL which + * The EXAMINE command will now perform the Shapiro-Wilk test when + one or more plots are requested. + +* The REGRESSION command now supports the /STATISTICS=TOL which outputs tolerance and variance inflation factor metrics for the data. * A bug where the GUI would crash when T-TEST was executed whilst diff --git a/doc/statistics.texi b/doc/statistics.texi index 9e9ed9081b..259c9abe53 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -331,6 +331,15 @@ Zero, however is a special value. If @var{t} is 0 or is omitted, then data will be transformed by taking its natural logarithm instead of raising to the power of @var{t}. +@cindex Shapiro-Wilk +When one or more plots are requested, @subcmd{EXAMINE} also performs the +Shapiro-Wilk test for each category. +There are however a number of provisos: +@itemize +@item All weight values must be integer. +@item The cumulative weight value must be in the range [3, 5000] +@end itemize + The @subcmd{COMPARE} subcommand is only relevant if producing boxplots, and it is only useful there is more than one dependent variable and at least one factor. If diff --git a/src/language/stats/examine.c b/src/language/stats/examine.c index cb78ebd4f8..20fb46f3be 100644 --- a/src/language/stats/examine.c +++ b/src/language/stats/examine.c @@ -1,6 +1,6 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2012, 2013, 2016 Free Software Foundation, Inc. + Copyright (C) 2012, 2013, 2016, 2019 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -47,6 +47,7 @@ #include "math/sort.h" #include "math/order-stats.h" #include "math/percentiles.h" +#include "math/shapiro-wilk.h" #include "math/tukey-hinges.h" #include "math/trimmed-mean.h" @@ -95,7 +96,6 @@ enum #define PLOT_NPPLOT 0x4 #define PLOT_SPREADLEVEL 0x8 - struct examine { struct pool *pool; @@ -181,6 +181,7 @@ struct exploratory_stats struct trimmed_mean *trimmed_mean; struct percentile *quartiles[3]; struct percentile **percentiles; + struct shapiro_wilk *shapiro_wilk; struct tukey_hinges *hinges; @@ -662,6 +663,72 @@ percentiles_report (const struct examine *cmd, int iact_idx) pivot_table_submit (table); } +static void +normality_report (const struct examine *cmd, int iact_idx) +{ + struct pivot_table *table = pivot_table_create (N_("Tests of Normality")); + table->omit_empty = true; + + struct pivot_dimension *test = + pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Shapiro-Wilk"), + N_("Statistic"), + N_("df"), PIVOT_RC_COUNT, + N_("Sig.")); + + test->root->show_label = true; + + const struct interaction *iact = cmd->iacts[iact_idx]; + struct pivot_footnote *missing_footnote = create_missing_footnote (table); + create_interaction_dimensions (table, cmd->cats, iact, missing_footnote); + + struct pivot_dimension *dep_dim = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Dependent Variables")); + + size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes); + + size_t n_cats = categoricals_n_count (cmd->cats, iact_idx); + for (size_t v = 0; v < cmd->n_dep_vars; ++v) + { + indexes[table->n_dimensions - 1] = + pivot_category_create_leaf (dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v])); + + for (size_t i = 0; i < n_cats; ++i) + { + indexes[1] = i; + + const struct exploratory_stats *es + = categoricals_get_user_data_by_category_real ( + cmd->cats, iact_idx, i); + + struct shapiro_wilk *sw = es[v].shapiro_wilk; + + if (sw == NULL) + continue; + + double w = shapiro_wilk_calculate (sw); + + int j = 0; + indexes[0] = j; + + pivot_table_put (table, indexes, table->n_dimensions, + pivot_value_new_number (w)); + + indexes[0] = ++j; + pivot_table_put (table, indexes, table->n_dimensions, + pivot_value_new_number (sw->n)); + + indexes[0] = ++j; + pivot_table_put (table, indexes, table->n_dimensions, + pivot_value_new_number (shapiro_wilk_significance (sw->n, w))); + } + } + + free (indexes); + + pivot_table_submit (table); +} + + static void descriptives_report (const struct examine *cmd, int iact_idx) { @@ -1137,6 +1204,7 @@ calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data) es[v].percentiles = pool_calloc (examine->pool, examine->n_percentiles, sizeof (*es[v].percentiles)); es[v].trimmed_mean = trimmed_mean_create (es[v].cc, 0.05); + es[v].shapiro_wilk = NULL; os = xcalloc (n_os, sizeof *os); os[0] = &es[v].trimmed_mean->parent; @@ -1178,6 +1246,23 @@ calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data) EX_WT, EX_VAL); } + if (examine->plot) + { + double mean; + + moments_calculate (es[v].mom, NULL, &mean, NULL, NULL, NULL); + + es[v].shapiro_wilk = shapiro_wilk_create (es[v].non_missing, mean); + + if (es[v].shapiro_wilk) + { + struct order_stats *os = &es[v].shapiro_wilk->parent; + order_stats_accumulate_idx (&os, 1, + casereader_clone (es[v].sorted_reader), + EX_WT, EX_VAL); + } + } + if (examine->plot & PLOT_NPPLOT) { double n, mean, var; @@ -1339,6 +1424,9 @@ run_examine (struct examine *cmd, struct casereader *input) if (cmd->descriptives) descriptives_report (cmd, i); + + if (cmd->plot) + normality_report (cmd, i); } cleanup_exploratory_stats (cmd); diff --git a/src/math/automake.mk b/src/math/automake.mk index 36e1f5e304..36a73530a8 100644 --- a/src/math/automake.mk +++ b/src/math/automake.mk @@ -1,5 +1,5 @@ # PSPP - a program for statistical analysis. -# Copyright (C) 2017 Free Software Foundation, Inc. +# Copyright (C) 2017, 2019 Free Software Foundation, Inc. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -46,6 +46,7 @@ src_math_libpspp_math_la_SOURCES = \ src/math/random.c src/math/random.h \ src/math/statistic.h \ src/math/sort.c src/math/sort.h \ + src/math/shapiro-wilk.c src/math/shapiro-wilk.h \ src/math/trimmed-mean.c src/math/trimmed-mean.h \ src/math/tukey-hinges.c src/math/tukey-hinges.h \ src/math/wilcoxon-sig.c src/math/wilcoxon-sig.h diff --git a/src/math/shapiro-wilk.c b/src/math/shapiro-wilk.c new file mode 100644 index 0000000000..f48156e97c --- /dev/null +++ b/src/math/shapiro-wilk.c @@ -0,0 +1,198 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2019 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "math/shapiro-wilk.h" + +#include +#include +#include +#include "libpspp/cast.h" +#include "libpspp/compiler.h" +#include "libpspp/message.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) +#define N_(msgid) msgid + +/* Return the sum of coeff[i] * x^i for all i in the range [0,order). + It is the caller's responsibility to ensure that coeff points to a + large enough array containing the desired coefficients. */ +static double +polynomial (const double *coeff, int order, double x) +{ + double result = 0; + for (int i = 0; i < order; ++i) + result += coeff[i] * pow (x, i); + + return result; +} + +static double +m_i (struct shapiro_wilk *sw, int i) +{ + assert (i > 0); + assert (i <= sw->n); + double x = (i - 0.375) / (sw->n + 0.25); + + return gsl_cdf_ugaussian_Pinv (x); +} + +static double +a_i (struct shapiro_wilk *sw, int i) +{ + assert (i > 0); + assert (i <= sw->n); + + if (i < sw->n / 2.0) + return -a_i (sw, sw->n - i + 1); + else if (i == sw->n) + return sw->a_n1; + else if (i == sw->n - 1) + return sw->a_n2; + else + return m_i (sw, i) / sqrt (sw->epsilon); +} + +struct ccase; + +static void +acc (struct statistic *s, const struct ccase *cx UNUSED, double c, + double cc, double y) +{ + struct shapiro_wilk *sw = UP_CAST (s, struct shapiro_wilk, parent.parent); + + double int_part, frac_part; + frac_part = modf (c, &int_part); + + if (frac_part != 0 && !sw->warned) + { + msg (MW, N_ ("One or more weight values are non-integer." + " Fractional parts will be ignored when calculating the Shapiro-Wilk statistic.")); + sw->warned = true; + } + + for (int i = 0; i < int_part; ++i) + { + double a_ii = a_i (sw, cc - c + i + 1); + double x_ii = y; + + sw->numerator += a_ii * x_ii; + sw->denominator += pow (x_ii - sw->mean, 2); + } +} + +static void +destroy (struct statistic *s) +{ + struct shapiro_wilk *sw = UP_CAST (s, struct shapiro_wilk, parent.parent); + free (sw); +} + + + +double +shapiro_wilk_calculate (const struct shapiro_wilk *sw) +{ + return sw->numerator * sw->numerator / sw->denominator; +} + +/* Inititialse a SW ready for calculating the statistic for + a dataset of size N. */ +struct shapiro_wilk * +shapiro_wilk_create (int n, double mean) +{ + struct shapiro_wilk *sw = xzalloc (sizeof (*sw)); + struct order_stats *os = &sw->parent; + struct statistic *stat = &os->parent; + + const double c1[] = {0, 0.221157, -0.147981, + -2.071190, 4.434685, -2.706056}; + + const double c2[] = {0, 0.042981, -0.293762, + -1.752461, 5.682633, -3.582633}; + + sw->n = n; + + if (n < 3 || n > 5000) + return NULL; + + const double u = 1.0 / sqrt (sw->n); + + double m = 0; + for (int i = 1; i <= sw->n; ++i) + { + double m_ii = m_i (sw, i); + m += m_ii * m_ii; + } + + double m_n1 = m_i (sw, sw->n); + double m_n2 = m_i (sw, sw->n - 1); + + sw->a_n1 = polynomial (c1, 6, u); + sw->a_n1 += m_n1 / sqrt (m); + + sw->a_n2 = polynomial (c2, 6, u); + sw->a_n2 += m_n2 / sqrt (m); + sw->mean = mean; + + sw->epsilon = m - 2 * pow (m_n1, 2) - 2 * pow (m_n2, 2); + sw->epsilon /= 1 - 2 * pow (sw->a_n1, 2) - 2 * pow (sw->a_n2, 2); + + sw->warned = false; + + stat->accumulate = acc; + stat->destroy = destroy; + + return sw; +} + +double +shapiro_wilk_significance (double n, double w) +{ + const double g[] = {-2.273, 0.459}; + const double c3[] = {.544,-.39978,.025054,-6.714e-4}; + const double c4[] = {1.3822,-.77857,.062767,-.0020322}; + const double c5[] = {-1.5861,-.31082,-.083751,.0038915 }; + const double c6[] = {-.4803,-.082676,.0030302}; + + double m, s; + double y = log (1 - w); + if (n == 3) + { + double pi6 = 6.0 / M_PI; + double stqr = asin(sqrt (3/ 4.0)); + double p = pi6 * (asin (sqrt (w)) - stqr); + return (p < 0) ? 0 : p; + } + else if (n <= 11) + { + double gamma = polynomial (g, 2, n); + y = - log (gamma - y); + m = polynomial (c3, 4, n); + s = exp (polynomial (c4, 4, n)); + } + else + { + double xx = log(n); + m = polynomial (c5, 4, xx); + s = exp (polynomial (c6, 3, xx)); + } + + double p = gsl_cdf_gaussian_Q (y - m, s); + return p; +} diff --git a/src/math/shapiro-wilk.h b/src/math/shapiro-wilk.h new file mode 100644 index 0000000000..9cae3608cf --- /dev/null +++ b/src/math/shapiro-wilk.h @@ -0,0 +1,46 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2019 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef __SHAPIRO_WILK_H__ +#define __SHAPIRO_WILK_H__ + +#include "order-stats.h" + +struct shapiro_wilk +{ + struct order_stats parent; + int n; + double a_n1; + double a_n2; + double epsilon; + + double mean; + double numerator; + double denominator; + + bool warned; +}; + + +struct shapiro_wilk * shapiro_wilk_create (int n, double mean); + +double shapiro_wilk_calculate (const struct shapiro_wilk *sw); + +double shapiro_wilk_significance (double n, double w); + + + +#endif diff --git a/tests/language/stats/examine.at b/tests/language/stats/examine.at index dbd11437fd..86297931d4 100644 --- a/tests/language/stats/examine.at +++ b/tests/language/stats/examine.at @@ -249,7 +249,6 @@ V1,Highest,1,21,20.00 AT_CLEANUP - AT_SETUP([EXAMINE -- extremes with fractional weights]) AT_KEYWORDS([categorical categoricals]) AT_DATA([extreme.sps], [dnl @@ -734,8 +733,8 @@ examine x . ]) -AT_CHECK([pspp -O format=csv examine-id.sps], [0], -[Table: Case Processing Summary +AT_CHECK([pspp -O format=csv examine-id.sps], [0], [dnl +Table: Case Processing Summary ,Cases,,,,, ,Valid,,Missing,,Total, ,N,Percent,N,Percent,N,Percent @@ -753,6 +752,11 @@ x,Highest,1,threehundred,300.00 ,,3,three,3.00 ,,4,four,4.00 ,,5,five,5.00 + +Table: Tests of Normality +,Shapiro-Wilk,, +,Statistic,df,Sig. +x,.37,14,.00 ]) AT_CLEANUP @@ -1251,6 +1255,7 @@ Weight in kilograms ,Highest,1,13,92.1 AT_CLEANUP + AT_SETUP([EXAMINE -- Crash on unrepresentable graphs]) AT_DATA([examine.sps], [dnl data list notable list /x * g *. @@ -1265,3 +1270,102 @@ examine x by g dnl This bug only manifested itself on cairo based drivers. AT_CHECK([pspp -O format=pdf examine.sps], [1], [ignore]) AT_CLEANUP + + +dnl This example comes from the web site: +dnl https://www.spsstests.com/2018/11/shapiro-wilk-normality-test-spss.html +AT_SETUP([EXAMINE -- shapiro-wilk 1]) +AT_KEYWORDS([shapiro wilk]) +AT_DATA([shapiro-wilk.sps], [dnl +data list notable list /x * g *. +begin data. +96 1 +98 1 +95 1 +89 1 +90 1 +92 1 +94 1 +93 1 +97 1 +100 1 +99 2 +96 2 +80 2 +89 2 +91 2 +92 2 +93 2 +94 2 +99 2 +80 2 +end data. + +set format F22.3. + +examine x by g + /nototal + /plot = all. +]) + +AT_CHECK([pspp -O format=csv shapiro-wilk.sps], [0],[dnl +Table: Case Processing Summary +,,Cases,,,,, +,,Valid,,Missing,,Total, +,g,N,Percent,N,Percent,N,Percent +x,1.00,10,100.0%,0,.0%,10,100.0% +,2.00,10,100.0%,0,.0%,10,100.0% + +Table: Tests of Normality +,,Shapiro-Wilk,, +,g,Statistic,df,Sig. +x,1.00,.984,10,.983 +,2.00,.882,10,.136 +]) + +AT_CLEANUP + + +dnl This example comes from the web site: +dnl http://www.real-statistics.com/tests-normality-and-symmetry/statistical-tests-normality-symmetry/shapiro-wilk-expanded-test/ +dnl It uses a dataset larger than 11 samples. Hence the alternative method for +dnl signficance is used. +AT_SETUP([EXAMINE -- shapiro-wilk 2]) +AT_KEYWORDS([shapiro wilk]) +AT_DATA([shapiro-wilk2.sps], [dnl +data list notable list /x *. +begin data. +65 +61 +63 +86 +70 +55 +74 +35 +72 +68 +45 +58 +end data. + +set format F22.3. + +examine x + /plot = boxplot. +]) + +AT_CHECK([pspp -O format=csv shapiro-wilk2.sps], [0],[dnl +Table: Case Processing Summary +,Cases,,,,, +,Valid,,Missing,,Total, +,N,Percent,N,Percent,N,Percent +x,12,100.0%,0,.0%,12,100.0% + +Table: Tests of Normality +,Shapiro-Wilk,, +,Statistic,df,Sig. +x,.971,12,.922 +]) + +AT_CLEANUP