From 7acdff072470c7e87ac9ba7c2f135e4c08fd0e02 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Fri, 1 Jul 2011 18:21:13 +0200 Subject: [PATCH] Implemented the McNemar test. Closes bug #33242 --- NEWS | 5 +- doc/statistics.texi | 28 ++++ src/language/stats/automake.mk | 2 + src/language/stats/mcnemar.c | 295 +++++++++++++++++++++++++++++++++ src/language/stats/mcnemar.h | 35 ++++ src/language/stats/npar.c | 49 +++++- tests/language/stats/npar.at | 55 ++++++ 7 files changed, 465 insertions(+), 4 deletions(-) create mode 100644 src/language/stats/mcnemar.c create mode 100644 src/language/stats/mcnemar.h diff --git a/NEWS b/NEWS index 57d2d7b5..bd228d91 100644 --- a/NEWS +++ b/NEWS @@ -44,8 +44,9 @@ Changes from 0.6.2 to 0.7.8: - MISSING VALUES can now assign missing values to long string variables. - - NPAR TESTS has new KRUSKAL-WALLIS, SIGN, WILCOXON, and RUNS - subcommands. + - The following new subcommands to NPAR TESTS have been implemented: + COCHRAN, FRIEDMAN, KENDALL, KRUSKAL-WALLIS, MANN-WHITNEY, MCNEMAR, + SIGN, WILCOXON, and RUNS - SET and SHOW no longer have ENDCMD, NULLINE, PROMPT, CPROMPT, and DPROMPT subcommands. The defaults are now fixed values. diff --git a/doc/statistics.texi b/doc/statistics.texi index e5e299a4..18fa79ac 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -686,6 +686,7 @@ is used. * KENDALL:: Kendall's W Test * KRUSKAL-WALLIS:: Kruskal-Wallis Test * MANN-WHITNEY:: Mann Whitney U Test +* MCNEMAR:: McNemar Test * RUNS:: Runs Test * SIGN:: The Sign Test * WILCOXON:: Wilcoxon Signed Ranks Test @@ -859,6 +860,33 @@ Cases for which the @var{var} value is neither @var{group1} or @var{group2} will The value of the Mann-Whitney U statistic, the Wilcoxon W, and the significance will be printed. The abbreviated subcommand M-W may be used in place of MANN-WHITNEY. +@node MCNEMAR +@subsection McNemar Test +@vindex MCNEMAR +@cindex McNemar test + +@display + [ /MCNEMAR varlist [ WITH varlist [ (PAIRED) ]]] +@end display + +Use McNemar's test to analyse the significance of the difference between +pairs of correlated proportions. + +If the @code{WITH} keyword is omitted, then tests for all +combinations of the listed variables are performed. +If the @code{WITH} keyword is given, and the @code{(PAIRED)} keyword +is also given, then the number of variables preceding @code{WITH} +must be the same as the number following it. +In this case, tests for each respective pair of variables are +performed. +If the @code{WITH} keyword is given, but the +@code{(PAIRED)} keyword is omitted, then tests for each combination +of variable preceding @code{WITH} against variable following +@code{WITH} are performed. + +The data in each variable must be dichotomous. If there are more +than two distinct variables an error will occur and the test will +not be run. @node RUNS @subsection Runs Test diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index f7fd6373..9beacb5d 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -32,6 +32,8 @@ language_stats_sources = \ src/language/stats/kruskal-wallis.h \ src/language/stats/mann-whitney.c \ src/language/stats/mann-whitney.h \ + src/language/stats/mcnemar.c \ + src/language/stats/mcnemar.h \ src/language/stats/npar.c \ src/language/stats/npar.h \ src/language/stats/npar-summary.c \ diff --git a/src/language/stats/mcnemar.c b/src/language/stats/mcnemar.c new file mode 100644 index 00000000..53e9a52f --- /dev/null +++ b/src/language/stats/mcnemar.c @@ -0,0 +1,295 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "mcnemar.h" + +#include +#include + +#include "data/casereader.h" +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/missing-values.h" +#include "data/variable.h" +#include "language/stats/npar.h" +#include "libpspp/str.h" +#include "output/tab.h" +#include "libpspp/message.h" + +#include "gl/minmax.h" +#include "gl/xalloc.h" + +#include "data/value.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) + + +struct mcnemar +{ + union value val0; + union value val1; + + double n00; + double n01; + double n10; + double n11; +}; + +static void +output_freq_table (variable_pair *vp, + const struct mcnemar *param, + const struct dictionary *dict); + + +static void +output_statistics_table (const struct two_sample_test *t2s, + const struct mcnemar *param, + const struct dictionary *dict); + + +void +mcnemar_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact UNUSED, + double timer UNUSED) +{ + int i; + bool warn = true; + + const struct dictionary *dict = dataset_dict (ds); + + const struct two_sample_test *t2s = UP_CAST (test, const struct two_sample_test, parent); + struct ccase *c; + + struct casereader *r = input; + + struct mcnemar *mc = xcalloc (sizeof *mc, t2s->n_pairs); + + for (i = 0 ; i < t2s->n_pairs; ++i ) + { + mc[i].val0.f = mc[i].val1.f = SYSMIS; + } + + for (; (c = casereader_read (r)) != NULL; case_unref (c)) + { + const double weight = dict_get_case_weight (dict, c, &warn); + + for (i = 0 ; i < t2s->n_pairs; ++i ) + { + variable_pair *vp = &t2s->pairs[i]; + const union value *value0 = case_data (c, (*vp)[0]); + const union value *value1 = case_data (c, (*vp)[1]); + + if (var_is_value_missing ((*vp)[0], value0, exclude)) + continue; + + if (var_is_value_missing ((*vp)[1], value1, exclude)) + continue; + + + if ( mc[i].val0.f == SYSMIS) + { + if (mc[i].val1.f != value0->f) + mc[i].val0.f = value0->f; + else if (mc[i].val1.f != value1->f) + mc[i].val0.f = value1->f; + } + + if ( mc[i].val1.f == SYSMIS) + { + if (mc[i].val0.f != value1->f) + mc[i].val1.f = value1->f; + else if (mc[i].val0.f != value0->f) + mc[i].val1.f = value0->f; + } + + if (mc[i].val0.f == value0->f && mc[i].val0.f == value1->f) + { + mc[i].n00 += weight; + } + else if ( mc[i].val0.f == value0->f && mc[i].val1.f == value1->f) + { + mc[i].n10 += weight; + } + else if ( mc[i].val1.f == value0->f && mc[i].val0.f == value1->f) + { + mc[i].n01 += weight; + } + else if ( mc[i].val1.f == value0->f && mc[i].val1.f == value1->f) + { + mc[i].n11 += weight; + } + else + { + msg (ME, _("The McNemar test is appropriate only for dichotomous variables")); + } + } + } + + casereader_destroy (r); + + for (i = 0 ; i < t2s->n_pairs ; ++i) + output_freq_table (&t2s->pairs[i], mc + i, dict); + + output_statistics_table (t2s, mc, dict); + + free (mc); +} + + +static void +output_freq_table (variable_pair *vp, + const struct mcnemar *param, + const struct dictionary *dict) +{ + const int header_rows = 2; + const int header_cols = 1; + + struct tab_table *table = tab_create (header_cols + 2, header_rows + 2); + + const struct variable *wv = dict_get_weight (dict); + const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : & F_8_0; + + + struct string pair_name; + struct string val1str ; + struct string val0str ; + + ds_init_empty (&val0str); + ds_init_empty (&val1str); + + var_append_value_name ((*vp)[0], ¶m->val0, &val0str); + var_append_value_name ((*vp)[1], ¶m->val1, &val1str); + + ds_init_cstr (&pair_name, var_to_string ((*vp)[0])); + ds_put_cstr (&pair_name, " & "); + ds_put_cstr (&pair_name, var_to_string ((*vp)[1])); + + tab_title (table, ds_cstr (&pair_name)); + + ds_destroy (&pair_name); + + tab_headers (table, header_cols, 0, header_rows, 0); + + /* Vertical lines inside the box */ + tab_box (table, 0, 0, -1, TAL_1, + 1, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + /* Box around entire table */ + tab_box (table, TAL_2, TAL_2, -1, -1, + 0, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + tab_vline (table, TAL_2, header_cols, 0, tab_nr (table) - 1); + tab_hline (table, TAL_2, 0, tab_nc (table) - 1, header_rows); + + tab_text (table, 0, 0, TAB_CENTER, var_to_string ((*vp)[0])); + + tab_joint_text (table, 1, 0, 2, 0, TAB_CENTER, var_to_string ((*vp)[1])); + tab_hline (table, TAL_1, 1, tab_nc (table) - 1, 1); + + + tab_text (table, 0, header_rows + 0, TAB_LEFT, ds_cstr (&val0str)); + tab_text (table, 0, header_rows + 1, TAB_LEFT, ds_cstr (&val1str)); + + tab_text (table, header_cols + 0, 1, TAB_LEFT, ds_cstr (&val0str)); + tab_text (table, header_cols + 1, 1, TAB_LEFT, ds_cstr (&val1str)); + + tab_double (table, header_cols + 0, header_rows + 0, TAB_RIGHT, param->n00, wfmt); + tab_double (table, header_cols + 0, header_rows + 1, TAB_RIGHT, param->n01, wfmt); + tab_double (table, header_cols + 1, header_rows + 0, TAB_RIGHT, param->n10, wfmt); + tab_double (table, header_cols + 1, header_rows + 1, TAB_RIGHT, param->n11, wfmt); + + tab_submit (table); + + ds_destroy (&val0str); + ds_destroy (&val1str); +} + + +static void +output_statistics_table (const struct two_sample_test *t2s, + const struct mcnemar *mc, + const struct dictionary *dict) +{ + int i; + + struct tab_table *table = tab_create (5, t2s->n_pairs + 1); + + const struct variable *wv = dict_get_weight (dict); + const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : & F_8_0; + + tab_title (table, _("Test Statistics")); + + tab_headers (table, 0, 1, 0, 1); + + tab_hline (table, TAL_2, 0, tab_nc (table) - 1, 1); + tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1); + + + /* Vertical lines inside the box */ + tab_box (table, -1, -1, -1, TAL_1, + 0, 0, + tab_nc (table) - 1, tab_nr (table) - 1); + + /* Box around entire table */ + tab_box (table, TAL_2, TAL_2, -1, -1, + 0, 0, tab_nc (table) - 1, + tab_nr (table) - 1); + + tab_text (table, 1, 0, TAT_TITLE | TAB_CENTER, + _("N")); + + tab_text (table, 2, 0, TAT_TITLE | TAB_CENTER, + _("Exact Sig. (2-tailed)")); + + tab_text (table, 3, 0, TAT_TITLE | TAB_CENTER, + _("Exact Sig. (1-tailed)")); + + tab_text (table, 4, 0, TAT_TITLE | TAB_CENTER, + _("Point Probability")); + + for (i = 0 ; i < t2s->n_pairs; ++i) + { + double sig; + variable_pair *vp = &t2s->pairs[i]; + + struct string pair_name; + ds_init_cstr (&pair_name, var_to_string ((*vp)[0])); + ds_put_cstr (&pair_name, " & "); + ds_put_cstr (&pair_name, var_to_string ((*vp)[1])); + + tab_text (table, 0, 1 + i, TAB_LEFT, ds_cstr (&pair_name)); + ds_destroy (&pair_name); + + tab_double (table, 1, 1 + i, TAB_RIGHT, mc[i].n00 + mc[i].n01 + mc[i].n10 + mc[i].n11, wfmt); + + sig = gsl_cdf_binomial_P (mc[i].n01, 0.5, mc[i].n01 + mc[i].n10); + + tab_double (table, 2, 1 + i, TAB_RIGHT, 2 * sig, NULL); + tab_double (table, 3, 1 + i, TAB_RIGHT, sig, NULL); + + tab_double (table, 4, 1 + i, TAB_RIGHT, gsl_ran_binomial_pdf (mc[i].n01, 0.5, mc[i].n01 + mc[i].n10), + NULL); + } + + tab_submit (table); +} diff --git a/src/language/stats/mcnemar.h b/src/language/stats/mcnemar.h new file mode 100644 index 00000000..41724a3a --- /dev/null +++ b/src/language/stats/mcnemar.h @@ -0,0 +1,35 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#if !mcnemar_h +#define mcnemar_h 1 + + +#include +#include "data/missing-values.h" + +struct casereader; +struct dataset; +struct npar_test; + +void mcnemar_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact, + double timer); + +#endif diff --git a/src/language/stats/npar.c b/src/language/stats/npar.c index d34ec119..ce0c3d6d 100644 --- a/src/language/stats/npar.c +++ b/src/language/stats/npar.c @@ -38,6 +38,7 @@ #include "language/stats/friedman.h" #include "language/stats/kruskal-wallis.h" #include "language/stats/mann-whitney.h" +#include "language/stats/mcnemar.h" #include "language/stats/npar-summary.h" #include "language/stats/runs.h" #include "language/stats/sign.h" @@ -89,6 +90,7 @@ struct cmd_npar_tests int kendall; int kruskal_wallis; int mann_whitney; + int mcnemar; int missing; int method; int statistics; @@ -132,6 +134,7 @@ static int npar_wilcoxon (struct lexer *, struct dataset *, struct npar_specs *) static int npar_sign (struct lexer *, struct dataset *, struct npar_specs *); static int npar_kruskal_wallis (struct lexer *, struct dataset *, struct npar_specs *); static int npar_mann_whitney (struct lexer *, struct dataset *, struct npar_specs *); +static int npar_mcnemar (struct lexer *, struct dataset *, struct npar_specs *); static int npar_method (struct lexer *, struct npar_specs *); /* Command parsing functions. */ @@ -148,6 +151,7 @@ parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests npt->friedman = 0; npt->kruskal_wallis = 0; npt->mann_whitney = 0; + npt->mcnemar = 0; npt->runs = 0; npt->sign = 0; npt->wilcoxon = 0; @@ -276,6 +280,23 @@ parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests NOT_REACHED (); } } + else if (lex_match_phrase (lexer, "MCNEMAR")) + { + lex_match (lexer, T_EQUALS); + npt->mcnemar++; + switch (npar_mcnemar (lexer, ds, nps)) + { + case 0: + goto lossage; + case 1: + break; + case 2: + lex_error (lexer, NULL); + goto lossage; + default: + NOT_REACHED (); + } + } else if (lex_match_phrase (lexer, "M-W") || lex_match_phrase (lexer, "MANN-WHITNEY")) { @@ -1045,8 +1066,6 @@ npar_wilcoxon (struct lexer *lexer, struct dataset *ds, struct npar_specs *specs ) { - - struct two_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp)); struct npar_test *nt = &tp->parent; nt->execute = wilcoxon_execute; @@ -1090,6 +1109,8 @@ npar_mann_whitney (struct lexer *lexer, } + + static int npar_sign (struct lexer *lexer, struct dataset *ds, struct npar_specs *specs) @@ -1112,6 +1133,30 @@ npar_sign (struct lexer *lexer, struct dataset *ds, return 1; } + +static int +npar_mcnemar (struct lexer *lexer, struct dataset *ds, + struct npar_specs *specs) +{ + struct two_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp)); + struct npar_test *nt = &tp->parent; + + nt->execute = mcnemar_execute; + + if (!parse_two_sample_related_test (lexer, dataset_dict (ds), + tp, specs->pool) ) + return 0; + + specs->n_tests++; + specs->test = pool_realloc (specs->pool, + specs->test, + sizeof (*specs->test) * specs->n_tests); + specs->test[specs->n_tests - 1] = nt; + + return 1; +} + + static int npar_kruskal_wallis (struct lexer *lexer, struct dataset *ds, struct npar_specs *specs) diff --git a/tests/language/stats/npar.at b/tests/language/stats/npar.at index e9c118d8..e7ddc78b 100644 --- a/tests/language/stats/npar.at +++ b/tests/language/stats/npar.at @@ -1093,3 +1093,58 @@ Asymp. Sig.,.001 ]) AT_CLEANUP + + + +AT_SETUP([NPAR TESTS McNemar]) + +AT_DATA([mcnemar.sps], [dnl +set format = F12.3. +data list notable list /v1 * v2 * junk *. +begin data. +0 0 0 +0 0 0 +0 0 0 +0 0 0 +0 1 0 +0 1 0 +0 1 0 +0 1 0 +0 1 1 +0 1 1 +0 1 1 +0 1 1 +0 1 1 +1 0 1 +1 0 1 +1 1 1 +1 1 1 +1 1 0 +1 1 0 +1 1 1 +end data. + +npar tests + /mcnemar = v1 WITH v2 junk. +]) + +AT_CHECK([pspp -O format=csv mcnemar.sps], [0], [dnl +Table: v1 & v2 +v1,v2, +,.000,1.000 +.000,4,9 +1.000,2,5 + +Table: v1 & junk +v1,junk, +,.000,1.000 +.000,8,5 +1.000,2,5 + +Table: Test Statistics +,N,Exact Sig. (2-tailed),Exact Sig. (1-tailed),Point Probability +v1 & v2,20,.065,.033,.027 +v1 & junk,20,.453,.227,.164 +]) + +AT_CLEANUP -- 2.30.2