From 1f247f9a51625cdaae7e48b5f723310f05ff6627 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Mon, 25 Oct 2010 20:40:14 +0200 Subject: [PATCH] First attempt at Mann-Whitney U test. Currently only assymptotic significance is calculated. --- src/language/stats/automake.mk | 1 + src/language/stats/mann-whitney.c | 288 ++++++++++++++++++++++++++++++ src/language/stats/mann-whitney.h | 41 +++++ src/language/stats/npar.c | 58 +++++- 4 files changed, 383 insertions(+), 5 deletions(-) create mode 100644 src/language/stats/mann-whitney.c create mode 100644 src/language/stats/mann-whitney.h diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 55952622..32a5c228 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -23,6 +23,7 @@ language_stats_sources = \ src/language/stats/friedman.c src/language/stats/friedman.h \ src/language/stats/glm.c \ src/language/stats/kruskal-wallis.c src/language/stats/kruskal-wallis.h \ + src/language/stats/mann-whitney.c src/language/stats/mann-whitney.h \ src/language/stats/npar.c src/language/stats/npar.h \ src/language/stats/npar-summary.c src/language/stats/npar-summary.h \ src/language/stats/oneway.c \ diff --git a/src/language/stats/mann-whitney.c b/src/language/stats/mann-whitney.c new file mode 100644 index 00000000..b837177b --- /dev/null +++ b/src/language/stats/mann-whitney.c @@ -0,0 +1,288 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "mann-whitney.h" + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* Calculates the adjustment necessary for tie compensation */ +static void +distinct_callback (double v UNUSED, casenumber t, double w UNUSED, void *aux) +{ + double *tiebreaker = aux; + + *tiebreaker += (pow3 (t) - t) / 12.0; +} + +struct mw +{ + double rank_sum[2]; + double n[2]; + + + double u; /* The Mann-Whitney U statistic */ + double w; /* The Wilcoxon Rank Sum W statistic */ + double z; +}; + +static void show_ranks_box (const struct n_sample_test *nst, const struct mw *mw); +static void show_statistics_box (const struct n_sample_test *nst, const struct mw *mw, bool exact); + + +void +mann_whitney_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact, + double timer) +{ + int i; + const struct dictionary *dict = dataset_dict (ds); + const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, parent); + + const struct caseproto *proto = casereader_get_proto (input); + size_t rank_idx = caseproto_get_n_widths (proto); + + struct mw *mw = xcalloc (nst->n_vars, sizeof *mw); + + for (i = 0; i < nst->n_vars; ++i) + { + double tiebreaker = 0.0; + bool warn = true; + enum rank_error rerr = 0; + struct casereader *rr; + struct ccase *c; + const struct variable *var = nst->vars[i]; + + struct casereader *reader = + sort_execute_1var (casereader_clone (input), var); + + rr = casereader_create_append_rank (reader, var, + dict_get_weight (dict), + &rerr, + distinct_callback, &tiebreaker); + + for (; (c = casereader_read (rr)); case_unref (c)) + { + const union value *val = case_data (c, var); + const union value *group = case_data (c, nst->indep_var); + const size_t group_var_width = var_get_width (nst->indep_var); + const double rank = case_data_idx (c, rank_idx)->f; + + if ( var_is_value_missing (var, val, exclude)) + continue; + + if ( value_equal (group, &nst->val1, group_var_width)) + { + mw[i].rank_sum[0] += rank; + mw[i].n[0] += dict_get_case_weight (dict, c, &warn); + } + else if ( value_equal (group, &nst->val2, group_var_width)) + { + mw[i].rank_sum[1] += rank; + mw[i].n[1] += dict_get_case_weight (dict, c, &warn); + } + } + casereader_destroy (rr); + + { + double n; + double denominator; + struct mw *mwv = &mw[i]; + + mwv->u = mwv->n[0] * mwv->n[1] ; + mwv->u += mwv->n[0] * (mwv->n[0] + 1) / 2.0; + mwv->u -= mwv->rank_sum[0]; + + mwv->w = mwv->rank_sum[0]; + if ( mwv->u > mwv->n[0] * mwv->n[1] / 2.0) + { + mwv->u = mwv->n[0] * mwv->n[1] - mwv->u; + mwv->w = mwv->rank_sum[1]; + } + mwv->z = mwv->u - mwv->n[0] * mwv->n[1] / 2.0; + n = mwv->n[0] + mwv->n[1]; + denominator = pow3(n) - n; + denominator /= 12; + denominator -= tiebreaker; + denominator *= mwv->n[0] * mwv->n[1]; + denominator /= n * (n - 1); + + mwv->z /= sqrt (denominator); + } + } + casereader_destroy (input); + + show_ranks_box (nst, mw); + show_statistics_box (nst, mw, exact); + + free (mw); +} + + + +#include +#include "gettext.h" +#define _(msgid) gettext (msgid) + +static void +show_ranks_box (const struct n_sample_test *nst, const struct mw *mwv) +{ + int i; + const int row_headers = 1; + const int column_headers = 2; + struct tab_table *table = + tab_create (row_headers + 7, column_headers + nst->n_vars); + + tab_headers (table, row_headers, 0, column_headers, 0); + + tab_title (table, _("Ranks")); + + /* Vertical lines inside the box */ + tab_box (table, 1, 0, -1, TAL_1, + row_headers, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + /* Box around the table */ + tab_box (table, TAL_2, TAL_2, -1, -1, + 0, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + tab_hline (table, TAL_2, 0, tab_nc (table) -1, column_headers); + tab_vline (table, TAL_2, row_headers, 0, tab_nr (table) - 1); + + tab_hline (table, TAL_1, row_headers, tab_nc (table) -1, 1); + + tab_text (table, 1, 1, TAT_TITLE | TAB_CENTER, _("group1")); + tab_text (table, 2, 1, TAT_TITLE | TAB_CENTER, _("group2")); + tab_text (table, 3, 1, TAT_TITLE | TAB_CENTER, _("Total")); + tab_joint_text (table, 1, 0, 3, 0, + TAT_TITLE | TAB_CENTER, _("N")); + tab_vline (table, TAL_2, 4, 0, tab_nr (table) - 1); + + tab_text (table, 4, 1, TAT_TITLE | TAB_CENTER, _("group1")); + tab_text (table, 5, 1, TAT_TITLE | TAB_CENTER, _("group2")); + tab_joint_text (table, 4, 0, 5, 0, + TAT_TITLE | TAB_CENTER, _("Mean Rank")); + tab_vline (table, TAL_2, 6, 0, tab_nr (table) - 1); + + tab_text (table, 6, 1, TAT_TITLE | TAB_CENTER, _("group1")); + tab_text (table, 7, 1, TAT_TITLE | TAB_CENTER, _("group2")); + tab_joint_text (table, 6, 0, 7, 0, + TAT_TITLE | TAB_CENTER, _("Sum of Ranks")); + + for (i = 0 ; i < nst->n_vars ; ++i) + { + const struct mw *mw = &mwv[i]; + tab_text (table, 0, column_headers + i, TAT_TITLE, + var_to_string (nst->vars[i])); + + tab_double (table, 1, column_headers + i, 0, + mw->n[0], 0); + + tab_double (table, 2, column_headers + i, 0, + mw->n[1], 0); + + tab_double (table, 3, column_headers + i, 0, + mw->n[1] + mw->n[0], 0); + + /* Mean Ranks */ + tab_double (table, 4, column_headers + i, 0, + mw->rank_sum[0] / mw->n[0], 0); + + tab_double (table, 5, column_headers + i, 0, + mw->rank_sum[1] / mw->n[1], 0); + + /* Sum of Ranks */ + tab_double (table, 6, column_headers + i, 0, + mw->rank_sum[0], 0); + + tab_double (table, 7, column_headers + i, 0, + mw->rank_sum[1], 0); + } + + tab_submit (table); +} + +static void +show_statistics_box (const struct n_sample_test *nst, const struct mw *mwv, bool exact) +{ + int i; + const int row_headers = 1; + const int column_headers = 1; + struct tab_table *table = + tab_create (row_headers + (exact ? 6 : 4), column_headers + nst->n_vars); + + tab_headers (table, row_headers, 0, column_headers, 0); + + tab_title (table, _("Test Statistics")); + + /* Vertical lines inside the box */ + tab_box (table, 1, 0, -1, TAL_1, + row_headers, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + /* Box around the table */ + tab_box (table, TAL_2, TAL_2, -1, -1, + 0, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + tab_hline (table, TAL_2, 0, tab_nc (table) -1, column_headers); + tab_vline (table, TAL_2, row_headers, 0, tab_nr (table) - 1); + + tab_text (table, 1, 0, TAT_TITLE | TAB_CENTER, _("Mann-Whitney U")); + tab_text (table, 2, 0, TAT_TITLE | TAB_CENTER, _("Wilcoxon W")); + tab_text (table, 3, 0, TAT_TITLE | TAB_CENTER, _("Z")); + tab_text (table, 4, 0, TAT_TITLE | TAB_CENTER, _("Asymp. Sig. (2-tailed)")); + + if (exact) + { + tab_text (table, 5, 0, TAT_TITLE | TAB_CENTER, _("Exact Sig. (2-tailed)")); + tab_text (table, 6, 0, TAT_TITLE | TAB_CENTER, _("Point Probability")); + } + + for (i = 0 ; i < nst->n_vars ; ++i) + { + const struct mw *mw = &mwv[i]; + + tab_text (table, 0, column_headers + i, TAT_TITLE, + var_to_string (nst->vars[i])); + + tab_double (table, 1, column_headers + i, 0, + mw->u, 0); + + tab_double (table, 2, column_headers + i, 0, + mw->w, 0); + + tab_double (table, 3, column_headers + i, 0, + mw->z, 0); + + tab_double (table, 4, column_headers + i, 0, + 2.0 * gsl_cdf_ugaussian_P (mw->z), 0); + } + + tab_submit (table); +} diff --git a/src/language/stats/mann-whitney.h b/src/language/stats/mann-whitney.h new file mode 100644 index 00000000..00da9f87 --- /dev/null +++ b/src/language/stats/mann-whitney.h @@ -0,0 +1,41 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#if !mann_whitney_h +#define mann_whitney_h 1 + +#include +#include +#include + + +struct mann_whitney_test +{ + struct two_sample_test parent; +}; + +struct casereader; +struct dataset; + +void mann_whitney_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact, + double timer + ); + +#endif diff --git a/src/language/stats/npar.c b/src/language/stats/npar.c index a9eba5dd..053f021d 100644 --- a/src/language/stats/npar.c +++ b/src/language/stats/npar.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,7 @@ struct cmd_npar_tests int runs; int friedman; int kruskal_wallis; + int mann_whitney; int missing; int method; int statistics; @@ -124,6 +126,7 @@ static int npar_friedman (struct lexer *, struct dataset *, struct npar_specs *) static int npar_wilcoxon (struct lexer *, struct dataset *, struct npar_specs *); static int npar_sign (struct lexer *, struct dataset *, struct npar_specs *); static int npar_kruskal_wallis (struct lexer *, struct dataset *, struct npar_specs *); +static int npar_mann_whitney (struct lexer *, struct dataset *, struct npar_specs *); static int npar_method (struct lexer *, struct npar_specs *); /* Command parsing functions. */ @@ -134,12 +137,14 @@ static int parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests *npt, struct npar_specs *nps) { - npt->chisquare = 0; npt->binomial = 0; - npt->wilcoxon = 0; - npt->runs = 0; + npt->chisquare = 0; npt->friedman = 0; + npt->kruskal_wallis = 0; + npt->mann_whitney = 0; + npt->runs = 0; npt->sign = 0; + npt->wilcoxon = 0; npt->missing = 0; npt->miss = MISS_ANALYSIS; npt->method = 0; @@ -231,6 +236,24 @@ parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests NOT_REACHED (); } } + else if (lex_match_hyphenated_word (lexer, "M-W") || + lex_match_hyphenated_word (lexer, "MANN-WHITNEY")) + { + lex_match (lexer, '='); + npt->mann_whitney++; + switch (npar_mann_whitney (lexer, ds, nps)) + { + case 0: + goto lossage; + case 1: + break; + case 2: + lex_error (lexer, NULL); + goto lossage; + default: + NOT_REACHED (); + } + } else if (lex_match_hyphenated_word (lexer, "WILCOXON")) { lex_match (lexer, '='); @@ -911,8 +934,7 @@ parse_n_sample_related_test (struct lexer *lexer, return false; } - if ( ! lex_force_match (lexer, ',')) - return false; + lex_match (lexer, ','); value_init (&nst->val2, var_get_width (nst->indep_var)); if ( ! parse_value (lexer, &nst->val2, var_get_width (nst->indep_var))) @@ -951,6 +973,32 @@ npar_wilcoxon (struct lexer *lexer, return 1; } + +static int +npar_mann_whitney (struct lexer *lexer, + struct dataset *ds, + struct npar_specs *specs ) +{ + struct n_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp)); + struct npar_test *nt = &tp->parent; + + nt->insert_variables = n_sample_insert_variables; + nt->execute = mann_whitney_execute; + + if (!parse_n_sample_related_test (lexer, dataset_dict (ds), + tp, specs->pool) ) + return 0; + + specs->n_tests++; + specs->test = pool_realloc (specs->pool, + specs->test, + sizeof (*specs->test) * specs->n_tests); + specs->test[specs->n_tests - 1] = nt; + + return 1; +} + + static int npar_sign (struct lexer *lexer, struct dataset *ds, struct npar_specs *specs) -- 2.30.2