pintos-os.org Git - pspp/blob - src/language/stats/wilcoxon.c

   1 /* Pspp - a program for statistical analysis.
   2    Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17
  18
  19 #include <config.h>
  20
  21 #include "language/stats/wilcoxon.h"
  22
  23 #include <gsl/gsl_cdf.h>
  24 #include <math.h>
  25
  26 #include "data/casereader.h"
  27 #include "data/casewriter.h"
  28 #include "data/dataset.h"
  29 #include "data/dictionary.h"
  30 #include "data/format.h"
  31 #include "data/subcase.h"
  32 #include "data/variable.h"
  33 #include "libpspp/assertion.h"
  34 #include "libpspp/message.h"
  35 #include "libpspp/misc.h"
  36 #include "math/sort.h"
  37 #include "math/wilcoxon-sig.h"
  38 #include "output/pivot-table.h"
  39
  40 #include "gl/minmax.h"
  41 #include "gl/xalloc.h"
  42
  43 #include "gettext.h"
  44 #define N_(msgid) msgid
  45 #define _(msgid) gettext (msgid)
  46
  47 static double
  48 append_difference (const struct ccase *c, casenumber n UNUSED, void *aux)
  49 {
  50   const variable_pair *vp = aux;
  51
  52   return case_num (c, (*vp)[0]) - case_num (c, (*vp)[1]);
  53 }
  54
  55 static void show_ranks_box (const struct wilcoxon_state *,
  56                             const struct two_sample_test *,
  57                             const struct dictionary *);
  58
  59 static void show_tests_box (const struct wilcoxon_state *,
  60                             const struct two_sample_test *,
  61                             bool exact, double timer);
  62
  63
  64
  65 static void
  66 distinct_callback (double v UNUSED, casenumber n, double w UNUSED, void *aux)
  67 {
  68   struct wilcoxon_state *ws = aux;
  69
  70   ws->tiebreaker += pow3 (n) - n;
  71 }
  72
  73 #define WEIGHT_IDX 2
  74
  75 void
  76 wilcoxon_execute (const struct dataset *ds,
  77                   struct casereader *input,
  78                   enum mv_class exclude,
  79                   const struct npar_test *test,
  80                   bool exact,
  81                   double timer)
  82 {
  83   int i;
  84   bool warn = true;
  85   const struct dictionary *dict = dataset_dict (ds);
  86   const struct two_sample_test *t2s = UP_CAST (test, const struct two_sample_test, parent);
  87
  88   struct wilcoxon_state *ws = XCALLOC (t2s->n_pairs,  struct wilcoxon_state);
  89   const struct variable *weight = dict_get_weight (dict);
  90   struct variable *weightx = dict_create_internal_var (WEIGHT_IDX, 0);
  91   struct caseproto *proto;
  92
  93   input =
  94     casereader_create_filter_weight (input, dict, &warn, NULL);
  95
  96   proto = caseproto_create ();
  97   proto = caseproto_add_width (proto, 0);
  98   proto = caseproto_add_width (proto, 0);
  99   if (weight != NULL)
 100     proto = caseproto_add_width (proto, 0);
 101
 102   for (i = 0 ; i < t2s->n_pairs; ++i)
 103     {
 104       struct casereader *r = casereader_clone (input);
 105       struct casewriter *writer;
 106       struct ccase *c;
 107       struct subcase ordering;
 108       variable_pair *vp = &t2s->pairs[i];
 109
 110       ws[i].sign = dict_create_internal_var (0, 0);
 111       ws[i].absdiff = dict_create_internal_var (1, 0);
 112
 113       r = casereader_create_filter_missing (r, *vp, 2,
 114                                             exclude,
 115                                             NULL, NULL);
 116
 117       subcase_init_var (&ordering, ws[i].absdiff, SC_ASCEND);
 118       writer = sort_create_writer (&ordering, proto);
 119       subcase_uninit (&ordering);
 120
 121       for (; (c = casereader_read (r)) != NULL; case_unref (c))
 122         {
 123           struct ccase *output = case_create (proto);
 124           double d = append_difference (c, 0, vp);
 125
 126           if (d > 0)
 127             *case_num_rw (output, ws[i].sign) = 1.0;
 128           else if (d < 0)
 129             *case_num_rw (output, ws[i].sign) = -1.0;
 130           else
 131             {
 132               double w = 1.0;
 133               if (weight)
 134                 w = case_num (c, weight);
 135
 136               /* Central point values should be dropped */
 137               ws[i].n_zeros += w;
 138               case_unref (output);
 139               continue;
 140             }
 141
 142           *case_num_rw (output, ws[i].absdiff) = fabs (d);
 143
 144           if (weight)
 145            *case_num_rw (output, weightx) = case_num (c, weight);
 146
 147           casewriter_write (writer, output);
 148         }
 149       casereader_destroy (r);
 150       ws[i].reader = casewriter_make_reader (writer);
 151     }
 152   caseproto_unref (proto);
 153
 154   for (i = 0 ; i < t2s->n_pairs; ++i)
 155     {
 156       struct casereader *rr ;
 157       struct ccase *c;
 158       enum rank_error err = 0;
 159
 160       rr = casereader_create_append_rank (ws[i].reader, ws[i].absdiff,
 161                                           weight ? weightx : NULL, &err,
 162                                           distinct_callback, &ws[i]
 163                                         );
 164
 165       for (; (c = casereader_read (rr)) != NULL; case_unref (c))
 166         {
 167           double sign = case_num (c, ws[i].sign);
 168           double rank = case_num_idx (c, weight ? 3 : 2);
 169           double w = weight ? case_num (c, weightx) : 1.0;
 170
 171           if (sign > 0)
 172             {
 173               ws[i].positives.sum += rank * w;
 174               ws[i].positives.n += w;
 175             }
 176           else if (sign < 0)
 177             {
 178               ws[i].negatives.sum += rank * w;
 179               ws[i].negatives.n += w;
 180             }
 181           else
 182             NOT_REACHED ();
 183         }
 184
 185       casereader_destroy (rr);
 186     }
 187
 188   casereader_destroy (input);
 189
 190   dict_destroy_internal_var (weightx);
 191
 192   show_ranks_box (ws, t2s, dict);
 193   show_tests_box (ws, t2s, exact, timer);
 194
 195   for (i = 0 ; i < t2s->n_pairs; ++i)
 196     {
 197       dict_destroy_internal_var (ws[i].sign);
 198       dict_destroy_internal_var (ws[i].absdiff);
 199     }
 200
 201   free (ws);
 202 }
 203 \f
 204 static void
 205 put_row (struct pivot_table *table, int var_idx, int sign_idx,
 206          double n, double sum)
 207 {
 208   pivot_table_put3 (table, 0, sign_idx, var_idx, pivot_value_new_number (n));
 209   if (sum != SYSMIS)
 210     {
 211       pivot_table_put3 (table, 1, sign_idx, var_idx,
 212                         pivot_value_new_number (sum / n));
 213       pivot_table_put3 (table, 2, sign_idx, var_idx,
 214                         pivot_value_new_number (sum));
 215     }
 216 }
 217
 218 static int
 219 add_pair_leaf (struct pivot_dimension *dimension, variable_pair *pair)
 220 {
 221   char *label = xasprintf ("%s - %s", var_to_string ((*pair)[0]),
 222                            var_to_string ((*pair)[1]));
 223   return pivot_category_create_leaf (
 224     dimension->root,
 225     pivot_value_new_user_text_nocopy (label));
 226 }
 227
 228 static void
 229 show_ranks_box (const struct wilcoxon_state *ws,
 230                 const struct two_sample_test *t2s,
 231                 const struct dictionary *dict)
 232 {
 233   struct pivot_table *table = pivot_table_create (N_("Ranks"));
 234   pivot_table_set_weight_var (table, dict_get_weight (dict));
 235
 236   pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
 237                           N_("N"), PIVOT_RC_COUNT,
 238                           N_("Mean Rank"), PIVOT_RC_OTHER,
 239                           N_("Sum of Ranks"), PIVOT_RC_OTHER);
 240
 241   pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Sign"),
 242                           N_("Negative Ranks"), N_("Positive Ranks"),
 243                           N_("Ties"), N_("Total"));
 244
 245   struct pivot_dimension *pairs = pivot_dimension_create (
 246     table, PIVOT_AXIS_ROW, N_("Pairs"));
 247
 248   for (size_t i = 0 ; i < t2s->n_pairs; ++i)
 249     {
 250       variable_pair *vp = &t2s->pairs[i];
 251       int pair_idx = add_pair_leaf (pairs, vp);
 252
 253       const struct wilcoxon_state *w = &ws[i];
 254       put_row (table, pair_idx, 0, w->negatives.n, w->negatives.sum);
 255       put_row (table, pair_idx, 1, w->positives.n, w->positives.sum);
 256       put_row (table, pair_idx, 2, w->n_zeros, SYSMIS);
 257       put_row (table, pair_idx, 3,
 258                w->n_zeros + w->positives.n + w->negatives.n, SYSMIS);
 259     }
 260
 261   pivot_table_submit (table);
 262 }
 263
 264
 265 static void
 266 show_tests_box (const struct wilcoxon_state *ws,
 267                 const struct two_sample_test *t2s,
 268                 bool exact,
 269                 double timer UNUSED
 270                 )
 271 {
 272   struct pivot_table *table = pivot_table_create (N_("Test Statistics"));
 273
 274   struct pivot_dimension *statistics = pivot_dimension_create (
 275     table, PIVOT_AXIS_ROW, N_("Statistics"),
 276     N_("Z"), PIVOT_RC_OTHER,
 277     N_("Asymp. Sig. (2-tailed)"), PIVOT_RC_SIGNIFICANCE);
 278   if (exact)
 279     pivot_category_create_leaves (
 280       statistics->root,
 281       N_("Exact Sig. (2-tailed)"), PIVOT_RC_SIGNIFICANCE,
 282       N_("Exact Sig. (1-tailed)"), PIVOT_RC_SIGNIFICANCE);
 283
 284   struct pivot_dimension *pairs = pivot_dimension_create (
 285     table, PIVOT_AXIS_COLUMN, N_("Pairs"));
 286
 287   struct pivot_footnote *too_many_pairs = pivot_table_create_footnote (
 288     table, pivot_value_new_text (
 289       N_("Too many pairs to calculate exact significance")));
 290
 291   for (size_t i = 0 ; i < t2s->n_pairs; ++i)
 292     {
 293       variable_pair *vp = &t2s->pairs[i];
 294       int pair_idx = add_pair_leaf (pairs, vp);
 295
 296       double n = ws[i].positives.n + ws[i].negatives.n;
 297       double z = MIN (ws[i].positives.sum, ws[i].negatives.sum);
 298       z -= n * (n + 1)/ 4.0;
 299       z /= sqrt (n * (n + 1) * (2*n + 1)/24.0 - ws[i].tiebreaker / 48.0);
 300
 301       double entries[4];
 302       int n_entries = 0;
 303       entries[n_entries++] = z;
 304       entries[n_entries++] = 2.0 * gsl_cdf_ugaussian_P (z);
 305
 306       int footnote_idx = -1;
 307       if (exact)
 308         {
 309           double p = LevelOfSignificanceWXMPSR (ws[i].positives.sum, n);
 310           if (p < 0)
 311             {
 312               footnote_idx = n_entries;
 313               entries[n_entries++] = SYSMIS;
 314             }
 315           else
 316             {
 317               entries[n_entries++] = p;
 318               entries[n_entries++] = p / 2.0;
 319             }
 320         }
 321
 322       for (int j = 0; j < n_entries; j++)
 323         {
 324           struct pivot_value *value = pivot_value_new_number (entries[j]);
 325           if (j == footnote_idx)
 326             pivot_value_add_footnote (value, too_many_pairs);
 327           pivot_table_put2 (table, j, pair_idx, value);
 328         }
 329     }
 330
 331   pivot_table_submit (table);
 332 }