pintos-os.org Git - pspp/blob - src/language/stats/jonckheere-terpstra.c

   1 /* Pspp - a program for statistical analysis.
   2    Copyright (C) 2012 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17
  18 #include <config.h>
  19
  20 #include "jonckheere-terpstra.h"
  21
  22 #include <gsl/gsl_cdf.h>
  23 #include <math.h>
  24
  25 #include "data/casegrouper.h"
  26 #include "data/casereader.h"
  27 #include "data/casewriter.h"
  28 #include "data/dataset.h"
  29 #include "data/dictionary.h"
  30 #include "data/format.h"
  31 #include "data/subcase.h"
  32 #include "data/variable.h"
  33 #include "libpspp/assertion.h"
  34 #include "libpspp/hmap.h"
  35 #include "libpspp/message.h"
  36 #include "libpspp/misc.h"
  37 #include "math/sort.h"
  38 #include "output/tab.h"
  39
  40 #include "gl/minmax.h"
  41 #include "gl/xalloc.h"
  42
  43 /* Returns true iff the independent variable lies in the
  44    between val1 and val2. Regardless of which is the greater value.
  45 */
  46 static bool
  47 include_func_bi (const struct ccase *c, void *aux)
  48 {
  49   const struct n_sample_test *nst = aux;
  50   const union value *bigger = NULL;
  51   const union value *smaller = NULL;
  52
  53   if (0 > value_compare_3way (&nst->val1, &nst->val2, var_get_width (nst->indep_var)))
  54     {
  55       bigger = &nst->val2;
  56       smaller = &nst->val1;
  57     }
  58   else
  59     {
  60       smaller = &nst->val2;
  61       bigger = &nst->val1;
  62     }
  63
  64   if (0 < value_compare_3way (smaller, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
  65     return false;
  66
  67   if (0 > value_compare_3way (bigger, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
  68     return false;
  69
  70   return true;
  71 }
  72
  73 struct group_data
  74 {
  75   /* The total of the caseweights in the group */
  76   double cc;
  77
  78   /* A casereader containing the group data.
  79      This casereader contains just two values:
  80      0: The raw value of the data
  81      1: The cumulative caseweight
  82    */
  83   struct casereader *reader;
  84 };
  85
  86
  87 static double
  88 u (const struct group_data *grp0, const struct group_data *grp1)
  89 {
  90   struct ccase *c0;
  91
  92   struct casereader *r0 = casereader_clone (grp0->reader);
  93   double usum = 0;
  94   double prev_cc0 = 0.0;
  95   for (; (c0 = casereader_read (r0)); case_unref (c0))
  96     {
  97       struct ccase *c1;
  98       struct casereader *r1 = casereader_clone (grp1->reader);
  99       double x0 = case_data_idx (c0, 0)->f;
 100       double cc0 = case_data_idx (c0, 1)->f;
 101       double w0 = cc0 - prev_cc0;
 102
 103       double prev_cc1 = 0;
 104
 105       for (; (c1 = casereader_read (r1)); case_unref (c1))
 106         {
 107           double x1 = case_data_idx (c1, 0)->f;
 108           double cc1 = case_data_idx (c1, 1)->f;
 109
 110           if (x0 > x1)
 111             {
 112               /* Do nothing */
 113             }
 114           else if (x0 < x1)
 115             {
 116               usum += w0 * (grp1->cc - prev_cc1);
 117               case_unref (c1);
 118               break;
 119             }
 120           else
 121             {
 122 #if 1
 123               usum += w0 * ( (grp1->cc - prev_cc1) / 2.0);
 124 #else
 125               usum += w0 * (grp1->cc - (prev_cc1 + cc1) / 2.0);
 126 #endif
 127               case_unref (c1);
 128               break;
 129             }
 130
 131           prev_cc1 = cc1;
 132         }
 133       casereader_destroy (r1);
 134       prev_cc0 = cc0;
 135     }
 136   casereader_destroy (r0);
 137
 138   return usum;
 139 }
 140
 141
 142 typedef double func_f (double e_l);
 143
 144 /*
 145    These 3 functions are used repeatedly in the calculation of the
 146    variance of the JT statistic.
 147    Having them explicitly defined makes the variance calculation
 148    a lot simpler.
 149 */
 150 static  double
 151 ff1 (double e)
 152 {
 153   return e * (e - 1) * (2*e + 5);
 154 }
 155
 156 static  double
 157 ff2 (double e)
 158 {
 159   return e * (e - 1) * (e - 2);
 160 }
 161
 162 static  double
 163 ff3 (double e)
 164 {
 165   return e * (e - 1) ;
 166 }
 167
 168 static  func_f *mff[3] =
 169   {
 170     ff1, ff2, ff3
 171   };
 172
 173
 174 /*
 175   This function does the following:
 176   It creates an ordered set of *distinct* values from IR.
 177   For each case in that set, it calls f[0..N] passing it the caseweight.
 178   It returns the sum of f[j] in result[j].
 179
 180   result and f must be allocated prior to calling this function.
 181  */
 182 static
 183 void variance_calculation (struct casereader *ir, const struct variable *var,
 184                            const struct dictionary *dict,
 185                            func_f **f, double *result, size_t n)
 186 {
 187   int i;
 188   struct casereader *r = casereader_clone (ir);
 189   struct ccase *c;
 190   const struct variable *wv = dict_get_weight (dict);
 191   const int w_idx = wv ?
 192     var_get_case_index (wv) :
 193     caseproto_get_n_widths (casereader_get_proto (r)) ;
 194
 195   r = sort_execute_1var (r, var);
 196
 197   r = casereader_create_distinct (r, var, dict_get_weight (dict));
 198
 199   for (; (c = casereader_read (r)); case_unref (c))
 200     {
 201       double w = case_data_idx (c, w_idx)->f;
 202
 203       for (i = 0; i < n; ++i)
 204         result[i] += f[i] (w);
 205     }
 206
 207   casereader_destroy (r);
 208 }
 209
 210 struct jt
 211 {
 212   int levels;
 213   double n;
 214   double obs;
 215   double mean;
 216   double stddev;
 217 };
 218
 219 static void show_jt (const struct n_sample_test *nst, const struct jt *jt, const struct variable *wv);
 220
 221
 222 void
 223 jonckheere_terpstra_execute (const struct dataset *ds,
 224                         struct casereader *input,
 225                         enum mv_class exclude,
 226                         const struct npar_test *test,
 227                         bool exact UNUSED,
 228                         double timer UNUSED)
 229 {
 230   int v;
 231   bool warn = true;
 232   const struct dictionary *dict = dataset_dict (ds);
 233   const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, parent);
 234
 235   struct caseproto *proto = caseproto_create ();
 236   proto = caseproto_add_width (proto, 0);
 237   proto = caseproto_add_width (proto, 0);
 238
 239   /* If the independent variable is missing, then we ignore the case */
 240   input = casereader_create_filter_missing (input,
 241                                             &nst->indep_var, 1,
 242                                             exclude,
 243                                             NULL, NULL);
 244
 245   /* Remove cases with invalid weigths */
 246   input = casereader_create_filter_weight (input, dict, &warn, NULL);
 247
 248   /* Remove all those cases which are outside the range (val1, val2) */
 249   input = casereader_create_filter_func (input, include_func_bi, NULL,
 250         CONST_CAST (struct n_sample_test *, nst), NULL);
 251
 252   /* Sort the data by the independent variable */
 253   input = sort_execute_1var (input, nst->indep_var);
 254
 255   for (v = 0; v < nst->n_vars ; ++v)
 256   {
 257     struct jt jt;
 258     double variance;
 259     int g0;
 260     double nn = 0;
 261     int i;
 262     double sums[3] = {0,0,0};
 263     double e_sum[3] = {0,0,0};
 264
 265     struct group_data *grp = NULL;
 266     double ccsq_sum = 0;
 267
 268     struct casegrouper *grouper;
 269     struct casereader *group;
 270     struct casereader *vreader= casereader_clone (input);
 271
 272     /* Get a few values into e_sum - we'll be needing these later */
 273     variance_calculation (vreader, nst->vars[v], dict, mff, e_sum, 3);
 274
 275     grouper =
 276       casegrouper_create_vars (vreader, &nst->indep_var, 1);
 277
 278     jt.obs = 0;
 279     jt.levels = 0;
 280     jt.n = 0;
 281     for (; casegrouper_get_next_group (grouper, &group);
 282          casereader_destroy (group) )
 283       {
 284         struct casewriter *writer = autopaging_writer_create (proto);
 285         struct ccase *c;
 286         double cc = 0;
 287
 288         group = sort_execute_1var (group, nst->vars[v]);
 289         for (; (c = casereader_read (group)); case_unref (c))
 290           {
 291             struct ccase *c_out = case_create (proto);
 292             const union value *x = case_data (c, nst->vars[v]);
 293
 294             case_data_rw_idx (c_out, 0)->f = x->f;
 295
 296             cc += dict_get_case_weight (dict, c, &warn);
 297             case_data_rw_idx (c_out, 1)->f = cc;
 298             casewriter_write (writer, c_out);
 299           }
 300
 301         grp = xrealloc (grp, sizeof *grp * (jt.levels + 1));
 302
 303         grp[jt.levels].reader = casewriter_make_reader (writer);
 304         grp[jt.levels].cc = cc;
 305
 306         jt.levels++;
 307         jt.n += cc;
 308         ccsq_sum += pow2 (cc);
 309       }
 310
 311     casegrouper_destroy (grouper);
 312
 313     for (g0 = 0; g0 < jt.levels; ++g0)
 314       {
 315         int g1;
 316         for (g1 = g0 +1 ; g1 < jt.levels; ++g1)
 317           {
 318             double uu = u (&grp[g0], &grp[g1]);
 319             jt.obs += uu;
 320           }
 321         nn += pow2 (grp[g0].cc) * (2 * grp[g0].cc + 3);
 322
 323         for (i = 0; i < 3; ++i)
 324           sums[i] += mff[i] (grp[g0].cc);
 325
 326         casereader_destroy (grp[g0].reader);
 327       }
 328
 329     free (grp);
 330
 331     variance = (mff[0](jt.n) - sums[0] - e_sum[0]) / 72.0;
 332     variance += sums[1] * e_sum[1] / (36.0 * mff[1] (jt.n));
 333     variance += sums[2] * e_sum[2] / (8.0 * mff[2] (jt.n));
 334
 335     jt.stddev = sqrt (variance);
 336
 337     jt.mean = (pow2 (jt.n) - ccsq_sum) / 4.0;
 338
 339     show_jt (nst, &jt, dict_get_weight (dict));
 340   }
 341
 342   casereader_destroy (input);
 343   caseproto_unref (proto);
 344 }
 345
 346 \f
 347
 348 #include "gettext.h"
 349 #define _(msgid) gettext (msgid)
 350
 351 static void
 352 show_jt (const struct n_sample_test *nst, const struct jt *jt, const struct variable *wv)
 353 {
 354   int i;
 355   const int row_headers = 1;
 356   const int column_headers = 1;
 357   const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : & F_8_0;
 358
 359   struct tab_table *table =
 360     tab_create (row_headers + 7, column_headers + nst->n_vars);
 361
 362   tab_headers (table, row_headers, 0, column_headers, 0);
 363
 364   tab_title (table, _("Jonckheere-Terpstra Test"));
 365
 366   /* Vertical lines inside the box */
 367   tab_box (table, 1, 0, -1, TAL_1,
 368            row_headers, 0, tab_nc (table) - 1, tab_nr (table) - 1 );
 369
 370   /* Box around the table */
 371   tab_box (table, TAL_2, TAL_2, -1, -1,
 372            0,  0, tab_nc (table) - 1, tab_nr (table) - 1 );
 373
 374   tab_hline (table, TAL_2, 0, tab_nc (table) -1, column_headers);
 375   tab_vline (table, TAL_2, row_headers, 0, tab_nr (table) - 1);
 376
 377   tab_text_format (table, 1, 0, TAT_TITLE | TAB_CENTER,
 378                    _("Number of levels in %s"),
 379                    var_to_string (nst->indep_var));
 380   tab_text (table, 2, 0, TAT_TITLE | TAB_CENTER, _("N"));
 381   tab_text (table, 3, 0, TAT_TITLE | TAB_CENTER, _("Observed J-T Statistic"));
 382   tab_text (table, 4, 0, TAT_TITLE | TAB_CENTER, _("Mean J-T Statistic"));
 383   tab_text (table, 5, 0, TAT_TITLE | TAB_CENTER, _("Std. Deviation of J-T Statistic"));
 384   tab_text (table, 6, 0, TAT_TITLE | TAB_CENTER, _("Std. J-T Statistic"));
 385   tab_text (table, 7, 0, TAT_TITLE | TAB_CENTER, _("Asymp. Sig. (2-tailed)"));
 386
 387
 388   for (i = 0; i < nst->n_vars; ++i)
 389     {
 390       double std_jt;
 391
 392       tab_text (table, 0, i + row_headers, TAT_TITLE,
 393                 var_to_string (nst->vars[i]) );
 394
 395       tab_double (table, 1, i + row_headers, TAT_TITLE,
 396                   jt[0].levels, &F_8_0);
 397
 398       tab_double (table, 2, i + row_headers, TAT_TITLE,
 399                   jt[0].n, wfmt);
 400
 401       tab_double (table, 3, i + row_headers, TAT_TITLE,
 402                   jt[0].obs, 0);
 403
 404       tab_double (table, 4, i + row_headers, TAT_TITLE,
 405                   jt[0].mean, 0);
 406
 407       tab_double (table, 5, i + row_headers, TAT_TITLE,
 408                   jt[0].stddev, 0);
 409
 410       std_jt = (jt[0].obs - jt[0].mean) / jt[0].stddev;
 411       tab_double (table, 6, i + row_headers, TAT_TITLE,
 412                   std_jt, 0);
 413
 414       tab_double (table, 7, i + row_headers, TAT_TITLE,
 415                   2.0 * ((std_jt > 0) ? gsl_cdf_ugaussian_Q (std_jt) : gsl_cdf_ugaussian_P (std_jt)), 0);
 416     }
 417
 418   tab_submit (table);
 419 }