pintos-os.org Git - pspp/blob - src/language/stats/jonckheere-terpstra.c

   1 /* Pspp - a program for statistical analysis.
   2    Copyright (C) 2012 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17
  18 #include <config.h>
  19
  20 #include "jonckheere-terpstra.h"
  21
  22 #include <gsl/gsl_cdf.h>
  23 #include <math.h>
  24
  25 #include "data/casegrouper.h"
  26 #include "data/casereader.h"
  27 #include "data/casewriter.h"
  28 #include "data/dataset.h"
  29 #include "data/dictionary.h"
  30 #include "data/format.h"
  31 #include "data/subcase.h"
  32 #include "data/variable.h"
  33 #include "libpspp/assertion.h"
  34 #include "libpspp/hmap.h"
  35 #include "libpspp/message.h"
  36 #include "libpspp/misc.h"
  37 #include "math/sort.h"
  38 #include "output/pivot-table.h"
  39
  40 #include "gl/minmax.h"
  41 #include "gl/xalloc.h"
  42
  43 #include "gettext.h"
  44 #define N_(msgid) msgid
  45 #define _(msgid) gettext (msgid)
  46
  47
  48 /* Returns true iff the independent variable lies in the
  49    between val1 and val2. Regardless of which is the greater value.
  50 */
  51 static bool
  52 include_func_bi (const struct ccase *c, void *aux)
  53 {
  54   const struct n_sample_test *nst = aux;
  55   const union value *bigger = NULL;
  56   const union value *smaller = NULL;
  57
  58   if (0 > value_compare_3way (&nst->val1, &nst->val2, var_get_width (nst->indep_var)))
  59     {
  60       bigger = &nst->val2;
  61       smaller = &nst->val1;
  62     }
  63   else
  64     {
  65       smaller = &nst->val2;
  66       bigger = &nst->val1;
  67     }
  68
  69   if (0 < value_compare_3way (smaller, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
  70     return false;
  71
  72   if (0 > value_compare_3way (bigger, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
  73     return false;
  74
  75   return true;
  76 }
  77
  78 struct group_data
  79 {
  80   /* The total of the caseweights in the group */
  81   double cc;
  82
  83   /* A casereader containing the group data.
  84      This casereader contains just two values:
  85      0: The raw value of the data
  86      1: The cumulative caseweight
  87    */
  88   struct casereader *reader;
  89 };
  90
  91
  92 static double
  93 u (const struct group_data *grp0, const struct group_data *grp1)
  94 {
  95   struct ccase *c0;
  96
  97   struct casereader *r0 = casereader_clone (grp0->reader);
  98   double usum = 0;
  99   double prev_cc0 = 0.0;
 100   for (; (c0 = casereader_read (r0)); case_unref (c0))
 101     {
 102       struct ccase *c1;
 103       struct casereader *r1 = casereader_clone (grp1->reader);
 104       double x0 = case_num_idx (c0, 0);
 105       double cc0 = case_num_idx (c0, 1);
 106       double w0 = cc0 - prev_cc0;
 107
 108       double prev_cc1 = 0;
 109
 110       for (; (c1 = casereader_read (r1)); case_unref (c1))
 111         {
 112           double x1 = case_num_idx (c1, 0);
 113           double cc1 = case_num_idx (c1, 1);
 114
 115           if (x0 > x1)
 116             {
 117               /* Do nothing */
 118             }
 119           else if (x0 < x1)
 120             {
 121               usum += w0 * (grp1->cc - prev_cc1);
 122               case_unref (c1);
 123               break;
 124             }
 125           else
 126             {
 127 #if 1
 128               usum += w0 * ((grp1->cc - prev_cc1) / 2.0);
 129 #else
 130               usum += w0 * (grp1->cc - (prev_cc1 + cc1) / 2.0);
 131 #endif
 132               case_unref (c1);
 133               break;
 134             }
 135
 136           prev_cc1 = cc1;
 137         }
 138       casereader_destroy (r1);
 139       prev_cc0 = cc0;
 140     }
 141   casereader_destroy (r0);
 142
 143   return usum;
 144 }
 145
 146
 147 typedef double func_f (double e_l);
 148
 149 /*
 150    These 3 functions are used repeatedly in the calculation of the
 151    variance of the JT statistic.
 152    Having them explicitly defined makes the variance calculation
 153    a lot simpler.
 154 */
 155 static  double
 156 ff1 (double e)
 157 {
 158   return e * (e - 1) * (2*e + 5);
 159 }
 160
 161 static  double
 162 ff2 (double e)
 163 {
 164   return e * (e - 1) * (e - 2);
 165 }
 166
 167 static  double
 168 ff3 (double e)
 169 {
 170   return e * (e - 1) ;
 171 }
 172
 173 static  func_f *mff[3] =
 174   {
 175     ff1, ff2, ff3
 176   };
 177
 178
 179 /*
 180   This function does the following:
 181   It creates an ordered set of *distinct* values from IR.
 182   For each case in that set, it calls f[0..N] passing it the caseweight.
 183   It returns the sum of f[j] in result[j].
 184
 185   result and f must be allocated prior to calling this function.
 186  */
 187 static
 188 void variance_calculation (struct casereader *ir, const struct variable *var,
 189                            const struct dictionary *dict,
 190                            func_f **f, double *result, size_t n)
 191 {
 192   int i;
 193   struct casereader *r = casereader_clone (ir);
 194   struct ccase *c;
 195   const struct variable *wv = dict_get_weight (dict);
 196   const int w_idx = wv ?
 197     var_get_case_index (wv) :
 198     caseproto_get_n_widths (casereader_get_proto (r)) ;
 199
 200   r = sort_execute_1var (r, var);
 201
 202   r = casereader_create_distinct (r, var, dict_get_weight (dict));
 203
 204   for (; (c = casereader_read (r)); case_unref (c))
 205     {
 206       double w = case_num_idx (c, w_idx);
 207
 208       for (i = 0; i < n; ++i)
 209         result[i] += f[i] (w);
 210     }
 211
 212   casereader_destroy (r);
 213 }
 214
 215 struct jt
 216 {
 217   int levels;
 218   double n;
 219   double obs;
 220   double mean;
 221   double stddev;
 222 };
 223
 224 static void show_jt (const struct n_sample_test *, const struct jt *,
 225                      const struct fmt_spec *wfmt);
 226
 227
 228 void
 229 jonckheere_terpstra_execute (const struct dataset *ds,
 230                         struct casereader *input,
 231                         enum mv_class exclude,
 232                         const struct npar_test *test,
 233                         bool exact UNUSED,
 234                         double timer UNUSED)
 235 {
 236   int v;
 237   bool warn = true;
 238   const struct dictionary *dict = dataset_dict (ds);
 239   const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, parent);
 240
 241   struct caseproto *proto = caseproto_create ();
 242   proto = caseproto_add_width (proto, 0);
 243   proto = caseproto_add_width (proto, 0);
 244
 245   /* If the independent variable is missing, then we ignore the case */
 246   input = casereader_create_filter_missing (input,
 247                                             &nst->indep_var, 1,
 248                                             exclude,
 249                                             NULL, NULL);
 250
 251   /* Remove cases with invalid weigths */
 252   input = casereader_create_filter_weight (input, dict, &warn, NULL);
 253
 254   /* Remove all those cases which are outside the range (val1, val2) */
 255   input = casereader_create_filter_func (input, include_func_bi, NULL,
 256         CONST_CAST (struct n_sample_test *, nst), NULL);
 257
 258   /* Sort the data by the independent variable */
 259   input = sort_execute_1var (input, nst->indep_var);
 260
 261   for (v = 0; v < nst->n_vars ; ++v)
 262   {
 263     struct jt jt;
 264     double variance;
 265     int g0;
 266     double nn = 0;
 267     int i;
 268     double sums[3] = {0,0,0};
 269     double e_sum[3] = {0,0,0};
 270
 271     struct group_data *grp = NULL;
 272     double ccsq_sum = 0;
 273
 274     struct casegrouper *grouper;
 275     struct casereader *group;
 276     struct casereader *vreader= casereader_clone (input);
 277
 278     /* Get a few values into e_sum - we'll be needing these later */
 279     variance_calculation (vreader, nst->vars[v], dict, mff, e_sum, 3);
 280
 281     grouper =
 282       casegrouper_create_vars (vreader, &nst->indep_var, 1);
 283
 284     jt.obs = 0;
 285     jt.levels = 0;
 286     jt.n = 0;
 287     for (; casegrouper_get_next_group (grouper, &group);
 288          casereader_destroy (group))
 289       {
 290         struct casewriter *writer = autopaging_writer_create (proto);
 291         struct ccase *c;
 292         double cc = 0;
 293
 294         group = sort_execute_1var (group, nst->vars[v]);
 295         for (; (c = casereader_read (group)); case_unref (c))
 296           {
 297             struct ccase *c_out = case_create (proto);
 298
 299             *case_num_rw_idx (c_out, 0) = case_num (c, nst->vars[v]);
 300
 301             cc += dict_get_case_weight (dict, c, &warn);
 302             *case_num_rw_idx (c_out, 1) = cc;
 303             casewriter_write (writer, c_out);
 304           }
 305
 306         grp = xrealloc (grp, sizeof *grp * (jt.levels + 1));
 307
 308         grp[jt.levels].reader = casewriter_make_reader (writer);
 309         grp[jt.levels].cc = cc;
 310
 311         jt.levels++;
 312         jt.n += cc;
 313         ccsq_sum += pow2 (cc);
 314       }
 315
 316     casegrouper_destroy (grouper);
 317
 318     for (g0 = 0; g0 < jt.levels; ++g0)
 319       {
 320         int g1;
 321         for (g1 = g0 +1 ; g1 < jt.levels; ++g1)
 322           {
 323             double uu = u (&grp[g0], &grp[g1]);
 324             jt.obs += uu;
 325           }
 326         nn += pow2 (grp[g0].cc) * (2 * grp[g0].cc + 3);
 327
 328         for (i = 0; i < 3; ++i)
 329           sums[i] += mff[i] (grp[g0].cc);
 330
 331         casereader_destroy (grp[g0].reader);
 332       }
 333
 334     free (grp);
 335
 336     variance = (mff[0](jt.n) - sums[0] - e_sum[0]) / 72.0;
 337     variance += sums[1] * e_sum[1] / (36.0 * mff[1] (jt.n));
 338     variance += sums[2] * e_sum[2] / (8.0 * mff[2] (jt.n));
 339
 340     jt.stddev = sqrt (variance);
 341
 342     jt.mean = (pow2 (jt.n) - ccsq_sum) / 4.0;
 343
 344     show_jt (nst, &jt, dict_get_weight_format (dict));
 345   }
 346
 347   casereader_destroy (input);
 348   caseproto_unref (proto);
 349 }
 350 \f
 351 static void
 352 show_jt (const struct n_sample_test *nst, const struct jt *jt,
 353          const struct fmt_spec *wfmt)
 354 {
 355   struct pivot_table *table = pivot_table_create (
 356     N_("Jonckheere-Terpstra Test"));
 357   pivot_table_set_weight_format (table, wfmt);
 358
 359   struct pivot_dimension *statistics = pivot_dimension_create (
 360     table, PIVOT_AXIS_COLUMN, N_("Statistics"));
 361   pivot_category_create_leaf_rc (
 362     statistics->root,
 363     pivot_value_new_text_format (N_("Number of levels in %s"),
 364                                  var_to_string (nst->indep_var)),
 365     PIVOT_RC_INTEGER);
 366   pivot_category_create_leaves (
 367     statistics->root,
 368     N_("N"), PIVOT_RC_COUNT,
 369     N_("Observed J-T Statistic"), PIVOT_RC_OTHER,
 370     N_("Mean J-T Statistic"), PIVOT_RC_OTHER,
 371     N_("Std. Deviation of J-T Statistic"), PIVOT_RC_OTHER,
 372     N_("Std. J-T Statistic"), PIVOT_RC_OTHER,
 373     N_("Asymp. Sig. (2-tailed)"), PIVOT_RC_SIGNIFICANCE);
 374
 375   struct pivot_dimension *variables = pivot_dimension_create (
 376     table, PIVOT_AXIS_ROW, N_("Variable"));
 377
 378   for (size_t i = 0; i < nst->n_vars; ++i)
 379     {
 380       int row = pivot_category_create_leaf (
 381         variables->root, pivot_value_new_variable (nst->vars[i]));
 382
 383       double std_jt = (jt[0].obs - jt[0].mean) / jt[0].stddev;
 384       double sig = (2.0 * (std_jt > 0
 385                            ? gsl_cdf_ugaussian_Q (std_jt)
 386                            : gsl_cdf_ugaussian_P (std_jt)));
 387       double entries[] = {
 388         jt[0].levels,
 389         jt[0].n,
 390         jt[0].obs,
 391         jt[0].mean,
 392         jt[0].stddev,
 393         std_jt,
 394         sig,
 395       };
 396       for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
 397         pivot_table_put2 (table, j, row, pivot_value_new_number (entries[j]));
 398     }
 399
 400   pivot_table_submit (table);
 401 }