src/language/stats/ks-one-sample.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/stats/ks-one-sample.h"
  20
  21 #include <gsl/gsl_cdf.h>
  22 #include <math.h>
  23 #include <stdlib.h>
  24
  25
  26 #include "math/sort.h"
  27 #include "data/case.h"
  28 #include "data/casereader.h"
  29 #include "data/dataset.h"
  30 #include "data/dictionary.h"
  31 #include "data/format.h"
  32 #include "data/value-labels.h"
  33 #include "data/variable.h"
  34 #include "language/stats/freq.h"
  35 #include "language/stats/npar.h"
  36 #include "libpspp/array.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/compiler.h"
  40 #include "libpspp/hash-functions.h"
  41 #include "libpspp/message.h"
  42 #include "libpspp/misc.h"
  43 #include "output/tab.h"
  44
  45 #include "gl/xalloc.h"
  46
  47 #include "gettext.h"
  48 #define _(msgid) gettext (msgid)
  49
  50
  51 /* The per test variable statistics */
  52 struct ks
  53 {
  54   double obs_cc;
  55
  56   double test_min ;
  57   double test_max;
  58   double mu;
  59   double sigma;
  60
  61   double diff_pos;
  62   double diff_neg;
  63
  64   double ssq;
  65   double sum;
  66 };
  67
  68 typedef double theoretical (const struct ks *ks, double x);
  69 typedef theoretical *theoreticalfp;
  70
  71 static double
  72 theoretical_uniform (const struct ks *ks, double x)
  73 {
  74   return gsl_cdf_flat_P (x, ks->test_min, ks->test_max);
  75 }
  76
  77 static double
  78 theoretical_normal (const struct ks *ks, double x)
  79 {
  80   return gsl_cdf_gaussian_P (x - ks->mu, ks->sigma);
  81 }
  82
  83 static double
  84 theoretical_poisson (const struct ks *ks, double x)
  85 {
  86   return gsl_cdf_poisson_P (x, ks->mu);
  87 }
  88
  89 static double
  90 theoretical_exponential (const struct ks *ks, double x)
  91 {
  92   return gsl_cdf_exponential_P (x, 1/ks->mu);
  93 }
  94
  95
  96 static const  theoreticalfp theoreticalf[4] =
  97 {
  98   theoretical_normal,
  99   theoretical_uniform,
 100   theoretical_poisson,
 101   theoretical_exponential
 102 };
 103
 104 /*
 105    Return the assymptotic approximation to the significance of Z
 106  */
 107 static double
 108 ks_asymp_sig (double z)
 109 {
 110   if (z < 0.27)
 111     return 1;
 112
 113   if (z >= 3.1)
 114     return 0;
 115
 116   if (z < 1)
 117     {
 118       double q = exp (-1.233701 * pow (z, -2));
 119       return 1 - 2.506628 * (q + pow (q, 9) + pow (q, 25))/ z ;
 120     }
 121   else
 122     {
 123       double q = exp (-2 * z * z);
 124       return 2 * (q - pow (q, 4) + pow (q, 9) - pow (q, 16))/ z ;
 125     }
 126 }
 127
 128 static void show_results (const struct ks *, const struct ks_one_sample_test *,  const struct fmt_spec *);
 129
 130
 131 void
 132 ks_one_sample_execute (const struct dataset *ds,
 133                    struct casereader *input,
 134                    enum mv_class exclude,
 135                    const struct npar_test *test)
 136 {
 137   const struct dictionary *dict = dataset_dict (ds);
 138   const struct ks_one_sample_test *kst = UP_CAST (test, const struct ks_one_sample_test, parent.parent);
 139   const struct one_sample_test *ost = &kst->parent;
 140   struct ccase *c;
 141   const struct variable *wvar = dict_get_weight (dict);
 142   const struct fmt_spec *wfmt = wvar ? var_get_print_format (wvar) : & F_8_0;
 143   bool warn = true;
 144   int v;
 145
 146   struct ks *ks = xcalloc (sizeof *ks, ost->n_vars);
 147
 148   for (v = 0; v < ost->n_vars; ++v)
 149     {
 150       ks[v].obs_cc = 0;
 151       ks[v].test_min = DBL_MAX;
 152       ks[v].test_max = -DBL_MAX;
 153       ks[v].diff_pos = -DBL_MAX;
 154       ks[v].diff_neg = DBL_MAX;
 155       ks[v].sum = 0;
 156       ks[v].ssq = 0;
 157     }
 158
 159   struct casereader *r = casereader_clone (input);
 160
 161   for (; (c = casereader_read (r)) != NULL; case_unref (c))
 162     {
 163       const double weight = dict_get_case_weight (dict, c, &warn);
 164
 165       for (v = 0; v < ost->n_vars; ++v)
 166         {
 167           const struct variable *var = ost->vars[v];
 168           const union value *val = case_data (c, var);
 169
 170           if (var_is_value_missing (var, val, exclude))
 171             continue;
 172
 173           minimize (&ks[v].test_min, val->f);
 174           maximize (&ks[v].test_max, val->f);
 175
 176           ks[v].obs_cc += weight;
 177           ks[v].sum += val->f;
 178           ks[v].ssq += pow2 (val->f);
 179         }
 180     }
 181   casereader_destroy (r);
 182
 183   for (v = 0; v < ost->n_vars; ++v)
 184     {
 185       switch (kst->dist)
 186         {
 187         case KS_UNIFORM:
 188           if (kst->p[0] != SYSMIS)
 189             ks[v].test_min = kst->p[0];
 190
 191           if (kst->p[1] != SYSMIS)
 192             ks[v].test_max = kst->p[1];
 193           break;
 194         case KS_NORMAL:
 195           if (kst->p[0] != SYSMIS)
 196             ks[v].mu = kst->p[0];
 197           else
 198             ks[v].mu = ks[v].sum / ks[v].obs_cc;
 199
 200           if (kst->p[1] != SYSMIS)
 201             ks[v].sigma = kst->p[1];
 202           else
 203             {
 204               ks[v].sigma = ks[v].ssq - pow2 (ks[v].sum) / ks[v].obs_cc;
 205               ks[v].sigma /= ks[v].obs_cc - 1;
 206               ks[v].sigma = sqrt (ks[v].sigma);
 207             }
 208
 209           break;
 210         case KS_POISSON:
 211         case KS_EXPONENTIAL:
 212           if (kst->p[0] != SYSMIS)
 213             ks[v].mu = ks[v].sigma = kst->p[0];
 214           else
 215             ks[v].mu = ks[v].sigma = ks[v].sum / ks[v].obs_cc;
 216           break;
 217         default:
 218           NOT_REACHED ();
 219         }
 220
 221       const struct variable *var = ost->vars[v];
 222       double cc = 0;
 223       double prev_empirical = 0;
 224       r = sort_execute_1var (casereader_clone (input), var);
 225       for (; (c = casereader_read (r)) != NULL; case_unref (c))
 226         {
 227           double theoretical, empirical;
 228           double d, dp;
 229           const double weight = dict_get_case_weight (dict, c, &warn);
 230           const union value *val = case_data (c, var);
 231
 232           if (var_is_value_missing (var, val, exclude))
 233             continue;
 234
 235           cc += weight;
 236
 237           empirical = cc / ks[v].obs_cc;
 238
 239           theoretical = theoreticalf[kst->dist] (&ks[v], val->f);
 240
 241           d = empirical - theoretical;
 242           dp = prev_empirical - theoretical;
 243
 244           if (d > 0)
 245             maximize (&ks[v].diff_pos, d);
 246           else
 247             minimize (&ks[v].diff_neg, d);
 248
 249           if (dp > 0)
 250             maximize (&ks[v].diff_pos, dp);
 251           else
 252             minimize (&ks[v].diff_neg, dp);
 253
 254           prev_empirical = empirical;
 255         }
 256
 257       casereader_destroy (r);
 258     }
 259
 260   show_results (ks, kst, wfmt);
 261
 262   free (ks);
 263   casereader_destroy (input);
 264 }
 265
 266
 267 static void
 268 show_results (const struct ks *ks,
 269               const struct ks_one_sample_test *kst,
 270               const struct fmt_spec *wfmt)
 271 {
 272   int i;
 273   const int row_headers = 1;
 274   const int column_headers = 2;
 275   const int nc = kst->parent.n_vars + column_headers;
 276   const int nr = 8 + row_headers;
 277   struct tab_table *table = tab_create (nc, nr);
 278
 279   tab_headers (table, row_headers, 0, column_headers, 0);
 280
 281   tab_title (table, _("One-Sample Kolmogorov-Smirnov Test"));
 282
 283   /* Box around the table */
 284   tab_box (table, TAL_2, TAL_2, -1, -1,
 285            0,  0, nc - 1, nr - 1 );
 286
 287   tab_hline (table, TAL_2, 0, nc - 1, row_headers);
 288
 289   tab_vline (table, TAL_1, column_headers, 0, nr - 1);
 290
 291   tab_text (table,  0, 1,
 292             TAT_TITLE | TAB_LEFT , _("N"));
 293
 294   switch (kst->dist)
 295     {
 296     case KS_NORMAL:
 297       tab_text (table,  0, 2,
 298                 TAT_TITLE | TAB_LEFT , _("Normal Parameters"));
 299
 300       tab_text (table,  1, 2,
 301                 TAT_TITLE | TAB_LEFT , _("Mean"));
 302       tab_text (table,  1, 3,
 303                 TAT_TITLE | TAB_LEFT , _("Std. Deviation"));
 304       break;
 305     case KS_UNIFORM:
 306       tab_text (table,  0, 2,
 307                 TAT_TITLE | TAB_LEFT , _("Uniform Parameters"));
 308
 309       tab_text (table,  1, 2,
 310                 TAT_TITLE | TAB_LEFT , _("Minimum"));
 311       tab_text (table,  1, 3,
 312                 TAT_TITLE | TAB_LEFT , _("Maximum"));
 313       break;
 314     case KS_POISSON:
 315       tab_text (table,  0, 2,
 316                 TAT_TITLE | TAB_LEFT , _("Poisson Parameters"));
 317
 318       tab_text (table,  1, 2,
 319                 TAT_TITLE | TAB_LEFT , _("Lambda"));
 320       break;
 321     case KS_EXPONENTIAL:
 322       tab_text (table,  0, 2,
 323                 TAT_TITLE | TAB_LEFT , _("Exponential Parameters"));
 324
 325       tab_text (table,  1, 2,
 326                 TAT_TITLE | TAB_LEFT , _("Scale"));
 327       break;
 328
 329     default:
 330       NOT_REACHED ();
 331     }
 332
 333   /* The variable columns */
 334   for (i = 0; i < kst->parent.n_vars; ++i)
 335     {
 336       double abs = 0;
 337       double z = 0;
 338       const int col = 2 + i;
 339       tab_text (table, col, 0,
 340                 TAT_TITLE | TAB_CENTER ,
 341                 var_to_string (kst->parent.vars[i]));
 342
 343       switch (kst->dist)
 344         {
 345         case KS_UNIFORM:
 346           tab_double (table, col, 1, 0, ks[i].obs_cc, wfmt);
 347           tab_double (table, col, 2, 0, ks[i].test_min, NULL);
 348           tab_double (table, col, 3, 0, ks[i].test_max, NULL);
 349           break;
 350
 351         case KS_NORMAL:
 352           tab_double (table, col, 1, 0, ks[i].obs_cc, wfmt);
 353           tab_double (table, col, 2, 0, ks[i].mu, NULL);
 354           tab_double (table, col, 3, 0, ks[i].sigma, NULL);
 355           break;
 356
 357         case KS_POISSON:
 358         case KS_EXPONENTIAL:
 359           tab_double (table, col, 1, 0, ks[i].obs_cc, wfmt);
 360           tab_double (table, col, 2, 0, ks[i].mu, NULL);
 361           break;
 362
 363         default:
 364           NOT_REACHED ();
 365         }
 366
 367       abs = ks[i].diff_pos;
 368       maximize (&abs, -ks[i].diff_neg);
 369
 370       z = sqrt (ks[i].obs_cc) * abs;
 371
 372       tab_double (table, col, 5, 0, ks[i].diff_pos, NULL);
 373       tab_double (table, col, 6, 0, ks[i].diff_neg, NULL);
 374
 375       tab_double (table, col, 4, 0, abs, NULL);
 376
 377       tab_double (table, col, 7, 0, z, NULL);
 378       tab_double (table, col, 8, 0, ks_asymp_sig (z), NULL);
 379     }
 380
 381
 382   tab_text (table,  0, 4,
 383             TAT_TITLE | TAB_LEFT , _("Most Extreme Differences"));
 384
 385   tab_text (table,  1, 4,
 386             TAT_TITLE | TAB_LEFT , _("Absolute"));
 387
 388   tab_text (table,  1, 5,
 389             TAT_TITLE | TAB_LEFT , _("Positive"));
 390
 391   tab_text (table,  1, 6,
 392             TAT_TITLE | TAB_LEFT , _("Negative"));
 393
 394   tab_text (table,  0, 7,
 395             TAT_TITLE | TAB_LEFT , _("Kolmogorov-Smirnov Z"));
 396
 397   tab_text (table,  0, 8,
 398             TAT_TITLE | TAB_LEFT , _("Asymp. Sig. (2-tailed)"));
 399
 400   tab_submit (table);
 401 }