pintos-os.org Git - pspp/blob - src/language/stats/ks-one-sample.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/stats/ks-one-sample.h"
  20
  21 #include <gsl/gsl_cdf.h>
  22 #include <math.h>
  23 #include <stdlib.h>
  24
  25
  26 #include "math/sort.h"
  27 #include "data/case.h"
  28 #include "data/casereader.h"
  29 #include "data/dataset.h"
  30 #include "data/dictionary.h"
  31 #include "data/format.h"
  32 #include "data/value-labels.h"
  33 #include "data/variable.h"
  34 #include "language/stats/freq.h"
  35 #include "language/stats/npar.h"
  36 #include "libpspp/array.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/compiler.h"
  40 #include "libpspp/hash-functions.h"
  41 #include "libpspp/message.h"
  42 #include "libpspp/misc.h"
  43 #include "output/tab.h"
  44
  45 #include "gl/xalloc.h"
  46
  47 #include "gettext.h"
  48 #define _(msgid) gettext (msgid)
  49
  50
  51 /* The per test variable statistics */
  52 struct ks
  53 {
  54   double obs_cc;
  55
  56   double test_min ;
  57   double test_max;
  58   double mu;
  59   double sigma;
  60
  61   double diff_pos;
  62   double diff_neg;
  63
  64   double ssq;
  65   double sum;
  66 };
  67
  68 typedef double theoretical (const struct ks *ks, double x);
  69 typedef theoretical *theoreticalfp;
  70
  71 static double
  72 theoretical_uniform (const struct ks *ks, double x)
  73 {
  74   return gsl_cdf_flat_P (x, ks->test_min, ks->test_max);
  75 }
  76
  77 static double
  78 theoretical_normal (const struct ks *ks, double x)
  79 {
  80   return gsl_cdf_gaussian_P (x - ks->mu, ks->sigma);
  81 }
  82
  83 static double
  84 theoretical_poisson (const struct ks *ks, double x)
  85 {
  86   return gsl_cdf_poisson_P (x, ks->mu);
  87 }
  88
  89 static double
  90 theoretical_exponential (const struct ks *ks, double x)
  91 {
  92   return gsl_cdf_exponential_P (x, 1/ks->mu);
  93 }
  94
  95
  96 static const  theoreticalfp theoreticalf[4] =
  97 {
  98   theoretical_normal,
  99   theoretical_uniform,
 100   theoretical_poisson,
 101   theoretical_exponential
 102 };
 103
 104 /*
 105    Return the assymptotic approximation to the significance of Z
 106  */
 107 static double
 108 ks_asymp_sig (double z)
 109 {
 110   if (z < 0.27)
 111     return 1;
 112
 113   if (z >= 3.1)
 114     return 0;
 115
 116   if (z < 1)
 117     {
 118       double q = exp (-1.233701 * pow (z, -2));
 119       return 1 - 2.506628 * (q + pow (q, 9) + pow (q, 25))/ z ;
 120     }
 121   else
 122     {
 123       double q = exp (-2 * z * z);
 124       return 2 * (q - pow (q, 4) + pow (q, 9) - pow (q, 16))/ z ;
 125     }
 126 }
 127
 128 static void show_results (const struct ks *, const struct ks_one_sample_test *,  const struct fmt_spec *);
 129
 130
 131 void
 132 ks_one_sample_execute (const struct dataset *ds,
 133                        struct casereader *input,
 134                        enum mv_class exclude,
 135                        const struct npar_test *test,
 136                        bool x UNUSED, double y UNUSED)
 137 {
 138   const struct dictionary *dict = dataset_dict (ds);
 139   const struct ks_one_sample_test *kst = UP_CAST (test, const struct ks_one_sample_test, parent.parent);
 140   const struct one_sample_test *ost = &kst->parent;
 141   struct ccase *c;
 142   const struct variable *wvar = dict_get_weight (dict);
 143   const struct fmt_spec *wfmt = wvar ? var_get_print_format (wvar) : & F_8_0;
 144   bool warn = true;
 145   int v;
 146   struct casereader *r = casereader_clone (input);
 147
 148   struct ks *ks = xcalloc (sizeof *ks, ost->n_vars);
 149
 150   for (v = 0; v < ost->n_vars; ++v)
 151     {
 152       ks[v].obs_cc = 0;
 153       ks[v].test_min = DBL_MAX;
 154       ks[v].test_max = -DBL_MAX;
 155       ks[v].diff_pos = -DBL_MAX;
 156       ks[v].diff_neg = DBL_MAX;
 157       ks[v].sum = 0;
 158       ks[v].ssq = 0;
 159     }
 160
 161   for (; (c = casereader_read (r)) != NULL; case_unref (c))
 162     {
 163       const double weight = dict_get_case_weight (dict, c, &warn);
 164
 165       for (v = 0; v < ost->n_vars; ++v)
 166         {
 167           const struct variable *var = ost->vars[v];
 168           const union value *val = case_data (c, var);
 169
 170           if (var_is_value_missing (var, val, exclude))
 171             continue;
 172
 173           minimize (&ks[v].test_min, val->f);
 174           maximize (&ks[v].test_max, val->f);
 175
 176           ks[v].obs_cc += weight;
 177           ks[v].sum += val->f;
 178           ks[v].ssq += pow2 (val->f);
 179         }
 180     }
 181   casereader_destroy (r);
 182
 183   for (v = 0; v < ost->n_vars; ++v)
 184     {
 185       const struct variable *var = ost->vars[v];
 186       double cc = 0;
 187       double prev_empirical = 0;
 188
 189       switch (kst->dist)
 190         {
 191         case KS_UNIFORM:
 192           if (kst->p[0] != SYSMIS)
 193             ks[v].test_min = kst->p[0];
 194
 195           if (kst->p[1] != SYSMIS)
 196             ks[v].test_max = kst->p[1];
 197           break;
 198         case KS_NORMAL:
 199           if (kst->p[0] != SYSMIS)
 200             ks[v].mu = kst->p[0];
 201           else
 202             ks[v].mu = ks[v].sum / ks[v].obs_cc;
 203
 204           if (kst->p[1] != SYSMIS)
 205             ks[v].sigma = kst->p[1];
 206           else
 207             {
 208               ks[v].sigma = ks[v].ssq - pow2 (ks[v].sum) / ks[v].obs_cc;
 209               ks[v].sigma /= ks[v].obs_cc - 1;
 210               ks[v].sigma = sqrt (ks[v].sigma);
 211             }
 212
 213           break;
 214         case KS_POISSON:
 215         case KS_EXPONENTIAL:
 216           if (kst->p[0] != SYSMIS)
 217             ks[v].mu = ks[v].sigma = kst->p[0];
 218           else
 219             ks[v].mu = ks[v].sigma = ks[v].sum / ks[v].obs_cc;
 220           break;
 221         default:
 222           NOT_REACHED ();
 223         }
 224
 225       r = sort_execute_1var (casereader_clone (input), var);
 226       for (; (c = casereader_read (r)) != NULL; case_unref (c))
 227         {
 228           double theoretical, empirical;
 229           double d, dp;
 230           const double weight = dict_get_case_weight (dict, c, &warn);
 231           const union value *val = case_data (c, var);
 232
 233           if (var_is_value_missing (var, val, exclude))
 234             continue;
 235
 236           cc += weight;
 237
 238           empirical = cc / ks[v].obs_cc;
 239
 240           theoretical = theoreticalf[kst->dist] (&ks[v], val->f);
 241
 242           d = empirical - theoretical;
 243           dp = prev_empirical - theoretical;
 244
 245           if (d > 0)
 246             maximize (&ks[v].diff_pos, d);
 247           else
 248             minimize (&ks[v].diff_neg, d);
 249
 250           if (dp > 0)
 251             maximize (&ks[v].diff_pos, dp);
 252           else
 253             minimize (&ks[v].diff_neg, dp);
 254
 255           prev_empirical = empirical;
 256         }
 257
 258       casereader_destroy (r);
 259     }
 260
 261   show_results (ks, kst, wfmt);
 262
 263   free (ks);
 264   casereader_destroy (input);
 265 }
 266
 267
 268 static void
 269 show_results (const struct ks *ks,
 270               const struct ks_one_sample_test *kst,
 271               const struct fmt_spec *wfmt)
 272 {
 273   int i;
 274   const int row_headers = 1;
 275   const int column_headers = 2;
 276   const int nc = kst->parent.n_vars + column_headers;
 277   const int nr = 8 + row_headers;
 278   struct tab_table *table = tab_create (nc, nr);
 279
 280   tab_headers (table, row_headers, 0, column_headers, 0);
 281
 282   tab_title (table, _("One-Sample Kolmogorov-Smirnov Test"));
 283
 284   /* Box around the table */
 285   tab_box (table, TAL_2, TAL_2, -1, -1,
 286            0,  0, nc - 1, nr - 1 );
 287
 288   tab_hline (table, TAL_2, 0, nc - 1, row_headers);
 289
 290   tab_vline (table, TAL_1, column_headers, 0, nr - 1);
 291
 292   tab_text (table,  0, 1,
 293             TAT_TITLE | TAB_LEFT , _("N"));
 294
 295   switch (kst->dist)
 296     {
 297     case KS_NORMAL:
 298       tab_text (table,  0, 2,
 299                 TAT_TITLE | TAB_LEFT , _("Normal Parameters"));
 300
 301       tab_text (table,  1, 2,
 302                 TAT_TITLE | TAB_LEFT , _("Mean"));
 303       tab_text (table,  1, 3,
 304                 TAT_TITLE | TAB_LEFT , _("Std. Deviation"));
 305       break;
 306     case KS_UNIFORM:
 307       tab_text (table,  0, 2,
 308                 TAT_TITLE | TAB_LEFT , _("Uniform Parameters"));
 309
 310       tab_text (table,  1, 2,
 311                 TAT_TITLE | TAB_LEFT , _("Minimum"));
 312       tab_text (table,  1, 3,
 313                 TAT_TITLE | TAB_LEFT , _("Maximum"));
 314       break;
 315     case KS_POISSON:
 316       tab_text (table,  0, 2,
 317                 TAT_TITLE | TAB_LEFT , _("Poisson Parameters"));
 318
 319       tab_text (table,  1, 2,
 320                 TAT_TITLE | TAB_LEFT , _("Lambda"));
 321       break;
 322     case KS_EXPONENTIAL:
 323       tab_text (table,  0, 2,
 324                 TAT_TITLE | TAB_LEFT , _("Exponential Parameters"));
 325
 326       tab_text (table,  1, 2,
 327                 TAT_TITLE | TAB_LEFT , _("Scale"));
 328       break;
 329
 330     default:
 331       NOT_REACHED ();
 332     }
 333
 334   /* The variable columns */
 335   for (i = 0; i < kst->parent.n_vars; ++i)
 336     {
 337       double abs = 0;
 338       double z = 0;
 339       const int col = 2 + i;
 340       tab_text (table, col, 0,
 341                 TAT_TITLE | TAB_CENTER ,
 342                 var_to_string (kst->parent.vars[i]));
 343
 344       switch (kst->dist)
 345         {
 346         case KS_UNIFORM:
 347           tab_double (table, col, 1, 0, ks[i].obs_cc, wfmt);
 348           tab_double (table, col, 2, 0, ks[i].test_min, NULL);
 349           tab_double (table, col, 3, 0, ks[i].test_max, NULL);
 350           break;
 351
 352         case KS_NORMAL:
 353           tab_double (table, col, 1, 0, ks[i].obs_cc, wfmt);
 354           tab_double (table, col, 2, 0, ks[i].mu, NULL);
 355           tab_double (table, col, 3, 0, ks[i].sigma, NULL);
 356           break;
 357
 358         case KS_POISSON:
 359         case KS_EXPONENTIAL:
 360           tab_double (table, col, 1, 0, ks[i].obs_cc, wfmt);
 361           tab_double (table, col, 2, 0, ks[i].mu, NULL);
 362           break;
 363
 364         default:
 365           NOT_REACHED ();
 366         }
 367
 368       abs = ks[i].diff_pos;
 369       maximize (&abs, -ks[i].diff_neg);
 370
 371       z = sqrt (ks[i].obs_cc) * abs;
 372
 373       tab_double (table, col, 5, 0, ks[i].diff_pos, NULL);
 374       tab_double (table, col, 6, 0, ks[i].diff_neg, NULL);
 375
 376       tab_double (table, col, 4, 0, abs, NULL);
 377
 378       tab_double (table, col, 7, 0, z, NULL);
 379       tab_double (table, col, 8, 0, ks_asymp_sig (z), NULL);
 380     }
 381
 382
 383   tab_text (table,  0, 4,
 384             TAT_TITLE | TAB_LEFT , _("Most Extreme Differences"));
 385
 386   tab_text (table,  1, 4,
 387             TAT_TITLE | TAB_LEFT , _("Absolute"));
 388
 389   tab_text (table,  1, 5,
 390             TAT_TITLE | TAB_LEFT , _("Positive"));
 391
 392   tab_text (table,  1, 6,
 393             TAT_TITLE | TAB_LEFT , _("Negative"));
 394
 395   tab_text (table,  0, 7,
 396             TAT_TITLE | TAB_LEFT , _("Kolmogorov-Smirnov Z"));
 397
 398   tab_text (table,  0, 8,
 399             TAT_TITLE | TAB_LEFT , _("Asymp. Sig. (2-tailed)"));
 400
 401   tab_submit (table);
 402 }