1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/stats/ks-one-sample.h"
21 #include <gsl/gsl_cdf.h>
26 #include "math/sort.h"
27 #include "data/case.h"
28 #include "data/casereader.h"
29 #include "data/dataset.h"
30 #include "data/dictionary.h"
31 #include "data/format.h"
32 #include "data/value-labels.h"
33 #include "data/variable.h"
34 #include "language/stats/freq.h"
35 #include "language/stats/npar.h"
36 #include "libpspp/array.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/compiler.h"
40 #include "libpspp/hash-functions.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "output/tab.h"
45 #include "gl/xalloc.h"
48 #define _(msgid) gettext (msgid)
51 /* The per test variable statistics */
68 typedef double theoretical (const struct ks *ks, double x);
69 typedef theoretical *theoreticalfp;
72 theoretical_uniform (const struct ks *ks, double x)
74 return gsl_cdf_flat_P (x, ks->test_min, ks->test_max);
78 theoretical_normal (const struct ks *ks, double x)
80 return gsl_cdf_gaussian_P (x - ks->mu, ks->sigma);
84 theoretical_poisson (const struct ks *ks, double x)
86 return gsl_cdf_poisson_P (x, ks->mu);
90 theoretical_exponential (const struct ks *ks, double x)
92 return gsl_cdf_exponential_P (x, 1/ks->mu);
96 static const theoreticalfp theoreticalf[4] =
101 theoretical_exponential
105 Return the assymptotic approximation to the significance of Z
108 ks_asymp_sig (double z)
118 double q = exp (-1.233701 * pow (z, -2));
119 return 1 - 2.506628 * (q + pow (q, 9) + pow (q, 25))/ z ;
123 double q = exp (-2 * z * z);
124 return 2 * (q - pow (q, 4) + pow (q, 9) - pow (q, 16))/ z ;
128 static void show_results (const struct ks *, const struct ks_one_sample_test *, const struct fmt_spec *);
132 ks_one_sample_execute (const struct dataset *ds,
133 struct casereader *input,
134 enum mv_class exclude,
135 const struct npar_test *test)
137 const struct dictionary *dict = dataset_dict (ds);
138 const struct ks_one_sample_test *kst = UP_CAST (test, const struct ks_one_sample_test, parent.parent);
139 const struct one_sample_test *ost = &kst->parent;
141 const struct variable *wvar = dict_get_weight (dict);
142 const struct fmt_spec *wfmt = wvar ? var_get_print_format (wvar) : & F_8_0;
146 struct ks *ks = xcalloc (sizeof *ks, ost->n_vars);
148 for (v = 0; v < ost->n_vars; ++v)
151 ks[v].test_min = DBL_MAX;
152 ks[v].test_max = -DBL_MAX;
153 ks[v].diff_pos = -DBL_MAX;
154 ks[v].diff_neg = DBL_MAX;
159 struct casereader *r = casereader_clone (input);
161 for (; (c = casereader_read (r)) != NULL; case_unref (c))
163 const double weight = dict_get_case_weight (dict, c, &warn);
165 for (v = 0; v < ost->n_vars; ++v)
167 const struct variable *var = ost->vars[v];
168 const union value *val = case_data (c, var);
170 if (var_is_value_missing (var, val, exclude))
173 minimize (&ks[v].test_min, val->f);
174 maximize (&ks[v].test_max, val->f);
176 ks[v].obs_cc += weight;
178 ks[v].ssq += pow2 (val->f);
181 casereader_destroy (r);
183 for (v = 0; v < ost->n_vars; ++v)
188 if (kst->p[0] != SYSMIS)
189 ks[v].test_min = kst->p[0];
191 if (kst->p[1] != SYSMIS)
192 ks[v].test_max = kst->p[1];
195 if (kst->p[0] != SYSMIS)
196 ks[v].mu = kst->p[0];
198 ks[v].mu = ks[v].sum / ks[v].obs_cc;
200 if (kst->p[1] != SYSMIS)
201 ks[v].sigma = kst->p[1];
204 ks[v].sigma = ks[v].ssq - pow2 (ks[v].sum) / ks[v].obs_cc;
205 ks[v].sigma /= ks[v].obs_cc - 1;
206 ks[v].sigma = sqrt (ks[v].sigma);
212 if (kst->p[0] != SYSMIS)
213 ks[v].mu = ks[v].sigma = kst->p[0];
215 ks[v].mu = ks[v].sigma = ks[v].sum / ks[v].obs_cc;
221 const struct variable *var = ost->vars[v];
223 double prev_empirical = 0;
224 r = sort_execute_1var (casereader_clone (input), var);
225 for (; (c = casereader_read (r)) != NULL; case_unref (c))
227 double theoretical, empirical;
229 const double weight = dict_get_case_weight (dict, c, &warn);
230 const union value *val = case_data (c, var);
232 if (var_is_value_missing (var, val, exclude))
237 empirical = cc / ks[v].obs_cc;
239 theoretical = theoreticalf[kst->dist] (&ks[v], val->f);
241 d = empirical - theoretical;
242 dp = prev_empirical - theoretical;
245 maximize (&ks[v].diff_pos, d);
247 minimize (&ks[v].diff_neg, d);
250 maximize (&ks[v].diff_pos, dp);
252 minimize (&ks[v].diff_neg, dp);
254 prev_empirical = empirical;
257 casereader_destroy (r);
260 show_results (ks, kst, wfmt);
263 casereader_destroy (input);
268 show_results (const struct ks *ks,
269 const struct ks_one_sample_test *kst,
270 const struct fmt_spec *wfmt)
273 const int row_headers = 1;
274 const int column_headers = 2;
275 const int nc = kst->parent.n_vars + column_headers;
276 const int nr = 8 + row_headers;
277 struct tab_table *table = tab_create (nc, nr);
279 tab_headers (table, row_headers, 0, column_headers, 0);
281 tab_title (table, _("One-Sample Kolmogorov-Smirnov Test"));
283 /* Box around the table */
284 tab_box (table, TAL_2, TAL_2, -1, -1,
285 0, 0, nc - 1, nr - 1 );
287 tab_hline (table, TAL_2, 0, nc - 1, row_headers);
289 tab_vline (table, TAL_1, column_headers, 0, nr - 1);
291 tab_text (table, 0, 1,
292 TAT_TITLE | TAB_LEFT , _("N"));
297 tab_text (table, 0, 2,
298 TAT_TITLE | TAB_LEFT , _("Normal Parameters"));
300 tab_text (table, 1, 2,
301 TAT_TITLE | TAB_LEFT , _("Mean"));
302 tab_text (table, 1, 3,
303 TAT_TITLE | TAB_LEFT , _("Std. Deviation"));
306 tab_text (table, 0, 2,
307 TAT_TITLE | TAB_LEFT , _("Uniform Parameters"));
309 tab_text (table, 1, 2,
310 TAT_TITLE | TAB_LEFT , _("Minimum"));
311 tab_text (table, 1, 3,
312 TAT_TITLE | TAB_LEFT , _("Maximum"));
315 tab_text (table, 0, 2,
316 TAT_TITLE | TAB_LEFT , _("Poisson Parameters"));
318 tab_text (table, 1, 2,
319 TAT_TITLE | TAB_LEFT , _("Lambda"));
322 tab_text (table, 0, 2,
323 TAT_TITLE | TAB_LEFT , _("Exponential Parameters"));
325 tab_text (table, 1, 2,
326 TAT_TITLE | TAB_LEFT , _("Scale"));
333 /* The variable columns */
334 for (i = 0; i < kst->parent.n_vars; ++i)
338 const int col = 2 + i;
339 tab_text (table, col, 0,
340 TAT_TITLE | TAB_CENTER ,
341 var_to_string (kst->parent.vars[i]));
346 tab_double (table, col, 1, 0, ks[i].obs_cc, wfmt);
347 tab_double (table, col, 2, 0, ks[i].test_min, NULL);
348 tab_double (table, col, 3, 0, ks[i].test_max, NULL);
352 tab_double (table, col, 1, 0, ks[i].obs_cc, wfmt);
353 tab_double (table, col, 2, 0, ks[i].mu, NULL);
354 tab_double (table, col, 3, 0, ks[i].sigma, NULL);
359 tab_double (table, col, 1, 0, ks[i].obs_cc, wfmt);
360 tab_double (table, col, 2, 0, ks[i].mu, NULL);
367 abs = ks[i].diff_pos;
368 maximize (&abs, -ks[i].diff_neg);
370 z = sqrt (ks[i].obs_cc) * abs;
372 tab_double (table, col, 5, 0, ks[i].diff_pos, NULL);
373 tab_double (table, col, 6, 0, ks[i].diff_neg, NULL);
375 tab_double (table, col, 4, 0, abs, NULL);
377 tab_double (table, col, 7, 0, z, NULL);
378 tab_double (table, col, 8, 0, ks_asymp_sig (z), NULL);
382 tab_text (table, 0, 4,
383 TAT_TITLE | TAB_LEFT , _("Most Extreme Differences"));
385 tab_text (table, 1, 4,
386 TAT_TITLE | TAB_LEFT , _("Absolute"));
388 tab_text (table, 1, 5,
389 TAT_TITLE | TAB_LEFT , _("Positive"));
391 tab_text (table, 1, 6,
392 TAT_TITLE | TAB_LEFT , _("Negative"));
394 tab_text (table, 0, 7,
395 TAT_TITLE | TAB_LEFT , _("Kolmogorov-Smirnov Z"));
397 tab_text (table, 0, 8,
398 TAT_TITLE | TAB_LEFT , _("Asymp. Sig. (2-tailed)"));