1 /* Pspp - a program for statistical analysis.
2 Copyright (C) 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "jonckheere-terpstra.h"
22 #include <gsl/gsl_cdf.h>
25 #include "data/casegrouper.h"
26 #include "data/casereader.h"
27 #include "data/casewriter.h"
28 #include "data/dataset.h"
29 #include "data/dictionary.h"
30 #include "data/format.h"
31 #include "data/subcase.h"
32 #include "data/variable.h"
33 #include "libpspp/assertion.h"
34 #include "libpspp/hmap.h"
35 #include "libpspp/message.h"
36 #include "libpspp/misc.h"
37 #include "math/sort.h"
38 #include "output/tab.h"
40 #include "gl/minmax.h"
41 #include "gl/xalloc.h"
43 /* Returns true iff the independent variable lies in the
44 between val1 and val2. Regardless of which is the greater value.
47 include_func_bi (const struct ccase *c, void *aux)
49 const struct n_sample_test *nst = aux;
50 const union value *bigger = NULL;
51 const union value *smaller = NULL;
53 if (0 > value_compare_3way (&nst->val1, &nst->val2, var_get_width (nst->indep_var)))
64 if (0 < value_compare_3way (smaller, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
67 if (0 > value_compare_3way (bigger, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
75 /* The total of the caseweights in the group */
78 /* A casereader containing the group data.
79 This casereader contains just two values:
80 0: The raw value of the data
81 1: The cumulative caseweight
83 struct casereader *reader;
88 u (const struct group_data *grp0, const struct group_data *grp1)
92 struct casereader *r0 = casereader_clone (grp0->reader);
94 double prev_cc0 = 0.0;
95 for (; (c0 = casereader_read (r0)); case_unref (c0))
98 struct casereader *r1 = casereader_clone (grp1->reader);
99 double x0 = case_data_idx (c0, 0)->f;
100 double cc0 = case_data_idx (c0, 1)->f;
101 double w0 = cc0 - prev_cc0;
105 for (; (c1 = casereader_read (r1)); case_unref (c1))
107 double x1 = case_data_idx (c1, 0)->f;
108 double cc1 = case_data_idx (c1, 1)->f;
116 usum += w0 * (grp1->cc - prev_cc1);
123 usum += w0 * ( (grp1->cc - prev_cc1) / 2.0);
125 usum += w0 * (grp1->cc - (prev_cc1 + cc1) / 2.0);
133 casereader_destroy (r1);
136 casereader_destroy (r0);
142 typedef double func_f (double e_l);
145 These 3 functions are used repeatedly in the calculation of the
146 variance of the JT statistic.
147 Having them explicitly defined makes the variance calculation
153 return e * (e - 1) * (2*e + 5);
159 return e * (e - 1) * (e - 2);
168 static func_f *mff[3] =
175 This function does the following:
176 It creates an ordered set of *distinct* values from IR.
177 For each case in that set, it calls f[0..N] passing it the caseweight.
178 It returns the sum of f[j] in result[j].
180 result and f must be allocated prior to calling this function.
183 void variance_calculation (struct casereader *ir, const struct variable *var,
184 const struct dictionary *dict,
185 func_f **f, double *result, size_t n)
188 struct casereader *r = casereader_clone (ir);
190 const struct variable *wv = dict_get_weight (dict);
191 const int w_idx = wv ?
192 var_get_case_index (wv) :
193 caseproto_get_n_widths (casereader_get_proto (r)) ;
195 r = sort_execute_1var (r, var);
197 r = casereader_create_distinct (r, var, dict_get_weight (dict));
199 for (; (c = casereader_read (r)); case_unref (c))
201 double w = case_data_idx (c, w_idx)->f;
203 for (i = 0; i < n; ++i)
204 result[i] += f[i] (w);
207 casereader_destroy (r);
219 static void show_jt (const struct n_sample_test *nst, const struct jt *jt, const struct variable *wv);
223 jonckheere_terpstra_execute (const struct dataset *ds,
224 struct casereader *input,
225 enum mv_class exclude,
226 const struct npar_test *test,
232 const struct dictionary *dict = dataset_dict (ds);
233 const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, parent);
235 struct caseproto *proto = caseproto_create ();
236 proto = caseproto_add_width (proto, 0);
237 proto = caseproto_add_width (proto, 0);
239 /* If the independent variable is missing, then we ignore the case */
240 input = casereader_create_filter_missing (input,
245 /* Remove cases with invalid weigths */
246 input = casereader_create_filter_weight (input, dict, &warn, NULL);
248 /* Remove all those cases which are outside the range (val1, val2) */
249 input = casereader_create_filter_func (input, include_func_bi, NULL,
250 CONST_CAST (struct n_sample_test *, nst), NULL);
252 /* Sort the data by the independent variable */
253 input = sort_execute_1var (input, nst->indep_var);
255 for (v = 0; v < nst->n_vars ; ++v)
262 double sums[3] = {0,0,0};
263 double e_sum[3] = {0,0,0};
265 struct group_data *grp = NULL;
268 struct casegrouper *grouper;
269 struct casereader *group;
270 struct casereader *vreader= casereader_clone (input);
272 /* Get a few values into e_sum - we'll be needing these later */
273 variance_calculation (vreader, nst->vars[v], dict, mff, e_sum, 3);
276 casegrouper_create_vars (vreader, &nst->indep_var, 1);
281 for (; casegrouper_get_next_group (grouper, &group);
282 casereader_destroy (group) )
284 struct casewriter *writer = autopaging_writer_create (proto);
288 group = sort_execute_1var (group, nst->vars[v]);
289 for (; (c = casereader_read (group)); case_unref (c))
291 struct ccase *c_out = case_create (proto);
292 const union value *x = case_data (c, nst->vars[v]);
294 case_data_rw_idx (c_out, 0)->f = x->f;
296 cc += dict_get_case_weight (dict, c, &warn);
297 case_data_rw_idx (c_out, 1)->f = cc;
298 casewriter_write (writer, c_out);
301 grp = xrealloc (grp, sizeof *grp * (jt.levels + 1));
303 grp[jt.levels].reader = casewriter_make_reader (writer);
304 grp[jt.levels].cc = cc;
308 ccsq_sum += pow2 (cc);
311 casegrouper_destroy (grouper);
313 for (g0 = 0; g0 < jt.levels; ++g0)
316 for (g1 = g0 +1 ; g1 < jt.levels; ++g1)
318 double uu = u (&grp[g0], &grp[g1]);
321 nn += pow2 (grp[g0].cc) * (2 * grp[g0].cc + 3);
323 for (i = 0; i < 3; ++i)
324 sums[i] += mff[i] (grp[g0].cc);
326 casereader_destroy (grp[g0].reader);
331 variance = (mff[0](jt.n) - sums[0] - e_sum[0]) / 72.0;
332 variance += sums[1] * e_sum[1] / (36.0 * mff[1] (jt.n));
333 variance += sums[2] * e_sum[2] / (8.0 * mff[2] (jt.n));
335 jt.stddev = sqrt (variance);
337 jt.mean = (pow2 (jt.n) - ccsq_sum) / 4.0;
339 show_jt (nst, &jt, dict_get_weight (dict));
342 casereader_destroy (input);
343 caseproto_unref (proto);
349 #define _(msgid) gettext (msgid)
352 show_jt (const struct n_sample_test *nst, const struct jt *jt, const struct variable *wv)
355 const int row_headers = 1;
356 const int column_headers = 1;
357 const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : & F_8_0;
359 struct tab_table *table =
360 tab_create (row_headers + 7, column_headers + nst->n_vars);
362 tab_headers (table, row_headers, 0, column_headers, 0);
364 tab_title (table, _("Jonckheere-Terpstra Test"));
366 /* Vertical lines inside the box */
367 tab_box (table, 1, 0, -1, TAL_1,
368 row_headers, 0, tab_nc (table) - 1, tab_nr (table) - 1 );
370 /* Box around the table */
371 tab_box (table, TAL_2, TAL_2, -1, -1,
372 0, 0, tab_nc (table) - 1, tab_nr (table) - 1 );
374 tab_hline (table, TAL_2, 0, tab_nc (table) -1, column_headers);
375 tab_vline (table, TAL_2, row_headers, 0, tab_nr (table) - 1);
377 tab_text_format (table, 1, 0, TAT_TITLE | TAB_CENTER,
378 _("Number of levels in %s"),
379 var_to_string (nst->indep_var));
380 tab_text (table, 2, 0, TAT_TITLE | TAB_CENTER, _("N"));
381 tab_text (table, 3, 0, TAT_TITLE | TAB_CENTER, _("Observed J-T Statistic"));
382 tab_text (table, 4, 0, TAT_TITLE | TAB_CENTER, _("Mean J-T Statistic"));
383 tab_text (table, 5, 0, TAT_TITLE | TAB_CENTER, _("Std. Deviation of J-T Statistic"));
384 tab_text (table, 6, 0, TAT_TITLE | TAB_CENTER, _("Std. J-T Statistic"));
385 tab_text (table, 7, 0, TAT_TITLE | TAB_CENTER, _("Asymp. Sig. (2-tailed)"));
388 for (i = 0; i < nst->n_vars; ++i)
392 tab_text (table, 0, i + row_headers, TAT_TITLE,
393 var_to_string (nst->vars[i]) );
395 tab_double (table, 1, i + row_headers, TAT_TITLE,
396 jt[0].levels, &F_8_0);
398 tab_double (table, 2, i + row_headers, TAT_TITLE,
401 tab_double (table, 3, i + row_headers, TAT_TITLE,
404 tab_double (table, 4, i + row_headers, TAT_TITLE,
407 tab_double (table, 5, i + row_headers, TAT_TITLE,
410 std_jt = (jt[0].obs - jt[0].mean) / jt[0].stddev;
411 tab_double (table, 6, i + row_headers, TAT_TITLE,
414 tab_double (table, 7, i + row_headers, TAT_TITLE,
415 2.0 * ((std_jt > 0) ? gsl_cdf_ugaussian_Q (std_jt) : gsl_cdf_ugaussian_P (std_jt)), 0);