1 /* Pspp - a program for statistical analysis.
2 Copyright (C) 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "jonckheere-terpstra.h"
22 #include <gsl/gsl_cdf.h>
25 #include "data/casegrouper.h"
26 #include "data/casereader.h"
27 #include "data/casewriter.h"
28 #include "data/dataset.h"
29 #include "data/dictionary.h"
30 #include "data/format.h"
31 #include "data/subcase.h"
32 #include "data/variable.h"
33 #include "libpspp/assertion.h"
34 #include "libpspp/hmap.h"
35 #include "libpspp/message.h"
36 #include "libpspp/misc.h"
37 #include "math/sort.h"
38 #include "output/pivot-table.h"
40 #include "gl/minmax.h"
41 #include "gl/xalloc.h"
44 #define N_(msgid) msgid
45 #define _(msgid) gettext (msgid)
48 /* Returns true iff the independent variable lies in the
49 between val1 and val2. Regardless of which is the greater value.
52 include_func_bi (const struct ccase *c, void *aux)
54 const struct n_sample_test *nst = aux;
55 const union value *bigger = NULL;
56 const union value *smaller = NULL;
58 if (0 > value_compare_3way (&nst->val1, &nst->val2, var_get_width (nst->indep_var)))
69 if (0 < value_compare_3way (smaller, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
72 if (0 > value_compare_3way (bigger, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
80 /* The total of the caseweights in the group */
83 /* A casereader containing the group data.
84 This casereader contains just two values:
85 0: The raw value of the data
86 1: The cumulative caseweight
88 struct casereader *reader;
93 u (const struct group_data *grp0, const struct group_data *grp1)
97 struct casereader *r0 = casereader_clone (grp0->reader);
99 double prev_cc0 = 0.0;
100 for (; (c0 = casereader_read (r0)); case_unref (c0))
103 struct casereader *r1 = casereader_clone (grp1->reader);
104 double x0 = case_num_idx (c0, 0);
105 double cc0 = case_num_idx (c0, 1);
106 double w0 = cc0 - prev_cc0;
110 for (; (c1 = casereader_read (r1)); case_unref (c1))
112 double x1 = case_num_idx (c1, 0);
113 double cc1 = case_num_idx (c1, 1);
121 usum += w0 * (grp1->cc - prev_cc1);
128 usum += w0 * ((grp1->cc - prev_cc1) / 2.0);
130 usum += w0 * (grp1->cc - (prev_cc1 + cc1) / 2.0);
138 casereader_destroy (r1);
141 casereader_destroy (r0);
147 typedef double func_f (double e_l);
150 These 3 functions are used repeatedly in the calculation of the
151 variance of the JT statistic.
152 Having them explicitly defined makes the variance calculation
158 return e * (e - 1) * (2*e + 5);
164 return e * (e - 1) * (e - 2);
173 static func_f *mff[3] =
180 This function does the following:
181 It creates an ordered set of *distinct* values from IR.
182 For each case in that set, it calls f[0..N] passing it the caseweight.
183 It returns the sum of f[j] in result[j].
185 result and f must be allocated prior to calling this function.
188 void variance_calculation (struct casereader *ir, const struct variable *var,
189 const struct dictionary *dict,
190 func_f **f, double *result, size_t n)
193 struct casereader *r = casereader_clone (ir);
195 const struct variable *wv = dict_get_weight (dict);
196 const int w_idx = wv ?
197 var_get_case_index (wv) :
198 caseproto_get_n_widths (casereader_get_proto (r)) ;
200 r = sort_execute_1var (r, var);
202 r = casereader_create_distinct (r, var, dict_get_weight (dict));
204 for (; (c = casereader_read (r)); case_unref (c))
206 double w = case_num_idx (c, w_idx);
208 for (i = 0; i < n; ++i)
209 result[i] += f[i] (w);
212 casereader_destroy (r);
224 static void show_jt (const struct n_sample_test *, const struct jt *,
225 const struct fmt_spec *wfmt);
229 jonckheere_terpstra_execute (const struct dataset *ds,
230 struct casereader *input,
231 enum mv_class exclude,
232 const struct npar_test *test,
238 const struct dictionary *dict = dataset_dict (ds);
239 const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, parent);
241 struct caseproto *proto = caseproto_create ();
242 proto = caseproto_add_width (proto, 0);
243 proto = caseproto_add_width (proto, 0);
245 /* If the independent variable is missing, then we ignore the case */
246 input = casereader_create_filter_missing (input,
251 /* Remove cases with invalid weigths */
252 input = casereader_create_filter_weight (input, dict, &warn, NULL);
254 /* Remove all those cases which are outside the range (val1, val2) */
255 input = casereader_create_filter_func (input, include_func_bi, NULL,
256 CONST_CAST (struct n_sample_test *, nst), NULL);
258 /* Sort the data by the independent variable */
259 input = sort_execute_1var (input, nst->indep_var);
261 for (v = 0; v < nst->n_vars ; ++v)
268 double sums[3] = {0,0,0};
269 double e_sum[3] = {0,0,0};
271 struct group_data *grp = NULL;
274 struct casegrouper *grouper;
275 struct casereader *group;
276 struct casereader *vreader= casereader_clone (input);
278 /* Get a few values into e_sum - we'll be needing these later */
279 variance_calculation (vreader, nst->vars[v], dict, mff, e_sum, 3);
282 casegrouper_create_vars (vreader, &nst->indep_var, 1);
287 for (; casegrouper_get_next_group (grouper, &group);
288 casereader_destroy (group))
290 struct casewriter *writer = autopaging_writer_create (proto);
294 group = sort_execute_1var (group, nst->vars[v]);
295 for (; (c = casereader_read (group)); case_unref (c))
297 struct ccase *c_out = case_create (proto);
299 *case_num_rw_idx (c_out, 0) = case_num (c, nst->vars[v]);
301 cc += dict_get_case_weight (dict, c, &warn);
302 *case_num_rw_idx (c_out, 1) = cc;
303 casewriter_write (writer, c_out);
306 grp = xrealloc (grp, sizeof *grp * (jt.levels + 1));
308 grp[jt.levels].reader = casewriter_make_reader (writer);
309 grp[jt.levels].cc = cc;
313 ccsq_sum += pow2 (cc);
316 casegrouper_destroy (grouper);
318 for (g0 = 0; g0 < jt.levels; ++g0)
321 for (g1 = g0 +1 ; g1 < jt.levels; ++g1)
323 double uu = u (&grp[g0], &grp[g1]);
326 nn += pow2 (grp[g0].cc) * (2 * grp[g0].cc + 3);
328 for (i = 0; i < 3; ++i)
329 sums[i] += mff[i] (grp[g0].cc);
331 casereader_destroy (grp[g0].reader);
336 variance = (mff[0](jt.n) - sums[0] - e_sum[0]) / 72.0;
337 variance += sums[1] * e_sum[1] / (36.0 * mff[1] (jt.n));
338 variance += sums[2] * e_sum[2] / (8.0 * mff[2] (jt.n));
340 jt.stddev = sqrt (variance);
342 jt.mean = (pow2 (jt.n) - ccsq_sum) / 4.0;
344 show_jt (nst, &jt, dict_get_weight_format (dict));
347 casereader_destroy (input);
348 caseproto_unref (proto);
352 show_jt (const struct n_sample_test *nst, const struct jt *jt,
353 const struct fmt_spec *wfmt)
355 struct pivot_table *table = pivot_table_create (
356 N_("Jonckheere-Terpstra Test"));
357 pivot_table_set_weight_format (table, wfmt);
359 struct pivot_dimension *statistics = pivot_dimension_create (
360 table, PIVOT_AXIS_COLUMN, N_("Statistics"));
361 pivot_category_create_leaf_rc (
363 pivot_value_new_text_format (N_("Number of levels in %s"),
364 var_to_string (nst->indep_var)),
366 pivot_category_create_leaves (
368 N_("N"), PIVOT_RC_COUNT,
369 N_("Observed J-T Statistic"), PIVOT_RC_OTHER,
370 N_("Mean J-T Statistic"), PIVOT_RC_OTHER,
371 N_("Std. Deviation of J-T Statistic"), PIVOT_RC_OTHER,
372 N_("Std. J-T Statistic"), PIVOT_RC_OTHER,
373 N_("Asymp. Sig. (2-tailed)"), PIVOT_RC_SIGNIFICANCE);
375 struct pivot_dimension *variables = pivot_dimension_create (
376 table, PIVOT_AXIS_ROW, N_("Variable"));
378 for (size_t i = 0; i < nst->n_vars; ++i)
380 int row = pivot_category_create_leaf (
381 variables->root, pivot_value_new_variable (nst->vars[i]));
383 double std_jt = (jt[0].obs - jt[0].mean) / jt[0].stddev;
384 double sig = (2.0 * (std_jt > 0
385 ? gsl_cdf_ugaussian_Q (std_jt)
386 : gsl_cdf_ugaussian_P (std_jt)));
396 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
397 pivot_table_put2 (table, j, row, pivot_value_new_number (entries[j]));
400 pivot_table_submit (table);