1 /* Pspp - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2022 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "kruskal-wallis.h"
22 #include <gsl/gsl_cdf.h>
25 #include "data/casereader.h"
26 #include "data/casewriter.h"
27 #include "data/dataset.h"
28 #include "data/dictionary.h"
29 #include "data/format.h"
30 #include "data/subcase.h"
31 #include "data/variable.h"
32 #include "libpspp/assertion.h"
33 #include "libpspp/hmap.h"
34 #include "libpspp/bt.h"
35 #include "libpspp/message.h"
36 #include "libpspp/misc.h"
37 #include "math/sort.h"
38 #include "output/pivot-table.h"
40 #include "gl/minmax.h"
41 #include "gl/xalloc.h"
44 #define N_(msgid) msgid
45 #define _(msgid) gettext (msgid)
47 /* Returns true iff the independent variable lies between nst->val1 and nst->val2 */
49 include_func (const struct ccase *c, void *aux)
51 const struct n_sample_test *nst = aux;
53 const union value *smaller = 0;
54 const union value *larger = 0;
55 int x = value_compare_3way (&nst->val1, &nst->val2, var_get_width (nst->indep_var));
67 if (0 < value_compare_3way (smaller, case_data (c, nst->indep_var),
68 var_get_width (nst->indep_var)))
71 if (0 > value_compare_3way (larger, case_data (c, nst->indep_var),
72 var_get_width (nst->indep_var)))
81 struct hmap_node node;
91 compare_rank_entries_3way (const struct bt_node *a,
92 const struct bt_node *b,
95 const struct variable *var = aux;
96 const struct rank_entry *rea = BT_DATA (a, struct rank_entry, btn);
97 const struct rank_entry *reb = BT_DATA (b, struct rank_entry, btn);
99 return value_compare_3way (&rea->group, &reb->group, var_get_width (var));
103 /* Return the entry with the key GROUP or null if there is no such entry */
104 static struct rank_entry *
105 find_rank_entry (const struct hmap *map, const union value *group, size_t width)
107 struct rank_entry *re = NULL;
108 size_t hash = value_hash (group, width, 0);
110 HMAP_FOR_EACH_WITH_HASH (re, struct rank_entry, node, hash, map)
112 if (0 == value_compare_3way (group, &re->group, width))
119 /* Calculates the adjustment necessary for tie compensation */
121 distinct_callback (double v UNUSED, casenumber t, double w UNUSED, void *aux)
123 double *tiebreaker = aux;
125 *tiebreaker += pow3 (t) - t;
135 static void show_ranks_box (const struct n_sample_test *, const struct kw *);
136 static void show_sig_box (const struct n_sample_test *, const struct kw *);
139 kruskal_wallis_execute (const struct dataset *ds,
140 struct casereader *input,
141 enum mv_class exclude,
142 const struct npar_test *test,
149 const struct dictionary *dict = dataset_dict (ds);
150 const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, parent);
151 const struct caseproto *proto ;
154 int total_n_groups = 0.0;
156 struct kw *kw = XCALLOC (nst->n_vars, struct kw);
158 /* If the independent variable is missing, then we ignore the case */
159 input = casereader_create_filter_missing (input,
164 input = casereader_create_filter_weight (input, dict, &warn, NULL);
166 /* Remove all those cases which are outside the range (val1, val2) */
167 input = casereader_create_filter_func (input, include_func, NULL,
168 CONST_CAST (struct n_sample_test *, nst), NULL);
170 proto = casereader_get_proto (input);
171 rank_idx = caseproto_get_n_widths (proto);
173 /* Rank cases by the v value */
174 for (i = 0; i < nst->n_vars; ++i)
176 double tiebreaker = 0.0;
178 enum rank_error rerr = 0;
179 struct casereader *rr;
180 struct casereader *r = casereader_clone (input);
182 r = sort_execute_1var (r, nst->vars[i]);
184 /* Ignore missings in the test variable */
185 r = casereader_create_filter_missing (r, &nst->vars[i], 1,
189 rr = casereader_create_append_rank (r,
191 dict_get_weight (dict),
193 distinct_callback, &tiebreaker);
195 hmap_init (&kw[i].map);
196 for (; (c = casereader_read (rr)); case_unref (c))
198 const union value *group = case_data (c, nst->indep_var);
199 const size_t group_var_width = var_get_width (nst->indep_var);
200 struct rank_entry *rank = find_rank_entry (&kw[i].map, group, group_var_width);
204 rank = xzalloc (sizeof *rank);
205 value_clone (&rank->group, group, group_var_width);
207 hmap_insert (&kw[i].map, &rank->node,
208 value_hash (&rank->group, group_var_width, 0));
211 rank->sum_of_ranks += case_num_idx (c, rank_idx);
212 rank->n += dict_get_case_weight (dict, c, &warn);
214 /* If this assertion fires, then either the data wasn't sorted or some other
219 casereader_destroy (rr);
221 /* Calculate the value of h */
223 struct rank_entry *mre;
226 HMAP_FOR_EACH (mre, struct rank_entry, node, &kw[i].map)
228 kw[i].h += pow2 (mre->sum_of_ranks) / mre->n;
233 kw[i].h *= 12 / (n * (n + 1));
234 kw[i].h -= 3 * (n + 1) ;
236 kw[i].h /= 1 - tiebreaker/ (pow3 (n) - n);
240 casereader_destroy (input);
242 show_ranks_box (nst, kw);
243 show_sig_box (nst, kw);
245 /* Cleanup allocated memory */
246 for (i = 0 ; i < nst->n_vars; ++i)
248 struct rank_entry *mre, *next;
249 HMAP_FOR_EACH_SAFE (mre, next, struct rank_entry, node, &kw[i].map)
251 hmap_delete (&kw[i].map, &mre->node);
254 hmap_destroy (&kw[i].map);
261 show_ranks_box (const struct n_sample_test *nst, const struct kw *kw)
263 struct pivot_table *table = pivot_table_create (N_("Ranks"));
265 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
266 N_("N"), PIVOT_RC_INTEGER,
267 N_("Mean Rank"), PIVOT_RC_OTHER);
269 struct pivot_dimension *variables = pivot_dimension_create (
270 table, PIVOT_AXIS_ROW, N_("Variables"));
272 for (size_t i = 0 ; i < nst->n_vars ; ++i)
274 /* Sort the rank entries, by iteratin the hash and putting the entries
275 into a binary tree. */
276 struct bt bt = BT_INITIALIZER(compare_rank_entries_3way, nst->vars[i]);
277 struct rank_entry *re_x;
278 HMAP_FOR_EACH (re_x, struct rank_entry, node, &kw[i].map)
279 bt_insert (&bt, &re_x->btn);
281 /* Report the rank entries in sorted order. */
282 struct pivot_category *group = pivot_category_create_group__ (
283 variables->root, pivot_value_new_variable (nst->vars[i]));
285 const struct rank_entry *re;
286 BT_FOR_EACH (re, struct rank_entry, btn, &bt)
288 struct string str = DS_EMPTY_INITIALIZER;
289 var_append_value_name (nst->indep_var, &re->group, &str);
290 int row = pivot_category_create_leaf (
291 group, pivot_value_new_user_text_nocopy (ds_steal_cstr (&str)));
293 double entries[] = { re->n, re->sum_of_ranks / re->n };
294 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
295 pivot_table_put2 (table, j, row,
296 pivot_value_new_number (entries[j]));
301 int row = pivot_category_create_leaves (group, N_("Total"));
302 pivot_table_put2 (table, 0, row, pivot_value_new_number (tot));
305 pivot_table_submit (table);
309 show_sig_box (const struct n_sample_test *nst, const struct kw *kw)
311 struct pivot_table *table = pivot_table_create (N_("Test Statistics"));
313 pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"),
314 N_("Chi-Square"), PIVOT_RC_OTHER,
315 N_("df"), PIVOT_RC_INTEGER,
316 N_("Asymp. Sig."), PIVOT_RC_SIGNIFICANCE);
318 struct pivot_dimension *variables = pivot_dimension_create (
319 table, PIVOT_AXIS_COLUMN, N_("Variables"));
321 for (size_t i = 0 ; i < nst->n_vars; ++i)
323 int col = pivot_category_create_leaf (
324 variables->root, pivot_value_new_variable (nst->vars[i]));
326 double df = hmap_count (&kw[i].map) - 1;
327 double sig = gsl_cdf_chisq_Q (kw[i].h, df);
328 double entries[] = { kw[i].h, df, sig };
329 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
330 pivot_table_put2 (table, j, col, pivot_value_new_number (entries[j]));
333 pivot_table_submit (table);