1 /* Pspp - a program for statistical analysis.
2 Copyright (C) 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "kruskal-wallis.h"
22 #include <gsl/gsl_cdf.h>
25 #include "data/casereader.h"
26 #include "data/casewriter.h"
27 #include "data/dataset.h"
28 #include "data/dictionary.h"
29 #include "data/format.h"
30 #include "data/subcase.h"
31 #include "data/variable.h"
32 #include "libpspp/assertion.h"
33 #include "libpspp/hmap.h"
34 #include "libpspp/bt.h"
35 #include "libpspp/message.h"
36 #include "libpspp/misc.h"
37 #include "math/sort.h"
38 #include "output/pivot-table.h"
40 #include "gl/minmax.h"
41 #include "gl/xalloc.h"
44 #define N_(msgid) msgid
45 #define _(msgid) gettext (msgid)
47 /* Returns true iff the independent variable lies in the range [nst->val1, nst->val2] */
49 include_func (const struct ccase *c, void *aux)
51 const struct n_sample_test *nst = aux;
53 if (0 < value_compare_3way (&nst->val1, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
56 if (0 > value_compare_3way (&nst->val2, case_data (c, nst->indep_var), var_get_width (nst->indep_var)))
65 struct hmap_node node;
75 compare_rank_entries_3way (const struct bt_node *a,
76 const struct bt_node *b,
79 const struct variable *var = aux;
80 const struct rank_entry *rea = BT_DATA (a, struct rank_entry, btn);
81 const struct rank_entry *reb = BT_DATA (b, struct rank_entry, btn);
83 return value_compare_3way (&rea->group, &reb->group, var_get_width (var));
87 /* Return the entry with the key GROUP or null if there is no such entry */
88 static struct rank_entry *
89 find_rank_entry (const struct hmap *map, const union value *group, size_t width)
91 struct rank_entry *re = NULL;
92 size_t hash = value_hash (group, width, 0);
94 HMAP_FOR_EACH_WITH_HASH (re, struct rank_entry, node, hash, map)
96 if (0 == value_compare_3way (group, &re->group, width))
103 /* Calculates the adjustment necessary for tie compensation */
105 distinct_callback (double v UNUSED, casenumber t, double w UNUSED, void *aux)
107 double *tiebreaker = aux;
109 *tiebreaker += pow3 (t) - t;
119 static void show_ranks_box (const struct n_sample_test *, const struct kw *);
120 static void show_sig_box (const struct n_sample_test *, const struct kw *);
123 kruskal_wallis_execute (const struct dataset *ds,
124 struct casereader *input,
125 enum mv_class exclude,
126 const struct npar_test *test,
133 const struct dictionary *dict = dataset_dict (ds);
134 const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, parent);
135 const struct caseproto *proto ;
138 int total_n_groups = 0.0;
140 struct kw *kw = xcalloc (nst->n_vars, sizeof *kw);
142 /* If the independent variable is missing, then we ignore the case */
143 input = casereader_create_filter_missing (input,
148 input = casereader_create_filter_weight (input, dict, &warn, NULL);
150 /* Remove all those cases which are outside the range (val1, val2) */
151 input = casereader_create_filter_func (input, include_func, NULL,
152 CONST_CAST (struct n_sample_test *, nst), NULL);
154 proto = casereader_get_proto (input);
155 rank_idx = caseproto_get_n_widths (proto);
157 /* Rank cases by the v value */
158 for (i = 0; i < nst->n_vars; ++i)
160 double tiebreaker = 0.0;
162 enum rank_error rerr = 0;
163 struct casereader *rr;
164 struct casereader *r = casereader_clone (input);
166 r = sort_execute_1var (r, nst->vars[i]);
168 /* Ignore missings in the test variable */
169 r = casereader_create_filter_missing (r, &nst->vars[i], 1,
173 rr = casereader_create_append_rank (r,
175 dict_get_weight (dict),
177 distinct_callback, &tiebreaker);
179 hmap_init (&kw[i].map);
180 for (; (c = casereader_read (rr)); case_unref (c))
182 const union value *group = case_data (c, nst->indep_var);
183 const size_t group_var_width = var_get_width (nst->indep_var);
184 struct rank_entry *rank = find_rank_entry (&kw[i].map, group, group_var_width);
188 rank = xzalloc (sizeof *rank);
189 value_clone (&rank->group, group, group_var_width);
191 hmap_insert (&kw[i].map, &rank->node,
192 value_hash (&rank->group, group_var_width, 0));
195 rank->sum_of_ranks += case_data_idx (c, rank_idx)->f;
196 rank->n += dict_get_case_weight (dict, c, &warn);
198 /* If this assertion fires, then either the data wasn't sorted or some other
203 casereader_destroy (rr);
205 /* Calculate the value of h */
207 struct rank_entry *mre;
210 HMAP_FOR_EACH (mre, struct rank_entry, node, &kw[i].map)
212 kw[i].h += pow2 (mre->sum_of_ranks) / mre->n;
217 kw[i].h *= 12 / (n * ( n + 1));
218 kw[i].h -= 3 * (n + 1) ;
220 kw[i].h /= 1 - tiebreaker/ (pow3 (n) - n);
224 casereader_destroy (input);
226 show_ranks_box (nst, kw);
227 show_sig_box (nst, kw);
229 /* Cleanup allocated memory */
230 for (i = 0 ; i < nst->n_vars; ++i)
232 struct rank_entry *mre, *next;
233 HMAP_FOR_EACH_SAFE (mre, next, struct rank_entry, node, &kw[i].map)
235 hmap_delete (&kw[i].map, &mre->node);
238 hmap_destroy (&kw[i].map);
245 show_ranks_box (const struct n_sample_test *nst, const struct kw *kw)
247 struct pivot_table *table = pivot_table_create (N_("Ranks"));
249 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
250 N_("N"), PIVOT_RC_INTEGER,
251 N_("Mean Rank"), PIVOT_RC_OTHER);
253 struct pivot_dimension *variables = pivot_dimension_create (
254 table, PIVOT_AXIS_ROW, N_("Variables"));
256 for (size_t i = 0 ; i < nst->n_vars ; ++i)
258 /* Sort the rank entries, by iteratin the hash and putting the entries
259 into a binary tree. */
260 struct bt bt = BT_INITIALIZER(compare_rank_entries_3way, nst->vars[i]);
261 struct rank_entry *re_x;
262 HMAP_FOR_EACH (re_x, struct rank_entry, node, &kw[i].map)
263 bt_insert (&bt, &re_x->btn);
265 /* Report the rank entries in sorted order. */
266 struct pivot_category *group = pivot_category_create_group__ (
267 variables->root, pivot_value_new_variable (nst->vars[i]));
269 const struct rank_entry *re;
270 BT_FOR_EACH (re, struct rank_entry, btn, &bt)
272 struct string str = DS_EMPTY_INITIALIZER;
273 var_append_value_name (nst->indep_var, &re->group, &str);
274 int row = pivot_category_create_leaf (
275 group, pivot_value_new_user_text_nocopy (ds_steal_cstr (&str)));
277 double entries[] = { re->n, re->sum_of_ranks / re->n };
278 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
279 pivot_table_put2 (table, j, row,
280 pivot_value_new_number (entries[j]));
285 int row = pivot_category_create_leaves (group, N_("Total"));
286 pivot_table_put2 (table, 0, row, pivot_value_new_number (tot));
289 pivot_table_submit (table);
293 show_sig_box (const struct n_sample_test *nst, const struct kw *kw)
295 struct pivot_table *table = pivot_table_create (N_("Test Statistics"));
297 pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"),
298 N_("Chi-Square"), PIVOT_RC_OTHER,
299 N_("df"), PIVOT_RC_INTEGER,
300 N_("Asymp. Sig."), PIVOT_RC_SIGNIFICANCE);
302 struct pivot_dimension *variables = pivot_dimension_create (
303 table, PIVOT_AXIS_COLUMN, N_("Variables"));
305 for (size_t i = 0 ; i < nst->n_vars; ++i)
307 int col = pivot_category_create_leaf (
308 variables->root, pivot_value_new_variable (nst->vars[i]));
310 double df = hmap_count (&kw[i].map) - 1;
311 double sig = gsl_cdf_chisq_Q (kw[i].h, df);
312 double entries[] = { kw[i].h, df, sig };
313 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
314 pivot_table_put2 (table, j, col, pivot_value_new_number (entries[j]));
317 pivot_table_submit (table);