1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/categoricals.h"
20 #include "math/interaction.h"
25 #include "data/case.h"
26 #include "data/value.h"
27 #include "data/variable.h"
28 #include "libpspp/array.h"
29 #include "libpspp/hmap.h"
30 #include "libpspp/pool.h"
31 #include "libpspp/str.h"
32 #include "libpspp/hash-functions.h"
34 #include "gl/xalloc.h"
36 #define CATEGORICALS_DEBUG 0
40 struct hmap_node node; /* Node in hash map. */
41 union value val; /* The value */
42 int index; /* A zero based unique index for this value */
45 static struct value_node *
46 lookup_value (const struct hmap *map, const union value *val,
47 unsigned int hash, int width)
49 struct value_node *vn;
50 HMAP_FOR_EACH_WITH_HASH (vn, struct value_node, node, hash, map)
51 if (value_equal (&vn->val, val, width))
56 /* A variable used in a categoricals object. */
59 struct hmap_node node; /* In struct categorical's 'varmap'. */
60 const struct variable *var; /* The variable. */
61 struct hmap valmap; /* Contains "struct value_node"s. */
62 union value *values; /* Values in valmap, as a sorted array. */
66 compare_value_node_3way (const void *vn1_, const void *vn2_, const void *aux)
68 const struct value_node *const *vn1p = vn1_;
69 const struct value_node *const *vn2p = vn2_;
71 const struct variable_node *vn = aux;
73 return value_compare_3way (&(*vn1p)->val, &(*vn2p)->val,
74 var_get_width (vn->var));
77 static struct variable_node *
78 lookup_variable (const struct hmap *map, const struct variable *var)
80 struct variable_node *vn;
81 HMAP_FOR_EACH_WITH_HASH (vn, struct variable_node, node,
82 hash_pointer (var, 0), map)
88 struct interact_params
90 /* The interaction, and an array with iact->n_vars elements such that
91 varnodes[x] points to the variable_node for iact->vars[x]. */
92 const struct interaction *iact;
93 struct variable_node **varnodes;
95 /* An example of each interaction that appears in the data, like a frequency
96 table for 'iact'. By construction, the number of elements must be less
97 than or equal to 'n_cats'.
99 categoricals_update() updates 'ivmap' case-by-case, then
100 categoricals_done() dumps 'ivmap' into 'ivs' and sorts it. */
101 struct hmap ivmap; /* Contains "struct interaction_value"s. */
102 struct interaction_value **ivs;
107 /* Product of hmap_count(&varnodes[*]->valmap), that is, the maximum number
108 of distinct values of this interaction. */
111 /* Product of degrees of freedom of all the variables. */
116 /* Sum of ivs[*]->cc. */
120 struct interaction_value
122 struct hmap_node node; /* In struct interact_params's ivmap. */
123 struct ccase *ccase; /* A case representative of the interaction. */
124 double cc; /* Total weight of cases for this interaction. */
129 compare_interaction_value_3way (const void *vn1_, const void *vn2_,
132 const struct interaction_value *const *vn1p = vn1_;
133 const struct interaction_value *const *vn2p = vn2_;
135 const struct interact_params *iap = aux;
137 return interaction_case_cmp_3way (iap->iact, (*vn1p)->ccase, (*vn2p)->ccase);
142 /* The weight variable */
143 const struct variable *wv;
145 struct interact_params *iap; /* Interaction parameters. */
148 /* Contains a "struct variable_node" for each variable in 'iap'. */
151 /* A map to enable the lookup of variables indexed by subscript.
152 This map considers only the N - 1 of the N variables.
154 int *df_to_iact; /* 'df_sum' elements. */
157 /* Like the above, but uses all N variables. */
158 int *cat_to_iact; /* 'n_cats_total' elements. */
163 /* Missing values in the factor variables to be excluded */
164 enum mv_class fctr_excl;
171 const struct payload *payload;
176 categoricals_isbalanced (const struct categoricals *cat)
178 for (int i = 0 ; i < cat->n_iap; ++i)
181 const struct interact_params *iap = &cat->iap[i];
184 for (v = 0; v < hmap_count (&iap->ivmap); ++v)
186 const struct interaction_value *iv = iap->ivs[v];
198 categoricals_dump (const struct categoricals *cat)
200 if (CATEGORICALS_DEBUG)
204 printf ("df to interaction map:\n");
205 for (i = 0; i < cat->df_sum; ++i)
206 printf (" %d", cat->df_to_iact[i]);
209 printf ("Category to interaction map:\n");
210 for (i = 0; i < cat->n_cats_total; ++i)
211 printf (" %d", cat->cat_to_iact[i]);
214 printf ("Number of interactions %zu\n", cat->n_iap);
215 for (i = 0 ; i < cat->n_iap; ++i)
219 const struct interact_params *iap = &cat->iap[i];
220 const struct interaction *iact = iap->iact;
222 ds_init_empty (&str);
223 interaction_to_string (iact, &str);
225 printf ("\nInteraction: \"%s\" (number of categories: %d); ", ds_cstr (&str), iap->n_cats);
227 printf ("Base index (df/categories): %d/%d\n", iap->base_df, iap->base_cats);
230 for (v = 0; v < hmap_count (&iap->ivmap); ++v)
233 const struct interaction_value *iv = iap->ivs[v];
235 if (v > 0) printf (" ");
237 for (vv = 0; vv < iact->n_vars; ++vv)
239 const struct variable *var = iact->vars[vv];
240 const union value *val = case_data (iv->ccase, var);
241 struct variable_node *vn = iap->varnodes[vv];
242 const int width = var_get_width (var);
243 unsigned int valhash = value_hash (val, width, 0);
244 struct value_node *valn = lookup_value (&vn->valmap, val, valhash, width);
246 assert (vn->var == var);
248 printf ("%.*g(%d)", DBL_DIG + 1, val->f, valn->index);
249 if (vv < iact->n_vars - 1)
260 categoricals_destroy (struct categoricals *cat)
265 for (int i = 0; i < cat->n_iap; ++i)
267 /* Unref any cases that we reffed. */
268 struct interaction_value *iv;
269 HMAP_FOR_EACH (iv, struct interaction_value, node, &cat->iap[i].ivmap)
271 if (cat->payload && cat->payload->destroy)
272 cat->payload->destroy (cat->aux1, cat->aux2, iv->user_data);
273 case_unref (iv->ccase);
276 free (cat->iap[i].enc_sum);
277 hmap_destroy (&cat->iap[i].ivmap);
281 /* Interate over each variable and delete its value map.
283 The values themselves are part of the pool. */
284 struct variable_node *vn;
285 HMAP_FOR_EACH (vn, struct variable_node, node, &cat->varmap)
286 hmap_destroy (&vn->valmap);
288 hmap_destroy (&cat->varmap);
290 pool_destroy (cat->pool);
295 static struct interaction_value *
296 lookup_case (const struct hmap *map, const struct interaction *iact,
297 const struct ccase *c)
299 size_t hash = interaction_case_hash (iact, c, 0);
300 struct interaction_value *iv;
301 HMAP_FOR_EACH_WITH_HASH (iv, struct interaction_value, node, hash, map)
302 if (interaction_case_equal (iact, c, iv->ccase))
307 /* Returns true iff CAT is sane, that is, if it is complete and has at least
310 categoricals_sane (const struct categoricals *cat)
315 /* Creates and returns a new categoricals object whose variables come from the
316 N_INTER interactions objects in the array starting at INTER. (The INTER
317 objects must outlive the categoricals object because it uses them
320 FCTR_EXCL determines which cases are listwise ignored by
321 categoricals_update(). */
322 struct categoricals *
323 categoricals_create (struct interaction *const *inter, size_t n_inter,
324 const struct variable *wv, enum mv_class fctr_excl)
326 struct categoricals *cat = XZALLOC (struct categoricals);
327 cat->iap = pool_calloc (cat->pool, n_inter, sizeof *cat->iap);
328 cat->n_iap = n_inter;
330 cat->pool = pool_create ();
331 cat->fctr_excl = fctr_excl;
333 hmap_init (&cat->varmap);
334 for (size_t i = 0; i < cat->n_iap; ++i)
336 struct interact_params *iap = &cat->iap[i];
337 hmap_init (&iap->ivmap);
338 iap->iact = inter[i];
340 iap->varnodes = pool_nmalloc (cat->pool, iap->iact->n_vars,
341 sizeof *iap->varnodes);
342 for (size_t v = 0; v < inter[i]->n_vars; ++v)
344 const struct variable *var = inter[i]->vars[v];
345 struct variable_node *vn = lookup_variable (&cat->varmap, var);
348 vn = pool_malloc (cat->pool, sizeof *vn);
351 hmap_init (&vn->valmap);
352 hmap_insert (&cat->varmap, &vn->node, hash_pointer (var, 0));
354 iap->varnodes[v] = vn;
362 categoricals_update (struct categoricals *cat, const struct ccase *c)
366 assert (!cat->df_to_iact);
367 assert (!cat->cat_to_iact);
370 weight = cat->wv ? case_num (c, cat->wv) : 1.0;
371 weight = var_force_valid_weight (cat->wv, weight, NULL);
373 /* Update the frequency table for each variable. */
374 struct variable_node *vn;
375 HMAP_FOR_EACH (vn, struct variable_node, node, &cat->varmap)
377 const int width = var_get_width (vn->var);
378 const union value *val = case_data (c, vn->var);
379 unsigned int hash = value_hash (val, width, 0);
381 struct value_node *valn = lookup_value (&vn->valmap, val, hash, width);
384 valn = pool_malloc (cat->pool, sizeof *valn);
386 value_init_pool (cat->pool, &valn->val, width);
387 value_copy (&valn->val, val, width);
388 hmap_insert (&vn->valmap, &valn->node, hash);
392 /* Update the frequency table for full interactions. */
393 for (int i = 0; i < cat->n_iap; ++i)
395 struct interact_params *iap = &cat->iap[i];
396 const struct interaction *iact = iap->iact;
397 if (interaction_case_is_missing (iact, c, cat->fctr_excl))
400 unsigned int hash = interaction_case_hash (iact, c, 0);
401 struct interaction_value *node = lookup_case (&iap->ivmap, iact, c);
404 node = pool_malloc (cat->pool, sizeof *node);
405 node->ccase = case_ref (c);
408 hmap_insert (&iap->ivmap, &node->node, hash);
411 node->user_data = cat->payload->create (cat->aux1, cat->aux2);
418 cat->payload->update (cat->aux1, cat->aux2, node->user_data, c,
423 /* Return the number of categories (distinct values) for interaction IDX in
426 categoricals_n_count (const struct categoricals *cat, size_t idx)
428 return hmap_count (&cat->iap[idx].ivmap);
431 /* Return the total number of categories across all interactions in CAT. */
433 categoricals_n_total (const struct categoricals *cat)
435 return categoricals_is_complete (cat) ? cat->n_cats_total : 0;
438 /* Returns the number of degrees of freedom for interaction IDX within CAT. */
440 categoricals_df (const struct categoricals *cat, size_t idx)
442 const struct interact_params *iap = &cat->iap[idx];
446 /* Returns the total degrees of freedom for CAT. */
448 categoricals_df_total (const struct categoricals *cat)
450 return cat ? cat->df_sum : 0;
453 /* Returns true iff categoricals_done() has been called for CAT. */
455 categoricals_is_complete (const struct categoricals *cat)
457 return cat->df_to_iact != NULL;
460 /* This function must be called (once) before any call to the *_by_subscript or
461 *_by_category functions, but AFTER any calls to categoricals_update. If this
462 function returns false, then no calls to _by_subscript or *_by_category are
465 categoricals_done (const struct categoricals *cat_)
467 struct categoricals *cat = CONST_CAST (struct categoricals *, cat_);
468 if (!cat || categoricals_is_complete (cat))
471 /* Assign 'index' to each variables' value_nodes, counting up from 0 in
472 ascending order by value. */
473 struct variable_node *vn;
474 HMAP_FOR_EACH (vn, struct variable_node, node, &cat->varmap)
476 size_t n_vals = hmap_count (&vn->valmap);
483 struct value_node **nodes = XCALLOC (n_vals, struct value_node *);
485 struct value_node *valnd;
486 HMAP_FOR_EACH (valnd, struct value_node, node, &vn->valmap)
488 sort (nodes, n_vals, sizeof *nodes, compare_value_node_3way, vn);
489 for (x = 0; x < n_vals; ++x)
494 /* Calculate the degrees of freedom, and the number of categories. */
496 cat->n_cats_total = 0;
497 for (int i = 0 ; i < cat->n_iap; ++i)
499 struct interact_params *iap = &cat->iap[i];
500 const struct interaction *iact = iap->iact;
504 for (int v = 0 ; v < iact->n_vars; ++v)
506 size_t n_vals = hmap_count (&iap->varnodes[v]->valmap);
508 iap->df_prod *= n_vals - 1;
509 iap->n_cats *= n_vals;
512 if (iact->n_vars > 0)
513 cat->df_sum += iap->df_prod;
514 cat->n_cats_total += iap->n_cats;
518 cat->df_to_iact = pool_calloc (cat->pool, cat->df_sum,
519 sizeof *cat->df_to_iact);
521 cat->cat_to_iact = pool_calloc (cat->pool, cat->n_cats_total,
522 sizeof *cat->cat_to_iact);
526 for (int i = 0; i < cat->n_iap; ++i)
528 struct interact_params *iap = &cat->iap[i];
530 iap->base_df = idx_df;
531 iap->base_cats = idx_cat;
533 /* For some purposes (eg CONTRASTS in ONEWAY) the values need to be
535 iap->ivs = pool_nmalloc (cat->pool, hmap_count (&iap->ivmap),
538 struct interaction_value *ivn;
539 HMAP_FOR_EACH (ivn, struct interaction_value, node, &iap->ivmap)
541 sort (iap->ivs, x, sizeof *iap->ivs,
542 compare_interaction_value_3way, iap);
544 /* Populate the variable maps. */
545 if (iap->iact->n_vars)
546 for (int j = 0; j < iap->df_prod; ++j)
547 cat->df_to_iact[idx_df++] = i;
549 for (int j = 0; j < iap->n_cats; ++j)
550 cat->cat_to_iact[idx_cat++] = i;
553 categoricals_dump (cat);
555 /* Tally up the sums for all the encodings */
556 for (int i = 0; i < cat->n_iap; ++i)
558 struct interact_params *iap = &cat->iap[i];
559 const struct interaction *iact = iap->iact;
561 const int df = iact->n_vars ? iap->df_prod : 0;
563 iap->enc_sum = xcalloc (df, sizeof *iap->enc_sum);
565 for (int y = 0; y < hmap_count (&iap->ivmap); ++y)
567 struct interaction_value *iv = iap->ivs[y];
568 for (int x = iap->base_df;
569 x < iap->base_df + df; ++x)
571 const double bin = categoricals_get_effects_code_for_case (
573 iap->enc_sum[x - iap->base_df] += bin * iv->cc;
575 if (cat->payload && cat->payload->calculate)
576 cat->payload->calculate (cat->aux1, cat->aux2, iv->user_data);
584 categoricals_get_var_values (const struct categoricals *cat,
585 const struct variable *var, size_t *np)
587 struct variable_node *vn = lookup_variable (&cat->varmap, var);
588 *np = hmap_count (&vn->valmap);
591 vn->values = pool_nalloc (cat->pool, *np, sizeof *vn->values);
593 struct value_node *valnd;
594 HMAP_FOR_EACH (valnd, struct value_node, node, &vn->valmap)
595 vn->values[valnd->index] = valnd->val;
600 static struct interact_params *
601 df_to_iap (const struct categoricals *cat, int subscript)
603 assert (subscript >= 0);
604 assert (subscript < cat->df_sum);
606 return &cat->iap[cat->df_to_iact[subscript]];
609 static struct interact_params *
610 cat_index_to_iap (const struct categoricals *cat, int cat_index)
612 assert (cat_index >= 0);
613 assert (cat_index < cat->n_cats_total);
615 return &cat->iap[cat->cat_to_iact[cat_index]];
618 /* Return the interaction corresponding to SUBSCRIPT. */
619 const struct interaction *
620 categoricals_get_interaction_by_subscript (const struct categoricals *cat,
623 return df_to_iap (cat, subscript)->iact;
627 categoricals_get_weight_by_subscript (const struct categoricals *cat,
630 return df_to_iap (cat, subscript)->cc;
634 categoricals_get_sum_by_subscript (const struct categoricals *cat,
637 const struct interact_params *iap = df_to_iap (cat, subscript);
638 return iap->enc_sum[subscript - iap->base_df];
642 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
643 for that subscript */
645 categoricals_get_code_for_case (const struct categoricals *cat, int subscript,
646 const struct ccase *c,
649 const struct interaction *iact
650 = categoricals_get_interaction_by_subscript (cat, subscript);
652 const struct interact_params *iap = df_to_iap (cat, subscript);
656 for (int v = 0; v < iact->n_vars; ++v)
658 const struct variable *var = iact->vars[v];
660 const union value *val = case_data (c, var);
661 const int width = var_get_width (var);
663 const unsigned int hash = value_hash (val, width, 0);
664 const struct value_node *valn
665 = lookup_value (&iap->varnodes[v]->valmap, val, hash, width);
667 const int df = hmap_count (&iap->varnodes[v]->valmap) - 1;
668 const int dfpn = dfp * df;
670 if (effects_coding && valn->index == df)
674 /* Translate subscript into an index for the individual variable. */
675 const int index = ((subscript - iap->base_df) % dfpn) / dfp;
676 if (valn->index != index)
686 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
687 for that subscript. */
689 categoricals_get_dummy_code_for_case (const struct categoricals *cat,
690 int subscript, const struct ccase *c)
692 return categoricals_get_code_for_case (cat, subscript, c, false);
695 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
697 Else if it is the last category, return -1.
701 categoricals_get_effects_code_for_case (const struct categoricals *cat,
702 int subscript, const struct ccase *c)
704 return categoricals_get_code_for_case (cat, subscript, c, true);
707 /* Return a case containing the set of values corresponding to
708 the Nth Category of the IACTth interaction */
710 categoricals_get_case_by_category_real (const struct categoricals *cat,
713 const struct interact_params *iap = &cat->iap[iact];
714 return n < hmap_count (&iap->ivmap) ? iap->ivs[n]->ccase : NULL;
717 /* Return a the user data corresponding to the Nth Category of the IACTth
720 categoricals_get_user_data_by_category_real (const struct categoricals *cat,
723 const struct interact_params *iap = &cat->iap[iact];
724 return n < hmap_count (&iap->ivmap) ? iap->ivs[n]->user_data : NULL;
728 categoricals_get_value_index_by_category_real (const struct categoricals *cat,
729 int iact_idx, int cat_idx,
732 const struct interact_params *iap = &cat->iap[iact_idx];
733 const struct interaction_value *ivn = iap->ivs[cat_idx];
734 const struct variable *var = iap->iact->vars[var_idx];
735 const struct variable_node *vn = iap->varnodes[var_idx];
736 const union value *val = case_data (ivn->ccase, var);
737 int width = var_get_width (var);
738 unsigned int hash = value_hash (val, width, 0);
739 return lookup_value (&vn->valmap, val, hash, width)->index;
742 /* Return a case containing the set of values corresponding to CAT_INDEX. */
744 categoricals_get_case_by_category (const struct categoricals *cat,
747 const struct interact_params *iap = cat_index_to_iap (cat, cat_index);
748 const struct interaction_value *vn = iap->ivs[cat_index - iap->base_cats];
753 categoricals_get_user_data_by_category (const struct categoricals *cat,
756 const struct interact_params *iap = cat_index_to_iap (cat, cat_index);
757 const struct interaction_value *iv = iap->ivs[cat_index - iap->base_cats];
758 return iv->user_data;
762 categoricals_set_payload (struct categoricals *cat, const struct payload *p,
763 const void *aux1, void *aux2)