1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/categoricals.h"
20 #include "math/interaction.h"
25 #include "data/case.h"
26 #include "data/value.h"
27 #include "data/variable.h"
28 #include "libpspp/array.h"
29 #include "libpspp/hmap.h"
30 #include "libpspp/pool.h"
31 #include "libpspp/str.h"
32 #include "libpspp/hash-functions.h"
34 #include "gl/xalloc.h"
36 #define CATEGORICALS_DEBUG 0
40 struct hmap_node node; /* Node in hash map. */
41 union value val; /* The value */
42 int index; /* A zero based unique index for this value */
45 static struct value_node *
46 lookup_value (const struct hmap *map, const union value *val,
47 unsigned int hash, int width)
49 struct value_node *vn;
50 HMAP_FOR_EACH_WITH_HASH (vn, struct value_node, node, hash, map)
51 if (value_equal (&vn->val, val, width))
56 /* A variable used in a categoricals object. */
59 struct hmap_node node; /* In struct categorical's 'varmap'. */
60 const struct variable *var; /* The variable. */
61 struct hmap valmap; /* Contains "struct value_node"s. */
65 compare_value_node_3way (const void *vn1_, const void *vn2_, const void *aux)
67 const struct value_node *const *vn1p = vn1_;
68 const struct value_node *const *vn2p = vn2_;
70 const struct variable_node *vn = aux;
72 return value_compare_3way (&(*vn1p)->val, &(*vn2p)->val,
73 var_get_width (vn->var));
76 static struct variable_node *
77 lookup_variable (const struct hmap *map, const struct variable *var, unsigned int hash)
79 struct variable_node *vn;
80 HMAP_FOR_EACH_WITH_HASH (vn, struct variable_node, node, hash, map)
86 struct interact_params
88 /* The interaction, and an array with iact->n_vars elements such that
89 varnodes[x] points to the variable_node for iact->vars[x]. */
90 const struct interaction *iact;
91 struct variable_node **varnodes;
93 /* An example of each interaction that appears in the data, like a frequency
94 table for 'iact'. By construction, the number of elements must be less
95 than or equal to 'n_cats'.
97 categoricals_update() updates 'ivmap' case-by-case, then
98 categoricals_done() dumps 'ivmap' into 'ivs' and sorts it. */
99 struct hmap ivmap; /* Contains "struct interaction_value"s. */
100 struct interaction_value **ivs;
105 /* Product of hmap_count(&varnodes[*]->valmap), that is, the maximum number
106 of distinct values of this interaction. */
109 /* Product of degrees of freedom of all the variables. */
114 /* Sum of ivs[*]->cc. */
118 struct interaction_value
120 struct hmap_node node; /* In struct interact_params's ivmap. */
121 struct ccase *ccase; /* A case representative of the interaction. */
122 double cc; /* Total weight of cases for this interaction. */
127 compare_interaction_value_3way (const void *vn1_, const void *vn2_,
130 const struct interaction_value *const *vn1p = vn1_;
131 const struct interaction_value *const *vn2p = vn2_;
133 const struct interact_params *iap = aux;
135 return interaction_case_cmp_3way (iap->iact, (*vn1p)->ccase, (*vn2p)->ccase);
140 /* The weight variable */
141 const struct variable *wv;
143 struct interact_params *iap; /* Interaction parameters. */
146 /* Contains a "struct variable_node" for each variable in 'iap'. */
149 /* A map to enable the lookup of variables indexed by subscript.
150 This map considers only the N - 1 of the N variables.
152 int *df_to_iact; /* 'df_sum' elements. */
155 /* Like the above, but uses all N variables. */
156 int *cat_to_iact; /* 'n_cats_total' elements. */
161 /* Missing values in the factor variables to be excluded */
162 enum mv_class fctr_excl;
169 const struct payload *payload;
174 categoricals_isbalanced (const struct categoricals *cat)
176 for (int i = 0 ; i < cat->n_iap; ++i)
179 const struct interact_params *iap = &cat->iap[i];
182 for (v = 0; v < hmap_count (&iap->ivmap); ++v)
184 const struct interaction_value *iv = iap->ivs[v];
196 categoricals_dump (const struct categoricals *cat)
198 if (CATEGORICALS_DEBUG)
202 printf ("df to interaction map:\n");
203 for (i = 0; i < cat->df_sum; ++i)
204 printf (" %d", cat->df_to_iact[i]);
207 printf ("Category to interaction map:\n");
208 for (i = 0; i < cat->n_cats_total; ++i)
209 printf (" %d", cat->cat_to_iact[i]);
212 printf ("Number of interactions %zu\n", cat->n_iap);
213 for (i = 0 ; i < cat->n_iap; ++i)
217 const struct interact_params *iap = &cat->iap[i];
218 const struct interaction *iact = iap->iact;
220 ds_init_empty (&str);
221 interaction_to_string (iact, &str);
223 printf ("\nInteraction: \"%s\" (number of categories: %d); ", ds_cstr (&str), iap->n_cats);
225 printf ("Base index (df/categories): %d/%d\n", iap->base_df, iap->base_cats);
228 for (v = 0; v < hmap_count (&iap->ivmap); ++v)
231 const struct interaction_value *iv = iap->ivs[v];
233 if (v > 0) printf (" ");
235 for (vv = 0; vv < iact->n_vars; ++vv)
237 const struct variable *var = iact->vars[vv];
238 const union value *val = case_data (iv->ccase, var);
239 struct variable_node *vn = iap->varnodes[vv];
240 const int width = var_get_width (var);
241 unsigned int valhash = value_hash (val, width, 0);
242 struct value_node *valn = lookup_value (&vn->valmap, val, valhash, width);
244 assert (vn->var == var);
246 printf ("%.*g(%d)", DBL_DIG + 1, val->f, valn->index);
247 if (vv < iact->n_vars - 1)
258 categoricals_destroy (struct categoricals *cat)
263 for (int i = 0; i < cat->n_iap; ++i)
265 /* Unref any cases that we reffed. */
266 struct interaction_value *iv;
267 HMAP_FOR_EACH (iv, struct interaction_value, node, &cat->iap[i].ivmap)
269 if (cat->payload && cat->payload->destroy)
270 cat->payload->destroy (cat->aux1, cat->aux2, iv->user_data);
271 case_unref (iv->ccase);
274 free (cat->iap[i].enc_sum);
275 hmap_destroy (&cat->iap[i].ivmap);
278 /* Interate over each variable and delete its value map.
280 The values themselves are part of the pool. */
281 struct variable_node *vn;
282 HMAP_FOR_EACH (vn, struct variable_node, node, &cat->varmap)
283 hmap_destroy (&vn->valmap);
285 hmap_destroy (&cat->varmap);
287 pool_destroy (cat->pool);
292 static struct interaction_value *
293 lookup_case (const struct hmap *map, const struct interaction *iact,
294 const struct ccase *c)
296 size_t hash = interaction_case_hash (iact, c, 0);
297 struct interaction_value *iv;
298 HMAP_FOR_EACH_WITH_HASH (iv, struct interaction_value, node, hash, map)
299 if (interaction_case_equal (iact, c, iv->ccase))
304 /* Returns true iff CAT is sane, that is, if it is complete and has at least
307 categoricals_sane (const struct categoricals *cat)
312 /* Creates and returns a new categoricals object whose variables come from the
313 N_INTER interactions objects in the array starting at INTER. (The INTER
314 objects must outlive the categoricals object because it uses them
317 FCTR_EXCL determines which cases are listwise ignored by
318 categoricals_update(). */
319 struct categoricals *
320 categoricals_create (struct interaction *const *inter, size_t n_inter,
321 const struct variable *wv, enum mv_class fctr_excl)
323 struct categoricals *cat = xzalloc (sizeof *cat);
324 cat->iap = pool_calloc (cat->pool, n_inter, sizeof *cat->iap);
325 cat->n_iap = n_inter;
327 cat->pool = pool_create ();
328 cat->fctr_excl = fctr_excl;
330 hmap_init (&cat->varmap);
331 for (size_t i = 0; i < cat->n_iap; ++i)
333 struct interact_params *iap = &cat->iap[i];
334 hmap_init (&iap->ivmap);
335 iap->iact = inter[i];
337 iap->varnodes = pool_nmalloc (cat->pool, iap->iact->n_vars,
338 sizeof *iap->varnodes);
339 for (size_t v = 0; v < inter[i]->n_vars; ++v)
341 const struct variable *var = inter[i]->vars[v];
342 unsigned int hash = hash_pointer (var, 0);
343 struct variable_node *vn = lookup_variable (&cat->varmap, var, hash);
346 vn = pool_malloc (cat->pool, sizeof *vn);
348 hmap_init (&vn->valmap);
349 hmap_insert (&cat->varmap, &vn->node, hash);
351 iap->varnodes[v] = vn;
359 categoricals_update (struct categoricals *cat, const struct ccase *c)
363 assert (!cat->df_to_iact);
364 assert (!cat->cat_to_iact);
367 weight = cat->wv ? case_data (c, cat->wv)->f : 1.0;
368 weight = var_force_valid_weight (cat->wv, weight, NULL);
370 /* Update the frequency table for each variable. */
371 struct variable_node *vn;
372 HMAP_FOR_EACH (vn, struct variable_node, node, &cat->varmap)
374 const int width = var_get_width (vn->var);
375 const union value *val = case_data (c, vn->var);
376 unsigned int hash = value_hash (val, width, 0);
378 struct value_node *valn = lookup_value (&vn->valmap, val, hash, width);
381 valn = pool_malloc (cat->pool, sizeof *valn);
383 value_init (&valn->val, width);
384 value_copy (&valn->val, val, width);
385 hmap_insert (&vn->valmap, &valn->node, hash);
389 /* Update the frequency table for full interactions. */
390 for (int i = 0; i < cat->n_iap; ++i)
392 struct interact_params *iap = &cat->iap[i];
393 const struct interaction *iact = iap->iact;
394 if (interaction_case_is_missing (iact, c, cat->fctr_excl))
397 unsigned int hash = interaction_case_hash (iact, c, 0);
398 struct interaction_value *node = lookup_case (&iap->ivmap, iact, c);
401 node = pool_malloc (cat->pool, sizeof *node);
402 node->ccase = case_ref (c);
405 hmap_insert (&iap->ivmap, &node->node, hash);
408 node->user_data = cat->payload->create (cat->aux1, cat->aux2);
415 cat->payload->update (cat->aux1, cat->aux2, node->user_data, c,
420 /* Return the number of categories (distinct values) for interaction IDX in
423 categoricals_n_count (const struct categoricals *cat, size_t idx)
425 return hmap_count (&cat->iap[idx].ivmap);
428 /* Return the total number of categories across all interactions in CAT. */
430 categoricals_n_total (const struct categoricals *cat)
432 return categoricals_is_complete (cat) ? cat->n_cats_total : 0;
435 /* Returns the number of degrees of freedom for interaction IDX within CAT. */
437 categoricals_df (const struct categoricals *cat, size_t idx)
439 const struct interact_params *iap = &cat->iap[idx];
443 /* Returns the total degrees of freedom for CAT. */
445 categoricals_df_total (const struct categoricals *cat)
447 return cat ? cat->df_sum : 0;
450 /* Returns true iff categoricals_done() has been called for CAT. */
452 categoricals_is_complete (const struct categoricals *cat)
454 return cat->df_to_iact != NULL;
457 /* This function must be called (once) before any call to the *_by_subscript or
458 *_by_category functions, but AFTER any calls to categoricals_update. If this
459 function returns false, then no calls to _by_subscript or *_by_category are
462 categoricals_done (const struct categoricals *cat_)
464 struct categoricals *cat = CONST_CAST (struct categoricals *, cat_);
465 if (!cat || categoricals_is_complete (cat))
468 /* Assign 'index' to each variables' value_nodes, counting up from 0 in
469 ascending order by value. */
470 struct variable_node *vn;
471 HMAP_FOR_EACH (vn, struct variable_node, node, &cat->varmap)
473 size_t n_vals = hmap_count (&vn->valmap);
480 struct value_node **nodes = xcalloc (sizeof *nodes, n_vals);
482 struct value_node *valnd;
483 HMAP_FOR_EACH (valnd, struct value_node, node, &vn->valmap)
485 sort (nodes, n_vals, sizeof *nodes, compare_value_node_3way, vn);
486 for (x = 0; x < n_vals; ++x)
491 /* Calculate the degrees of freedom, and the number of categories. */
493 cat->n_cats_total = 0;
494 for (int i = 0 ; i < cat->n_iap; ++i)
496 struct interact_params *iap = &cat->iap[i];
497 const struct interaction *iact = iap->iact;
501 for (int v = 0 ; v < iact->n_vars; ++v)
503 size_t n_vals = hmap_count (&iap->varnodes[v]->valmap);
505 iap->df_prod *= n_vals - 1;
506 iap->n_cats *= n_vals;
509 if (iact->n_vars > 0)
510 cat->df_sum += iap->df_prod;
511 cat->n_cats_total += iap->n_cats;
515 cat->df_to_iact = pool_calloc (cat->pool, cat->df_sum,
516 sizeof *cat->df_to_iact);
518 cat->cat_to_iact = pool_calloc (cat->pool, cat->n_cats_total,
519 sizeof *cat->cat_to_iact);
523 for (int i = 0; i < cat->n_iap; ++i)
525 struct interact_params *iap = &cat->iap[i];
527 iap->base_df = idx_df;
528 iap->base_cats = idx_cat;
530 /* For some purposes (eg CONTRASTS in ONEWAY) the values need to be
532 iap->ivs = pool_nmalloc (cat->pool, hmap_count (&iap->ivmap),
535 struct interaction_value *ivn;
536 HMAP_FOR_EACH (ivn, struct interaction_value, node, &iap->ivmap)
538 sort (iap->ivs, x, sizeof *iap->ivs,
539 compare_interaction_value_3way, iap);
541 /* Populate the variable maps. */
542 if (iap->iact->n_vars)
543 for (int j = 0; j < iap->df_prod; ++j)
544 cat->df_to_iact[idx_df++] = i;
546 for (int j = 0; j < iap->n_cats; ++j)
547 cat->cat_to_iact[idx_cat++] = i;
550 categoricals_dump (cat);
552 /* Tally up the sums for all the encodings */
553 for (int i = 0; i < cat->n_iap; ++i)
555 struct interact_params *iap = &cat->iap[i];
556 const struct interaction *iact = iap->iact;
558 const int df = iact->n_vars ? iap->df_prod : 0;
560 iap->enc_sum = xcalloc (df, sizeof *iap->enc_sum);
562 for (int y = 0; y < hmap_count (&iap->ivmap); ++y)
564 struct interaction_value *iv = iap->ivs[y];
565 for (int x = iap->base_df;
566 x < iap->base_df + df; ++x)
568 const double bin = categoricals_get_effects_code_for_case (
570 iap->enc_sum[x - iap->base_df] += bin * iv->cc;
572 if (cat->payload && cat->payload->calculate)
573 cat->payload->calculate (cat->aux1, cat->aux2, iv->user_data);
580 static struct interact_params *
581 df_to_iap (const struct categoricals *cat, int subscript)
583 assert (subscript >= 0);
584 assert (subscript < cat->df_sum);
586 return &cat->iap[cat->df_to_iact[subscript]];
589 static struct interact_params *
590 cat_index_to_iap (const struct categoricals *cat, int cat_index)
592 assert (cat_index >= 0);
593 assert (cat_index < cat->n_cats_total);
595 return &cat->iap[cat->cat_to_iact[cat_index]];
598 /* Return the interaction corresponding to SUBSCRIPT. */
599 const struct interaction *
600 categoricals_get_interaction_by_subscript (const struct categoricals *cat,
603 return df_to_iap (cat, subscript)->iact;
607 categoricals_get_weight_by_subscript (const struct categoricals *cat,
610 return df_to_iap (cat, subscript)->cc;
614 categoricals_get_sum_by_subscript (const struct categoricals *cat,
617 const struct interact_params *iap = df_to_iap (cat, subscript);
618 return iap->enc_sum[subscript - iap->base_df];
622 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
623 for that subscript */
625 categoricals_get_code_for_case (const struct categoricals *cat, int subscript,
626 const struct ccase *c,
629 const struct interaction *iact
630 = categoricals_get_interaction_by_subscript (cat, subscript);
632 const struct interact_params *iap = df_to_iap (cat, subscript);
636 for (int v = 0; v < iact->n_vars; ++v)
638 const struct variable *var = iact->vars[v];
640 const union value *val = case_data (c, var);
641 const int width = var_get_width (var);
643 const unsigned int hash = value_hash (val, width, 0);
644 const struct value_node *valn
645 = lookup_value (&iap->varnodes[v]->valmap, val, hash, width);
647 const int df = hmap_count (&iap->varnodes[v]->valmap) - 1;
648 const int dfpn = dfp * df;
650 if (effects_coding && valn->index == df)
654 /* Translate subscript into an index for the individual variable. */
655 const int index = ((subscript - iap->base_df) % dfpn) / dfp;
656 if (valn->index != index)
666 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
667 for that subscript. */
669 categoricals_get_dummy_code_for_case (const struct categoricals *cat,
670 int subscript, const struct ccase *c)
672 return categoricals_get_code_for_case (cat, subscript, c, false);
675 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
677 Else if it is the last category, return -1.
681 categoricals_get_effects_code_for_case (const struct categoricals *cat,
682 int subscript, const struct ccase *c)
684 return categoricals_get_code_for_case (cat, subscript, c, true);
687 /* Return a case containing the set of values corresponding to
688 the Nth Category of the IACTth interaction */
690 categoricals_get_case_by_category_real (const struct categoricals *cat,
693 const struct interact_params *iap = &cat->iap[iact];
694 return n < hmap_count (&iap->ivmap) ? iap->ivs[n]->ccase : NULL;
697 /* Return a the user data corresponding to the Nth Category of the IACTth
700 categoricals_get_user_data_by_category_real (const struct categoricals *cat,
703 const struct interact_params *iap = &cat->iap[iact];
704 return n < hmap_count (&iap->ivmap) ? iap->ivs[n]->user_data : NULL;
707 /* Return a case containing the set of values corresponding to CAT_INDEX. */
709 categoricals_get_case_by_category (const struct categoricals *cat,
712 const struct interact_params *iap = cat_index_to_iap (cat, cat_index);
713 const struct interaction_value *vn = iap->ivs[cat_index - iap->base_cats];
718 categoricals_get_user_data_by_category (const struct categoricals *cat,
721 const struct interact_params *iap = cat_index_to_iap (cat, cat_index);
722 const struct interaction_value *iv = iap->ivs[cat_index - iap->base_cats];
723 return iv->user_data;
727 categoricals_set_payload (struct categoricals *cat, const struct payload *p,
728 const void *aux1, void *aux2)