1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/categoricals.h"
20 #include "math/interaction.h"
24 #include "data/case.h"
25 #include "data/value.h"
26 #include "data/variable.h"
27 #include "libpspp/array.h"
28 #include "libpspp/hmap.h"
29 #include "libpspp/pool.h"
30 #include "libpspp/str.h"
31 #include "libpspp/hash-functions.h"
33 #include "gl/xalloc.h"
35 #define CATEGORICALS_DEBUG 0
39 struct hmap_node node; /* Node in hash map. */
41 union value val; /* The value */
43 int index; /* A zero based unique index for this value */
47 struct interaction_value
49 struct hmap_node node; /* Node in hash map */
51 struct ccase *ccase; /* A case (probably the first in the dataset) which matches
54 double cc; /* Total of the weights of cases matching this interaction */
56 void *user_data; /* A pointer to data which the caller can store stuff */
59 static struct value_node *
60 lookup_value (const struct hmap *map, const union value *val, unsigned int hash, int width)
62 struct value_node *vn = NULL;
63 HMAP_FOR_EACH_WITH_HASH (vn, struct value_node, node, hash, map)
65 if (value_equal (&vn->val, val, width))
74 struct hmap_node node; /* Node in hash map. */
75 const struct variable *var; /* The variable */
77 struct hmap valmap; /* A map of value nodes */
78 int n_vals; /* Number of values for this variable */
80 int *indirection; /* An array (of size n_vals) of integers, which serve to
81 permute the index members of the values in valmap.
83 Doing this, means that categories are considered in the order
84 of their values. Mathematically the order is irrelevant.
85 However certain procedures (eg logistic regression) want to report
86 statisitics for particular categories */
91 /* Comparison function to sort value_nodes in ascending order */
93 compare_value_node_3way (const void *vn1_, const void *vn2_, const void *aux)
95 const struct value_node *const *vn1p = vn1_;
96 const struct value_node *const *vn2p = vn2_;
98 const struct variable_node *vn = aux;
101 return value_compare_3way (&(*vn1p)->val, &(*vn2p)->val, var_get_width (vn->var));
106 static struct variable_node *
107 lookup_variable (const struct hmap *map, const struct variable *var, unsigned int hash)
109 struct variable_node *vn = NULL;
110 HMAP_FOR_EACH_WITH_HASH (vn, struct variable_node, node, hash, map)
115 fprintf (stderr, "Warning: Hash table collision\n");
122 struct interact_params
124 /* A map of cases indexed by a interaction_value */
127 struct interaction *iact;
129 int base_subscript_short;
130 int base_subscript_long;
132 /* The number of distinct values of this interaction */
135 /* An array of integers df_n * df_{n-1} * df_{n-2} ...
136 These are the products of the degrees of freedom for the current
137 variable and all preceeding variables */
142 /* A map of interaction_values indexed by subscript */
143 struct interaction_value **reverse_interaction_value_map;
149 /* Comparison function to sort the reverse_value_map in ascending order */
151 compare_interaction_value_3way (const void *vn1_, const void *vn2_, const void *aux)
153 const struct interaction_value *const *vn1p = vn1_;
154 const struct interaction_value *const *vn2p = vn2_;
156 const struct interact_params *iap = aux;
158 return interaction_case_cmp_3way (iap->iact, (*vn1p)->ccase, (*vn2p)->ccase);
163 /* The weight variable */
164 const struct variable *wv;
166 /* An array of interact_params */
167 struct interact_params *iap;
169 /* Map whose members are the union of the variables which comprise IAP */
172 /* The size of IAP. (ie, the number of interactions involved.) */
175 /* The number of categorical variables which contain entries.
176 In the absence of missing values, this will be equal to N_IAP */
181 /* A map to enable the lookup of variables indexed by subscript.
182 This map considers only the N - 1 of the N variables.
184 int *reverse_variable_map_short;
186 /* Like the above, but uses all N variables */
187 int *reverse_variable_map_long;
193 /* Missing values in the dependent varirable to be excluded */
194 enum mv_class dep_excl;
196 /* Missing values in the factor variables to be excluded */
197 enum mv_class fctr_excl;
204 const struct payload *payload;
208 categoricals_dump (const struct categoricals *cat)
210 if (CATEGORICALS_DEBUG)
214 printf ("Reverse Variable Map (short):\n");
215 for (i = 0; i < cat->df_sum; ++i)
217 printf (" %d", cat->reverse_variable_map_short[i]);
221 printf ("Reverse Variable Map (long):\n");
222 for (i = 0; i < cat->n_cats_total; ++i)
224 printf (" %d", cat->reverse_variable_map_long[i]);
228 printf ("Number of interactions %d\n", cat->n_iap);
229 for (i = 0 ; i < cat->n_iap; ++i)
233 const struct interact_params *iap = &cat->iap[i];
234 const struct interaction *iact = iap->iact;
236 ds_init_empty (&str);
237 interaction_to_string (iact, &str);
239 printf ("\nInteraction: \"%s\" (number of categories: %d); ", ds_cstr (&str), iap->n_cats);
241 printf ("Base index (short/long): %d/%d\n", iap->base_subscript_short, iap->base_subscript_long);
244 for (v = 0; v < hmap_count (&iap->ivmap); ++v)
247 const struct interaction_value *iv = iap->reverse_interaction_value_map[v];
249 if (v > 0) printf (" ");
251 for (vv = 0; vv < iact->n_vars; ++vv)
253 const struct variable *var = iact->vars[vv];
254 const union value *val = case_data (iv->ccase, var);
255 unsigned int varhash = hash_pointer (var, 0);
256 struct variable_node *vn = lookup_variable (&cat->varmap, var, varhash);
258 const int width = var_get_width (var);
259 unsigned int valhash = value_hash (val, width, 0);
260 struct value_node *valn = lookup_value (&vn->valmap, val, valhash, width);
262 assert (vn->var == var);
264 printf ("%g(%d)", val->f, valn->index);
265 if (vv < iact->n_vars - 1)
276 categoricals_destroy (struct categoricals *cat)
278 struct variable_node *vn = NULL;
282 for (i = 0; i < cat->n_iap; ++i)
284 struct interaction_value *iv = NULL;
285 /* Interate over each interaction value, and unref any cases that we reffed */
286 HMAP_FOR_EACH (iv, struct interaction_value, node, &cat->iap[i].ivmap)
288 if (cat->payload && cat->payload->destroy)
289 cat->payload->destroy (cat->aux1, cat->aux2, iv->user_data);
290 case_unref (iv->ccase);
293 free (cat->iap[i].enc_sum);
294 free (cat->iap[i].df_prod);
295 hmap_destroy (&cat->iap[i].ivmap);
296 interaction_destroy (cat->iap[i].iact);
299 /* Interate over each variable and delete its value map */
300 HMAP_FOR_EACH (vn, struct variable_node, node, &cat->varmap)
302 hmap_destroy (&vn->valmap);
305 hmap_destroy (&cat->varmap);
307 pool_destroy (cat->pool);
314 static struct interaction_value *
315 lookup_case (const struct hmap *map, const struct interaction *iact, const struct ccase *c)
317 struct interaction_value *iv = NULL;
318 size_t hash = interaction_case_hash (iact, c, 0);
320 HMAP_FOR_EACH_WITH_HASH (iv, struct interaction_value, node, hash, map)
322 if (interaction_case_equal (iact, c, iv->ccase))
325 fprintf (stderr, "Warning: Hash table collision\n");
332 categoricals_sane (const struct categoricals *cat)
337 struct categoricals *
338 categoricals_create (struct interaction *const*inter, size_t n_inter,
339 const struct variable *wv, enum mv_class dep_excl, enum mv_class fctr_excl)
342 struct categoricals *cat = xmalloc (sizeof *cat);
344 cat->n_iap = n_inter;
346 cat->n_cats_total = 0;
348 cat->reverse_variable_map_short = NULL;
349 cat->reverse_variable_map_long = NULL;
350 cat->pool = pool_create ();
351 cat->dep_excl = dep_excl;
352 cat->fctr_excl = fctr_excl;
357 cat->iap = pool_calloc (cat->pool, cat->n_iap, sizeof *cat->iap);
359 hmap_init (&cat->varmap);
360 for (i = 0 ; i < cat->n_iap; ++i)
363 hmap_init (&cat->iap[i].ivmap);
364 cat->iap[i].iact = inter[i];
365 cat->iap[i].cc = 0.0;
366 for (v = 0; v < inter[i]->n_vars; ++v)
368 const struct variable *var = inter[i]->vars[v];
369 unsigned int hash = hash_pointer (var, 0);
370 struct variable_node *vn = lookup_variable (&cat->varmap, var, hash);
373 vn = pool_malloc (cat->pool, sizeof *vn);
376 hmap_init (&vn->valmap);
378 hmap_insert (&cat->varmap, &vn->node, hash);
389 categoricals_update (struct categoricals *cat, const struct ccase *c)
392 struct variable_node *vn = NULL;
398 weight = cat->wv ? case_data (c, cat->wv)->f : 1.0;
400 assert (NULL == cat->reverse_variable_map_short);
401 assert (NULL == cat->reverse_variable_map_long);
403 /* Interate over each variable, and add the value of that variable
404 to the appropriate map, if it's not already present. */
405 HMAP_FOR_EACH (vn, struct variable_node, node, &cat->varmap)
407 const int width = var_get_width (vn->var);
408 const union value *val = case_data (c, vn->var);
409 unsigned int hash = value_hash (val, width, 0);
411 struct value_node *valn = lookup_value (&vn->valmap, val, hash, width);
414 valn = pool_malloc (cat->pool, sizeof *valn);
415 valn->index = vn->n_vals++;
416 value_init (&valn->val, width);
417 value_copy (&valn->val, val, width);
418 hmap_insert (&vn->valmap, &valn->node, hash);
422 for (i = 0 ; i < cat->n_iap; ++i)
424 const struct interaction *iact = cat->iap[i].iact;
427 struct interaction_value *node;
429 if ( interaction_case_is_missing (iact, c, cat->fctr_excl))
432 hash = interaction_case_hash (iact, c, 0);
433 node = lookup_case (&cat->iap[i].ivmap, iact, c);
437 node = pool_malloc (cat->pool, sizeof *node);
438 node->ccase = case_ref (c);
441 hmap_insert (&cat->iap[i].ivmap, &node->node, hash);
445 node->user_data = cat->payload->create (cat->aux1, cat->aux2);
452 cat->iap[i].cc += weight;
456 double weight = cat->wv ? case_data (c, cat->wv)->f : 1.0;
457 cat->payload->update (cat->aux1, cat->aux2, node->user_data, c, weight);
463 /* Return the number of categories (distinct values) for interction N */
465 categoricals_n_count (const struct categoricals *cat, size_t n)
467 return hmap_count (&cat->iap[n].ivmap);
472 categoricals_df (const struct categoricals *cat, size_t n)
474 const struct interact_params *iap = &cat->iap[n];
475 return iap->df_prod[iap->iact->n_vars - 1];
479 /* Return the total number of categories */
481 categoricals_n_total (const struct categoricals *cat)
483 if (!categoricals_is_complete (cat))
486 return cat->n_cats_total;
490 categoricals_df_total (const struct categoricals *cat)
499 categoricals_is_complete (const struct categoricals *cat)
501 return (NULL != cat->reverse_variable_map_short);
505 /* This function must be called *before* any call to categoricals_get_*_by subscript and
506 *after* all calls to categoricals_update */
508 categoricals_done (const struct categoricals *cat_)
510 /* Implementation Note: Whilst this function is O(n) in cat->n_cats_total, in most
511 uses it will be more efficient that using a tree based structure, since it
512 is called only once, and means that subsequent lookups will be O(1).
514 1 call of O(n) + 10^9 calls of O(1) is better than 10^9 calls of O(log n).
516 struct categoricals *cat = CONST_CAST (struct categoricals *, cat_);
526 cat->n_cats_total = 0;
528 /* Calculate the degrees of freedom, and the number of categories */
529 for (i = 0 ; i < cat->n_iap; ++i)
532 const struct interaction *iact = cat->iap[i].iact;
534 cat->iap[i].df_prod = iact->n_vars ? xcalloc (iact->n_vars, sizeof (int)) : NULL;
536 cat->iap[i].n_cats = 1;
538 for (v = 0 ; v < iact->n_vars; ++v)
541 const struct variable *var = iact->vars[v];
543 struct variable_node *vn = lookup_variable (&cat->varmap, var, hash_pointer (var, 0));
545 struct value_node *valnd = NULL;
546 struct value_node **array ;
548 assert (vn->n_vals == hmap_count (&vn->valmap));
556 vn->indirection = pool_calloc (cat->pool, vn->n_vals, sizeof *vn->indirection);
558 /* Sort the VALMAP here */
559 array = xcalloc (sizeof *array, vn->n_vals);
560 HMAP_FOR_EACH (valnd, struct value_node, node, &vn->valmap)
562 /* Note: This loop is probably superfluous, it could be done in the
563 update stage (at the expense of a realloc) */
564 array[valnd->index] = valnd;
567 sort (array, vn->n_vals, sizeof (*array),
568 compare_value_node_3way, vn);
570 for (x = 0; x < vn->n_vals; ++x)
572 struct value_node *vvv = array[x];
573 vn->indirection[vn->n_vals - x - 1] = vvv->index;
577 cat->iap[i].df_prod[v] = df * (vn->n_vals - 1);
578 df = cat->iap[i].df_prod[v];
580 cat->iap[i].n_cats *= vn->n_vals;
584 cat->df_sum += cat->iap[i].df_prod [v - 1];
586 cat->n_cats_total += cat->iap[i].n_cats;
590 cat->reverse_variable_map_short = pool_calloc (cat->pool,
592 sizeof *cat->reverse_variable_map_short);
594 cat->reverse_variable_map_long = pool_calloc (cat->pool,
596 sizeof *cat->reverse_variable_map_long);
598 for (i = 0 ; i < cat->n_iap; ++i)
600 struct interaction_value *ivn = NULL;
603 struct interact_params *iap = &cat->iap[i];
605 iap->base_subscript_short = idx_short;
606 iap->base_subscript_long = idx_long;
608 iap->reverse_interaction_value_map = pool_calloc (cat->pool, iap->n_cats,
609 sizeof *iap->reverse_interaction_value_map);
611 HMAP_FOR_EACH (ivn, struct interaction_value, node, &iap->ivmap)
613 iap->reverse_interaction_value_map[x++] = ivn;
616 assert (x <= iap->n_cats);
618 /* For some purposes (eg CONTRASTS in ONEWAY) the values need to be sorted */
619 sort (iap->reverse_interaction_value_map, x, sizeof (*iap->reverse_interaction_value_map),
620 compare_interaction_value_3way, iap);
622 /* Fill the remaining values with null */
623 for (ii = x ; ii < iap->n_cats; ++ii)
624 iap->reverse_interaction_value_map[ii] = NULL;
626 /* Populate the reverse variable maps. */
629 for (ii = 0; ii < iap->df_prod [iap->iact->n_vars - 1]; ++ii)
630 cat->reverse_variable_map_short[idx_short++] = i;
633 for (ii = 0; ii < iap->n_cats; ++ii)
634 cat->reverse_variable_map_long[idx_long++] = i;
637 assert (cat->n_vars <= cat->n_iap);
639 categoricals_dump (cat);
641 /* Tally up the sums for all the encodings */
642 for (i = 0 ; i < cat->n_iap; ++i)
645 struct interact_params *iap = &cat->iap[i];
646 const struct interaction *iact = iap->iact;
648 const int df = iap->df_prod ? iap->df_prod [iact->n_vars - 1] : 0;
650 iap->enc_sum = xcalloc (df, sizeof (*(iap->enc_sum)));
652 for (y = 0; y < hmap_count (&iap->ivmap); ++y)
654 struct interaction_value *iv = iap->reverse_interaction_value_map[y];
655 for (x = iap->base_subscript_short; x < iap->base_subscript_short + df ;++x)
657 const double bin = categoricals_get_effects_code_for_case (cat, x, iv->ccase);
658 iap->enc_sum [x - iap->base_subscript_short] += bin * iv->cc;
660 if (cat->payload && cat->payload->calculate)
661 cat->payload->calculate (cat->aux1, cat->aux2, iv->user_data);
670 reverse_variable_lookup_short (const struct categoricals *cat, int subscript)
672 assert (cat->reverse_variable_map_short);
673 assert (subscript >= 0);
674 assert (subscript < cat->df_sum);
676 return cat->reverse_variable_map_short[subscript];
680 reverse_variable_lookup_long (const struct categoricals *cat, int subscript)
682 assert (cat->reverse_variable_map_long);
683 assert (subscript >= 0);
684 assert (subscript < cat->n_cats_total);
686 return cat->reverse_variable_map_long[subscript];
690 /* Return the interaction corresponding to SUBSCRIPT */
691 const struct interaction *
692 categoricals_get_interaction_by_subscript (const struct categoricals *cat, int subscript)
694 int index = reverse_variable_lookup_short (cat, subscript);
696 return cat->iap[index].iact;
700 categoricals_get_weight_by_subscript (const struct categoricals *cat, int subscript)
702 int vindex = reverse_variable_lookup_short (cat, subscript);
703 const struct interact_params *vp = &cat->iap[vindex];
709 categoricals_get_sum_by_subscript (const struct categoricals *cat, int subscript)
711 int vindex = reverse_variable_lookup_short (cat, subscript);
712 const struct interact_params *vp = &cat->iap[vindex];
714 return vp->enc_sum[subscript - vp->base_subscript_short];
718 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
719 for that subscript */
721 categoricals_get_code_for_case (const struct categoricals *cat, int subscript,
722 const struct ccase *c,
725 const struct interaction *iact = categoricals_get_interaction_by_subscript (cat, subscript);
727 const int i = reverse_variable_lookup_short (cat, subscript);
729 const int base_index = cat->iap[i].base_subscript_short;
734 const struct interact_params *iap = &cat->iap[i];
737 for (v = 0; v < iact->n_vars; ++v)
739 const struct variable *var = iact->vars[v];
741 const union value *val = case_data (c, var);
742 const int width = var_get_width (var);
743 const struct variable_node *vn = lookup_variable (&cat->varmap, var, hash_pointer (var, 0));
745 const unsigned int hash = value_hash (val, width, 0);
746 const struct value_node *valn = lookup_value (&vn->valmap, val, hash, width);
750 const double df = iap->df_prod[v] / dfp;
752 /* Translate the subscript into an index for the individual variable */
753 const int index = ((subscript - base_index) % iap->df_prod[v] ) / dfp;
754 dfp = iap->df_prod [v];
756 if (effects_coding && vn->indirection [valn->index] == df )
758 else if ( vn->indirection [valn->index] != index )
768 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
769 for that subscript */
771 categoricals_get_dummy_code_for_case (const struct categoricals *cat, int subscript,
772 const struct ccase *c)
774 return categoricals_get_code_for_case (cat, subscript, c, false);
777 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
779 Else if it is the last category, return -1.
783 categoricals_get_effects_code_for_case (const struct categoricals *cat, int subscript,
784 const struct ccase *c)
786 return categoricals_get_code_for_case (cat, subscript, c, true);
791 categoricals_get_n_variables (const struct categoricals *cat)
793 printf ("%s\n", __FUNCTION__);
798 /* Return a case containing the set of values corresponding to
799 the Nth Category of the IACTth interaction */
801 categoricals_get_case_by_category_real (const struct categoricals *cat, int iact, int n)
803 const struct interaction_value *vn;
805 const struct interact_params *vp = &cat->iap[iact];
807 if ( n >= hmap_count (&vp->ivmap))
810 vn = vp->reverse_interaction_value_map [n];
815 /* Return a the user data corresponding to the Nth Category of the IACTth interaction. */
817 categoricals_get_user_data_by_category_real (const struct categoricals *cat, int iact, int n)
819 const struct interact_params *vp = &cat->iap[iact];
820 const struct interaction_value *iv ;
822 if ( n >= hmap_count (&vp->ivmap))
825 iv = vp->reverse_interaction_value_map [n];
827 return iv->user_data;
832 /* Return a case containing the set of values corresponding to SUBSCRIPT */
834 categoricals_get_case_by_category (const struct categoricals *cat, int subscript)
836 int vindex = reverse_variable_lookup_long (cat, subscript);
837 const struct interact_params *vp = &cat->iap[vindex];
838 const struct interaction_value *vn = vp->reverse_interaction_value_map [subscript - vp->base_subscript_long];
844 categoricals_get_user_data_by_category (const struct categoricals *cat, int subscript)
846 int vindex = reverse_variable_lookup_long (cat, subscript);
847 const struct interact_params *vp = &cat->iap[vindex];
849 const struct interaction_value *iv = vp->reverse_interaction_value_map [subscript - vp->base_subscript_long];
850 return iv->user_data;
857 categoricals_set_payload (struct categoricals *cat, const struct payload *p,
858 const void *aux1, void *aux2)