X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fmath%2Fcategoricals.c;h=78944fca44803aabe3da896e8ac23e3f430257b0;hb=334ecf90912ed0f3cb3c9ce0c97f175ab9354382;hp=033b6a18a07c8bdb705c0bfb3ec44f9270f80cb5;hpb=e0c2b55526074967dc11fd25183a464e4752e571;p=pspp diff --git a/src/math/categoricals.c b/src/math/categoricals.c index 033b6a18a0..78944fca44 100644 --- a/src/math/categoricals.c +++ b/src/math/categoricals.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,59 +16,105 @@ #include -#include +#include "math/categoricals.h" -#include "categoricals.h" +#include -#include -#include -#include -#include -#include -#include +#include "data/case.h" +#include "data/value.h" +#include "data/variable.h" +#include "libpspp/array.h" +#include "libpspp/hmap.h" +#include "libpspp/pool.h" +#include "libpspp/str.h" +#include "gl/xalloc.h" struct value_node { struct hmap_node node; /* Node in hash map. */ union value value; /* The value being labeled. */ double cc; /* The total of the weights of cases with this value */ + + void *user_data; /* A pointer to data which the caller can store stuff */ + int subscript; /* A zero based integer, unique within the variable. Can be used as an index into an array */ }; - struct var_params { /* A map indexed by a union values */ struct hmap map; - int base_subscript; + const struct variable *var; + + int base_subscript_short; + int base_subscript_long; /* The number of distinct values of this variable */ int n_cats; /* A map of values indexed by subscript */ const struct value_node **reverse_value_map; + + /* Total of the weights of this variable */ + double cc; }; -struct categoricals +/* Comparison function to sort the reverse_value_map in ascending order */ +static int +compare_value_node (const void *vn1_, const void *vn2_, const void *aux) { - const struct variable **vars; - size_t n_vars; + const struct value_node * const *vn1 = vn1_; + const struct value_node * const *vn2 = vn2_; + const struct var_params *vp = aux; + + return value_compare_3way (&(*vn1)->value, &(*vn2)->value, var_get_width (vp->var)); +} + +struct categoricals +{ + /* The weight variable */ const struct variable *wv; + /* An array of var_params */ struct var_params *vp; - /* A map to enable the lookup of variables indexed by subscript */ - int *reverse_variable_map; + /* The size of VP. (ie, the number of variables involved.) */ + size_t n_vp; + + /* The number of categorical variables which contain entries. + In the absence of missing values, this will be equal to N_VP */ + size_t n_vars; + + /* A map to enable the lookup of variables indexed by subscript. + This map considers only the N - 1 of the N variables. + */ + int *reverse_variable_map_short; + + /* Like the above, but uses all N variables */ + int *reverse_variable_map_long; size_t n_cats_total; struct pool *pool; + + /* Missing values to be excluded */ + enum mv_class exclude; + + /* Function to be called on each update */ + update_func *update; + + /* Function specified by the caller to create user_data */ + user_data_create_func *user_data_create; + + /* Auxilliary data to be passed to update and user_data_create_func*/ + void *aux1; + void *aux2; }; @@ -76,47 +122,76 @@ void categoricals_destroy ( struct categoricals *cat) { int i; - for (i = 0 ; i < cat->n_vars; ++i) - hmap_destroy (&cat->vp[i].map); - - pool_destroy (cat->pool); - free (cat); + if (cat != NULL) + { + for (i = 0 ; i < cat->n_vp; ++i) + hmap_destroy (&cat->vp[i].map); + + pool_destroy (cat->pool); + free (cat); + } } +#if 0 void categoricals_dump (const struct categoricals *cat) { int v; - for (v = 0 ; v < cat->n_vars; ++v) + for (v = 0 ; v < cat->n_vp; ++v) { const struct var_params *vp = &cat->vp[v]; const struct hmap *m = &vp->map; - size_t width = var_get_width (cat->vars[v]); struct hmap_node *node ; int x; - printf ("\n%s (%d):\n", var_get_name (cat->vars[v]), vp->base_subscript); - - assert (vp->reverse_value_map); + printf ("\n%s (%d) CC=%g n_cats=%d:\n", + var_get_name (vp->var), vp->base_subscript_long, vp->cc, vp->n_cats); printf ("Reverse map\n"); for (x = 0 ; x < vp->n_cats; ++x) { + struct string s; const struct value_node *vn = vp->reverse_value_map[x]; - printf ("Value for %d is %s\n", x, value_str (&vn->value, width)); + ds_init_empty (&s); + var_append_value_name (vp->var, &vn->value, &s); + printf ("Value for %d is %s\n", x, ds_cstr(&s)); + ds_destroy (&s); } printf ("\nForward map\n"); for (node = hmap_first (m); node; node = hmap_next (m, node)) { + struct string s; const struct value_node *vn = HMAP_DATA (node, struct value_node, node); + ds_init_empty (&s); + var_append_value_name (vp->var, &vn->value, &s); printf ("Value: %s; Index %d; CC %g\n", - value_str (&vn->value, width), vn->subscript, vn->cc); + ds_cstr (&s), + vn->subscript, vn->cc); + ds_destroy (&s); } } + + assert (cat->n_vars <= cat->n_vp); + + printf ("\n"); + printf ("Number of categorical variables: %d\n", cat->n_vp); + printf ("Number of non-empty categorical variables: %d\n", cat->n_vars); + printf ("Total number of categories: %d\n", cat->n_cats_total); + + printf ("\nReverse variable map (short):\n"); + for (v = 0 ; v < cat->n_cats_total - cat->n_vars; ++v) + printf ("%d ", cat->reverse_variable_map_short[v]); + + printf ("\nReverse variable map (long):\n"); + for (v = 0 ; v < cat->n_cats_total; ++v) + printf ("%d ", cat->reverse_variable_map_long[v]); + + printf ("\n"); } +#endif @@ -139,24 +214,38 @@ lookup_value (const struct hmap *map, const struct variable *var, const union va } - struct categoricals * -categoricals_create (const struct variable **v, size_t n_vars, const struct variable *wv) +categoricals_create (const struct variable *const *v, size_t n_vars, + const struct variable *wv, enum mv_class exclude, + user_data_create_func *udf, + update_func *update, void *aux1, void *aux2 + ) { size_t i; struct categoricals *cat = xmalloc (sizeof *cat); - cat->vars = v; - cat->n_vars = n_vars; + cat->n_vp = n_vars; cat->wv = wv; cat->n_cats_total = 0; - cat->reverse_variable_map = NULL; + cat->n_vars = 0; + cat->reverse_variable_map_short = NULL; + cat->reverse_variable_map_long = NULL; cat->pool = pool_create (); + cat->exclude = exclude; + cat->update = update; + cat->user_data_create = udf; + + cat->aux1 = aux1; + cat->aux2 = aux2; - cat->vp = pool_calloc (cat->pool, n_vars, sizeof *cat->vp); - for (i = 0 ; i < cat->n_vars; ++i) - hmap_init (&cat->vp[i].map); + cat->vp = pool_calloc (cat->pool, cat->n_vp, sizeof *cat->vp); + + for (i = 0 ; i < cat->n_vp; ++i) + { + hmap_init (&cat->vp[i].map); + cat->vp[i].var = v[i]; + } return cat; } @@ -170,15 +259,23 @@ categoricals_update (struct categoricals *cat, const struct ccase *c) const double weight = cat->wv ? case_data (c, cat->wv)->f : 1.0; - assert (NULL == cat->reverse_variable_map); + assert (NULL == cat->reverse_variable_map_short); + assert (NULL == cat->reverse_variable_map_long); - for (i = 0 ; i < cat->n_vars; ++i) + for (i = 0 ; i < cat->n_vp; ++i) { - unsigned int width = var_get_width (cat->vars[i]); - const union value *val = case_data (c, cat->vars[i]); - size_t hash = value_hash (val, width, 0); + const struct variable *var = cat->vp[i].var; + unsigned int width = var_get_width (var); + const union value *val = case_data (c, var); + size_t hash; + struct value_node *node ; + + if ( var_is_value_missing (var, val, cat->exclude)) + continue; + + hash = value_hash (val, width, 0); + node = lookup_value (&cat->vp[i].map, var, val); - struct value_node *node = lookup_value (&cat->vp[i].map, cat->vars[i], val); if ( NULL == node) { node = pool_malloc (cat->pool, sizeof *node); @@ -188,11 +285,22 @@ categoricals_update (struct categoricals *cat, const struct ccase *c) node->cc = 0.0; hmap_insert (&cat->vp[i].map, &node->node, hash); - cat->n_cats_total ++; + cat->n_cats_total++; + + if ( 0 == cat->vp[i].n_cats) + cat->n_vars++; + node->subscript = cat->vp[i].n_cats++ ; + + if (cat->user_data_create) + node->user_data = cat->user_data_create (cat->aux1, cat->aux2); } node->cc += weight; + cat->vp[i].cc += weight; + + if (cat->update) + cat->update (node->user_data, cat->exclude, cat->wv, var, c, cat->aux1, cat->aux2); } } @@ -204,19 +312,6 @@ categoricals_n_count (const struct categoricals *cat, size_t n) } -/* Return the index for value VAL in the Nth variable */ -int -categoricals_index (const struct categoricals *cat, size_t n, const union value *val) -{ - struct value_node *vn = lookup_value (&cat->vp[n].map, cat->vars[n], val); - - if ( vn == NULL) - return -1; - - return vn->subscript; -} - - /* Return the total number of categories */ size_t categoricals_total (const struct categoricals *cat) @@ -225,10 +320,10 @@ categoricals_total (const struct categoricals *cat) } -/* This function must be called *before* any call to categoricals_get_*_by subscript an +/* This function must be called *before* any call to categoricals_get_*_by subscript and *after* all calls to categoricals_update */ void -categoricals_done (struct categoricals *cat) +categoricals_done (const struct categoricals *cat_) { /* Implementation Note: Whilst this function is O(n) in cat->n_cats_total, in most uses it will be more efficient that using a tree based structure, since it @@ -236,11 +331,19 @@ categoricals_done (struct categoricals *cat) 1 call of O(n) + 10^9 calls of O(1) is better than 10^9 calls of O(log n). */ + struct categoricals *cat = CONST_CAST (struct categoricals *, cat_); int v; - int idx = 0; - cat->reverse_variable_map = pool_calloc (cat->pool, cat->n_cats_total, sizeof *cat->reverse_variable_map); + int idx_short = 0; + int idx_long = 0; + cat->reverse_variable_map_short = pool_calloc (cat->pool, + cat->n_cats_total - cat->n_vars, + sizeof *cat->reverse_variable_map_short); + + cat->reverse_variable_map_long = pool_calloc (cat->pool, + cat->n_cats_total, + sizeof *cat->reverse_variable_map_long); - for (v = 0 ; v < cat->n_vars; ++v) + for (v = 0 ; v < cat->n_vp; ++v) { int i; struct var_params *vp = &cat->vp[v]; @@ -249,7 +352,8 @@ categoricals_done (struct categoricals *cat) vp->reverse_value_map = pool_calloc (cat->pool, n_cats_total, sizeof *vp->reverse_value_map); - vp->base_subscript = idx; + vp->base_subscript_short = idx_short; + vp->base_subscript_long = idx_long; for (node = hmap_first (&vp->map); node; node = hmap_next (&vp->map, node)) { @@ -257,9 +361,40 @@ categoricals_done (struct categoricals *cat) vp->reverse_value_map[vn->subscript] = vn; } + /* For some purposes (eg CONTRASTS in ONEWAY) the values need to be sorted */ + sort (vp->reverse_value_map, vp->n_cats, sizeof (const struct value_node *), + compare_value_node, vp); + + /* Populate the reverse variable maps. */ + for (i = 0; i < vp->n_cats - 1; ++i) + cat->reverse_variable_map_short[idx_short++] = v; + for (i = 0; i < vp->n_cats; ++i) - cat->reverse_variable_map[idx++] = v; + cat->reverse_variable_map_long[idx_long++] = v; } + + assert (cat->n_vars <= cat->n_vp); +} + + +static int +reverse_variable_lookup_short (const struct categoricals *cat, int subscript) +{ + assert (cat->reverse_variable_map_short); + assert (subscript >= 0); + assert (subscript < cat->n_cats_total - cat->n_vars); + + return cat->reverse_variable_map_short[subscript]; +} + +static int +reverse_variable_lookup_long (const struct categoricals *cat, int subscript) +{ + assert (cat->reverse_variable_map_long); + assert (subscript >= 0); + assert (subscript < cat->n_cats_total); + + return cat->reverse_variable_map_long[subscript]; } @@ -268,28 +403,43 @@ categoricals_done (struct categoricals *cat) const struct variable * categoricals_get_variable_by_subscript (const struct categoricals *cat, int subscript) { - int index; - - assert (cat->reverse_variable_map); - - index = cat->reverse_variable_map[subscript]; + int index = reverse_variable_lookup_short (cat, subscript); - return cat->vars[index]; + return cat->vp[index].var; } - /* Return the value corresponding to SUBSCRIPT */ -const union value * +static const union value * categoricals_get_value_by_subscript (const struct categoricals *cat, int subscript) { - int vindex = cat->reverse_variable_map[subscript]; + int vindex = reverse_variable_lookup_short (cat, subscript); const struct var_params *vp = &cat->vp[vindex]; - const struct value_node *vn = vp->reverse_value_map [subscript - vp->base_subscript]; + const struct value_node *vn = vp->reverse_value_map [subscript - vp->base_subscript_short]; return &vn->value; } +double +categoricals_get_weight_by_subscript (const struct categoricals *cat, int subscript) +{ + int vindex = reverse_variable_lookup_short (cat, subscript); + const struct var_params *vp = &cat->vp[vindex]; + + return vp->cc; +} + +double +categoricals_get_sum_by_subscript (const struct categoricals *cat, int subscript) +{ + int vindex = reverse_variable_lookup_short (cat, subscript); + const struct var_params *vp = &cat->vp[vindex]; + + const struct value_node *vn = vp->reverse_value_map [subscript - vp->base_subscript_short]; + return vn->cc; +} + + /* Returns unity if the value in case C at SUBSCRIPT is equal to the category for that subscript */ double @@ -303,3 +453,34 @@ categoricals_get_binary_by_subscript (const struct categoricals *cat, int subscr return value_equal (val, categoricals_get_value_by_subscript (cat, subscript), width); } + + +size_t +categoricals_get_n_variables (const struct categoricals *cat) +{ + return cat->n_vars; +} + + + +/* Return the value corresponding to SUBSCRIPT */ +const union value * +categoricals_get_value_by_category (const struct categoricals *cat, int subscript) +{ + int vindex = reverse_variable_lookup_long (cat, subscript); + const struct var_params *vp = &cat->vp[vindex]; + const struct value_node *vn = vp->reverse_value_map [subscript - vp->base_subscript_long]; + + return &vn->value; +} + + +void * +categoricals_get_user_data_by_category (const struct categoricals *cat, int subscript) +{ + int vindex = reverse_variable_lookup_long (cat, subscript); + const struct var_params *vp = &cat->vp[vindex]; + + const struct value_node *vn = vp->reverse_value_map [subscript - vp->base_subscript_long]; + return vn->user_data; +}