From 194d01aaac43a41a174037357f89bc164b5c5213 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sat, 17 Oct 2009 19:26:04 +0200 Subject: [PATCH] New module src/math/categoricals . Created a new object to keep track of categorical variables, and their values. This is similar to src/data/categorys.c but is created on demand, and doesn't use any global state. --- src/language/stats/correlations.c | 7 +- src/math/automake.mk | 2 + src/math/categoricals.c | 158 ++++++++++++++++++++++++++++++ src/math/categoricals.h | 28 ++++++ src/math/covariance.c | 56 ++++++----- src/math/covariance.h | 7 +- 6 files changed, 224 insertions(+), 34 deletions(-) create mode 100644 src/math/categoricals.c create mode 100644 src/math/categoricals.h diff --git a/src/language/stats/correlations.c b/src/language/stats/correlations.c index 277cfea5..2d893d61 100644 --- a/src/language/stats/correlations.c +++ b/src/language/stats/correlations.c @@ -324,9 +324,9 @@ run_corr (struct casereader *r, const struct corr_opts *opts, const struct corr const gsl_matrix *var_matrix, *samples_matrix, *mean_matrix; const gsl_matrix *cov_matrix; gsl_matrix *corr_matrix; - struct covariance *cov = covariance_create (corr->n_vars_total, corr->vars, - opts->wv, opts->exclude, 2); - + struct covariance *cov = covariance_2pass_create (corr->n_vars_total, corr->vars, + 0, NULL, + opts->wv, opts->exclude); struct casereader *rc = casereader_clone (r); for ( ; (c = casereader_read (r) ); case_unref (c)) @@ -343,7 +343,6 @@ run_corr (struct casereader *r, const struct corr_opts *opts, const struct corr casereader_destroy (rc); - samples_matrix = covariance_moments (cov, MOMENT_NONE); var_matrix = covariance_moments (cov, MOMENT_VARIANCE); mean_matrix = covariance_moments (cov, MOMENT_MEAN); diff --git a/src/math/automake.mk b/src/math/automake.mk index 9fc15aaf..053ee440 100644 --- a/src/math/automake.mk +++ b/src/math/automake.mk @@ -13,6 +13,8 @@ src_math_libpspp_math_la_SOURCES = \ src/math/box-whisker.c src/math/box-whisker.h \ src/math/coefficient.c \ src/math/coefficient.h \ + src/math/categoricals.h \ + src/math/categoricals.c \ src/math/covariance.c \ src/math/covariance.h \ src/math/covariance-matrix.c \ diff --git a/src/math/categoricals.c b/src/math/categoricals.c new file mode 100644 index 00000000..a1c6f502 --- /dev/null +++ b/src/math/categoricals.c @@ -0,0 +1,158 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include + +#include "categoricals.h" + +#include +#include +#include +#include +#include + +struct categoricals +{ + const struct variable **vars; + size_t n_vars; + + const struct variable *wv; + + struct hmap *map; + + int *next_index; + + size_t n_cats; +}; + + +struct value_node + { + struct hmap_node node; /* Node in hash map. */ + union value value; /* The value being labeled. */ + double cc; /* The total of the weights of cases with this value */ + int index; /* A zero based integer, unique within the variable. + Can be used as an index into an array */ + }; + + +static struct value_node * +lookup_value (const struct hmap *map, const struct variable *var, const union value *val) +{ + struct value_node *foo; + unsigned int width = var_get_width (var); + size_t hash = value_hash (val, width, 0); + + HMAP_FOR_EACH_WITH_HASH (foo, struct value_node, node, hash, map) + { + if (value_equal (val, &foo->value, width)) + break; + } + + return foo; +} + + +struct categoricals * +categoricals_create (const struct variable **v, size_t n_vars, const struct variable *wv) +{ + size_t i; + struct categoricals *cat = xmalloc (sizeof *cat); + + cat->vars = v; + cat->n_vars = n_vars; + cat->wv = wv; + cat->n_cats = 0; + + cat->map = xmalloc (sizeof *cat->map * n_vars); + cat->next_index = xcalloc (sizeof *cat->next_index, n_vars); + + for (i = 0 ; i < cat->n_vars; ++i) + { + hmap_init (&cat->map[i]); + } + + return cat; +} + + +void +categoricals_update (struct categoricals *cat, const struct ccase *c) +{ + size_t i; + + const double weight = cat->wv ? case_data (c, cat->wv)->f : 1.0; + + for (i = 0 ; i < cat->n_vars; ++i) + { + unsigned int width = var_get_width (cat->vars[i]); + const union value *val = case_data (c, cat->vars[i]); + size_t hash = value_hash (val, width, 0); + + struct value_node *node = lookup_value (&cat->map[i], cat->vars[i], val); + + if ( NULL == node) + { + node = xmalloc (sizeof *node); + + value_init (&node->value, width); + value_copy (&node->value, val, width); + node->cc = 0.0; + + hmap_insert (&cat->map[i], &node->node, hash); + cat->n_cats ++; + node->index = cat->next_index[i]++ ; + } + + node->cc += weight; + } +} + +/* Return the number of categories (distinct values) for variable N */ +size_t +categoricals_n_count (const struct categoricals *cat, size_t n) +{ + return hmap_count (&cat->map[n]); +} + + +/* Return the index for value VAL in the Nth variable */ +int +categoricals_index (const struct categoricals *cat, size_t n, const union value *val) +{ + struct value_node *vn = lookup_value (&cat->map[n], cat->vars[n], val); + + if ( vn == NULL) + return -1; + + return vn->index; +} + + +/* Return the total number of categories */ +size_t +categoricals_total (const struct categoricals *cat) +{ + return cat->n_cats; +} + + + + + + diff --git a/src/math/categoricals.h b/src/math/categoricals.h new file mode 100644 index 00000000..af922645 --- /dev/null +++ b/src/math/categoricals.h @@ -0,0 +1,28 @@ +#ifndef _CATEGORICALS__ +#define _CATEGORICALS__ + +#include + +struct categoricals; +struct variable; +struct ccase; + +union value ; + +struct categoricals *categoricals_create (const struct variable **v, size_t n_vars, + const struct variable *wv); + +void categoricals_update (struct categoricals *cat, const struct ccase *c); + + +/* Return the number of categories (distinct values) for variable N */ +size_t categoricals_n_count (const struct categoricals *cat, size_t n); + + +/* Return the total number of categories */ +size_t categoricals_total (const struct categoricals *cat); + +/* Return the index for variable N */ +int categoricals_index (const struct categoricals *cat, size_t n, const union value *val); + +#endif diff --git a/src/math/covariance.c b/src/math/covariance.c index 1548187b..1d908b3d 100644 --- a/src/math/covariance.c +++ b/src/math/covariance.c @@ -24,6 +24,7 @@ #include #include #include +#include "categoricals.h" #define n_MOMENTS (MOMENT_VARIANCE + 1) @@ -35,8 +36,7 @@ struct covariance const struct variable **vars; /* Categorical variables. */ - size_t n_catvars; - const struct variable **catvars; + struct categoricals *categoricals; /* Array containing number of categories per categorical variable. */ size_t *n_categories; @@ -96,26 +96,22 @@ covariance_moments (const struct covariance *cov, int m) /* Create a covariance struct. */ struct covariance * -covariance_create (size_t n_vars, const struct variable **vars, - const struct variable *weight, enum mv_class exclude, - short passes) +covariance_1pass_create (size_t n_vars, const struct variable **vars, + const struct variable *weight, enum mv_class exclude) { size_t i; struct covariance *cov = xmalloc (sizeof *cov); - assert (passes == 1 || passes == 2); - cov->passes = passes; + + cov->passes = 1; cov->state = 0; cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false; - cov->vars = xmalloc (sizeof *cov->vars * n_vars); + cov->vars = vars; cov->wv = weight; cov->n_vars = n_vars; cov->dim = n_vars; - for (i = 0; i < n_vars; ++i) - cov->vars[i] = vars[i]; - cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS); for (i = 0; i < n_MOMENTS; ++i) @@ -139,31 +135,33 @@ covariance_create (size_t n_vars, const struct variable **vars, struct covariance * covariance_2pass_create (size_t n_vars, const struct variable **vars, size_t n_catvars, const struct variable **catvars, - const struct variable *weight, enum mv_class exclude) + const struct variable *wv, enum mv_class exclude) { size_t i; struct covariance *cov = xmalloc (sizeof *cov); - cov->vars = xmalloc (sizeof *cov->vars * n_vars); - cov->catvars = xnmalloc (n_catvars, sizeof (*cov->catvars)); - cov->n_categories = xnmalloc (n_catvars, sizeof (cov->n_categories)); - cov->wv = weight; - cov->n_vars = n_vars; - cov->n_catvars = n_catvars; - - for (i = 0; i < n_vars; ++i) - cov->vars[i] = vars[i]; + cov->passes = 2; + cov->state = 0; + cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false; + + cov->vars = vars; - for (i = 0; i < n_catvars; i++) - { - cov->catvars[i] = catvars[i]; - cov->n_categories[i] = 0; - } + cov->wv = wv; + cov->n_vars = n_vars; + cov->dim = n_vars; cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS); + for (i = 0; i < n_MOMENTS; ++i) + cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars); + cov->exclude = exclude; + cov->n_cm = - 1; + cov->cm = NULL; + + cov->categoricals = categoricals_create (catvars, n_catvars, wv); + return cov; } @@ -226,6 +224,8 @@ covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c) cov->state = 1; } + categoricals_update (cov->categoricals, c); + for (i = 0 ; i < cov->n_vars; ++i) { const union value *val1 = case_data (c, cov->vars[i]); @@ -270,6 +270,10 @@ covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c) assert (cov->state == 1); cov->state = 2; + cov->dim = cov->n_vars + categoricals_total (cov->categoricals); + cov->n_cm = (cov->dim * (cov->dim - 1) ) / 2; + cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm); + /* Divide the means by the number of samples */ for (i = 0; i < cov->n_vars; ++i) { diff --git a/src/math/covariance.h b/src/math/covariance.h index 8855433e..22e38e1e 100644 --- a/src/math/covariance.h +++ b/src/math/covariance.h @@ -27,14 +27,13 @@ struct covariance; struct variable; struct ccase ; -struct covariance * covariance_create (size_t n_vars, const struct variable **vars, - const struct variable *wv, enum mv_class excl, - short passes); +struct covariance * covariance_1pass_create (size_t n_vars, const struct variable **vars, + const struct variable *wv, enum mv_class excl); struct covariance * covariance_2pass_create (size_t n_vars, const struct variable **vars, size_t n_catvars, const struct variable **catvars, - const struct variable *weight, enum mv_class excl); + const struct variable *wv, enum mv_class excl); void covariance_accumulate (struct covariance *, const struct ccase *); void covariance_accumulate_pass1 (struct covariance *, const struct ccase *); -- 2.30.2