From ceb0823669b8cb6784fd4f793d28451e33dfd512 Mon Sep 17 00:00:00 2001 From: Jason H Stover Date: Wed, 15 Oct 2008 10:06:50 -0400 Subject: [PATCH] covariance_accumulate: New function for one-pass computation of the covariance matrix. Added hash table to temporarily store the elements that will be in the covariance matrix. The hash consists of the new data structure covariance_accumulator. New functions hash_numeric_alpha, covariance_accumulator_compare, match_nodes, covariance_accumulator_free, covariance_hsh_create, covariance_accumulator_hash, to assist with the hashing. update_product: New function. covariance_accumulator_to_matrix: New function to put hash elements into the covariance matrix. get_center: New function to center a value prior to adding it to the covariance matrix. covariance_matrix_insert: New function to add a value to the covariance matrix. --- src/math/covariance-matrix.c | 396 ++++++++++++++++++++++++++++++++++- src/math/covariance-matrix.h | 13 +- 2 files changed, 404 insertions(+), 5 deletions(-) diff --git a/src/math/covariance-matrix.c b/src/math/covariance-matrix.c index f929a370..817f7ed3 100644 --- a/src/math/covariance-matrix.c +++ b/src/math/covariance-matrix.c @@ -19,16 +19,46 @@ */ #include #include +#include +#include #include #include -#include "covariance-matrix.h" -#include "moments.h" +#include +#include +#include +#include +#include +#include + +/* + Structure used to accumulate the covariance matrix in a single data + pass. Before passing the data, we do not know how many categories + there are in each categorical variable. Therefore we do not know the + size of the covariance matrix. To get around this problem, we + accumulate the elements of the covariance matrix in pointers to + COVARIANC_ACCUMULATOR. These values are then used to populate + the covariance matrix. + */ +struct covariance_accumulator +{ + const struct variable *v1; + const struct variable *v2; + double product; + const union value *val1; + const union value *val2; +}; + +static hsh_hash_func covariance_accumulator_hash; +static unsigned int hash_numeric_alpha (const struct variable *, const struct variable *, + const union value *, size_t); +static hsh_compare_func covariance_accumulator_compare; +static hsh_free_func covariance_accumulator_free; /* The covariances are stored in a DESIGN_MATRIX structure. */ struct design_matrix * -covariance_matrix_create (int n_variables, const struct variable *v_variables[]) +covariance_matrix_create (size_t n_variables, const struct variable *v_variables[]) { return design_matrix_create (n_variables, v_variables, (size_t) n_variables); } @@ -144,3 +174,363 @@ void covariance_pass_two (struct design_matrix *cov, double mean1, double mean2, } } +static unsigned int +covariance_accumulator_hash (const void *h, const void *aux) +{ + struct covariance_accumulator *ca = (struct covariance_accumulator *) h; + size_t *n_vars = (size_t *) aux; + size_t idx_max; + size_t idx_min; + const struct variable *v_min; + const struct variable *v_max; + const union value *val_min; + const union value *val_max; + + /* + Order everything by the variables' indices. This ensures we get the + same key regardless of the order in which the variables are stored + and passed around. + */ + v_min = (var_get_dict_index (ca->v1) < var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2; + v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1; + + val_min = (v_min == ca->v1) ? ca->val1 : ca->val2; + val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1; + + idx_min = var_get_dict_index (v_min); + idx_max = var_get_dict_index (v_max); + + if (var_is_numeric (v_max) && var_is_numeric (v_min)) + { + return (*n_vars * idx_max + idx_min); + } + if (var_is_numeric (v_max) && var_is_alpha (v_min)) + { + return hash_numeric_alpha (v_max, v_min, val_min, *n_vars); + } + if (var_is_alpha (v_max) && var_is_numeric (v_min)) + { + return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars)); + } + if (var_is_alpha (v_max) && var_is_alpha (v_min)) + { + unsigned int tmp; + char *x = xnmalloc (1 + var_get_width (v_max) + var_get_width (v_min), sizeof (*x)); + strncpy (x, val_max->s, var_get_width (v_max)); + strncat (x, val_min->s, var_get_width (v_min)); + tmp = *n_vars * (*n_vars + 1 + idx_max) + + idx_min + + hsh_hash_string (x); + free (x); + return tmp; + } + return -1u; +} + +/* + Make a hash table consisting of struct covariance_accumulators. + This allows the accumulation of the elements of a covariance matrix + in a single data pass. Call covariance_accumulate () for each case + in the data. + */ +struct hsh_table * +covariance_hsh_create (size_t n_vars) +{ + return hsh_create (n_vars * (n_vars + 1) / 2, covariance_accumulator_compare, + covariance_accumulator_hash, covariance_accumulator_free, &n_vars); +} + +static void +covariance_accumulator_free (void *c_, const void *aux UNUSED) +{ + struct covariance_accumulator *c = c_; + assert (c != NULL); + free (c); +} +static int +match_nodes (const struct covariance_accumulator *c, const struct variable *v1, + const struct variable *v2, const union value *val1, + const union value *val2) +{ + if (var_get_dict_index (v1) == var_get_dict_index (c->v1) && + var_get_dict_index (v2) == var_get_dict_index (c->v2)) + { + if (var_is_numeric (v1) && var_is_numeric (v2)) + { + return 0; + } + if (var_is_numeric (v1) && var_is_alpha (v2)) + { + if (compare_values (val2, c->val2, v2)) + { + return 0; + } + } + if (var_is_alpha (v1) && var_is_numeric (v2)) + { + if (compare_values (val1, c->val1, v1)) + { + return 0; + } + } + if (var_is_alpha (v1) && var_is_alpha (v2)) + { + if (compare_values (val1, c->val1, v1)) + { + if (compare_values (val2, c->val2, v2)) + { + return 0; + } + } + } + } + else if (v2 == c->v1 && v1 == c->v2) + { + return -match_nodes (c, v2, v1, val2, val1); + } + return 1; +} + +/* + This function is meant to be used as a comparison function for + a struct hsh_table in src/libpspp/hash.c. +*/ +static int +covariance_accumulator_compare (const void *a1_, const void *a2_, const void *aux UNUSED) +{ + const struct covariance_accumulator *a1 = a1_; + const struct covariance_accumulator *a2 = a2_; + + if (a1 == NULL && a2 == NULL) + return 0; + + if (a1 == NULL || a2 == NULL) + return 1; + + return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2); +} + +static unsigned int +hash_numeric_alpha (const struct variable *v1, const struct variable *v2, + const union value *val, size_t n_vars) +{ + unsigned int result = -1u; + if (var_is_numeric (v1) && var_is_alpha (v2)) + { + result = n_vars * ((n_vars + 1) + var_get_dict_index (v1)) + + var_get_dict_index (v2) + hsh_hash_string (val->s); + } + else if (var_is_alpha (v1) && var_is_numeric (v2)) + { + result = hash_numeric_alpha (v2, v1, val, n_vars); + } + return result; +} + + +static double +update_product (const struct variable *v1, const struct variable *v2, const union value *val1, + const union value *val2) +{ + assert (v1 != NULL); + assert (v2 != NULL); + assert (val1 != NULL); + assert (val2 != NULL); + if (var_is_alpha (v1) && var_is_alpha (v2)) + { + return 1.0; + } + if (var_is_numeric (v1) && var_is_numeric (v2)) + { + return (val1->f * val2->f); + } + if (var_is_numeric (v1) && var_is_alpha (v2)) + { + return (val1->f); + } + if (var_is_numeric (v2) && var_is_alpha (v1)) + { + update_product (v2, v1, val2, val1); + } + return 0.0; +} +/* + Compute the covariance matrix in a single data-pass. + */ +void +covariance_accumulate (struct hsh_table *cov, struct moments1 **m, + const struct ccase *ccase, const struct variable **vars, + size_t n_vars) +{ + size_t i; + size_t j; + const union value *val; + struct covariance_accumulator *ca; + struct covariance_accumulator *entry; + + assert (m != NULL); + + for (i = 0; i < n_vars; ++i) + { + val = case_data (ccase, vars[i]); + if (var_is_alpha (vars[i])) + { + cat_value_update (vars[i], val); + } + else + { + moments1_add (m[i], val->f, 1.0); + } + for (j = i; j < n_vars; j++) + { + ca = xmalloc (sizeof (*ca)); + ca->v1 = vars[i]; + ca->v2 = vars[j]; + ca->val1 = val; + ca->val2 = case_data (ccase, ca->v2); + ca->product = update_product (ca->v1, ca->v2, ca->val1, ca->val2); + entry = hsh_insert (cov, ca); + if (entry != NULL) + { + entry->product += ca->product; + /* + If ENTRY is null, CA was not already in the hash + hable, so we don't free it because it was just inserted. + If ENTRY was not null, CA is already in the hash table. + Unnecessary now, it must be freed here. + */ + free (ca); + } + } + } +} + +static void +covariance_matrix_insert (struct design_matrix *cov, const struct variable *v1, + const struct variable *v2, const union value *val1, + const union value *val2, double product) +{ + size_t row; + size_t col; + size_t i; + const union value *tmp_val; + + assert (cov != NULL); + + row = design_matrix_var_to_column (cov, v1); + if (var_is_alpha (v1)) + { + i = 0; + tmp_val = cat_subscript_to_value (i, v1); + while (!compare_values (tmp_val, val1, v1)) + { + i++; + tmp_val = cat_subscript_to_value (i, v1); + } + row += i; + if (var_is_numeric (v2)) + { + col = design_matrix_var_to_column (cov, v2); + } + else + { + col = design_matrix_var_to_column (cov, v2); + i = 0; + tmp_val = cat_subscript_to_value (i, v1); + while (!compare_values (tmp_val, val1, v1)) + { + i++; + tmp_val = cat_subscript_to_value (i, v1); + } + col += i; + } + } + else + { + if (var_is_numeric (v2)) + { + col = design_matrix_var_to_column (cov, v2); + } + else + { + covariance_matrix_insert (cov, v2, v1, val2, val1, product); + } + } + gsl_matrix_set (cov->m, row, col, product); + gsl_matrix_set (cov->m, col, row, product); +} + +static double +get_center (const struct variable *v, const union value *val, + const struct variable **vars, const struct moments1 **m, size_t n_vars, + size_t ssize) +{ + size_t i = 0; + + while ((var_get_dict_index (vars[i]) != var_get_dict_index(v)) && (i < n_vars)) + { + i++; + } + if (var_is_numeric (v)) + { + double mean; + moments1_calculate (m[i], NULL, &mean, NULL, NULL, NULL); + return mean; + } + else + { + i = cat_value_find (v, val); + return (cat_get_category_count (i, v) / ssize); + } + return 0.0; +} + +/* + Subtract the product of the means. + */ +static double +center_entry (const struct covariance_accumulator *ca, const struct variable **vars, + const struct moments1 **m, size_t n_vars, size_t ssize) +{ + double m1; + double m2; + double result = 0.0; + + m1 = get_center (ca->v1, ca->val1, vars, m, n_vars, ssize); + m2 = get_center (ca->v2, ca->val2, vars, m, n_vars, ssize); + result = ca->product - ssize * m1 * m2; + return result; +} + +/* + The first moments in M should be stored in the order corresponding + to the order of VARS. So, for example, VARS[0] has its moments in + M[0], VARS[1] has its moments in M[1], etc. + */ +struct design_matrix * +covariance_accumulator_to_matrix (struct hsh_table *cov, const struct moments1 **m, + const struct variable **vars, size_t n_vars, size_t ssize) +{ + double tmp; + struct covariance_accumulator *entry; + struct design_matrix *result = NULL; + struct hsh_iterator iter; + + result = covariance_matrix_create (n_vars, vars); + + entry = hsh_first (cov, &iter); + + while (entry != NULL) + { + /* + We compute the centered, un-normalized covariance matrix. + */ + tmp = center_entry (entry, vars, m, n_vars, ssize); + covariance_matrix_insert (result, entry->v1, entry->v2, entry->val1, + entry->val2, tmp); + entry = hsh_next (cov, &iter); + } + + return result; +} + diff --git a/src/math/covariance-matrix.h b/src/math/covariance-matrix.h index 22f979c5..573d3e7e 100644 --- a/src/math/covariance-matrix.h +++ b/src/math/covariance-matrix.h @@ -21,14 +21,23 @@ #ifndef COVARIANCE_MATRIX_H #define COVARIANCE_MATRIX_H -#include "design-matrix.h" +#include + +struct moments1; +struct ccase; +struct hsh_table; struct design_matrix * -covariance_matrix_create (int, const struct variable *[]); +covariance_matrix_create (size_t, const struct variable *[]); void covariance_matrix_destroy (struct design_matrix *); void covariance_pass_two (struct design_matrix *, double, double, double, const struct variable *, const struct variable *, const union value *, const union value *); +void covariance_accumulate (struct hsh_table *, struct moments1 **, + const struct ccase *, const struct variable **, size_t); +struct hsh_table * covariance_hsh_create (size_t); +struct design_matrix * covariance_accumulator_to_matrix (struct hsh_table *, const struct moments1 **, + const struct variable **, size_t, size_t); #endif -- 2.30.2