1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
47 const union value *val1;
48 const union value *val2;
51 static hsh_hash_func covariance_accumulator_hash;
52 static unsigned int hash_numeric_alpha (const struct variable *, const struct variable *,
53 const union value *, size_t);
54 static hsh_compare_func covariance_accumulator_compare;
55 static hsh_free_func covariance_accumulator_free;
58 The covariances are stored in a DESIGN_MATRIX structure.
60 struct design_matrix *
61 covariance_matrix_create (size_t n_variables, const struct variable *v_variables[])
63 return design_matrix_create (n_variables, v_variables, (size_t) n_variables);
66 void covariance_matrix_destroy (struct design_matrix *x)
68 design_matrix_destroy (x);
72 Update the covariance matrix with the new entries, assuming that ROW
73 corresponds to a categorical variable and V2 is numeric.
76 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
78 const struct variable *v2, double x, const union value *val2)
83 assert (var_is_numeric (v2));
85 col = design_matrix_var_to_column (cov, v2);
86 assert (val2 != NULL);
87 tmp = gsl_matrix_get (cov->m, row, col);
88 gsl_matrix_set (cov->m, row, col, (val2->f - mean) * x + tmp);
89 gsl_matrix_set (cov->m, col, row, (val2->f - mean) * x + tmp);
92 column_iterate (struct design_matrix *cov, const struct variable *v,
93 double ssize, double x, const union value *val1, size_t row)
99 const union value *tmp_val;
101 col = design_matrix_var_to_column (cov, v);
102 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
105 y = -1.0 * cat_get_category_count (i, v) / ssize;
106 tmp_val = cat_subscript_to_value (i, v);
107 if (compare_values (tmp_val, val1, v))
111 tmp = gsl_matrix_get (cov->m, row, col);
112 gsl_matrix_set (cov->m, row, col, x * y + tmp);
113 gsl_matrix_set (cov->m, col, row, x * y + tmp);
117 Call this function in the second data pass. The central moments are
118 MEAN1 and MEAN2. Any categorical variables should already have their
119 values summarized in in its OBS_VALS element.
121 void covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
122 double ssize, const struct variable *v1,
123 const struct variable *v2, const union value *val1, const union value *val2)
129 const union value *tmp_val;
131 if (var_is_alpha (v1))
133 row = design_matrix_var_to_column (cov, v1);
134 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
137 x = -1.0 * cat_get_category_count (i, v1) / ssize;
138 tmp_val = cat_subscript_to_value (i, v1);
139 if (compare_values (tmp_val, val1, v1))
143 if (var_is_numeric (v2))
145 covariance_update_categorical_numeric (cov, mean2, row,
150 column_iterate (cov, v1, ssize, x, val1, row);
151 column_iterate (cov, v2, ssize, x, val2, row);
155 else if (var_is_alpha (v2))
158 Reverse the orders of V1, V2, etc. and put ourselves back
159 in the previous IF scope.
161 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
166 Both variables are numeric.
168 row = design_matrix_var_to_column (cov, v1);
169 col = design_matrix_var_to_column (cov, v2);
170 x = (val1->f - mean1) * (val2->f - mean2);
171 x += gsl_matrix_get (cov->m, col, row);
172 gsl_matrix_set (cov->m, row, col, x);
173 gsl_matrix_set (cov->m, col, row, x);
178 covariance_accumulator_hash (const void *h, const void *aux)
180 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
181 size_t *n_vars = (size_t *) aux;
184 const struct variable *v_min;
185 const struct variable *v_max;
186 const union value *val_min;
187 const union value *val_max;
190 Order everything by the variables' indices. This ensures we get the
191 same key regardless of the order in which the variables are stored
194 v_min = (var_get_dict_index (ca->v1) < var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
195 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
197 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
198 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
200 idx_min = var_get_dict_index (v_min);
201 idx_max = var_get_dict_index (v_max);
203 if (var_is_numeric (v_max) && var_is_numeric (v_min))
205 return (*n_vars * idx_max + idx_min);
207 if (var_is_numeric (v_max) && var_is_alpha (v_min))
209 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
211 if (var_is_alpha (v_max) && var_is_numeric (v_min))
213 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
215 if (var_is_alpha (v_max) && var_is_alpha (v_min))
218 char *x = xnmalloc (1 + var_get_width (v_max) + var_get_width (v_min), sizeof (*x));
219 strncpy (x, val_max->s, var_get_width (v_max));
220 strncat (x, val_min->s, var_get_width (v_min));
221 tmp = *n_vars * (*n_vars + 1 + idx_max)
223 + hsh_hash_string (x);
231 Make a hash table consisting of struct covariance_accumulators.
232 This allows the accumulation of the elements of a covariance matrix
233 in a single data pass. Call covariance_accumulate () for each case
237 covariance_hsh_create (size_t n_vars)
239 return hsh_create (n_vars * (n_vars + 1) / 2, covariance_accumulator_compare,
240 covariance_accumulator_hash, covariance_accumulator_free, &n_vars);
244 covariance_accumulator_free (void *c_, const void *aux UNUSED)
246 struct covariance_accumulator *c = c_;
251 match_nodes (const struct covariance_accumulator *c, const struct variable *v1,
252 const struct variable *v2, const union value *val1,
253 const union value *val2)
255 if (var_get_dict_index (v1) == var_get_dict_index (c->v1) &&
256 var_get_dict_index (v2) == var_get_dict_index (c->v2))
258 if (var_is_numeric (v1) && var_is_numeric (v2))
262 if (var_is_numeric (v1) && var_is_alpha (v2))
264 if (compare_values (val2, c->val2, v2))
269 if (var_is_alpha (v1) && var_is_numeric (v2))
271 if (compare_values (val1, c->val1, v1))
276 if (var_is_alpha (v1) && var_is_alpha (v2))
278 if (compare_values (val1, c->val1, v1))
280 if (compare_values (val2, c->val2, v2))
287 else if (v2 == c->v1 && v1 == c->v2)
289 return -match_nodes (c, v2, v1, val2, val1);
295 This function is meant to be used as a comparison function for
296 a struct hsh_table in src/libpspp/hash.c.
299 covariance_accumulator_compare (const void *a1_, const void *a2_, const void *aux UNUSED)
301 const struct covariance_accumulator *a1 = a1_;
302 const struct covariance_accumulator *a2 = a2_;
304 if (a1 == NULL && a2 == NULL)
307 if (a1 == NULL || a2 == NULL)
310 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
314 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
315 const union value *val, size_t n_vars)
317 unsigned int result = -1u;
318 if (var_is_numeric (v1) && var_is_alpha (v2))
320 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
321 + var_get_dict_index (v2) + hsh_hash_string (val->s);
323 else if (var_is_alpha (v1) && var_is_numeric (v2))
325 result = hash_numeric_alpha (v2, v1, val, n_vars);
332 update_product (const struct variable *v1, const struct variable *v2, const union value *val1,
333 const union value *val2)
337 assert (val1 != NULL);
338 assert (val2 != NULL);
339 if (var_is_alpha (v1) && var_is_alpha (v2))
343 if (var_is_numeric (v1) && var_is_numeric (v2))
345 return (val1->f * val2->f);
347 if (var_is_numeric (v1) && var_is_alpha (v2))
351 if (var_is_numeric (v2) && var_is_alpha (v1))
353 update_product (v2, v1, val2, val1);
358 Compute the covariance matrix in a single data-pass.
361 covariance_accumulate (struct hsh_table *cov, struct moments1 **m,
362 const struct ccase *ccase, const struct variable **vars,
367 const union value *val;
368 struct covariance_accumulator *ca;
369 struct covariance_accumulator *entry;
373 for (i = 0; i < n_vars; ++i)
375 val = case_data (ccase, vars[i]);
376 if (var_is_alpha (vars[i]))
378 cat_value_update (vars[i], val);
382 moments1_add (m[i], val->f, 1.0);
384 for (j = i; j < n_vars; j++)
386 ca = xmalloc (sizeof (*ca));
390 ca->val2 = case_data (ccase, ca->v2);
391 ca->product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
392 entry = hsh_insert (cov, ca);
395 entry->product += ca->product;
397 If ENTRY is null, CA was not already in the hash
398 hable, so we don't free it because it was just inserted.
399 If ENTRY was not null, CA is already in the hash table.
400 Unnecessary now, it must be freed here.
409 covariance_matrix_insert (struct design_matrix *cov, const struct variable *v1,
410 const struct variable *v2, const union value *val1,
411 const union value *val2, double product)
416 const union value *tmp_val;
418 assert (cov != NULL);
420 row = design_matrix_var_to_column (cov, v1);
421 if (var_is_alpha (v1))
424 tmp_val = cat_subscript_to_value (i, v1);
425 while (!compare_values (tmp_val, val1, v1))
428 tmp_val = cat_subscript_to_value (i, v1);
431 if (var_is_numeric (v2))
433 col = design_matrix_var_to_column (cov, v2);
437 col = design_matrix_var_to_column (cov, v2);
439 tmp_val = cat_subscript_to_value (i, v1);
440 while (!compare_values (tmp_val, val1, v1))
443 tmp_val = cat_subscript_to_value (i, v1);
450 if (var_is_numeric (v2))
452 col = design_matrix_var_to_column (cov, v2);
456 covariance_matrix_insert (cov, v2, v1, val2, val1, product);
459 gsl_matrix_set (cov->m, row, col, product);
460 gsl_matrix_set (cov->m, col, row, product);
464 get_center (const struct variable *v, const union value *val,
465 const struct variable **vars, const struct moments1 **m, size_t n_vars,
470 while ((var_get_dict_index (vars[i]) != var_get_dict_index(v)) && (i < n_vars))
474 if (var_is_numeric (v))
477 moments1_calculate (m[i], NULL, &mean, NULL, NULL, NULL);
482 i = cat_value_find (v, val);
483 return (cat_get_category_count (i, v) / ssize);
489 Subtract the product of the means.
492 center_entry (const struct covariance_accumulator *ca, const struct variable **vars,
493 const struct moments1 **m, size_t n_vars, size_t ssize)
499 m1 = get_center (ca->v1, ca->val1, vars, m, n_vars, ssize);
500 m2 = get_center (ca->v2, ca->val2, vars, m, n_vars, ssize);
501 result = ca->product - ssize * m1 * m2;
506 The first moments in M should be stored in the order corresponding
507 to the order of VARS. So, for example, VARS[0] has its moments in
508 M[0], VARS[1] has its moments in M[1], etc.
510 struct design_matrix *
511 covariance_accumulator_to_matrix (struct hsh_table *cov, const struct moments1 **m,
512 const struct variable **vars, size_t n_vars, size_t ssize)
515 struct covariance_accumulator *entry;
516 struct design_matrix *result = NULL;
517 struct hsh_iterator iter;
519 result = covariance_matrix_create (n_vars, vars);
521 entry = hsh_first (cov, &iter);
523 while (entry != NULL)
526 We compute the centered, un-normalized covariance matrix.
528 tmp = center_entry (entry, vars, m, n_vars, ssize);
529 covariance_matrix_insert (result, entry->v1, entry->v2, entry->val1,
531 entry = hsh_next (cov, &iter);