1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
46 const union value *val1;
47 const union value *val2;
56 struct covariance_matrix
58 struct design_matrix *cov;
62 const struct variable **v_variables;
66 enum mv_class missing_value;
67 void (*accumulate) (struct covariance_matrix *, const struct ccase *);
68 void (*update_moments) (struct covariance_matrix *, size_t, double);
71 static struct hsh_table *covariance_hsh_create (size_t);
72 static hsh_hash_func covariance_accumulator_hash;
73 static unsigned int hash_numeric_alpha (const struct variable *,
74 const struct variable *,
75 const union value *, size_t);
76 static hsh_compare_func covariance_accumulator_compare;
77 static hsh_free_func covariance_accumulator_free;
78 static void update_moments1 (struct covariance_matrix *, size_t, double);
79 static void update_moments2 (struct covariance_matrix *, size_t, double);
80 static struct covariance_accumulator *get_new_covariance_accumulator (const
95 static void covariance_accumulate_listwise (struct covariance_matrix *,
96 const struct ccase *);
97 static void covariance_accumulate_pairwise (struct covariance_matrix *,
98 const struct ccase *);
100 struct covariance_matrix *
101 covariance_matrix_init (size_t n_variables,
102 const struct variable *v_variables[], int n_pass,
103 int missing_handling, enum mv_class missing_value)
106 struct covariance_matrix *result = NULL;
108 result = xmalloc (sizeof (*result));
110 result->ca = covariance_hsh_create (n_variables);
113 result->missing_handling = missing_handling;
114 result->missing_value = missing_value;
115 result->accumulate = (result->missing_handling == LISTWISE) ?
116 covariance_accumulate_listwise : covariance_accumulate_pairwise;
117 if (n_pass == ONE_PASS)
119 result->update_moments = update_moments1;
120 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
121 for (i = 0; i < n_variables; i++)
123 result->m1[i] = moments1_create (MOMENT_MEAN);
128 result->update_moments = update_moments2;
129 result->m = xnmalloc (n_variables, sizeof (*result->m));
130 for (i = 0; i < n_variables; i++)
132 result->m[i] = moments_create (MOMENT_MEAN);
135 result->v_variables = v_variables;
136 result->n_variables = n_variables;
137 result->n_pass = n_pass;
143 The covariances are stored in a DESIGN_MATRIX structure.
145 struct design_matrix *
146 covariance_matrix_create (size_t n_variables,
147 const struct variable *v_variables[])
149 return design_matrix_create (n_variables, v_variables,
150 (size_t) n_variables);
154 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
156 assert (cov->m1 != NULL);
157 moments1_add (cov->m1[i], x, 1.0);
161 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
163 assert (cov->m != NULL);
164 moments_pass_one (cov->m[i], x, 1.0);
168 covariance_matrix_destroy (struct covariance_matrix *cov)
172 assert (cov != NULL);
173 design_matrix_destroy (cov->cov);
174 hsh_destroy (cov->ca);
175 if (cov->n_pass == ONE_PASS)
177 for (i = 0; i < cov->n_variables; i++)
179 moments1_destroy (cov->m1[i]);
185 for (i = 0; i < cov->n_variables; i++)
187 moments_destroy (cov->m[i]);
194 Update the covariance matrix with the new entries, assuming that ROW
195 corresponds to a categorical variable and V2 is numeric.
198 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
200 const struct variable *v2, double x,
201 const union value *val2)
206 assert (var_is_numeric (v2));
208 col = design_matrix_var_to_column (cov, v2);
209 assert (val2 != NULL);
210 tmp = gsl_matrix_get (cov->m, row, col);
211 gsl_matrix_set (cov->m, row, col, (val2->f - mean) * x + tmp);
212 gsl_matrix_set (cov->m, col, row, (val2->f - mean) * x + tmp);
215 column_iterate (struct design_matrix *cov, const struct variable *v,
216 double ssize, double x, const union value *val1, size_t row)
222 const union value *tmp_val;
224 col = design_matrix_var_to_column (cov, v);
225 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
228 y = -1.0 * cat_get_category_count (i, v) / ssize;
229 tmp_val = cat_subscript_to_value (i, v);
230 if (compare_values (tmp_val, val1, v))
234 tmp = gsl_matrix_get (cov->m, row, col);
235 gsl_matrix_set (cov->m, row, col, x * y + tmp);
236 gsl_matrix_set (cov->m, col, row, x * y + tmp);
241 Call this function in the second data pass. The central moments are
242 MEAN1 and MEAN2. Any categorical variables should already have their
243 values summarized in in its OBS_VALS element.
246 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
247 double ssize, const struct variable *v1,
248 const struct variable *v2, const union value *val1,
249 const union value *val2)
255 const union value *tmp_val;
257 if (var_is_alpha (v1))
259 row = design_matrix_var_to_column (cov, v1);
260 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
263 x = -1.0 * cat_get_category_count (i, v1) / ssize;
264 tmp_val = cat_subscript_to_value (i, v1);
265 if (compare_values (tmp_val, val1, v1))
269 if (var_is_numeric (v2))
271 covariance_update_categorical_numeric (cov, mean2, row,
276 column_iterate (cov, v1, ssize, x, val1, row);
277 column_iterate (cov, v2, ssize, x, val2, row);
281 else if (var_is_alpha (v2))
284 Reverse the orders of V1, V2, etc. and put ourselves back
285 in the previous IF scope.
287 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
292 Both variables are numeric.
294 row = design_matrix_var_to_column (cov, v1);
295 col = design_matrix_var_to_column (cov, v2);
296 x = (val1->f - mean1) * (val2->f - mean2);
297 x += gsl_matrix_get (cov->m, col, row);
298 gsl_matrix_set (cov->m, row, col, x);
299 gsl_matrix_set (cov->m, col, row, x);
304 covariance_accumulator_hash (const void *h, const void *aux)
306 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
307 size_t *n_vars = (size_t *) aux;
310 const struct variable *v_min;
311 const struct variable *v_max;
312 const union value *val_min;
313 const union value *val_max;
316 Order everything by the variables' indices. This ensures we get the
317 same key regardless of the order in which the variables are stored
321 (var_get_dict_index (ca->v1) <
322 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
323 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
325 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
326 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
328 idx_min = var_get_dict_index (v_min);
329 idx_max = var_get_dict_index (v_max);
331 if (var_is_numeric (v_max) && var_is_numeric (v_min))
333 return (*n_vars * idx_max + idx_min);
335 if (var_is_numeric (v_max) && var_is_alpha (v_min))
337 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
339 if (var_is_alpha (v_max) && var_is_numeric (v_min))
341 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
343 if (var_is_alpha (v_max) && var_is_alpha (v_min))
347 xnmalloc (1 + var_get_width (v_max) + var_get_width (v_min),
349 strncpy (x, val_max->s, var_get_width (v_max));
350 strncat (x, val_min->s, var_get_width (v_min));
351 tmp = *n_vars * (*n_vars + 1 + idx_max) + idx_min + hsh_hash_string (x);
359 Make a hash table consisting of struct covariance_accumulators.
360 This allows the accumulation of the elements of a covariance matrix
361 in a single data pass. Call covariance_accumulate () for each case
364 static struct hsh_table *
365 covariance_hsh_create (size_t n_vars)
367 return hsh_create (n_vars * n_vars, covariance_accumulator_compare,
368 covariance_accumulator_hash, covariance_accumulator_free,
373 covariance_accumulator_free (void *c_, const void *aux UNUSED)
375 struct covariance_accumulator *c = c_;
381 Hash comparison. Returns 0 for a match, or a non-zero int
382 otherwise. The sign of a non-zero return value *should* indicate the
383 position of C relative to the covariance_accumulator described by
384 the other arguments. But for now, it just returns 1 for any
385 non-match. This should be changed when someone figures out how to
386 compute a sensible sign for the return value.
389 match_nodes (const struct covariance_accumulator *c,
390 const struct variable *v1, const struct variable *v2,
391 const union value *val1, const union value *val2)
393 if (var_get_dict_index (v1) == var_get_dict_index (c->v1))
394 if (var_get_dict_index (v2) == var_get_dict_index (c->v2))
396 if (var_is_numeric (v1) && var_is_numeric (v2))
400 if (var_is_numeric (v1) && var_is_alpha (v2))
402 if (compare_values (val2, c->val2, v2))
407 if (var_is_alpha (v1) && var_is_numeric (v2))
409 if (compare_values (val1, c->val1, v1))
414 if (var_is_alpha (v1) && var_is_alpha (v2))
416 if (compare_values (val1, c->val1, v1))
418 if (compare_values (val2, c->val2, v2))
429 This function is meant to be used as a comparison function for
430 a struct hsh_table in src/libpspp/hash.c.
433 covariance_accumulator_compare (const void *a1_, const void *a2_,
434 const void *aux UNUSED)
436 const struct covariance_accumulator *a1 = a1_;
437 const struct covariance_accumulator *a2 = a2_;
439 if (a1 == NULL && a2 == NULL)
442 if (a1 == NULL || a2 == NULL)
445 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
449 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
450 const union value *val, size_t n_vars)
452 unsigned int result = -1u;
453 if (var_is_numeric (v1) && var_is_alpha (v2))
455 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
456 + var_get_dict_index (v2) + hsh_hash_string (val->s);
458 else if (var_is_alpha (v1) && var_is_numeric (v2))
460 result = hash_numeric_alpha (v2, v1, val, n_vars);
467 update_product (const struct variable *v1, const struct variable *v2,
468 const union value *val1, const union value *val2)
472 assert (val1 != NULL);
473 assert (val2 != NULL);
474 if (var_is_alpha (v1) && var_is_alpha (v2))
478 if (var_is_numeric (v1) && var_is_numeric (v2))
480 return (val1->f * val2->f);
482 if (var_is_numeric (v1) && var_is_alpha (v2))
486 if (var_is_numeric (v2) && var_is_alpha (v1))
488 update_product (v2, v1, val2, val1);
493 update_sum (const struct variable *var, const union value *val)
495 assert (var != NULL);
496 assert (val != NULL);
497 if (var_is_alpha (var))
503 static struct covariance_accumulator *
504 get_new_covariance_accumulator (const struct variable *v1,
505 const struct variable *v2,
506 const union value *val1,
507 const union value *val2)
509 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
511 struct covariance_accumulator *ca;
512 ca = xmalloc (sizeof (*ca));
522 static const struct variable **
523 get_covariance_variables (const struct covariance_matrix *cov)
525 return cov->v_variables;
529 update_hash_entry (struct hsh_table *c,
530 const struct variable *v1,
531 const struct variable *v2,
532 const union value *val1, const union value *val2)
534 struct covariance_accumulator *ca;
535 struct covariance_accumulator *new_entry;
538 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
539 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
540 ca->sum1 = update_sum (ca->v1, ca->val1);
541 ca->sum2 = update_sum (ca->v2, ca->val2);
543 new_entry = hsh_insert (c, ca);
544 if (new_entry != NULL)
546 new_entry->dot_product += ca->dot_product;
547 new_entry->ssize += 1.0;
548 new_entry->sum1 += ca->sum1;
549 new_entry->sum2 += ca->sum2;
551 If DOT_PRODUCT is null, CA was not already in the hash
552 hable, so we don't free it because it was just inserted.
553 If DOT_PRODUCT was not null, CA is already in the hash table.
554 Unnecessary now, it must be freed here.
561 Compute the covariance matrix in a single data-pass. Cases with
562 missing values are dropped pairwise, in other words, only if one of
563 the two values necessary to accumulate the inner product is missing.
565 Do not call this function directly. Call it through the struct
566 covariance_matrix ACCUMULATE member function, for example,
567 cov->accumulate (cov, ccase).
570 covariance_accumulate_pairwise (struct covariance_matrix *cov,
571 const struct ccase *ccase)
575 const union value *val1;
576 const union value *val2;
577 const struct variable **v_variables;
579 assert (cov != NULL);
580 assert (ccase != NULL);
582 v_variables = get_covariance_variables (cov);
583 assert (v_variables != NULL);
585 for (i = 0; i < cov->n_variables; ++i)
587 val1 = case_data (ccase, v_variables[i]);
588 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
590 cat_value_update (v_variables[i], val1);
591 if (var_is_alpha (v_variables[i]))
592 cov->update_moments (cov, i, val1->f);
594 for (j = i; j < cov->n_variables; j++)
596 val2 = case_data (ccase, v_variables[j]);
597 if (!var_is_value_missing
598 (v_variables[j], val2, cov->missing_value))
600 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
603 update_hash_entry (cov->ca, v_variables[j],
604 v_variables[i], val2, val1);
612 Compute the covariance matrix in a single data-pass. Cases with
613 missing values are dropped listwise. In other words, if one of the
614 values for any variable in a case is missing, the entire case is
617 The caller must use a casefilter to remove the cases with missing
618 values before calling covariance_accumulate_listwise. This function
619 assumes that CCASE has already passed through this filter, and
620 contains no missing values.
622 Do not call this function directly. Call it through the struct
623 covariance_matrix ACCUMULATE member function, for example,
624 cov->accumulate (cov, ccase).
627 covariance_accumulate_listwise (struct covariance_matrix *cov,
628 const struct ccase *ccase)
632 const union value *val1;
633 const union value *val2;
634 const struct variable **v_variables;
636 assert (cov != NULL);
637 assert (ccase != NULL);
639 v_variables = get_covariance_variables (cov);
640 assert (v_variables != NULL);
642 for (i = 0; i < cov->n_variables; ++i)
644 val1 = case_data (ccase, v_variables[i]);
645 cat_value_update (v_variables[i], val1);
646 if (var_is_alpha (v_variables[i]))
647 cov->update_moments (cov, i, val1->f);
649 for (j = i; j < cov->n_variables; j++)
651 val2 = case_data (ccase, v_variables[j]);
652 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
655 update_hash_entry (cov->ca, v_variables[j], v_variables[i],
662 Call this function during the data pass. Each case will be added to
663 a hash containing all values of the covariance matrix. After the
664 data have been passed, call covariance_matrix_compute to put the
665 values in the struct covariance_matrix.
668 covariance_matrix_accumulate (struct covariance_matrix *cov,
669 const struct ccase *ccase)
671 cov->accumulate (cov, ccase);
675 covariance_matrix_insert (struct design_matrix *cov,
676 const struct variable *v1,
677 const struct variable *v2, const union value *val1,
678 const union value *val2, double product)
683 const union value *tmp_val;
685 assert (cov != NULL);
687 row = design_matrix_var_to_column (cov, v1);
688 if (var_is_alpha (v1))
691 tmp_val = cat_subscript_to_value (i, v1);
692 while (!compare_values (tmp_val, val1, v1))
695 tmp_val = cat_subscript_to_value (i, v1);
698 if (var_is_numeric (v2))
700 col = design_matrix_var_to_column (cov, v2);
704 col = design_matrix_var_to_column (cov, v2);
706 tmp_val = cat_subscript_to_value (i, v1);
707 while (!compare_values (tmp_val, val1, v1))
710 tmp_val = cat_subscript_to_value (i, v1);
717 if (var_is_numeric (v2))
719 col = design_matrix_var_to_column (cov, v2);
723 covariance_matrix_insert (cov, v2, v1, val2, val1, product);
726 gsl_matrix_set (cov->m, row, col, product);
729 static struct design_matrix *
730 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
733 struct covariance_accumulator *entry;
734 struct design_matrix *result = NULL;
735 struct hsh_iterator iter;
737 result = covariance_matrix_create (cov->n_variables, cov->v_variables);
739 entry = hsh_first (cov->ca, &iter);
741 while (entry != NULL)
744 We compute the centered, un-normalized covariance matrix.
746 tmp = entry->dot_product - entry->sum1 * entry->sum2 / entry->ssize;
747 covariance_matrix_insert (result, entry->v1, entry->v2, entry->val1,
749 entry = hsh_next (cov->ca, &iter);
756 Call this function after passing the data.
759 covariance_matrix_compute (struct covariance_matrix *cov)
761 if (cov->n_pass == ONE_PASS)
763 cov->cov = covariance_accumulator_to_matrix (cov);
767 struct design_matrix *
768 covariance_to_design (const struct covariance_matrix *c)