1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/interaction.h>
30 #include <math/moments.h>
35 Structure used to accumulate the covariance matrix in a single data
36 pass. Before passing the data, we do not know how many categories
37 there are in each categorical variable. Therefore we do not know the
38 size of the covariance matrix. To get around this problem, we
39 accumulate the elements of the covariance matrix in pointers to
40 COVARIANC_ACCUMULATOR. These values are then used to populate
41 the covariance matrix.
43 struct covariance_accumulator
45 const struct variable *v1;
46 const struct variable *v2;
47 const union value *val1;
48 const union value *val2;
57 struct covariance_matrix
59 struct design_matrix *cov;
63 const struct variable **v_variables;
67 enum mv_class missing_value;
68 void (*accumulate) (struct covariance_matrix *, const struct ccase *);
69 void (*update_moments) (struct covariance_matrix *, size_t, double);
72 static struct hsh_table *covariance_hsh_create (size_t);
73 static hsh_hash_func covariance_accumulator_hash;
74 static unsigned int hash_numeric_alpha (const struct variable *,
75 const struct variable *,
76 const union value *, size_t);
77 static hsh_compare_func covariance_accumulator_compare;
78 static hsh_free_func covariance_accumulator_free;
79 static void update_moments1 (struct covariance_matrix *, size_t, double);
80 static void update_moments2 (struct covariance_matrix *, size_t, double);
81 static struct covariance_accumulator *get_new_covariance_accumulator (const
96 static void covariance_accumulate_listwise (struct covariance_matrix *,
97 const struct ccase *);
98 static void covariance_accumulate_pairwise (struct covariance_matrix *,
99 const struct ccase *);
101 struct covariance_matrix *
102 covariance_matrix_init (size_t n_variables,
103 const struct variable *v_variables[], int n_pass,
104 int missing_handling, enum mv_class missing_value)
107 struct covariance_matrix *result = NULL;
109 result = xmalloc (sizeof (*result));
111 result->ca = covariance_hsh_create (n_variables);
114 result->missing_handling = missing_handling;
115 result->missing_value = missing_value;
116 result->accumulate = (result->missing_handling == LISTWISE) ?
117 covariance_accumulate_listwise : covariance_accumulate_pairwise;
118 if (n_pass == ONE_PASS)
120 result->update_moments = update_moments1;
121 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
122 for (i = 0; i < n_variables; i++)
124 result->m1[i] = moments1_create (MOMENT_MEAN);
129 result->update_moments = update_moments2;
130 result->m = xnmalloc (n_variables, sizeof (*result->m));
131 for (i = 0; i < n_variables; i++)
133 result->m[i] = moments_create (MOMENT_MEAN);
136 result->v_variables = v_variables;
137 result->n_variables = n_variables;
138 result->n_pass = n_pass;
144 The covariances are stored in a DESIGN_MATRIX structure.
146 struct design_matrix *
147 covariance_matrix_create (size_t n_variables,
148 const struct variable *v_variables[])
150 return design_matrix_create (n_variables, v_variables,
151 (size_t) n_variables);
155 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
157 assert (cov->m1 != NULL);
158 moments1_add (cov->m1[i], x, 1.0);
162 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
164 assert (cov->m != NULL);
165 moments_pass_one (cov->m[i], x, 1.0);
169 covariance_matrix_destroy (struct covariance_matrix *cov)
173 assert (cov != NULL);
174 design_matrix_destroy (cov->cov);
175 hsh_destroy (cov->ca);
176 if (cov->n_pass == ONE_PASS)
178 for (i = 0; i < cov->n_variables; i++)
180 moments1_destroy (cov->m1[i]);
186 for (i = 0; i < cov->n_variables; i++)
188 moments_destroy (cov->m[i]);
195 Update the covariance matrix with the new entries, assuming that ROW
196 corresponds to a categorical variable and V2 is numeric.
199 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
201 const struct variable *v2, double x,
202 const union value *val2)
207 assert (var_is_numeric (v2));
209 col = design_matrix_var_to_column (cov, v2);
210 assert (val2 != NULL);
211 tmp = gsl_matrix_get (cov->m, row, col);
212 gsl_matrix_set (cov->m, row, col, (val2->f - mean) * x + tmp);
213 gsl_matrix_set (cov->m, col, row, (val2->f - mean) * x + tmp);
216 column_iterate (struct design_matrix *cov, const struct variable *v,
217 double ssize, double x, const union value *val1, size_t row)
223 const union value *tmp_val;
225 col = design_matrix_var_to_column (cov, v);
226 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
229 y = -1.0 * cat_get_category_count (i, v) / ssize;
230 tmp_val = cat_subscript_to_value (i, v);
231 if (compare_values_short (tmp_val, val1, v))
235 tmp = gsl_matrix_get (cov->m, row, col);
236 gsl_matrix_set (cov->m, row, col, x * y + tmp);
237 gsl_matrix_set (cov->m, col, row, x * y + tmp);
242 Call this function in the second data pass. The central moments are
243 MEAN1 and MEAN2. Any categorical variables should already have their
244 values summarized in in its OBS_VALS element.
247 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
248 double ssize, const struct variable *v1,
249 const struct variable *v2, const union value *val1,
250 const union value *val2)
256 const union value *tmp_val;
258 if (var_is_alpha (v1))
260 row = design_matrix_var_to_column (cov, v1);
261 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
264 x = -1.0 * cat_get_category_count (i, v1) / ssize;
265 tmp_val = cat_subscript_to_value (i, v1);
266 if (compare_values_short (tmp_val, val1, v1))
270 if (var_is_numeric (v2))
272 covariance_update_categorical_numeric (cov, mean2, row,
277 column_iterate (cov, v1, ssize, x, val1, row);
278 column_iterate (cov, v2, ssize, x, val2, row);
282 else if (var_is_alpha (v2))
285 Reverse the orders of V1, V2, etc. and put ourselves back
286 in the previous IF scope.
288 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
293 Both variables are numeric.
295 row = design_matrix_var_to_column (cov, v1);
296 col = design_matrix_var_to_column (cov, v2);
297 x = (val1->f - mean1) * (val2->f - mean2);
298 x += gsl_matrix_get (cov->m, col, row);
299 gsl_matrix_set (cov->m, row, col, x);
300 gsl_matrix_set (cov->m, col, row, x);
305 covariance_accumulator_hash (const void *h, const void *aux)
307 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
308 size_t *n_vars = (size_t *) aux;
311 const struct variable *v_min;
312 const struct variable *v_max;
313 const union value *val_min;
314 const union value *val_max;
317 Order everything by the variables' indices. This ensures we get the
318 same key regardless of the order in which the variables are stored
322 (var_get_dict_index (ca->v1) <
323 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
324 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
326 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
327 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
329 idx_min = var_get_dict_index (v_min);
330 idx_max = var_get_dict_index (v_max);
332 if (var_is_numeric (v_max) && var_is_numeric (v_min))
334 return (*n_vars * idx_max + idx_min);
336 if (var_is_numeric (v_max) && var_is_alpha (v_min))
338 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
340 if (var_is_alpha (v_max) && var_is_numeric (v_min))
342 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
344 if (var_is_alpha (v_max) && var_is_alpha (v_min))
348 xnmalloc (1 + var_get_width (v_max) + var_get_width (v_min),
350 strncpy (x, val_max->s, var_get_width (v_max));
351 strncat (x, val_min->s, var_get_width (v_min));
352 tmp = *n_vars * (*n_vars + 1 + idx_max) + idx_min + hsh_hash_string (x);
360 Make a hash table consisting of struct covariance_accumulators.
361 This allows the accumulation of the elements of a covariance matrix
362 in a single data pass. Call covariance_accumulate () for each case
365 static struct hsh_table *
366 covariance_hsh_create (size_t n_vars)
368 return hsh_create (n_vars * n_vars, covariance_accumulator_compare,
369 covariance_accumulator_hash, covariance_accumulator_free,
374 covariance_accumulator_free (void *c_, const void *aux UNUSED)
376 struct covariance_accumulator *c = c_;
382 Hash comparison. Returns 0 for a match, or a non-zero int
383 otherwise. The sign of a non-zero return value *should* indicate the
384 position of C relative to the covariance_accumulator described by
385 the other arguments. But for now, it just returns 1 for any
386 non-match. This should be changed when someone figures out how to
387 compute a sensible sign for the return value.
390 match_nodes (const struct covariance_accumulator *c,
391 const struct variable *v1, const struct variable *v2,
392 const union value *val1, const union value *val2)
394 if (var_get_dict_index (v1) == var_get_dict_index (c->v1))
395 if (var_get_dict_index (v2) == var_get_dict_index (c->v2))
397 if (var_is_numeric (v1) && var_is_numeric (v2))
401 if (var_is_numeric (v1) && var_is_alpha (v2))
403 if (!compare_values_short (val2, c->val2, v2))
408 if (var_is_alpha (v1) && var_is_numeric (v2))
410 if (!compare_values_short (val1, c->val1, v1))
415 if (var_is_alpha (v1) && var_is_alpha (v2))
417 if (compare_values_short (val1, c->val1, v1))
419 if (!compare_values_short (val2, c->val2, v2))
430 This function is meant to be used as a comparison function for
431 a struct hsh_table in src/libpspp/hash.c.
434 covariance_accumulator_compare (const void *a1_, const void *a2_,
435 const void *aux UNUSED)
437 const struct covariance_accumulator *a1 = a1_;
438 const struct covariance_accumulator *a2 = a2_;
440 if (a1 == NULL && a2 == NULL)
443 if (a1 == NULL || a2 == NULL)
446 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
450 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
451 const union value *val, size_t n_vars)
453 unsigned int result = -1u;
454 if (var_is_numeric (v1) && var_is_alpha (v2))
456 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
457 + var_get_dict_index (v2) + hsh_hash_string (val->s);
459 else if (var_is_alpha (v1) && var_is_numeric (v2))
461 result = hash_numeric_alpha (v2, v1, val, n_vars);
468 update_product (const struct variable *v1, const struct variable *v2,
469 const union value *val1, const union value *val2)
473 assert (val1 != NULL);
474 assert (val2 != NULL);
475 if (var_is_alpha (v1) && var_is_alpha (v2))
479 if (var_is_numeric (v1) && var_is_numeric (v2))
481 return (val1->f * val2->f);
483 if (var_is_numeric (v1) && var_is_alpha (v2))
487 if (var_is_numeric (v2) && var_is_alpha (v1))
489 update_product (v2, v1, val2, val1);
494 update_sum (const struct variable *var, const union value *val)
496 assert (var != NULL);
497 assert (val != NULL);
498 if (var_is_alpha (var))
504 static struct covariance_accumulator *
505 get_new_covariance_accumulator (const struct variable *v1,
506 const struct variable *v2,
507 const union value *val1,
508 const union value *val2)
510 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
512 struct covariance_accumulator *ca;
513 ca = xmalloc (sizeof (*ca));
523 static const struct variable **
524 get_covariance_variables (const struct covariance_matrix *cov)
526 return cov->v_variables;
530 update_hash_entry (struct hsh_table *c,
531 const struct variable *v1,
532 const struct variable *v2,
533 const union value *val1, const union value *val2)
535 struct covariance_accumulator *ca;
536 struct covariance_accumulator *new_entry;
539 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
540 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
541 ca->sum1 = update_sum (ca->v1, ca->val1);
542 ca->sum2 = update_sum (ca->v2, ca->val2);
544 new_entry = hsh_insert (c, ca);
545 if (new_entry != NULL)
547 new_entry->dot_product += ca->dot_product;
548 new_entry->ssize += 1.0;
549 new_entry->sum1 += ca->sum1;
550 new_entry->sum2 += ca->sum2;
552 If DOT_PRODUCT is null, CA was not already in the hash
553 hable, so we don't free it because it was just inserted.
554 If DOT_PRODUCT was not null, CA is already in the hash table.
555 Unnecessary now, it must be freed here.
562 Compute the covariance matrix in a single data-pass. Cases with
563 missing values are dropped pairwise, in other words, only if one of
564 the two values necessary to accumulate the inner product is missing.
566 Do not call this function directly. Call it through the struct
567 covariance_matrix ACCUMULATE member function, for example,
568 cov->accumulate (cov, ccase).
571 covariance_accumulate_pairwise (struct covariance_matrix *cov,
572 const struct ccase *ccase)
576 const union value *val1;
577 const union value *val2;
578 const struct variable **v_variables;
580 assert (cov != NULL);
581 assert (ccase != NULL);
583 v_variables = get_covariance_variables (cov);
584 assert (v_variables != NULL);
586 for (i = 0; i < cov->n_variables; ++i)
588 val1 = case_data (ccase, v_variables[i]);
589 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
591 cat_value_update (v_variables[i], val1);
592 if (var_is_numeric (v_variables[i]))
593 cov->update_moments (cov, i, val1->f);
595 for (j = i; j < cov->n_variables; j++)
597 val2 = case_data (ccase, v_variables[j]);
598 if (!var_is_value_missing
599 (v_variables[j], val2, cov->missing_value))
601 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
604 update_hash_entry (cov->ca, v_variables[j],
605 v_variables[i], val2, val1);
613 Compute the covariance matrix in a single data-pass. Cases with
614 missing values are dropped listwise. In other words, if one of the
615 values for any variable in a case is missing, the entire case is
618 The caller must use a casefilter to remove the cases with missing
619 values before calling covariance_accumulate_listwise. This function
620 assumes that CCASE has already passed through this filter, and
621 contains no missing values.
623 Do not call this function directly. Call it through the struct
624 covariance_matrix ACCUMULATE member function, for example,
625 cov->accumulate (cov, ccase).
628 covariance_accumulate_listwise (struct covariance_matrix *cov,
629 const struct ccase *ccase)
633 const union value *val1;
634 const union value *val2;
635 const struct variable **v_variables;
637 assert (cov != NULL);
638 assert (ccase != NULL);
640 v_variables = get_covariance_variables (cov);
641 assert (v_variables != NULL);
643 for (i = 0; i < cov->n_variables; ++i)
645 val1 = case_data (ccase, v_variables[i]);
646 cat_value_update (v_variables[i], val1);
647 if (var_is_numeric (v_variables[i]))
648 cov->update_moments (cov, i, val1->f);
650 for (j = i; j < cov->n_variables; j++)
652 val2 = case_data (ccase, v_variables[j]);
653 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
656 update_hash_entry (cov->ca, v_variables[j], v_variables[i],
663 Call this function during the data pass. Each case will be added to
664 a hash containing all values of the covariance matrix. After the
665 data have been passed, call covariance_matrix_compute to put the
666 values in the struct covariance_matrix.
669 covariance_matrix_accumulate (struct covariance_matrix *cov,
670 const struct ccase *ccase)
672 cov->accumulate (cov, ccase);
676 covariance_matrix_insert (struct design_matrix *cov,
677 const struct variable *v1,
678 const struct variable *v2, const union value *val1,
679 const union value *val2, double product)
684 const union value *tmp_val;
686 assert (cov != NULL);
688 row = design_matrix_var_to_column (cov, v1);
689 if (var_is_alpha (v1))
692 tmp_val = cat_subscript_to_value (i, v1);
693 while (!compare_values_short (tmp_val, val1, v1))
696 tmp_val = cat_subscript_to_value (i, v1);
699 if (var_is_numeric (v2))
701 col = design_matrix_var_to_column (cov, v2);
705 col = design_matrix_var_to_column (cov, v2);
707 tmp_val = cat_subscript_to_value (i, v1);
708 while (!compare_values_short (tmp_val, val1, v1))
711 tmp_val = cat_subscript_to_value (i, v1);
718 if (var_is_numeric (v2))
720 col = design_matrix_var_to_column (cov, v2);
724 covariance_matrix_insert (cov, v2, v1, val2, val1, product);
727 gsl_matrix_set (cov->m, row, col, product);
730 static struct design_matrix *
731 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
734 struct covariance_accumulator *entry;
735 struct design_matrix *result = NULL;
736 struct hsh_iterator iter;
738 result = covariance_matrix_create (cov->n_variables, cov->v_variables);
740 entry = hsh_first (cov->ca, &iter);
742 while (entry != NULL)
745 We compute the centered, un-normalized covariance matrix.
747 tmp = entry->dot_product - entry->sum1 * entry->sum2 / entry->ssize;
748 covariance_matrix_insert (result, entry->v1, entry->v2, entry->val1,
750 entry = hsh_next (cov->ca, &iter);
757 Call this function after passing the data.
760 covariance_matrix_compute (struct covariance_matrix *cov)
762 if (cov->n_pass == ONE_PASS)
764 cov->cov = covariance_accumulator_to_matrix (cov);
768 struct design_matrix *
769 covariance_to_design (const struct covariance_matrix *c)