1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
46 const union value *val1;
47 const union value *val2;
56 struct covariance_matrix
58 struct design_matrix *cov;
59 struct design_matrix *ssize;
60 struct design_matrix *sums;
64 const struct variable **v_variables;
68 enum mv_class missing_value;
69 void (*accumulate) (struct covariance_matrix *, const struct ccase *,
70 const struct interaction_variable **, size_t);
71 void (*update_moments) (struct covariance_matrix *, size_t, double);
76 static struct hsh_table *covariance_hsh_create (size_t *);
77 static hsh_hash_func covariance_accumulator_hash;
78 static unsigned int hash_numeric_alpha (const struct variable *,
79 const struct variable *,
80 const union value *, size_t);
81 static hsh_compare_func covariance_accumulator_compare;
82 static hsh_free_func covariance_accumulator_free;
83 static void update_moments1 (struct covariance_matrix *, size_t, double);
84 static void update_moments2 (struct covariance_matrix *, size_t, double);
85 static struct covariance_accumulator *get_new_covariance_accumulator (const
100 static void covariance_accumulate_listwise (struct covariance_matrix *,
101 const struct ccase *,
102 const struct interaction_variable **,
104 static void covariance_accumulate_pairwise (struct covariance_matrix *,
105 const struct ccase *,
106 const struct interaction_variable **,
109 struct covariance_matrix *
110 covariance_matrix_init (size_t n_variables,
111 const struct variable *v_variables[], int n_pass,
112 int missing_handling, enum mv_class missing_value)
115 struct covariance_matrix *result = NULL;
117 result = xmalloc (sizeof (*result));
119 result->n_variables = n_variables;
120 result->ca = covariance_hsh_create (&result->n_variables);
123 result->missing_handling = missing_handling;
124 result->missing_value = missing_value;
125 result->accumulate = (result->missing_handling == LISTWISE) ?
126 covariance_accumulate_listwise : covariance_accumulate_pairwise;
127 if (n_pass == ONE_PASS)
129 result->update_moments = update_moments1;
130 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
131 for (i = 0; i < n_variables; i++)
133 result->m1[i] = moments1_create (MOMENT_MEAN);
138 result->update_moments = update_moments2;
139 result->m = xnmalloc (n_variables, sizeof (*result->m));
140 for (i = 0; i < n_variables; i++)
142 result->m[i] = moments_create (MOMENT_MEAN);
145 result->v_variables = v_variables;
147 result->n_pass = n_pass;
153 The covariances are stored in a DESIGN_MATRIX structure.
155 struct design_matrix *
156 covariance_matrix_create (size_t n_variables,
157 const struct variable *v_variables[])
159 return design_matrix_create (n_variables, v_variables,
160 (size_t) n_variables);
164 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
166 assert (cov->m1 != NULL);
167 moments1_add (cov->m1[i], x, 1.0);
171 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
173 assert (cov->m != NULL);
174 moments_pass_one (cov->m[i], x, 1.0);
178 covariance_matrix_destroy (struct covariance_matrix *cov)
182 assert (cov != NULL);
183 design_matrix_destroy (cov->cov);
184 design_matrix_destroy (cov->ssize);
185 design_matrix_destroy (cov->sums);
186 hsh_destroy (cov->ca);
187 if (cov->n_pass == ONE_PASS)
189 for (i = 0; i < cov->n_variables; i++)
191 moments1_destroy (cov->m1[i]);
197 for (i = 0; i < cov->n_variables; i++)
199 moments_destroy (cov->m[i]);
206 Update the covariance matrix with the new entries, assuming that ROW
207 corresponds to a categorical variable and V2 is numeric.
210 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
212 const struct variable *v2, double x,
213 const union value *val2)
218 assert (var_is_numeric (v2));
220 col = design_matrix_var_to_column (cov, v2);
221 assert (val2 != NULL);
222 tmp = gsl_matrix_get (cov->m, row, col);
223 gsl_matrix_set (cov->m, row, col, (val2->f - mean) * x + tmp);
224 gsl_matrix_set (cov->m, col, row, (val2->f - mean) * x + tmp);
227 column_iterate (struct design_matrix *cov, const struct variable *v,
228 double ssize, double x, const union value *val1, size_t row)
234 const union value *tmp_val;
236 col = design_matrix_var_to_column (cov, v);
237 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
240 y = -1.0 * cat_get_category_count (i, v) / ssize;
241 tmp_val = cat_subscript_to_value (i, v);
242 if (!compare_values_short (tmp_val, val1, v))
246 tmp = gsl_matrix_get (cov->m, row, col);
247 gsl_matrix_set (cov->m, row, col, x * y + tmp);
248 gsl_matrix_set (cov->m, col, row, x * y + tmp);
253 Call this function in the second data pass. The central moments are
254 MEAN1 and MEAN2. Any categorical variables should already have their
255 values summarized in in its OBS_VALS element.
258 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
259 double ssize, const struct variable *v1,
260 const struct variable *v2, const union value *val1,
261 const union value *val2)
267 const union value *tmp_val;
269 if (var_is_alpha (v1))
271 row = design_matrix_var_to_column (cov, v1);
272 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
275 x = -1.0 * cat_get_category_count (i, v1) / ssize;
276 tmp_val = cat_subscript_to_value (i, v1);
277 if (!compare_values_short (tmp_val, val1, v1))
281 if (var_is_numeric (v2))
283 covariance_update_categorical_numeric (cov, mean2, row,
288 column_iterate (cov, v1, ssize, x, val1, row);
289 column_iterate (cov, v2, ssize, x, val2, row);
293 else if (var_is_alpha (v2))
296 Reverse the orders of V1, V2, etc. and put ourselves back
297 in the previous IF scope.
299 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
304 Both variables are numeric.
306 row = design_matrix_var_to_column (cov, v1);
307 col = design_matrix_var_to_column (cov, v2);
308 x = (val1->f - mean1) * (val2->f - mean2);
309 x += gsl_matrix_get (cov->m, col, row);
310 gsl_matrix_set (cov->m, row, col, x);
311 gsl_matrix_set (cov->m, col, row, x);
316 covariance_accumulator_hash (const void *h, const void *aux)
318 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
319 size_t *n_vars = (size_t *) aux;
322 const struct variable *v_min;
323 const struct variable *v_max;
324 const union value *val_min;
325 const union value *val_max;
328 Order everything by the variables' indices. This ensures we get the
329 same key regardless of the order in which the variables are stored
333 (var_get_dict_index (ca->v1) <
334 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
335 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
337 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
338 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
340 idx_min = var_get_dict_index (v_min);
341 idx_max = var_get_dict_index (v_max);
343 if (var_is_numeric (v_max) && var_is_numeric (v_min))
345 return (*n_vars * idx_max + idx_min);
347 if (var_is_numeric (v_max) && var_is_alpha (v_min))
349 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
351 if (var_is_alpha (v_max) && var_is_numeric (v_min))
353 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
355 if (var_is_alpha (v_max) && var_is_alpha (v_min))
357 unsigned tmp = hsh_hash_bytes (val_max, var_get_width (v_max));
358 tmp ^= hsh_hash_bytes (val_min, var_get_width (v_min));
359 tmp += *n_vars * (*n_vars + 1 + idx_max) + idx_min;
366 Make a hash table consisting of struct covariance_accumulators.
367 This allows the accumulation of the elements of a covariance matrix
368 in a single data pass. Call covariance_accumulate () for each case
371 static struct hsh_table *
372 covariance_hsh_create (size_t *n_vars)
374 return hsh_create (*n_vars * *n_vars, covariance_accumulator_compare,
375 covariance_accumulator_hash, covariance_accumulator_free,
380 covariance_accumulator_free (void *c_, const void *aux UNUSED)
382 struct covariance_accumulator *c = c_;
388 Hash comparison. Returns 0 for a match, or a non-zero int
389 otherwise. The sign of a non-zero return value *should* indicate the
390 position of C relative to the covariance_accumulator described by
391 the other arguments. But for now, it just returns 1 for any
392 non-match. This should be changed when someone figures out how to
393 compute a sensible sign for the return value.
396 match_nodes (const struct covariance_accumulator *c,
397 const struct variable *v1, const struct variable *v2,
398 const union value *val1, const union value *val2)
400 if (var_get_dict_index (v1) == var_get_dict_index (c->v1))
401 if (var_get_dict_index (v2) == var_get_dict_index (c->v2))
403 if (var_is_numeric (v1) && var_is_numeric (v2))
407 if (var_is_numeric (v1) && var_is_alpha (v2))
409 if (!compare_values_short (val2, c->val2, v2))
414 if (var_is_alpha (v1) && var_is_numeric (v2))
416 if (!compare_values_short (val1, c->val1, v1))
421 if (var_is_alpha (v1) && var_is_alpha (v2))
423 if (!compare_values_short (val1, c->val1, v1))
425 if (!compare_values_short (val2, c->val2, v2))
436 This function is meant to be used as a comparison function for
437 a struct hsh_table in src/libpspp/hash.c.
440 covariance_accumulator_compare (const void *a1_, const void *a2_,
441 const void *aux UNUSED)
443 const struct covariance_accumulator *a1 = a1_;
444 const struct covariance_accumulator *a2 = a2_;
446 if (a1 == NULL && a2 == NULL)
449 if (a1 == NULL || a2 == NULL)
452 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
456 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
457 const union value *val, size_t n_vars)
459 unsigned int result = -1u;
460 if (var_is_numeric (v1) && var_is_alpha (v2))
462 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
463 + var_get_dict_index (v2) + hsh_hash_string (val->s);
465 else if (var_is_alpha (v1) && var_is_numeric (v2))
467 result = hash_numeric_alpha (v2, v1, val, n_vars);
474 update_product (const struct variable *v1, const struct variable *v2,
475 const union value *val1, const union value *val2)
479 assert (val1 != NULL);
480 assert (val2 != NULL);
481 if (var_is_alpha (v1) && var_is_alpha (v2))
485 if (var_is_numeric (v1) && var_is_numeric (v2))
487 return (val1->f * val2->f);
489 if (var_is_numeric (v1) && var_is_alpha (v2))
493 if (var_is_numeric (v2) && var_is_alpha (v1))
495 update_product (v2, v1, val2, val1);
500 update_sum (const struct variable *var, const union value *val, double weight)
502 assert (var != NULL);
503 assert (val != NULL);
504 if (var_is_alpha (var))
510 static struct covariance_accumulator *
511 get_new_covariance_accumulator (const struct variable *v1,
512 const struct variable *v2,
513 const union value *val1,
514 const union value *val2)
516 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
518 struct covariance_accumulator *ca;
519 ca = xmalloc (sizeof (*ca));
529 static const struct variable **
530 get_covariance_variables (const struct covariance_matrix *cov)
532 return cov->v_variables;
536 update_hash_entry (struct hsh_table *c,
537 const struct variable *v1,
538 const struct variable *v2,
539 const union value *val1, const union value *val2,
540 const struct interaction_value *i_val1,
541 const struct interaction_value *i_val2)
543 struct covariance_accumulator *ca;
544 struct covariance_accumulator *new_entry;
548 iv_f1 = interaction_value_get_nonzero_entry (i_val1);
549 iv_f2 = interaction_value_get_nonzero_entry (i_val2);
550 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
551 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
552 ca->dot_product *= iv_f1 * iv_f2;
553 ca->sum1 = update_sum (ca->v1, ca->val1, iv_f1);
554 ca->sum2 = update_sum (ca->v2, ca->val2, iv_f2);
556 new_entry = hsh_insert (c, ca);
558 if (new_entry != NULL)
560 new_entry->dot_product += ca->dot_product;
561 new_entry->ssize += 1.0;
562 new_entry->sum1 += ca->sum1;
563 new_entry->sum2 += ca->sum2;
565 If DOT_PRODUCT is null, CA was not already in the hash
566 hable, so we don't free it because it was just inserted.
567 If DOT_PRODUCT was not null, CA is already in the hash table.
568 Unnecessary now, it must be freed here.
575 Compute the covariance matrix in a single data-pass. Cases with
576 missing values are dropped pairwise, in other words, only if one of
577 the two values necessary to accumulate the inner product is missing.
579 Do not call this function directly. Call it through the struct
580 covariance_matrix ACCUMULATE member function, for example,
581 cov->accumulate (cov, ccase).
584 covariance_accumulate_pairwise (struct covariance_matrix *cov,
585 const struct ccase *ccase,
586 const struct interaction_variable **i_var,
591 const union value *val1;
592 const union value *val2;
593 const struct variable **v_variables;
594 struct interaction_value *i_val1 = NULL;
595 struct interaction_value *i_val2 = NULL;
597 assert (cov != NULL);
598 assert (ccase != NULL);
600 v_variables = get_covariance_variables (cov);
601 assert (v_variables != NULL);
603 for (i = 0; i < cov->n_variables; ++i)
605 if (is_interaction (v_variables[i], i_var, n_intr))
607 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
608 val1 = interaction_value_get (i_val1);
612 val1 = case_data (ccase, v_variables[i]);
614 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
616 cat_value_update (v_variables[i], val1);
617 if (var_is_numeric (v_variables[i]))
618 cov->update_moments (cov, i, val1->f);
620 for (j = i; j < cov->n_variables; j++)
622 if (is_interaction (v_variables[j], i_var, n_intr))
624 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
625 val2 = interaction_value_get (i_val2);
629 val2 = case_data (ccase, v_variables[j]);
631 if (!var_is_value_missing
632 (v_variables[j], val2, cov->missing_value))
634 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
635 val1, val2, i_val1, i_val2);
637 update_hash_entry (cov->ca, v_variables[j],
638 v_variables[i], val2, val1, i_val2, i_val1);
646 Compute the covariance matrix in a single data-pass. Cases with
647 missing values are dropped listwise. In other words, if one of the
648 values for any variable in a case is missing, the entire case is
651 The caller must use a casefilter to remove the cases with missing
652 values before calling covariance_accumulate_listwise. This function
653 assumes that CCASE has already passed through this filter, and
654 contains no missing values.
656 Do not call this function directly. Call it through the struct
657 covariance_matrix ACCUMULATE member function, for example,
658 cov->accumulate (cov, ccase).
661 covariance_accumulate_listwise (struct covariance_matrix *cov,
662 const struct ccase *ccase,
663 const struct interaction_variable **i_var,
668 const union value *val1;
669 const union value *val2;
670 const struct variable **v_variables;
671 struct interaction_value *i_val1 = NULL;
672 struct interaction_value *i_val2 = NULL;
674 assert (cov != NULL);
675 assert (ccase != NULL);
677 v_variables = get_covariance_variables (cov);
678 assert (v_variables != NULL);
680 for (i = 0; i < cov->n_variables; ++i)
682 if (is_interaction (v_variables[i], i_var, n_intr))
684 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
685 val1 = interaction_value_get (i_val1);
689 val1 = case_data (ccase, v_variables[i]);
691 cat_value_update (v_variables[i], val1);
692 if (var_is_numeric (v_variables[i]))
693 cov->update_moments (cov, i, val1->f);
695 for (j = i; j < cov->n_variables; j++)
697 if (is_interaction (v_variables[j], i_var, n_intr))
699 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
700 val2 = interaction_value_get (i_val2);
704 val2 = case_data (ccase, v_variables[j]);
706 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
707 val1, val2, i_val1, i_val2);
709 update_hash_entry (cov->ca, v_variables[j], v_variables[i],
710 val2, val1, i_val2, i_val1);
716 Call this function during the data pass. Each case will be added to
717 a hash containing all values of the covariance matrix. After the
718 data have been passed, call covariance_matrix_compute to put the
719 values in the struct covariance_matrix.
722 covariance_matrix_accumulate (struct covariance_matrix *cov,
723 const struct ccase *ccase, void **aux, size_t n_intr)
725 cov->accumulate (cov, ccase, (const struct interaction_variable **) aux, n_intr);
728 If VAR is categorical with d categories, its first category should
729 correspond to the origin in d-dimensional Euclidean space.
732 is_origin (const struct variable *var, const union value *val)
734 if (cat_value_find (var, val) == 0)
742 Return the subscript of the column of the design matrix
743 corresponding to VAL. If VAR is categorical with d categories, its
744 first category should correspond to the origin in d-dimensional
745 Euclidean space, so there is no subscript for this value.
748 get_exact_subscript (const struct design_matrix *dm, const struct variable *var,
749 const union value *val)
753 if (is_origin (var, val))
758 result = design_matrix_var_to_column (dm, var);
759 if (var_is_alpha (var))
761 result += cat_value_find (var, val) - 1;
767 covariance_matrix_insert (struct design_matrix *cov,
768 const struct variable *v1,
769 const struct variable *v2, const union value *val1,
770 const union value *val2, double product)
775 assert (cov != NULL);
777 row = get_exact_subscript (cov, v1, val1);
778 col = get_exact_subscript (cov, v2, val2);
779 if (row != -1u && col != -1u)
781 gsl_matrix_set (cov->m, row, col, product);
787 is_covariance_contributor (const struct covariance_accumulator *ca, const struct design_matrix *dm,
791 const struct variable *v1;
792 const struct variable *v2;
795 v1 = design_matrix_col_to_var (dm, i);
796 if (var_get_dict_index (v1) == var_get_dict_index(ca->v1))
798 v2 = design_matrix_col_to_var (dm, j);
799 if (var_get_dict_index (v2) == var_get_dict_index (ca->v2))
801 k = get_exact_subscript (dm, v1, ca->val1);
804 k = get_exact_subscript (dm, v2, ca->val2);
815 get_sum (const struct covariance_matrix *cov, size_t i)
818 const struct variable *var;
819 const union value *val = NULL;
820 struct covariance_accumulator ca;
821 struct covariance_accumulator *c;
823 assert ( cov != NULL);
824 var = design_matrix_col_to_var (cov->cov, i);
827 if (var_is_alpha (var))
829 k = design_matrix_var_to_column (cov->cov, var);
831 val = cat_subscript_to_value (i, var);
837 c = (struct covariance_accumulator *) hsh_find (cov->ca, &ca);
846 update_ssize (struct design_matrix *dm, size_t i, size_t j, struct covariance_accumulator *ca)
848 struct variable *var;
850 var = design_matrix_col_to_var (dm, i);
851 if (var_get_dict_index (ca->v1) == var_get_dict_index (var))
853 var = design_matrix_col_to_var (dm, j);
854 if (var_get_dict_index (ca->v2) == var_get_dict_index (var))
856 tmp = gsl_matrix_get (dm->m, i, j);
858 gsl_matrix_set (dm->m, i, j, tmp);
863 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
870 struct covariance_accumulator *entry;
871 struct hsh_iterator iter;
873 cov->cov = covariance_matrix_create (cov->n_variables, cov->v_variables);
874 cov->ssize = covariance_matrix_create (cov->n_variables, cov->v_variables);
875 cov->sums = covariance_matrix_create (cov->n_variables, cov->v_variables);
876 for (i = 0; i < design_matrix_get_n_cols (cov->cov); i++)
878 sum_i = get_sum (cov, i);
879 for (j = i; j < design_matrix_get_n_cols (cov->cov); j++)
881 sum_j = get_sum (cov, j);
882 entry = hsh_first (cov->ca, &iter);
883 gsl_matrix_set (cov->sums->m, i, j, sum_i);
884 while (entry != NULL)
886 update_ssize (cov->ssize, i, j, entry);
888 We compute the centered, un-normalized covariance matrix.
890 if (is_covariance_contributor (entry, cov->cov, i, j))
892 covariance_matrix_insert (cov->cov, entry->v1, entry->v2, entry->val1,
893 entry->val2, entry->dot_product);
895 entry = hsh_next (cov->ca, &iter);
897 tmp = gsl_matrix_get (cov->cov->m, i, j);
898 tmp -= sum_i * sum_j / gsl_matrix_get (cov->ssize->m, i, j);
899 gsl_matrix_set (cov->cov->m, i, j, tmp);
906 Call this function after passing the data.
909 covariance_matrix_compute (struct covariance_matrix *cov)
911 if (cov->n_pass == ONE_PASS)
913 covariance_accumulator_to_matrix (cov);
917 struct design_matrix *
918 covariance_to_design (const struct covariance_matrix *c)