1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
46 const union value *val1;
47 const union value *val2;
56 struct covariance_matrix
58 struct design_matrix *cov;
59 struct design_matrix *ssize;
63 const struct variable **v_variables;
67 enum mv_class missing_value;
68 void (*accumulate) (struct covariance_matrix *, const struct ccase *,
69 const struct interaction_variable **, size_t);
70 void (*update_moments) (struct covariance_matrix *, size_t, double);
75 static struct hsh_table *covariance_hsh_create (size_t *);
76 static hsh_hash_func covariance_accumulator_hash;
77 static unsigned int hash_numeric_alpha (const struct variable *,
78 const struct variable *,
79 const union value *, size_t);
80 static hsh_compare_func covariance_accumulator_compare;
81 static hsh_free_func covariance_accumulator_free;
82 static void update_moments1 (struct covariance_matrix *, size_t, double);
83 static void update_moments2 (struct covariance_matrix *, size_t, double);
84 static struct covariance_accumulator *get_new_covariance_accumulator (const
99 static void covariance_accumulate_listwise (struct covariance_matrix *,
100 const struct ccase *,
101 const struct interaction_variable **,
103 static void covariance_accumulate_pairwise (struct covariance_matrix *,
104 const struct ccase *,
105 const struct interaction_variable **,
108 struct covariance_matrix *
109 covariance_matrix_init (size_t n_variables,
110 const struct variable *v_variables[], int n_pass,
111 int missing_handling, enum mv_class missing_value)
114 struct covariance_matrix *result = NULL;
116 result = xmalloc (sizeof (*result));
118 result->n_variables = n_variables;
119 result->ca = covariance_hsh_create (&result->n_variables);
122 result->missing_handling = missing_handling;
123 result->missing_value = missing_value;
124 result->accumulate = (result->missing_handling == LISTWISE) ?
125 covariance_accumulate_listwise : covariance_accumulate_pairwise;
126 if (n_pass == ONE_PASS)
128 result->update_moments = update_moments1;
129 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
130 for (i = 0; i < n_variables; i++)
132 result->m1[i] = moments1_create (MOMENT_MEAN);
137 result->update_moments = update_moments2;
138 result->m = xnmalloc (n_variables, sizeof (*result->m));
139 for (i = 0; i < n_variables; i++)
141 result->m[i] = moments_create (MOMENT_MEAN);
144 result->v_variables = v_variables;
146 result->n_pass = n_pass;
151 get_n_rows (size_t n_variables, size_t *v_variables[])
155 for (i = 0; i < n_variables; i++)
157 if (var_is_numeric (v_variables[i]))
161 else if (var_is_alpha (v_variables[i]))
163 size_t n_categories = cat_get_n_categories (v_variables[i]);
164 result += n_categories - 1;
170 The covariances are stored in a DESIGN_MATRIX structure.
172 struct design_matrix *
173 covariance_matrix_create (size_t n_variables,
174 const struct variable *v_variables[])
176 size_t n_rows = get_n_rows (n_variables, v_variables);
177 return design_matrix_create (n_variables, v_variables, n_rows);
181 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
183 assert (cov->m1 != NULL);
184 moments1_add (cov->m1[i], x, 1.0);
188 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
190 assert (cov->m != NULL);
191 moments_pass_one (cov->m[i], x, 1.0);
195 covariance_matrix_destroy (struct covariance_matrix *cov)
199 assert (cov != NULL);
200 design_matrix_destroy (cov->cov);
201 design_matrix_destroy (cov->ssize);
202 hsh_destroy (cov->ca);
203 if (cov->n_pass == ONE_PASS)
205 for (i = 0; i < cov->n_variables; i++)
207 moments1_destroy (cov->m1[i]);
213 for (i = 0; i < cov->n_variables; i++)
215 moments_destroy (cov->m[i]);
222 Update the covariance matrix with the new entries, assuming that ROW
223 corresponds to a categorical variable and V2 is numeric.
226 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
228 const struct variable *v2, double x,
229 const union value *val2)
234 assert (var_is_numeric (v2));
236 col = design_matrix_var_to_column (cov, v2);
237 assert (val2 != NULL);
238 tmp = design_matrix_get_element (cov, row, col);
239 design_matrix_set_element (cov, row, col, (val2->f - mean) * x + tmp);
240 design_matrix_set_element (cov, col, row, (val2->f - mean) * x + tmp);
243 column_iterate (struct design_matrix *cov, const struct variable *v,
244 double ssize, double x, const union value *val1, size_t row)
250 const union value *tmp_val;
252 col = design_matrix_var_to_column (cov, v);
253 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
256 y = -1.0 * cat_get_category_count (i, v) / ssize;
257 tmp_val = cat_subscript_to_value (i, v);
258 if (!compare_values_short (tmp_val, val1, v))
262 tmp = design_matrix_get_element (cov, row, col);
263 design_matrix_set_element (cov, row, col, x * y + tmp);
264 design_matrix_set_element (cov, col, row, x * y + tmp);
269 Call this function in the second data pass. The central moments are
270 MEAN1 and MEAN2. Any categorical variables should already have their
271 values summarized in in its OBS_VALS element.
274 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
275 double ssize, const struct variable *v1,
276 const struct variable *v2, const union value *val1,
277 const union value *val2)
283 const union value *tmp_val;
285 if (var_is_alpha (v1))
287 row = design_matrix_var_to_column (cov, v1);
288 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
291 x = -1.0 * cat_get_category_count (i, v1) / ssize;
292 tmp_val = cat_subscript_to_value (i, v1);
293 if (!compare_values_short (tmp_val, val1, v1))
297 if (var_is_numeric (v2))
299 covariance_update_categorical_numeric (cov, mean2, row,
304 column_iterate (cov, v1, ssize, x, val1, row);
305 column_iterate (cov, v2, ssize, x, val2, row);
309 else if (var_is_alpha (v2))
312 Reverse the orders of V1, V2, etc. and put ourselves back
313 in the previous IF scope.
315 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
320 Both variables are numeric.
322 row = design_matrix_var_to_column (cov, v1);
323 col = design_matrix_var_to_column (cov, v2);
324 x = (val1->f - mean1) * (val2->f - mean2);
325 x += design_matrix_get_element (cov, col, row);
326 design_matrix_set_element (cov, row, col, x);
327 design_matrix_set_element (cov, col, row, x);
332 covariance_accumulator_hash (const void *h, const void *aux)
334 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
335 size_t *n_vars = (size_t *) aux;
338 const struct variable *v_min;
339 const struct variable *v_max;
340 const union value *val_min;
341 const union value *val_max;
344 Order everything by the variables' indices. This ensures we get the
345 same key regardless of the order in which the variables are stored
349 (var_get_dict_index (ca->v1) <
350 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
351 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
353 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
354 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
356 idx_min = var_get_dict_index (v_min);
357 idx_max = var_get_dict_index (v_max);
359 if (var_is_numeric (v_max) && var_is_numeric (v_min))
361 return (*n_vars * idx_max + idx_min);
363 if (var_is_numeric (v_max) && var_is_alpha (v_min))
365 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
367 if (var_is_alpha (v_max) && var_is_numeric (v_min))
369 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
371 if (var_is_alpha (v_max) && var_is_alpha (v_min))
373 unsigned tmp = hash_bytes (val_max, var_get_width (v_max), 0);
374 tmp ^= hash_bytes (val_min, var_get_width (v_min), 0);
375 tmp += *n_vars * (*n_vars + 1 + idx_max) + idx_min;
382 Make a hash table consisting of struct covariance_accumulators.
383 This allows the accumulation of the elements of a covariance matrix
384 in a single data pass. Call covariance_accumulate () for each case
387 static struct hsh_table *
388 covariance_hsh_create (size_t *n_vars)
390 return hsh_create (*n_vars * *n_vars, covariance_accumulator_compare,
391 covariance_accumulator_hash, covariance_accumulator_free,
396 covariance_accumulator_free (void *c_, const void *aux UNUSED)
398 struct covariance_accumulator *c = c_;
404 ordered_match_nodes (const struct covariance_accumulator *c, const struct variable *v1,
405 const struct variable *v2, const union value *val1, const union value *val2)
410 result = var_get_dict_index (v1) ^ var_get_dict_index (c->v1);
411 m = var_get_dict_index (v2) ^ var_get_dict_index (c->v2);
413 if (var_is_alpha (v1))
415 result |= compare_values_short (val1, c->val1, v1);
416 if (var_is_alpha (v2))
418 result |= compare_values_short (val2, c->val2, v2);
421 else if (var_is_alpha (v2))
423 result |= compare_values_short (val2, c->val2, v2);
429 Hash comparison. Returns 0 for a match, or a non-zero int
430 otherwise. The sign of a non-zero return value *should* indicate the
431 position of C relative to the covariance_accumulator described by
432 the other arguments. But for now, it just returns 1 for any
433 non-match. This should be changed when someone figures out how to
434 compute a sensible sign for the return value.
437 match_nodes (const struct covariance_accumulator *c,
438 const struct variable *v1, const struct variable *v2,
439 const union value *val1, const union value *val2)
444 n = ordered_match_nodes (c, v1, v2, val1, val2);
445 m = ordered_match_nodes (c, v2, v1, val2, val1);
450 This function is meant to be used as a comparison function for
451 a struct hsh_table in src/libpspp/hash.c.
454 covariance_accumulator_compare (const void *a1_, const void *a2_,
455 const void *aux UNUSED)
457 const struct covariance_accumulator *a1 = a1_;
458 const struct covariance_accumulator *a2 = a2_;
460 if (a1 == NULL && a2 == NULL)
463 if (a1 == NULL || a2 == NULL)
466 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
470 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
471 const union value *val, size_t n_vars)
473 unsigned int result = -1u;
474 if (var_is_numeric (v1) && var_is_alpha (v2))
476 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
477 + var_get_dict_index (v2) + hash_value_short (val, v2);
479 else if (var_is_alpha (v1) && var_is_numeric (v2))
481 result = hash_numeric_alpha (v2, v1, val, n_vars);
488 update_product (const struct variable *v1, const struct variable *v2,
489 const union value *val1, const union value *val2)
493 assert (val1 != NULL);
494 assert (val2 != NULL);
495 if (var_is_alpha (v1) && var_is_alpha (v2))
499 if (var_is_numeric (v1) && var_is_numeric (v2))
501 return (val1->f * val2->f);
503 if (var_is_numeric (v1) && var_is_alpha (v2))
507 if (var_is_numeric (v2) && var_is_alpha (v1))
517 update_sum (const struct variable *var, const union value *val, double weight)
519 assert (var != NULL);
520 assert (val != NULL);
521 if (var_is_alpha (var))
527 static struct covariance_accumulator *
528 get_new_covariance_accumulator (const struct variable *v1,
529 const struct variable *v2,
530 const union value *val1,
531 const union value *val2)
533 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
535 struct covariance_accumulator *ca;
536 ca = xmalloc (sizeof (*ca));
546 static const struct variable **
547 get_covariance_variables (const struct covariance_matrix *cov)
549 return cov->v_variables;
553 update_hash_entry (struct hsh_table *c,
554 const struct variable *v1,
555 const struct variable *v2,
556 const union value *val1, const union value *val2,
557 const struct interaction_value *i_val1,
558 const struct interaction_value *i_val2)
560 struct covariance_accumulator *ca;
561 struct covariance_accumulator *new_entry;
565 iv_f1 = interaction_value_get_nonzero_entry (i_val1);
566 iv_f2 = interaction_value_get_nonzero_entry (i_val2);
567 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
568 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
569 ca->dot_product *= iv_f1 * iv_f2;
570 ca->sum1 = update_sum (ca->v1, ca->val1, iv_f1);
571 ca->sum2 = update_sum (ca->v2, ca->val2, iv_f2);
573 new_entry = hsh_insert (c, ca);
575 if (new_entry != NULL)
577 new_entry->dot_product += ca->dot_product;
578 new_entry->ssize += 1.0;
579 new_entry->sum1 += ca->sum1;
580 new_entry->sum2 += ca->sum2;
582 If DOT_PRODUCT is null, CA was not already in the hash
583 hable, so we don't free it because it was just inserted.
584 If DOT_PRODUCT was not null, CA is already in the hash table.
585 Unnecessary now, it must be freed here.
592 Compute the covariance matrix in a single data-pass. Cases with
593 missing values are dropped pairwise, in other words, only if one of
594 the two values necessary to accumulate the inner product is missing.
596 Do not call this function directly. Call it through the struct
597 covariance_matrix ACCUMULATE member function, for example,
598 cov->accumulate (cov, ccase).
601 covariance_accumulate_pairwise (struct covariance_matrix *cov,
602 const struct ccase *ccase,
603 const struct interaction_variable **i_var,
608 const union value *val1;
609 const union value *val2;
610 const struct variable **v_variables;
611 struct interaction_value *i_val1 = NULL;
612 struct interaction_value *i_val2 = NULL;
614 assert (cov != NULL);
615 assert (ccase != NULL);
617 v_variables = get_covariance_variables (cov);
618 assert (v_variables != NULL);
620 for (i = 0; i < cov->n_variables; ++i)
622 if (is_interaction (v_variables[i], i_var, n_intr))
624 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
625 val1 = interaction_value_get (i_val1);
629 val1 = case_data (ccase, v_variables[i]);
631 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
633 cat_value_update (v_variables[i], val1);
634 if (var_is_numeric (v_variables[i]))
635 cov->update_moments (cov, i, val1->f);
637 for (j = i; j < cov->n_variables; j++)
639 if (is_interaction (v_variables[j], i_var, n_intr))
641 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
642 val2 = interaction_value_get (i_val2);
646 val2 = case_data (ccase, v_variables[j]);
648 if (!var_is_value_missing
649 (v_variables[j], val2, cov->missing_value))
651 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
652 val1, val2, i_val1, i_val2);
660 Compute the covariance matrix in a single data-pass. Cases with
661 missing values are dropped listwise. In other words, if one of the
662 values for any variable in a case is missing, the entire case is
665 The caller must use a casefilter to remove the cases with missing
666 values before calling covariance_accumulate_listwise. This function
667 assumes that CCASE has already passed through this filter, and
668 contains no missing values.
670 Do not call this function directly. Call it through the struct
671 covariance_matrix ACCUMULATE member function, for example,
672 cov->accumulate (cov, ccase).
675 covariance_accumulate_listwise (struct covariance_matrix *cov,
676 const struct ccase *ccase,
677 const struct interaction_variable **i_var,
682 const union value *val1;
683 const union value *val2;
684 const struct variable **v_variables;
685 struct interaction_value *i_val1 = NULL;
686 struct interaction_value *i_val2 = NULL;
688 assert (cov != NULL);
689 assert (ccase != NULL);
691 v_variables = get_covariance_variables (cov);
692 assert (v_variables != NULL);
694 for (i = 0; i < cov->n_variables; ++i)
696 if (is_interaction (v_variables[i], i_var, n_intr))
698 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
699 val1 = interaction_value_get (i_val1);
703 val1 = case_data (ccase, v_variables[i]);
705 cat_value_update (v_variables[i], val1);
706 if (var_is_numeric (v_variables[i]))
707 cov->update_moments (cov, i, val1->f);
709 for (j = i; j < cov->n_variables; j++)
711 if (is_interaction (v_variables[j], i_var, n_intr))
713 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
714 val2 = interaction_value_get (i_val2);
718 val2 = case_data (ccase, v_variables[j]);
720 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
721 val1, val2, i_val1, i_val2);
727 Call this function during the data pass. Each case will be added to
728 a hash containing all values of the covariance matrix. After the
729 data have been passed, call covariance_matrix_compute to put the
730 values in the struct covariance_matrix.
733 covariance_matrix_accumulate (struct covariance_matrix *cov,
734 const struct ccase *ccase, void **aux, size_t n_intr)
736 cov->accumulate (cov, ccase, (const struct interaction_variable **) aux, n_intr);
739 If VAR is categorical with d categories, its first category should
740 correspond to the origin in d-dimensional Euclidean space.
743 is_origin (const struct variable *var, const union value *val)
745 if (var_is_numeric (var))
749 if (cat_value_find (var, val) == 0)
757 Return the subscript of the column of the design matrix
758 corresponding to VAL. If VAR is categorical with d categories, its
759 first category should correspond to the origin in d-dimensional
760 Euclidean space, so there is no subscript for this value.
763 get_exact_subscript (const struct design_matrix *dm, const struct variable *var,
764 const union value *val)
768 result = design_matrix_var_to_column (dm, var);
769 if (var_is_alpha (var))
771 if (is_origin (var, val))
775 result += cat_value_find (var, val) - 1;
781 Return the value corresponding to subscript TARGET. If that value corresponds
782 to the origin, return NULL.
784 static const union value *
785 get_value_from_subscript (const struct design_matrix *dm, size_t target)
787 const union value *result = NULL;
788 const struct variable *var;
791 var = design_matrix_col_to_var (dm, target);
792 if (var_is_numeric (var))
796 for (i = 0; i < cat_get_n_categories (var); i++)
798 result = cat_subscript_to_value (i, var);
799 if (get_exact_subscript (dm, var, result) == target)
808 is_covariance_contributor (const struct covariance_accumulator *ca, const struct design_matrix *dm,
812 const struct variable *v1;
813 const struct variable *v2;
816 v1 = design_matrix_col_to_var (dm, i);
817 v2 = design_matrix_col_to_var (dm, j);
818 if (var_get_dict_index (v1) == var_get_dict_index(ca->v1))
820 if (var_get_dict_index (v2) == var_get_dict_index (ca->v2))
822 k = get_exact_subscript (dm, v1, ca->val1);
825 k = get_exact_subscript (dm, v2, ca->val2);
833 else if (var_get_dict_index (v1) == var_get_dict_index (ca->v2))
835 if (var_get_dict_index (v2) == var_get_dict_index (ca->v1))
837 k = get_exact_subscript (dm, v1, ca->val2);
840 k = get_exact_subscript (dm, v2, ca->val1);
852 get_sum (const struct covariance_matrix *cov, size_t i)
857 const struct variable *var;
858 const union value *val = NULL;
860 assert ( cov != NULL);
861 var = design_matrix_col_to_var (cov->cov, i);
864 if (var_is_alpha (var))
866 val = get_value_from_subscript (cov->cov, i);
867 k = cat_value_find (var, val);
868 return cat_get_category_count (k, var);
873 while (var_get_dict_index (cov->v_variables[k]) != var_get_dict_index (var))
877 moments1_calculate (cov->m1[k], &n, &mean, NULL, NULL, NULL);
885 update_ssize (struct design_matrix *dm, size_t i, size_t j, struct covariance_accumulator *ca)
887 struct variable *var;
889 var = design_matrix_col_to_var (dm, i);
890 if (var_get_dict_index (ca->v1) == var_get_dict_index (var))
892 var = design_matrix_col_to_var (dm, j);
893 if (var_get_dict_index (ca->v2) == var_get_dict_index (var))
895 tmp = design_matrix_get_element (dm, i, j);
897 design_matrix_set_element (dm, i, j, tmp);
902 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
909 struct covariance_accumulator *entry;
910 struct hsh_iterator iter;
912 cov->cov = covariance_matrix_create (cov->n_variables, cov->v_variables);
913 cov->ssize = covariance_matrix_create (cov->n_variables, cov->v_variables);
914 entry = hsh_first (cov->ca, &iter);
915 while (entry != NULL)
917 entry = hsh_next (cov->ca, &iter);
920 for (i = 0; i < design_matrix_get_n_cols (cov->cov); i++)
922 sum_i = get_sum (cov, i);
923 for (j = i; j < design_matrix_get_n_cols (cov->cov); j++)
925 sum_j = get_sum (cov, j);
926 entry = hsh_first (cov->ca, &iter);
927 while (entry != NULL)
929 update_ssize (cov->ssize, i, j, entry);
931 We compute the centered, un-normalized covariance matrix.
933 if (is_covariance_contributor (entry, cov->cov, i, j))
935 design_matrix_set_element (cov->cov, i, j, entry->dot_product);
937 entry = hsh_next (cov->ca, &iter);
939 tmp = design_matrix_get_element (cov->cov, i, j);
940 tmp -= sum_i * sum_j / design_matrix_get_element (cov->ssize, i, j);
941 design_matrix_set_element (cov->cov, i, j, tmp);
942 design_matrix_set_element (cov->cov, j, i, tmp);
949 Call this function after passing the data.
952 covariance_matrix_compute (struct covariance_matrix *cov)
954 if (cov->n_pass == ONE_PASS)
956 covariance_accumulator_to_matrix (cov);
960 struct design_matrix *
961 covariance_to_design (const struct covariance_matrix *c)
970 covariance_matrix_get_n_rows (const struct covariance_matrix *c)
972 return design_matrix_get_n_rows (c->cov);
976 covariance_matrix_get_element (const struct covariance_matrix *c, size_t row, size_t col)
978 return (design_matrix_get_element (c->cov, row, col));