1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
46 const union value *val1;
47 const union value *val2;
56 struct covariance_matrix
58 struct design_matrix *cov;
59 struct design_matrix *ssize;
63 const struct variable **v_variables;
67 enum mv_class missing_value;
68 void (*accumulate) (struct covariance_matrix *, const struct ccase *,
69 const struct interaction_variable **, size_t);
70 void (*update_moments) (struct covariance_matrix *, size_t, double);
75 static struct hsh_table *covariance_hsh_create (size_t *);
76 static hsh_hash_func covariance_accumulator_hash;
77 static unsigned int hash_numeric_alpha (const struct variable *,
78 const struct variable *,
79 const union value *, size_t);
80 static hsh_compare_func covariance_accumulator_compare;
81 static hsh_free_func covariance_accumulator_free;
82 static void update_moments1 (struct covariance_matrix *, size_t, double);
83 static void update_moments2 (struct covariance_matrix *, size_t, double);
84 static struct covariance_accumulator *get_new_covariance_accumulator (const
99 static void covariance_accumulate_listwise (struct covariance_matrix *,
100 const struct ccase *,
101 const struct interaction_variable **,
103 static void covariance_accumulate_pairwise (struct covariance_matrix *,
104 const struct ccase *,
105 const struct interaction_variable **,
108 struct covariance_matrix *
109 covariance_matrix_init (size_t n_variables,
110 const struct variable *v_variables[], int n_pass,
111 int missing_handling, enum mv_class missing_value)
114 struct covariance_matrix *result = NULL;
116 result = xmalloc (sizeof (*result));
118 result->n_variables = n_variables;
119 result->ca = covariance_hsh_create (&result->n_variables);
122 result->missing_handling = missing_handling;
123 result->missing_value = missing_value;
124 result->accumulate = (result->missing_handling == LISTWISE) ?
125 covariance_accumulate_listwise : covariance_accumulate_pairwise;
126 if (n_pass == ONE_PASS)
128 result->update_moments = update_moments1;
129 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
130 for (i = 0; i < n_variables; i++)
132 result->m1[i] = moments1_create (MOMENT_MEAN);
137 result->update_moments = update_moments2;
138 result->m = xnmalloc (n_variables, sizeof (*result->m));
139 for (i = 0; i < n_variables; i++)
141 result->m[i] = moments_create (MOMENT_MEAN);
144 result->v_variables = v_variables;
146 result->n_pass = n_pass;
151 get_n_rows (size_t n_variables, const struct variable *v_variables[])
155 for (i = 0; i < n_variables; i++)
157 if (var_is_numeric (v_variables[i]))
161 else if (var_is_alpha (v_variables[i]))
163 size_t n_categories = cat_get_n_categories (v_variables[i]);
164 result += n_categories - 1;
170 The covariances are stored in a DESIGN_MATRIX structure.
172 struct design_matrix *
173 covariance_matrix_create (size_t n_variables,
174 const struct variable *v_variables[])
176 size_t n_rows = get_n_rows (n_variables, v_variables);
177 return design_matrix_create (n_variables, v_variables, n_rows);
181 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
183 assert (cov->m1 != NULL);
184 moments1_add (cov->m1[i], x, 1.0);
188 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
190 assert (cov->m != NULL);
191 moments_pass_one (cov->m[i], x, 1.0);
195 covariance_matrix_destroy (struct covariance_matrix *cov)
199 assert (cov != NULL);
200 design_matrix_destroy (cov->cov);
201 design_matrix_destroy (cov->ssize);
202 hsh_destroy (cov->ca);
203 if (cov->n_pass == ONE_PASS)
205 for (i = 0; i < cov->n_variables; i++)
207 moments1_destroy (cov->m1[i]);
213 for (i = 0; i < cov->n_variables; i++)
215 moments_destroy (cov->m[i]);
222 Update the covariance matrix with the new entries, assuming that ROW
223 corresponds to a categorical variable and V2 is numeric.
226 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
228 const struct variable *v2, double x,
229 const union value *val2)
234 assert (var_is_numeric (v2));
236 col = design_matrix_var_to_column (cov, v2);
237 assert (val2 != NULL);
238 tmp = design_matrix_get_element (cov, row, col);
239 design_matrix_set_element (cov, row, col, (val2->f - mean) * x + tmp);
240 design_matrix_set_element (cov, col, row, (val2->f - mean) * x + tmp);
243 column_iterate (struct design_matrix *cov, const struct variable *v,
244 double ssize, double x, const union value *val1, size_t row)
246 int width = var_get_width (v);
251 const union value *tmp_val;
253 col = design_matrix_var_to_column (cov, v);
254 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
257 y = -1.0 * cat_get_category_count (i, v) / ssize;
258 tmp_val = cat_subscript_to_value (i, v);
259 if (!value_equal (tmp_val, val1, width))
263 tmp = design_matrix_get_element (cov, row, col);
264 design_matrix_set_element (cov, row, col, x * y + tmp);
265 design_matrix_set_element (cov, col, row, x * y + tmp);
270 Call this function in the second data pass. The central moments are
271 MEAN1 and MEAN2. Any categorical variables should already have their
272 values summarized in in its OBS_VALS element.
275 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
276 double ssize, const struct variable *v1,
277 const struct variable *v2, const union value *val1,
278 const union value *val2)
284 const union value *tmp_val;
286 if (var_is_alpha (v1))
288 row = design_matrix_var_to_column (cov, v1);
289 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
292 x = -1.0 * cat_get_category_count (i, v1) / ssize;
293 tmp_val = cat_subscript_to_value (i, v1);
294 if (!value_equal (tmp_val, val1, var_get_width (v1)))
298 if (var_is_numeric (v2))
300 covariance_update_categorical_numeric (cov, mean2, row,
305 column_iterate (cov, v1, ssize, x, val1, row);
306 column_iterate (cov, v2, ssize, x, val2, row);
310 else if (var_is_alpha (v2))
313 Reverse the orders of V1, V2, etc. and put ourselves back
314 in the previous IF scope.
316 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
321 Both variables are numeric.
323 row = design_matrix_var_to_column (cov, v1);
324 col = design_matrix_var_to_column (cov, v2);
325 x = (val1->f - mean1) * (val2->f - mean2);
326 x += design_matrix_get_element (cov, col, row);
327 design_matrix_set_element (cov, row, col, x);
328 design_matrix_set_element (cov, col, row, x);
333 covariance_accumulator_hash (const void *h, const void *aux)
335 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
336 size_t *n_vars = (size_t *) aux;
339 const struct variable *v_min;
340 const struct variable *v_max;
341 const union value *val_min;
342 const union value *val_max;
345 Order everything by the variables' indices. This ensures we get the
346 same key regardless of the order in which the variables are stored
350 (var_get_dict_index (ca->v1) <
351 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
352 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
354 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
355 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
357 idx_min = var_get_dict_index (v_min);
358 idx_max = var_get_dict_index (v_max);
360 if (var_is_numeric (v_max) && var_is_numeric (v_min))
362 return (*n_vars * idx_max + idx_min);
364 if (var_is_numeric (v_max) && var_is_alpha (v_min))
366 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
368 if (var_is_alpha (v_max) && var_is_numeric (v_min))
370 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
372 if (var_is_alpha (v_max) && var_is_alpha (v_min))
374 unsigned hash = value_hash (val_max, var_get_width (v_max), 0);
375 hash = value_hash (val_min, var_get_width (v_min), hash);
376 return hash_int (*n_vars * (*n_vars + 1 + idx_max) + idx_min, hash);
382 Make a hash table consisting of struct covariance_accumulators.
383 This allows the accumulation of the elements of a covariance matrix
384 in a single data pass. Call covariance_accumulate () for each case
387 static struct hsh_table *
388 covariance_hsh_create (size_t *n_vars)
390 return hsh_create (*n_vars * *n_vars, covariance_accumulator_compare,
391 covariance_accumulator_hash, covariance_accumulator_free,
396 covariance_accumulator_free (void *c_, const void *aux UNUSED)
398 struct covariance_accumulator *c = c_;
404 ordered_match_nodes (const struct covariance_accumulator *c, const struct variable *v1,
405 const struct variable *v2, const union value *val1, const union value *val2)
410 result = var_get_dict_index (v1) ^ var_get_dict_index (c->v1);
411 m = var_get_dict_index (v2) ^ var_get_dict_index (c->v2);
413 if (var_is_alpha (v1))
415 result |= value_compare_3way (val1, c->val1, var_get_width (v1));
416 if (var_is_alpha (v2))
418 result |= value_compare_3way (val2, c->val2, var_get_width (v2));
421 else if (var_is_alpha (v2))
423 result |= value_compare_3way (val2, c->val2, var_get_width (v2));
429 Hash comparison. Returns 0 for a match, or a non-zero int
430 otherwise. The sign of a non-zero return value *should* indicate the
431 position of C relative to the covariance_accumulator described by
432 the other arguments. But for now, it just returns 1 for any
433 non-match. This should be changed when someone figures out how to
434 compute a sensible sign for the return value.
437 match_nodes (const struct covariance_accumulator *c,
438 const struct variable *v1, const struct variable *v2,
439 const union value *val1, const union value *val2)
444 n = ordered_match_nodes (c, v1, v2, val1, val2);
445 m = ordered_match_nodes (c, v2, v1, val2, val1);
450 This function is meant to be used as a comparison function for
451 a struct hsh_table in src/libpspp/hash.c.
454 covariance_accumulator_compare (const void *a1_, const void *a2_,
455 const void *aux UNUSED)
457 const struct covariance_accumulator *a1 = a1_;
458 const struct covariance_accumulator *a2 = a2_;
460 if (a1 == NULL && a2 == NULL)
463 if (a1 == NULL || a2 == NULL)
466 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
470 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
471 const union value *val, size_t n_vars)
473 unsigned int result = -1u;
474 if (var_is_numeric (v1) && var_is_alpha (v2))
476 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
477 + var_get_dict_index (v2) + value_hash (val, var_get_width (v2), 0);
479 else if (var_is_alpha (v1) && var_is_numeric (v2))
481 result = hash_numeric_alpha (v2, v1, val, n_vars);
488 update_product (const struct variable *v1, const struct variable *v2,
489 const union value *val1, const union value *val2)
493 assert (val1 != NULL);
494 assert (val2 != NULL);
495 if (var_is_alpha (v1) && var_is_alpha (v2))
499 if (var_is_numeric (v1) && var_is_numeric (v2))
501 return (val1->f * val2->f);
503 if (var_is_numeric (v1) && var_is_alpha (v2))
507 if (var_is_numeric (v2) && var_is_alpha (v1))
517 update_sum (const struct variable *var, const union value *val, double weight)
519 assert (var != NULL);
520 assert (val != NULL);
521 if (var_is_alpha (var))
527 static struct covariance_accumulator *
528 get_new_covariance_accumulator (const struct variable *v1,
529 const struct variable *v2,
530 const union value *val1,
531 const union value *val2)
533 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
535 struct covariance_accumulator *ca;
536 ca = xmalloc (sizeof (*ca));
546 static const struct variable **
547 get_covariance_variables (const struct covariance_matrix *cov)
549 return cov->v_variables;
553 update_hash_entry (struct hsh_table *c,
554 const struct variable *v1,
555 const struct variable *v2,
556 const union value *val1, const union value *val2,
557 const struct interaction_value *i_val1,
558 const struct interaction_value *i_val2)
560 struct covariance_accumulator *ca;
561 struct covariance_accumulator *new_entry;
565 iv_f1 = interaction_value_get_nonzero_entry (i_val1);
566 iv_f2 = interaction_value_get_nonzero_entry (i_val2);
567 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
568 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
569 ca->dot_product *= iv_f1 * iv_f2;
570 ca->sum1 = update_sum (ca->v1, ca->val1, iv_f1);
571 ca->sum2 = update_sum (ca->v2, ca->val2, iv_f2);
573 new_entry = hsh_insert (c, ca);
575 if (new_entry != NULL)
577 new_entry->dot_product += ca->dot_product;
578 new_entry->ssize += 1.0;
579 new_entry->sum1 += ca->sum1;
580 new_entry->sum2 += ca->sum2;
582 If DOT_PRODUCT is null, CA was not already in the hash
583 hable, so we don't free it because it was just inserted.
584 If DOT_PRODUCT was not null, CA is already in the hash table.
585 Unnecessary now, it must be freed here.
592 Compute the covariance matrix in a single data-pass. Cases with
593 missing values are dropped pairwise, in other words, only if one of
594 the two values necessary to accumulate the inner product is missing.
596 Do not call this function directly. Call it through the struct
597 covariance_matrix ACCUMULATE member function, for example,
598 cov->accumulate (cov, ccase).
601 covariance_accumulate_pairwise (struct covariance_matrix *cov,
602 const struct ccase *ccase,
603 const struct interaction_variable **i_var,
608 const union value *val1;
609 const union value *val2;
610 const struct variable **v_variables;
611 struct interaction_value *i_val1 = NULL;
612 struct interaction_value *i_val2 = NULL;
614 assert (cov != NULL);
615 assert (ccase != NULL);
617 v_variables = get_covariance_variables (cov);
618 assert (v_variables != NULL);
620 for (i = 0; i < cov->n_variables; ++i)
622 if (is_interaction (v_variables[i], i_var, n_intr))
624 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
625 val1 = interaction_value_get (i_val1);
629 val1 = case_data (ccase, v_variables[i]);
631 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
633 cat_value_update (v_variables[i], val1);
634 if (var_is_numeric (v_variables[i]))
635 cov->update_moments (cov, i, val1->f);
637 for (j = i; j < cov->n_variables; j++)
639 if (is_interaction (v_variables[j], i_var, n_intr))
641 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
642 val2 = interaction_value_get (i_val2);
646 val2 = case_data (ccase, v_variables[j]);
648 if (!var_is_value_missing
649 (v_variables[j], val2, cov->missing_value))
651 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
652 val1, val2, i_val1, i_val2);
660 Compute the covariance matrix in a single data-pass. Cases with
661 missing values are dropped listwise. In other words, if one of the
662 values for any variable in a case is missing, the entire case is
665 The caller must use a casefilter to remove the cases with missing
666 values before calling covariance_accumulate_listwise. This function
667 assumes that CCASE has already passed through this filter, and
668 contains no missing values.
670 Do not call this function directly. Call it through the struct
671 covariance_matrix ACCUMULATE member function, for example,
672 cov->accumulate (cov, ccase).
675 covariance_accumulate_listwise (struct covariance_matrix *cov,
676 const struct ccase *ccase,
677 const struct interaction_variable **i_var,
682 const union value *val1;
683 const union value *val2;
684 const struct variable **v_variables;
685 struct interaction_value *i_val1 = NULL;
686 struct interaction_value *i_val2 = NULL;
688 assert (cov != NULL);
689 assert (ccase != NULL);
691 v_variables = get_covariance_variables (cov);
692 assert (v_variables != NULL);
694 for (i = 0; i < cov->n_variables; ++i)
696 if (is_interaction (v_variables[i], i_var, n_intr))
698 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
699 val1 = interaction_value_get (i_val1);
703 val1 = case_data (ccase, v_variables[i]);
705 cat_value_update (v_variables[i], val1);
706 if (var_is_numeric (v_variables[i]))
707 cov->update_moments (cov, i, val1->f);
709 for (j = i; j < cov->n_variables; j++)
711 if (is_interaction (v_variables[j], i_var, n_intr))
713 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
714 val2 = interaction_value_get (i_val2);
718 val2 = case_data (ccase, v_variables[j]);
720 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
721 val1, val2, i_val1, i_val2);
727 Call this function during the data pass. Each case will be added to
728 a hash containing all values of the covariance matrix. After the
729 data have been passed, call covariance_matrix_compute to put the
730 values in the struct covariance_matrix.
733 covariance_matrix_accumulate (struct covariance_matrix *cov,
734 const struct ccase *ccase, void **aux, size_t n_intr)
736 cov->accumulate (cov, ccase, (const struct interaction_variable **) aux, n_intr);
740 Return the value corresponding to subscript TARGET. If that value corresponds
741 to the origin, return NULL.
743 static const union value *
744 get_value_from_subscript (const struct design_matrix *dm, size_t target)
746 const union value *result = NULL;
747 const struct variable *var;
750 var = design_matrix_col_to_var (dm, target);
751 if (var_is_numeric (var))
755 for (i = 0; i < cat_get_n_categories (var); i++)
757 result = cat_subscript_to_value (i, var);
758 if (dm_get_exact_subscript (dm, var, result) == target)
767 is_covariance_contributor (const struct covariance_accumulator *ca, const struct design_matrix *dm,
771 const struct variable *v1;
772 const struct variable *v2;
775 v1 = design_matrix_col_to_var (dm, i);
776 v2 = design_matrix_col_to_var (dm, j);
777 if (var_get_dict_index (v1) == var_get_dict_index(ca->v1))
779 if (var_get_dict_index (v2) == var_get_dict_index (ca->v2))
781 k = dm_get_exact_subscript (dm, v1, ca->val1);
784 k = dm_get_exact_subscript (dm, v2, ca->val2);
792 else if (var_get_dict_index (v1) == var_get_dict_index (ca->v2))
794 if (var_get_dict_index (v2) == var_get_dict_index (ca->v1))
796 k = dm_get_exact_subscript (dm, v1, ca->val2);
799 k = dm_get_exact_subscript (dm, v2, ca->val1);
811 get_sum (const struct covariance_matrix *cov, size_t i)
816 const struct variable *var;
817 const union value *val = NULL;
819 assert ( cov != NULL);
820 var = design_matrix_col_to_var (cov->cov, i);
823 if (var_is_alpha (var))
825 val = get_value_from_subscript (cov->cov, i);
826 k = cat_value_find (var, val);
827 return cat_get_category_count (k, var);
832 while (var_get_dict_index (cov->v_variables[k]) != var_get_dict_index (var))
836 moments1_calculate (cov->m1[k], &n, &mean, NULL, NULL, NULL);
844 update_ssize (struct design_matrix *dm, size_t i, size_t j, struct covariance_accumulator *ca)
846 const struct variable *var;
848 var = design_matrix_col_to_var (dm, i);
849 if (var_get_dict_index (ca->v1) == var_get_dict_index (var))
851 var = design_matrix_col_to_var (dm, j);
852 if (var_get_dict_index (ca->v2) == var_get_dict_index (var))
854 tmp = design_matrix_get_element (dm, i, j);
856 design_matrix_set_element (dm, i, j, tmp);
861 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
868 struct covariance_accumulator *entry;
869 struct hsh_iterator iter;
871 cov->cov = covariance_matrix_create (cov->n_variables, cov->v_variables);
872 cov->ssize = covariance_matrix_create (cov->n_variables, cov->v_variables);
873 entry = hsh_first (cov->ca, &iter);
874 while (entry != NULL)
876 entry = hsh_next (cov->ca, &iter);
879 for (i = 0; i < design_matrix_get_n_cols (cov->cov); i++)
881 sum_i = get_sum (cov, i);
882 for (j = i; j < design_matrix_get_n_cols (cov->cov); j++)
884 sum_j = get_sum (cov, j);
885 entry = hsh_first (cov->ca, &iter);
886 while (entry != NULL)
888 update_ssize (cov->ssize, i, j, entry);
890 We compute the centered, un-normalized covariance matrix.
892 if (is_covariance_contributor (entry, cov->cov, i, j))
894 design_matrix_set_element (cov->cov, i, j, entry->dot_product);
896 entry = hsh_next (cov->ca, &iter);
898 tmp = design_matrix_get_element (cov->cov, i, j);
899 tmp -= sum_i * sum_j / design_matrix_get_element (cov->ssize, i, j);
900 design_matrix_set_element (cov->cov, i, j, tmp);
901 design_matrix_set_element (cov->cov, j, i, tmp);
908 Call this function after passing the data.
911 covariance_matrix_compute (struct covariance_matrix *cov)
913 if (cov->n_pass == ONE_PASS)
915 covariance_accumulator_to_matrix (cov);
919 struct design_matrix *
920 covariance_to_design (const struct covariance_matrix *c)
929 covariance_matrix_get_n_rows (const struct covariance_matrix *c)
931 return design_matrix_get_n_rows (c->cov);
935 covariance_matrix_get_element (const struct covariance_matrix *c, size_t row, size_t col)
937 return (design_matrix_get_element (c->cov, row, col));