1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
46 const union value *val1;
47 const union value *val2;
56 struct covariance_matrix
58 struct design_matrix *cov;
59 struct design_matrix *ssize;
60 struct design_matrix *sums;
64 const struct variable **v_variables;
68 enum mv_class missing_value;
69 void (*accumulate) (struct covariance_matrix *, const struct ccase *,
70 const struct interaction_variable **, size_t);
71 void (*update_moments) (struct covariance_matrix *, size_t, double);
76 static struct hsh_table *covariance_hsh_create (size_t *);
77 static hsh_hash_func covariance_accumulator_hash;
78 static unsigned int hash_numeric_alpha (const struct variable *,
79 const struct variable *,
80 const union value *, size_t);
81 static hsh_compare_func covariance_accumulator_compare;
82 static hsh_free_func covariance_accumulator_free;
83 static void update_moments1 (struct covariance_matrix *, size_t, double);
84 static void update_moments2 (struct covariance_matrix *, size_t, double);
85 static struct covariance_accumulator *get_new_covariance_accumulator (const
100 static void covariance_accumulate_listwise (struct covariance_matrix *,
101 const struct ccase *,
102 const struct interaction_variable **,
104 static void covariance_accumulate_pairwise (struct covariance_matrix *,
105 const struct ccase *,
106 const struct interaction_variable **,
109 struct covariance_matrix *
110 covariance_matrix_init (size_t n_variables,
111 const struct variable *v_variables[], int n_pass,
112 int missing_handling, enum mv_class missing_value)
115 struct covariance_matrix *result = NULL;
117 result = xmalloc (sizeof (*result));
119 result->n_variables = n_variables;
120 result->ca = covariance_hsh_create (&result->n_variables);
123 result->missing_handling = missing_handling;
124 result->missing_value = missing_value;
125 result->accumulate = (result->missing_handling == LISTWISE) ?
126 covariance_accumulate_listwise : covariance_accumulate_pairwise;
127 if (n_pass == ONE_PASS)
129 result->update_moments = update_moments1;
130 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
131 for (i = 0; i < n_variables; i++)
133 result->m1[i] = moments1_create (MOMENT_MEAN);
138 result->update_moments = update_moments2;
139 result->m = xnmalloc (n_variables, sizeof (*result->m));
140 for (i = 0; i < n_variables; i++)
142 result->m[i] = moments_create (MOMENT_MEAN);
145 result->v_variables = v_variables;
147 result->n_pass = n_pass;
152 get_n_rows (size_t n_variables, size_t *v_variables[])
156 for (i = 0; i < n_variables; i++)
158 if (var_is_numeric (v_variables[i]))
162 else if (var_is_alpha (v_variables[i]))
164 size_t n_categories = cat_get_n_categories (v_variables[i]);
165 result += n_categories - 1;
171 The covariances are stored in a DESIGN_MATRIX structure.
173 struct design_matrix *
174 covariance_matrix_create (size_t n_variables,
175 const struct variable *v_variables[])
177 size_t n_rows = get_n_rows (n_variables, v_variables);
178 return design_matrix_create (n_variables, v_variables, n_rows);
182 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
184 assert (cov->m1 != NULL);
185 moments1_add (cov->m1[i], x, 1.0);
189 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
191 assert (cov->m != NULL);
192 moments_pass_one (cov->m[i], x, 1.0);
196 covariance_matrix_destroy (struct covariance_matrix *cov)
200 assert (cov != NULL);
201 design_matrix_destroy (cov->cov);
202 design_matrix_destroy (cov->ssize);
203 design_matrix_destroy (cov->sums);
204 hsh_destroy (cov->ca);
205 if (cov->n_pass == ONE_PASS)
207 for (i = 0; i < cov->n_variables; i++)
209 moments1_destroy (cov->m1[i]);
215 for (i = 0; i < cov->n_variables; i++)
217 moments_destroy (cov->m[i]);
224 Update the covariance matrix with the new entries, assuming that ROW
225 corresponds to a categorical variable and V2 is numeric.
228 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
230 const struct variable *v2, double x,
231 const union value *val2)
236 assert (var_is_numeric (v2));
238 col = design_matrix_var_to_column (cov, v2);
239 assert (val2 != NULL);
240 tmp = design_matrix_get_element (cov, row, col);
241 design_matrix_set_element (cov, row, col, (val2->f - mean) * x + tmp);
242 design_matrix_set_element (cov, col, row, (val2->f - mean) * x + tmp);
245 column_iterate (struct design_matrix *cov, const struct variable *v,
246 double ssize, double x, const union value *val1, size_t row)
252 const union value *tmp_val;
254 col = design_matrix_var_to_column (cov, v);
255 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
258 y = -1.0 * cat_get_category_count (i, v) / ssize;
259 tmp_val = cat_subscript_to_value (i, v);
260 if (!compare_values_short (tmp_val, val1, v))
264 tmp = design_matrix_get_element (cov, row, col);
265 design_matrix_set_element (cov, row, col, x * y + tmp);
266 design_matrix_set_element (cov, col, row, x * y + tmp);
271 Call this function in the second data pass. The central moments are
272 MEAN1 and MEAN2. Any categorical variables should already have their
273 values summarized in in its OBS_VALS element.
276 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
277 double ssize, const struct variable *v1,
278 const struct variable *v2, const union value *val1,
279 const union value *val2)
285 const union value *tmp_val;
287 if (var_is_alpha (v1))
289 row = design_matrix_var_to_column (cov, v1);
290 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
293 x = -1.0 * cat_get_category_count (i, v1) / ssize;
294 tmp_val = cat_subscript_to_value (i, v1);
295 if (!compare_values_short (tmp_val, val1, v1))
299 if (var_is_numeric (v2))
301 covariance_update_categorical_numeric (cov, mean2, row,
306 column_iterate (cov, v1, ssize, x, val1, row);
307 column_iterate (cov, v2, ssize, x, val2, row);
311 else if (var_is_alpha (v2))
314 Reverse the orders of V1, V2, etc. and put ourselves back
315 in the previous IF scope.
317 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
322 Both variables are numeric.
324 row = design_matrix_var_to_column (cov, v1);
325 col = design_matrix_var_to_column (cov, v2);
326 x = (val1->f - mean1) * (val2->f - mean2);
327 x += design_matrix_get_element (cov, col, row);
328 design_matrix_set_element (cov, row, col, x);
329 design_matrix_set_element (cov, col, row, x);
334 covariance_accumulator_hash (const void *h, const void *aux)
336 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
337 size_t *n_vars = (size_t *) aux;
340 const struct variable *v_min;
341 const struct variable *v_max;
342 const union value *val_min;
343 const union value *val_max;
346 Order everything by the variables' indices. This ensures we get the
347 same key regardless of the order in which the variables are stored
351 (var_get_dict_index (ca->v1) <
352 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
353 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
355 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
356 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
358 idx_min = var_get_dict_index (v_min);
359 idx_max = var_get_dict_index (v_max);
361 if (var_is_numeric (v_max) && var_is_numeric (v_min))
363 return (*n_vars * idx_max + idx_min);
365 if (var_is_numeric (v_max) && var_is_alpha (v_min))
367 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
369 if (var_is_alpha (v_max) && var_is_numeric (v_min))
371 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
373 if (var_is_alpha (v_max) && var_is_alpha (v_min))
375 unsigned tmp = hsh_hash_bytes (val_max, var_get_width (v_max));
376 tmp ^= hsh_hash_bytes (val_min, var_get_width (v_min));
377 tmp += *n_vars * (*n_vars + 1 + idx_max) + idx_min;
384 Make a hash table consisting of struct covariance_accumulators.
385 This allows the accumulation of the elements of a covariance matrix
386 in a single data pass. Call covariance_accumulate () for each case
389 static struct hsh_table *
390 covariance_hsh_create (size_t *n_vars)
392 return hsh_create (*n_vars * *n_vars, covariance_accumulator_compare,
393 covariance_accumulator_hash, covariance_accumulator_free,
398 covariance_accumulator_free (void *c_, const void *aux UNUSED)
400 struct covariance_accumulator *c = c_;
406 Hash comparison. Returns 0 for a match, or a non-zero int
407 otherwise. The sign of a non-zero return value *should* indicate the
408 position of C relative to the covariance_accumulator described by
409 the other arguments. But for now, it just returns 1 for any
410 non-match. This should be changed when someone figures out how to
411 compute a sensible sign for the return value.
414 match_nodes (const struct covariance_accumulator *c,
415 const struct variable *v1, const struct variable *v2,
416 const union value *val1, const union value *val2)
418 if (var_get_dict_index (v1) == var_get_dict_index (c->v1))
419 if (var_get_dict_index (v2) == var_get_dict_index (c->v2))
421 if (var_is_numeric (v1) && var_is_numeric (v2))
425 if (var_is_numeric (v1) && var_is_alpha (v2))
427 if (!compare_values_short (val2, c->val2, v2))
432 if (var_is_alpha (v1) && var_is_numeric (v2))
434 if (!compare_values_short (val1, c->val1, v1))
439 if (var_is_alpha (v1) && var_is_alpha (v2))
441 if (!compare_values_short (val1, c->val1, v1))
443 if (!compare_values_short (val2, c->val2, v2))
454 This function is meant to be used as a comparison function for
455 a struct hsh_table in src/libpspp/hash.c.
458 covariance_accumulator_compare (const void *a1_, const void *a2_,
459 const void *aux UNUSED)
461 const struct covariance_accumulator *a1 = a1_;
462 const struct covariance_accumulator *a2 = a2_;
464 if (a1 == NULL && a2 == NULL)
467 if (a1 == NULL || a2 == NULL)
470 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
474 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
475 const union value *val, size_t n_vars)
477 unsigned int result = -1u;
478 if (var_is_numeric (v1) && var_is_alpha (v2))
480 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
481 + var_get_dict_index (v2) + hsh_hash_string (val->s);
483 else if (var_is_alpha (v1) && var_is_numeric (v2))
485 result = hash_numeric_alpha (v2, v1, val, n_vars);
492 update_product (const struct variable *v1, const struct variable *v2,
493 const union value *val1, const union value *val2)
497 assert (val1 != NULL);
498 assert (val2 != NULL);
499 if (var_is_alpha (v1) && var_is_alpha (v2))
503 if (var_is_numeric (v1) && var_is_numeric (v2))
505 return (val1->f * val2->f);
507 if (var_is_numeric (v1) && var_is_alpha (v2))
511 if (var_is_numeric (v2) && var_is_alpha (v1))
513 update_product (v2, v1, val2, val1);
518 update_sum (const struct variable *var, const union value *val, double weight)
520 assert (var != NULL);
521 assert (val != NULL);
522 if (var_is_alpha (var))
528 static struct covariance_accumulator *
529 get_new_covariance_accumulator (const struct variable *v1,
530 const struct variable *v2,
531 const union value *val1,
532 const union value *val2)
534 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
536 struct covariance_accumulator *ca;
537 ca = xmalloc (sizeof (*ca));
547 static const struct variable **
548 get_covariance_variables (const struct covariance_matrix *cov)
550 return cov->v_variables;
554 update_hash_entry (struct hsh_table *c,
555 const struct variable *v1,
556 const struct variable *v2,
557 const union value *val1, const union value *val2,
558 const struct interaction_value *i_val1,
559 const struct interaction_value *i_val2)
561 struct covariance_accumulator *ca;
562 struct covariance_accumulator *new_entry;
566 iv_f1 = interaction_value_get_nonzero_entry (i_val1);
567 iv_f2 = interaction_value_get_nonzero_entry (i_val2);
568 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
569 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
570 ca->dot_product *= iv_f1 * iv_f2;
571 ca->sum1 = update_sum (ca->v1, ca->val1, iv_f1);
572 ca->sum2 = update_sum (ca->v2, ca->val2, iv_f2);
574 new_entry = hsh_insert (c, ca);
576 if (new_entry != NULL)
578 new_entry->dot_product += ca->dot_product;
579 new_entry->ssize += 1.0;
580 new_entry->sum1 += ca->sum1;
581 new_entry->sum2 += ca->sum2;
583 If DOT_PRODUCT is null, CA was not already in the hash
584 hable, so we don't free it because it was just inserted.
585 If DOT_PRODUCT was not null, CA is already in the hash table.
586 Unnecessary now, it must be freed here.
593 Compute the covariance matrix in a single data-pass. Cases with
594 missing values are dropped pairwise, in other words, only if one of
595 the two values necessary to accumulate the inner product is missing.
597 Do not call this function directly. Call it through the struct
598 covariance_matrix ACCUMULATE member function, for example,
599 cov->accumulate (cov, ccase).
602 covariance_accumulate_pairwise (struct covariance_matrix *cov,
603 const struct ccase *ccase,
604 const struct interaction_variable **i_var,
609 const union value *val1;
610 const union value *val2;
611 const struct variable **v_variables;
612 struct interaction_value *i_val1 = NULL;
613 struct interaction_value *i_val2 = NULL;
615 assert (cov != NULL);
616 assert (ccase != NULL);
618 v_variables = get_covariance_variables (cov);
619 assert (v_variables != NULL);
621 for (i = 0; i < cov->n_variables; ++i)
623 if (is_interaction (v_variables[i], i_var, n_intr))
625 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
626 val1 = interaction_value_get (i_val1);
630 val1 = case_data (ccase, v_variables[i]);
632 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
634 cat_value_update (v_variables[i], val1);
635 if (var_is_numeric (v_variables[i]))
636 cov->update_moments (cov, i, val1->f);
638 for (j = i; j < cov->n_variables; j++)
640 if (is_interaction (v_variables[j], i_var, n_intr))
642 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
643 val2 = interaction_value_get (i_val2);
647 val2 = case_data (ccase, v_variables[j]);
649 if (!var_is_value_missing
650 (v_variables[j], val2, cov->missing_value))
652 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
653 val1, val2, i_val1, i_val2);
655 update_hash_entry (cov->ca, v_variables[j],
656 v_variables[i], val2, val1, i_val2, i_val1);
664 Compute the covariance matrix in a single data-pass. Cases with
665 missing values are dropped listwise. In other words, if one of the
666 values for any variable in a case is missing, the entire case is
669 The caller must use a casefilter to remove the cases with missing
670 values before calling covariance_accumulate_listwise. This function
671 assumes that CCASE has already passed through this filter, and
672 contains no missing values.
674 Do not call this function directly. Call it through the struct
675 covariance_matrix ACCUMULATE member function, for example,
676 cov->accumulate (cov, ccase).
679 covariance_accumulate_listwise (struct covariance_matrix *cov,
680 const struct ccase *ccase,
681 const struct interaction_variable **i_var,
686 const union value *val1;
687 const union value *val2;
688 const struct variable **v_variables;
689 struct interaction_value *i_val1 = NULL;
690 struct interaction_value *i_val2 = NULL;
692 assert (cov != NULL);
693 assert (ccase != NULL);
695 v_variables = get_covariance_variables (cov);
696 assert (v_variables != NULL);
698 for (i = 0; i < cov->n_variables; ++i)
700 if (is_interaction (v_variables[i], i_var, n_intr))
702 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
703 val1 = interaction_value_get (i_val1);
707 val1 = case_data (ccase, v_variables[i]);
709 cat_value_update (v_variables[i], val1);
710 if (var_is_numeric (v_variables[i]))
711 cov->update_moments (cov, i, val1->f);
713 for (j = i; j < cov->n_variables; j++)
715 if (is_interaction (v_variables[j], i_var, n_intr))
717 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
718 val2 = interaction_value_get (i_val2);
722 val2 = case_data (ccase, v_variables[j]);
724 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
725 val1, val2, i_val1, i_val2);
727 update_hash_entry (cov->ca, v_variables[j], v_variables[i],
728 val2, val1, i_val2, i_val1);
734 Call this function during the data pass. Each case will be added to
735 a hash containing all values of the covariance matrix. After the
736 data have been passed, call covariance_matrix_compute to put the
737 values in the struct covariance_matrix.
740 covariance_matrix_accumulate (struct covariance_matrix *cov,
741 const struct ccase *ccase, void **aux, size_t n_intr)
743 cov->accumulate (cov, ccase, (const struct interaction_variable **) aux, n_intr);
746 If VAR is categorical with d categories, its first category should
747 correspond to the origin in d-dimensional Euclidean space.
750 is_origin (const struct variable *var, const union value *val)
752 if (cat_value_find (var, val) == 0)
760 Return the subscript of the column of the design matrix
761 corresponding to VAL. If VAR is categorical with d categories, its
762 first category should correspond to the origin in d-dimensional
763 Euclidean space, so there is no subscript for this value.
766 get_exact_subscript (const struct design_matrix *dm, const struct variable *var,
767 const union value *val)
771 if (is_origin (var, val))
776 result = design_matrix_var_to_column (dm, var);
777 if (var_is_alpha (var))
779 result += cat_value_find (var, val) - 1;
785 covariance_matrix_insert (struct design_matrix *cov,
786 const struct variable *v1,
787 const struct variable *v2, const union value *val1,
788 const union value *val2, double product)
793 assert (cov != NULL);
795 row = get_exact_subscript (cov, v1, val1);
796 col = get_exact_subscript (cov, v2, val2);
797 if (row != -1u && col != -1u)
799 design_matrix_set_element (cov, row, col, product);
805 is_covariance_contributor (const struct covariance_accumulator *ca, const struct design_matrix *dm,
809 const struct variable *v1;
810 const struct variable *v2;
813 v1 = design_matrix_col_to_var (dm, i);
814 if (var_get_dict_index (v1) == var_get_dict_index(ca->v1))
816 v2 = design_matrix_col_to_var (dm, j);
817 if (var_get_dict_index (v2) == var_get_dict_index (ca->v2))
819 k = get_exact_subscript (dm, v1, ca->val1);
822 k = get_exact_subscript (dm, v2, ca->val2);
833 get_sum (const struct covariance_matrix *cov, size_t i)
836 const struct variable *var;
837 const union value *val = NULL;
838 struct covariance_accumulator ca;
839 struct covariance_accumulator *c;
841 assert ( cov != NULL);
842 var = design_matrix_col_to_var (cov->cov, i);
845 if (var_is_alpha (var))
847 k = design_matrix_var_to_column (cov->cov, var);
849 val = cat_subscript_to_value (i, var);
855 c = (struct covariance_accumulator *) hsh_find (cov->ca, &ca);
864 update_ssize (struct design_matrix *dm, size_t i, size_t j, struct covariance_accumulator *ca)
866 struct variable *var;
868 var = design_matrix_col_to_var (dm, i);
869 if (var_get_dict_index (ca->v1) == var_get_dict_index (var))
871 var = design_matrix_col_to_var (dm, j);
872 if (var_get_dict_index (ca->v2) == var_get_dict_index (var))
874 tmp = design_matrix_get_element (dm, i, j);
876 design_matrix_set_element (dm, i, j, tmp);
881 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
888 struct covariance_accumulator *entry;
889 struct hsh_iterator iter;
891 cov->cov = covariance_matrix_create (cov->n_variables, cov->v_variables);
892 cov->ssize = covariance_matrix_create (cov->n_variables, cov->v_variables);
893 cov->sums = covariance_matrix_create (cov->n_variables, cov->v_variables);
894 for (i = 0; i < design_matrix_get_n_cols (cov->cov); i++)
896 sum_i = get_sum (cov, i);
897 for (j = i; j < design_matrix_get_n_cols (cov->cov); j++)
899 sum_j = get_sum (cov, j);
900 entry = hsh_first (cov->ca, &iter);
901 design_matrix_set_element (cov->sums, i, j, sum_i);
902 while (entry != NULL)
904 update_ssize (cov->ssize, i, j, entry);
906 We compute the centered, un-normalized covariance matrix.
908 if (is_covariance_contributor (entry, cov->cov, i, j))
910 covariance_matrix_insert (cov->cov, entry->v1, entry->v2, entry->val1,
911 entry->val2, entry->dot_product);
913 entry = hsh_next (cov->ca, &iter);
915 tmp = design_matrix_get_element (cov->cov, i, j);
916 tmp -= sum_i * sum_j / design_matrix_get_element (cov->ssize, i, j);
917 design_matrix_set_element (cov->cov, i, j, tmp);
924 Call this function after passing the data.
927 covariance_matrix_compute (struct covariance_matrix *cov)
929 if (cov->n_pass == ONE_PASS)
931 covariance_accumulator_to_matrix (cov);
935 struct design_matrix *
936 covariance_to_design (const struct covariance_matrix *c)
946 covariance_matrix_get_element (const struct covariance_matrix *c, size_t row, size_t col)
948 return (design_matrix_get_element (c->cov, row, col));