1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
46 const union value *val1;
47 const union value *val2;
56 struct covariance_matrix
58 struct design_matrix *cov;
62 const struct variable **v_variables;
66 enum mv_class missing_value;
67 void (*accumulate) (struct covariance_matrix *, const struct ccase *,
68 const struct interaction_variable **, size_t);
69 void (*update_moments) (struct covariance_matrix *, size_t, double);
74 static struct hsh_table *covariance_hsh_create (size_t *);
75 static hsh_hash_func covariance_accumulator_hash;
76 static unsigned int hash_numeric_alpha (const struct variable *,
77 const struct variable *,
78 const union value *, size_t);
79 static hsh_compare_func covariance_accumulator_compare;
80 static hsh_free_func covariance_accumulator_free;
81 static void update_moments1 (struct covariance_matrix *, size_t, double);
82 static void update_moments2 (struct covariance_matrix *, size_t, double);
83 static struct covariance_accumulator *get_new_covariance_accumulator (const
98 static void covariance_accumulate_listwise (struct covariance_matrix *,
100 const struct interaction_variable **,
102 static void covariance_accumulate_pairwise (struct covariance_matrix *,
103 const struct ccase *,
104 const struct interaction_variable **,
107 struct covariance_matrix *
108 covariance_matrix_init (size_t n_variables,
109 const struct variable *v_variables[], int n_pass,
110 int missing_handling, enum mv_class missing_value)
113 struct covariance_matrix *result = NULL;
115 result = xmalloc (sizeof (*result));
117 result->n_variables = n_variables;
118 result->ca = covariance_hsh_create (&result->n_variables);
121 result->missing_handling = missing_handling;
122 result->missing_value = missing_value;
123 result->accumulate = (result->missing_handling == LISTWISE) ?
124 covariance_accumulate_listwise : covariance_accumulate_pairwise;
125 if (n_pass == ONE_PASS)
127 result->update_moments = update_moments1;
128 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
129 for (i = 0; i < n_variables; i++)
131 result->m1[i] = moments1_create (MOMENT_MEAN);
136 result->update_moments = update_moments2;
137 result->m = xnmalloc (n_variables, sizeof (*result->m));
138 for (i = 0; i < n_variables; i++)
140 result->m[i] = moments_create (MOMENT_MEAN);
143 result->v_variables = v_variables;
145 result->n_pass = n_pass;
151 The covariances are stored in a DESIGN_MATRIX structure.
153 struct design_matrix *
154 covariance_matrix_create (size_t n_variables,
155 const struct variable *v_variables[])
157 return design_matrix_create (n_variables, v_variables,
158 (size_t) n_variables);
162 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
164 assert (cov->m1 != NULL);
165 moments1_add (cov->m1[i], x, 1.0);
169 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
171 assert (cov->m != NULL);
172 moments_pass_one (cov->m[i], x, 1.0);
176 covariance_matrix_destroy (struct covariance_matrix *cov)
180 assert (cov != NULL);
181 design_matrix_destroy (cov->cov);
182 hsh_destroy (cov->ca);
183 if (cov->n_pass == ONE_PASS)
185 for (i = 0; i < cov->n_variables; i++)
187 moments1_destroy (cov->m1[i]);
193 for (i = 0; i < cov->n_variables; i++)
195 moments_destroy (cov->m[i]);
202 Update the covariance matrix with the new entries, assuming that ROW
203 corresponds to a categorical variable and V2 is numeric.
206 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
208 const struct variable *v2, double x,
209 const union value *val2)
214 assert (var_is_numeric (v2));
216 col = design_matrix_var_to_column (cov, v2);
217 assert (val2 != NULL);
218 tmp = gsl_matrix_get (cov->m, row, col);
219 gsl_matrix_set (cov->m, row, col, (val2->f - mean) * x + tmp);
220 gsl_matrix_set (cov->m, col, row, (val2->f - mean) * x + tmp);
223 column_iterate (struct design_matrix *cov, const struct variable *v,
224 double ssize, double x, const union value *val1, size_t row)
230 const union value *tmp_val;
232 col = design_matrix_var_to_column (cov, v);
233 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
236 y = -1.0 * cat_get_category_count (i, v) / ssize;
237 tmp_val = cat_subscript_to_value (i, v);
238 if (!compare_values_short (tmp_val, val1, v))
242 tmp = gsl_matrix_get (cov->m, row, col);
243 gsl_matrix_set (cov->m, row, col, x * y + tmp);
244 gsl_matrix_set (cov->m, col, row, x * y + tmp);
249 Call this function in the second data pass. The central moments are
250 MEAN1 and MEAN2. Any categorical variables should already have their
251 values summarized in in its OBS_VALS element.
254 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
255 double ssize, const struct variable *v1,
256 const struct variable *v2, const union value *val1,
257 const union value *val2)
263 const union value *tmp_val;
265 if (var_is_alpha (v1))
267 row = design_matrix_var_to_column (cov, v1);
268 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
271 x = -1.0 * cat_get_category_count (i, v1) / ssize;
272 tmp_val = cat_subscript_to_value (i, v1);
273 if (!compare_values_short (tmp_val, val1, v1))
277 if (var_is_numeric (v2))
279 covariance_update_categorical_numeric (cov, mean2, row,
284 column_iterate (cov, v1, ssize, x, val1, row);
285 column_iterate (cov, v2, ssize, x, val2, row);
289 else if (var_is_alpha (v2))
292 Reverse the orders of V1, V2, etc. and put ourselves back
293 in the previous IF scope.
295 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
300 Both variables are numeric.
302 row = design_matrix_var_to_column (cov, v1);
303 col = design_matrix_var_to_column (cov, v2);
304 x = (val1->f - mean1) * (val2->f - mean2);
305 x += gsl_matrix_get (cov->m, col, row);
306 gsl_matrix_set (cov->m, row, col, x);
307 gsl_matrix_set (cov->m, col, row, x);
312 covariance_accumulator_hash (const void *h, const void *aux)
314 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
315 size_t *n_vars = (size_t *) aux;
318 const struct variable *v_min;
319 const struct variable *v_max;
320 const union value *val_min;
321 const union value *val_max;
324 Order everything by the variables' indices. This ensures we get the
325 same key regardless of the order in which the variables are stored
329 (var_get_dict_index (ca->v1) <
330 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
331 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
333 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
334 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
336 idx_min = var_get_dict_index (v_min);
337 idx_max = var_get_dict_index (v_max);
339 if (var_is_numeric (v_max) && var_is_numeric (v_min))
341 return (*n_vars * idx_max + idx_min);
343 if (var_is_numeric (v_max) && var_is_alpha (v_min))
345 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
347 if (var_is_alpha (v_max) && var_is_numeric (v_min))
349 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
351 if (var_is_alpha (v_max) && var_is_alpha (v_min))
353 unsigned tmp = hsh_hash_bytes (val_max, var_get_width (v_max));
354 tmp ^= hsh_hash_bytes (val_min, var_get_width (v_min));
355 tmp += *n_vars * (*n_vars + 1 + idx_max) + idx_min;
362 Make a hash table consisting of struct covariance_accumulators.
363 This allows the accumulation of the elements of a covariance matrix
364 in a single data pass. Call covariance_accumulate () for each case
367 static struct hsh_table *
368 covariance_hsh_create (size_t *n_vars)
370 return hsh_create (*n_vars * *n_vars, covariance_accumulator_compare,
371 covariance_accumulator_hash, covariance_accumulator_free,
376 covariance_accumulator_free (void *c_, const void *aux UNUSED)
378 struct covariance_accumulator *c = c_;
384 Hash comparison. Returns 0 for a match, or a non-zero int
385 otherwise. The sign of a non-zero return value *should* indicate the
386 position of C relative to the covariance_accumulator described by
387 the other arguments. But for now, it just returns 1 for any
388 non-match. This should be changed when someone figures out how to
389 compute a sensible sign for the return value.
392 match_nodes (const struct covariance_accumulator *c,
393 const struct variable *v1, const struct variable *v2,
394 const union value *val1, const union value *val2)
396 if (var_get_dict_index (v1) == var_get_dict_index (c->v1))
397 if (var_get_dict_index (v2) == var_get_dict_index (c->v2))
399 if (var_is_numeric (v1) && var_is_numeric (v2))
403 if (var_is_numeric (v1) && var_is_alpha (v2))
405 if (!compare_values_short (val2, c->val2, v2))
410 if (var_is_alpha (v1) && var_is_numeric (v2))
412 if (!compare_values_short (val1, c->val1, v1))
417 if (var_is_alpha (v1) && var_is_alpha (v2))
419 if (!compare_values_short (val1, c->val1, v1))
421 if (!compare_values_short (val2, c->val2, v2))
432 This function is meant to be used as a comparison function for
433 a struct hsh_table in src/libpspp/hash.c.
436 covariance_accumulator_compare (const void *a1_, const void *a2_,
437 const void *aux UNUSED)
439 const struct covariance_accumulator *a1 = a1_;
440 const struct covariance_accumulator *a2 = a2_;
442 if (a1 == NULL && a2 == NULL)
445 if (a1 == NULL || a2 == NULL)
448 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
452 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
453 const union value *val, size_t n_vars)
455 unsigned int result = -1u;
456 if (var_is_numeric (v1) && var_is_alpha (v2))
458 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
459 + var_get_dict_index (v2) + hsh_hash_string (val->s);
461 else if (var_is_alpha (v1) && var_is_numeric (v2))
463 result = hash_numeric_alpha (v2, v1, val, n_vars);
470 update_product (const struct variable *v1, const struct variable *v2,
471 const union value *val1, const union value *val2)
475 assert (val1 != NULL);
476 assert (val2 != NULL);
477 if (var_is_alpha (v1) && var_is_alpha (v2))
481 if (var_is_numeric (v1) && var_is_numeric (v2))
483 return (val1->f * val2->f);
485 if (var_is_numeric (v1) && var_is_alpha (v2))
489 if (var_is_numeric (v2) && var_is_alpha (v1))
491 update_product (v2, v1, val2, val1);
496 update_sum (const struct variable *var, const union value *val, double weight)
498 assert (var != NULL);
499 assert (val != NULL);
500 if (var_is_alpha (var))
506 static struct covariance_accumulator *
507 get_new_covariance_accumulator (const struct variable *v1,
508 const struct variable *v2,
509 const union value *val1,
510 const union value *val2)
512 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
514 struct covariance_accumulator *ca;
515 ca = xmalloc (sizeof (*ca));
525 static const struct variable **
526 get_covariance_variables (const struct covariance_matrix *cov)
528 return cov->v_variables;
532 update_hash_entry (struct hsh_table *c,
533 const struct variable *v1,
534 const struct variable *v2,
535 const union value *val1, const union value *val2,
536 const struct interaction_value *i_val1,
537 const struct interaction_value *i_val2)
539 struct covariance_accumulator *ca;
540 struct covariance_accumulator *new_entry;
544 iv_f1 = interaction_value_get_nonzero_entry (i_val1);
545 iv_f2 = interaction_value_get_nonzero_entry (i_val2);
546 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
547 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
548 ca->dot_product *= iv_f1 * iv_f2;
549 ca->sum1 = update_sum (ca->v1, ca->val1, iv_f1);
550 ca->sum2 = update_sum (ca->v2, ca->val2, iv_f2);
552 new_entry = hsh_insert (c, ca);
554 if (new_entry != NULL)
556 new_entry->dot_product += ca->dot_product;
557 new_entry->ssize += 1.0;
558 new_entry->sum1 += ca->sum1;
559 new_entry->sum2 += ca->sum2;
561 If DOT_PRODUCT is null, CA was not already in the hash
562 hable, so we don't free it because it was just inserted.
563 If DOT_PRODUCT was not null, CA is already in the hash table.
564 Unnecessary now, it must be freed here.
571 Compute the covariance matrix in a single data-pass. Cases with
572 missing values are dropped pairwise, in other words, only if one of
573 the two values necessary to accumulate the inner product is missing.
575 Do not call this function directly. Call it through the struct
576 covariance_matrix ACCUMULATE member function, for example,
577 cov->accumulate (cov, ccase).
580 covariance_accumulate_pairwise (struct covariance_matrix *cov,
581 const struct ccase *ccase,
582 const struct interaction_variable **i_var,
587 const union value *val1;
588 const union value *val2;
589 const struct variable **v_variables;
590 struct interaction_value *i_val1 = NULL;
591 struct interaction_value *i_val2 = NULL;
593 assert (cov != NULL);
594 assert (ccase != NULL);
596 v_variables = get_covariance_variables (cov);
597 assert (v_variables != NULL);
599 for (i = 0; i < cov->n_variables; ++i)
601 if (is_interaction (v_variables[i], i_var, n_intr))
603 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
604 val1 = interaction_value_get (i_val1);
608 val1 = case_data (ccase, v_variables[i]);
610 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
612 cat_value_update (v_variables[i], val1);
613 if (var_is_numeric (v_variables[i]))
614 cov->update_moments (cov, i, val1->f);
616 for (j = i; j < cov->n_variables; j++)
618 if (is_interaction (v_variables[j], i_var, n_intr))
620 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
621 val2 = interaction_value_get (i_val2);
625 val2 = case_data (ccase, v_variables[j]);
627 if (!var_is_value_missing
628 (v_variables[j], val2, cov->missing_value))
630 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
631 val1, val2, i_val1, i_val2);
633 update_hash_entry (cov->ca, v_variables[j],
634 v_variables[i], val2, val1, i_val2, i_val1);
642 Compute the covariance matrix in a single data-pass. Cases with
643 missing values are dropped listwise. In other words, if one of the
644 values for any variable in a case is missing, the entire case is
647 The caller must use a casefilter to remove the cases with missing
648 values before calling covariance_accumulate_listwise. This function
649 assumes that CCASE has already passed through this filter, and
650 contains no missing values.
652 Do not call this function directly. Call it through the struct
653 covariance_matrix ACCUMULATE member function, for example,
654 cov->accumulate (cov, ccase).
657 covariance_accumulate_listwise (struct covariance_matrix *cov,
658 const struct ccase *ccase,
659 const struct interaction_variable **i_var,
664 const union value *val1;
665 const union value *val2;
666 const struct variable **v_variables;
667 struct interaction_value *i_val1 = NULL;
668 struct interaction_value *i_val2 = NULL;
670 assert (cov != NULL);
671 assert (ccase != NULL);
673 v_variables = get_covariance_variables (cov);
674 assert (v_variables != NULL);
676 for (i = 0; i < cov->n_variables; ++i)
678 if (is_interaction (v_variables[i], i_var, n_intr))
680 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
681 val1 = interaction_value_get (i_val1);
685 val1 = case_data (ccase, v_variables[i]);
687 cat_value_update (v_variables[i], val1);
688 if (var_is_numeric (v_variables[i]))
689 cov->update_moments (cov, i, val1->f);
691 for (j = i; j < cov->n_variables; j++)
693 if (is_interaction (v_variables[j], i_var, n_intr))
695 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
696 val2 = interaction_value_get (i_val2);
700 val2 = case_data (ccase, v_variables[j]);
702 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
703 val1, val2, i_val1, i_val2);
705 update_hash_entry (cov->ca, v_variables[j], v_variables[i],
706 val2, val1, i_val2, i_val1);
712 Call this function during the data pass. Each case will be added to
713 a hash containing all values of the covariance matrix. After the
714 data have been passed, call covariance_matrix_compute to put the
715 values in the struct covariance_matrix.
718 covariance_matrix_accumulate (struct covariance_matrix *cov,
719 const struct ccase *ccase, void **aux, size_t n_intr)
721 cov->accumulate (cov, ccase, (const struct interaction_variable **) aux, n_intr);
724 If VAR is categorical with d categories, its first category should
725 correspond to the origin in d-dimensional Euclidean space.
728 is_origin (const struct variable *var, const union value *val)
730 if (cat_value_find (var, val) == 0)
738 Return the subscript of the column of the design matrix
739 corresponding to VAL. If VAR is categorical with d categories, its
740 first category should correspond to the origin in d-dimensional
741 Euclidean space, so there is no subscript for this value.
744 get_exact_subscript (const struct design_matrix *dm, const struct variable *var,
745 const union value *val)
749 if (is_origin (var, val))
754 result = design_matrix_var_to_column (dm, var);
755 if (var_is_alpha (var))
757 result += cat_value_find (var, val) - 1;
763 covariance_matrix_insert (struct design_matrix *cov,
764 const struct variable *v1,
765 const struct variable *v2, const union value *val1,
766 const union value *val2, double product)
771 const union value *tmp_val;
773 assert (cov != NULL);
775 row = get_exact_subscript (cov, v1, val1);
776 col = get_exact_subscript (cov, v2, val2);
777 if (row != -1u && col != -1u)
779 gsl_matrix_set (cov->m, row, col, product);
785 is_covariance_contributor (const struct covariance_accumulator *ca, const struct design_matrix *dm,
789 const struct variable *v1;
790 const struct variable *v2;
793 v1 = design_matrix_col_to_var (dm, i);
794 if (var_get_dict_index (v1) == var_get_dict_index(ca->v1))
796 v2 = design_matrix_col_to_var (dm, j);
797 if (var_get_dict_index (v2) == var_get_dict_index (ca->v2))
799 k = get_exact_subscript (dm, v1, ca->val1);
802 k = get_exact_subscript (dm, v2, ca->val2);
813 static struct design_matrix *
814 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
819 struct covariance_accumulator *entry;
820 struct design_matrix *result = NULL;
821 struct hsh_iterator iter;
823 result = covariance_matrix_create (cov->n_variables, cov->v_variables);
825 for (i = 0; i < design_matrix_get_n_cols (result); i++)
827 for (j = i; j < design_matrix_get_n_cols (result); j++)
829 entry = hsh_first (cov->ca, &iter);
831 while (entry != NULL)
834 We compute the centered, un-normalized covariance matrix.
836 if (is_covariance_contributor (entry, result, i, j))
838 tmp = entry->dot_product - entry->sum1 * entry->sum2 / entry->ssize;
839 covariance_matrix_insert (result, entry->v1, entry->v2, entry->val1,
842 entry = hsh_next (cov->ca, &iter);
851 Call this function after passing the data.
854 covariance_matrix_compute (struct covariance_matrix *cov)
856 if (cov->n_pass == ONE_PASS)
858 cov->cov = covariance_accumulator_to_matrix (cov);
862 struct design_matrix *
863 covariance_to_design (const struct covariance_matrix *c)