1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
46 const union value *val1;
47 const union value *val2;
56 struct covariance_matrix
58 struct design_matrix *cov;
62 const struct variable **v_variables;
66 enum mv_class missing_value;
67 void (*accumulate) (struct covariance_matrix *, const struct ccase *,
68 const struct interaction_variable **, size_t);
69 void (*update_moments) (struct covariance_matrix *, size_t, double);
74 static struct hsh_table *covariance_hsh_create (size_t *);
75 static hsh_hash_func covariance_accumulator_hash;
76 static unsigned int hash_numeric_alpha (const struct variable *,
77 const struct variable *,
78 const union value *, size_t);
79 static hsh_compare_func covariance_accumulator_compare;
80 static hsh_free_func covariance_accumulator_free;
81 static void update_moments1 (struct covariance_matrix *, size_t, double);
82 static void update_moments2 (struct covariance_matrix *, size_t, double);
83 static struct covariance_accumulator *get_new_covariance_accumulator (const
98 static void covariance_accumulate_listwise (struct covariance_matrix *,
100 const struct interaction_variable **,
102 static void covariance_accumulate_pairwise (struct covariance_matrix *,
103 const struct ccase *,
104 const struct interaction_variable **,
107 struct covariance_matrix *
108 covariance_matrix_init (size_t n_variables,
109 const struct variable *v_variables[], int n_pass,
110 int missing_handling, enum mv_class missing_value)
113 struct covariance_matrix *result = NULL;
115 result = xmalloc (sizeof (*result));
117 result->n_variables = n_variables;
118 result->ca = covariance_hsh_create (&result->n_variables);
121 result->missing_handling = missing_handling;
122 result->missing_value = missing_value;
123 result->accumulate = (result->missing_handling == LISTWISE) ?
124 covariance_accumulate_listwise : covariance_accumulate_pairwise;
125 if (n_pass == ONE_PASS)
127 result->update_moments = update_moments1;
128 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
129 for (i = 0; i < n_variables; i++)
131 result->m1[i] = moments1_create (MOMENT_MEAN);
136 result->update_moments = update_moments2;
137 result->m = xnmalloc (n_variables, sizeof (*result->m));
138 for (i = 0; i < n_variables; i++)
140 result->m[i] = moments_create (MOMENT_MEAN);
143 result->v_variables = v_variables;
145 result->n_pass = n_pass;
151 The covariances are stored in a DESIGN_MATRIX structure.
153 struct design_matrix *
154 covariance_matrix_create (size_t n_variables,
155 const struct variable *v_variables[])
157 return design_matrix_create (n_variables, v_variables,
158 (size_t) n_variables);
162 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
164 assert (cov->m1 != NULL);
165 moments1_add (cov->m1[i], x, 1.0);
169 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
171 assert (cov->m != NULL);
172 moments_pass_one (cov->m[i], x, 1.0);
176 covariance_matrix_destroy (struct covariance_matrix *cov)
180 assert (cov != NULL);
181 design_matrix_destroy (cov->cov);
182 hsh_destroy (cov->ca);
183 if (cov->n_pass == ONE_PASS)
185 for (i = 0; i < cov->n_variables; i++)
187 moments1_destroy (cov->m1[i]);
193 for (i = 0; i < cov->n_variables; i++)
195 moments_destroy (cov->m[i]);
202 Update the covariance matrix with the new entries, assuming that ROW
203 corresponds to a categorical variable and V2 is numeric.
206 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
208 const struct variable *v2, double x,
209 const union value *val2)
214 assert (var_is_numeric (v2));
216 col = design_matrix_var_to_column (cov, v2);
217 assert (val2 != NULL);
218 tmp = gsl_matrix_get (cov->m, row, col);
219 gsl_matrix_set (cov->m, row, col, (val2->f - mean) * x + tmp);
220 gsl_matrix_set (cov->m, col, row, (val2->f - mean) * x + tmp);
223 column_iterate (struct design_matrix *cov, const struct variable *v,
224 double ssize, double x, const union value *val1, size_t row)
230 const union value *tmp_val;
232 col = design_matrix_var_to_column (cov, v);
233 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
236 y = -1.0 * cat_get_category_count (i, v) / ssize;
237 tmp_val = cat_subscript_to_value (i, v);
238 if (!compare_values_short (tmp_val, val1, v))
242 tmp = gsl_matrix_get (cov->m, row, col);
243 gsl_matrix_set (cov->m, row, col, x * y + tmp);
244 gsl_matrix_set (cov->m, col, row, x * y + tmp);
249 Call this function in the second data pass. The central moments are
250 MEAN1 and MEAN2. Any categorical variables should already have their
251 values summarized in in its OBS_VALS element.
254 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
255 double ssize, const struct variable *v1,
256 const struct variable *v2, const union value *val1,
257 const union value *val2)
263 const union value *tmp_val;
265 if (var_is_alpha (v1))
267 row = design_matrix_var_to_column (cov, v1);
268 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
271 x = -1.0 * cat_get_category_count (i, v1) / ssize;
272 tmp_val = cat_subscript_to_value (i, v1);
273 if (!compare_values_short (tmp_val, val1, v1))
277 if (var_is_numeric (v2))
279 covariance_update_categorical_numeric (cov, mean2, row,
284 column_iterate (cov, v1, ssize, x, val1, row);
285 column_iterate (cov, v2, ssize, x, val2, row);
289 else if (var_is_alpha (v2))
292 Reverse the orders of V1, V2, etc. and put ourselves back
293 in the previous IF scope.
295 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
300 Both variables are numeric.
302 row = design_matrix_var_to_column (cov, v1);
303 col = design_matrix_var_to_column (cov, v2);
304 x = (val1->f - mean1) * (val2->f - mean2);
305 x += gsl_matrix_get (cov->m, col, row);
306 gsl_matrix_set (cov->m, row, col, x);
307 gsl_matrix_set (cov->m, col, row, x);
312 covariance_accumulator_hash (const void *h, const void *aux)
314 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
315 size_t *n_vars = (size_t *) aux;
318 const struct variable *v_min;
319 const struct variable *v_max;
320 const union value *val_min;
321 const union value *val_max;
324 Order everything by the variables' indices. This ensures we get the
325 same key regardless of the order in which the variables are stored
329 (var_get_dict_index (ca->v1) <
330 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
331 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
333 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
334 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
336 idx_min = var_get_dict_index (v_min);
337 idx_max = var_get_dict_index (v_max);
339 if (var_is_numeric (v_max) && var_is_numeric (v_min))
341 return (*n_vars * idx_max + idx_min);
343 if (var_is_numeric (v_max) && var_is_alpha (v_min))
345 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
347 if (var_is_alpha (v_max) && var_is_numeric (v_min))
349 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
351 if (var_is_alpha (v_max) && var_is_alpha (v_min))
353 unsigned tmp = hsh_hash_bytes (val_max, var_get_width (v_max));
354 tmp ^= hsh_hash_bytes (val_min, var_get_width (v_min));
355 tmp += *n_vars * (*n_vars + 1 + idx_max) + idx_min;
362 Make a hash table consisting of struct covariance_accumulators.
363 This allows the accumulation of the elements of a covariance matrix
364 in a single data pass. Call covariance_accumulate () for each case
367 static struct hsh_table *
368 covariance_hsh_create (size_t *n_vars)
370 return hsh_create (*n_vars * *n_vars, covariance_accumulator_compare,
371 covariance_accumulator_hash, covariance_accumulator_free,
376 covariance_accumulator_free (void *c_, const void *aux UNUSED)
378 struct covariance_accumulator *c = c_;
384 Hash comparison. Returns 0 for a match, or a non-zero int
385 otherwise. The sign of a non-zero return value *should* indicate the
386 position of C relative to the covariance_accumulator described by
387 the other arguments. But for now, it just returns 1 for any
388 non-match. This should be changed when someone figures out how to
389 compute a sensible sign for the return value.
392 match_nodes (const struct covariance_accumulator *c,
393 const struct variable *v1, const struct variable *v2,
394 const union value *val1, const union value *val2)
396 if (var_get_dict_index (v1) == var_get_dict_index (c->v1))
397 if (var_get_dict_index (v2) == var_get_dict_index (c->v2))
399 if (var_is_numeric (v1) && var_is_numeric (v2))
403 if (var_is_numeric (v1) && var_is_alpha (v2))
405 if (!compare_values_short (val2, c->val2, v2))
410 if (var_is_alpha (v1) && var_is_numeric (v2))
412 if (!compare_values_short (val1, c->val1, v1))
417 if (var_is_alpha (v1) && var_is_alpha (v2))
419 if (!compare_values_short (val1, c->val1, v1))
421 if (!compare_values_short (val2, c->val2, v2))
432 This function is meant to be used as a comparison function for
433 a struct hsh_table in src/libpspp/hash.c.
436 covariance_accumulator_compare (const void *a1_, const void *a2_,
437 const void *aux UNUSED)
439 const struct covariance_accumulator *a1 = a1_;
440 const struct covariance_accumulator *a2 = a2_;
442 if (a1 == NULL && a2 == NULL)
445 if (a1 == NULL || a2 == NULL)
448 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
452 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
453 const union value *val, size_t n_vars)
455 unsigned int result = -1u;
456 if (var_is_numeric (v1) && var_is_alpha (v2))
458 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
459 + var_get_dict_index (v2) + hsh_hash_string (val->s);
461 else if (var_is_alpha (v1) && var_is_numeric (v2))
463 result = hash_numeric_alpha (v2, v1, val, n_vars);
470 update_product (const struct variable *v1, const struct variable *v2,
471 const union value *val1, const union value *val2)
475 assert (val1 != NULL);
476 assert (val2 != NULL);
477 if (var_is_alpha (v1) && var_is_alpha (v2))
481 if (var_is_numeric (v1) && var_is_numeric (v2))
483 return (val1->f * val2->f);
485 if (var_is_numeric (v1) && var_is_alpha (v2))
489 if (var_is_numeric (v2) && var_is_alpha (v1))
491 update_product (v2, v1, val2, val1);
496 update_sum (const struct variable *var, const union value *val, double weight)
498 assert (var != NULL);
499 assert (val != NULL);
500 if (var_is_alpha (var))
506 static struct covariance_accumulator *
507 get_new_covariance_accumulator (const struct variable *v1,
508 const struct variable *v2,
509 const union value *val1,
510 const union value *val2)
512 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
514 struct covariance_accumulator *ca;
515 ca = xmalloc (sizeof (*ca));
525 static const struct variable **
526 get_covariance_variables (const struct covariance_matrix *cov)
528 return cov->v_variables;
533 update_hash_entry (struct hsh_table *c,
534 const struct variable *v1,
535 const struct variable *v2,
536 const union value *val1, const union value *val2,
537 const struct interaction_value *i_val1,
538 const struct interaction_value *i_val2)
540 struct covariance_accumulator *ca;
541 struct covariance_accumulator *new_entry;
545 iv_f1 = interaction_value_get_nonzero_entry (i_val1);
546 iv_f2 = interaction_value_get_nonzero_entry (i_val2);
547 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
548 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
549 ca->dot_product *= iv_f1 * iv_f2;
550 ca->sum1 = update_sum (ca->v1, ca->val1, iv_f1);
551 ca->sum2 = update_sum (ca->v2, ca->val2, iv_f2);
553 new_entry = hsh_insert (c, ca);
555 if (new_entry != NULL)
557 new_entry->dot_product += ca->dot_product;
558 new_entry->ssize += 1.0;
559 new_entry->sum1 += ca->sum1;
560 new_entry->sum2 += ca->sum2;
562 If DOT_PRODUCT is null, CA was not already in the hash
563 hable, so we don't free it because it was just inserted.
564 If DOT_PRODUCT was not null, CA is already in the hash table.
565 Unnecessary now, it must be freed here.
572 Compute the covariance matrix in a single data-pass. Cases with
573 missing values are dropped pairwise, in other words, only if one of
574 the two values necessary to accumulate the inner product is missing.
576 Do not call this function directly. Call it through the struct
577 covariance_matrix ACCUMULATE member function, for example,
578 cov->accumulate (cov, ccase).
581 covariance_accumulate_pairwise (struct covariance_matrix *cov,
582 const struct ccase *ccase,
583 const struct interaction_variable **i_var,
588 const union value *val1;
589 const union value *val2;
590 const struct variable **v_variables;
591 struct interaction_value *i_val1 = NULL;
592 struct interaction_value *i_val2 = NULL;
594 assert (cov != NULL);
595 assert (ccase != NULL);
597 v_variables = get_covariance_variables (cov);
598 assert (v_variables != NULL);
600 for (i = 0; i < cov->n_variables; ++i)
602 if (is_interaction (v_variables[i], i_var, n_intr))
604 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
605 val1 = interaction_value_get (i_val1);
609 val1 = case_data (ccase, v_variables[i]);
611 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
613 cat_value_update (v_variables[i], val1);
614 if (var_is_numeric (v_variables[i]))
615 cov->update_moments (cov, i, val1->f);
617 for (j = i; j < cov->n_variables; j++)
619 if (is_interaction (v_variables[j], i_var, n_intr))
621 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
622 val2 = interaction_value_get (i_val2);
626 val2 = case_data (ccase, v_variables[j]);
628 if (!var_is_value_missing
629 (v_variables[j], val2, cov->missing_value))
631 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
632 val1, val2, i_val1, i_val2);
634 update_hash_entry (cov->ca, v_variables[j],
635 v_variables[i], val2, val1, i_val2, i_val1);
643 Compute the covariance matrix in a single data-pass. Cases with
644 missing values are dropped listwise. In other words, if one of the
645 values for any variable in a case is missing, the entire case is
648 The caller must use a casefilter to remove the cases with missing
649 values before calling covariance_accumulate_listwise. This function
650 assumes that CCASE has already passed through this filter, and
651 contains no missing values.
653 Do not call this function directly. Call it through the struct
654 covariance_matrix ACCUMULATE member function, for example,
655 cov->accumulate (cov, ccase).
658 covariance_accumulate_listwise (struct covariance_matrix *cov,
659 const struct ccase *ccase,
660 const struct interaction_variable **i_var,
665 const union value *val1;
666 const union value *val2;
667 const struct variable **v_variables;
668 struct interaction_value *i_val1 = NULL;
669 struct interaction_value *i_val2 = NULL;
671 assert (cov != NULL);
672 assert (ccase != NULL);
674 v_variables = get_covariance_variables (cov);
675 assert (v_variables != NULL);
677 for (i = 0; i < cov->n_variables; ++i)
679 if (is_interaction (v_variables[i], i_var, n_intr))
681 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
682 val1 = interaction_value_get (i_val1);
686 val1 = case_data (ccase, v_variables[i]);
688 cat_value_update (v_variables[i], val1);
689 if (var_is_numeric (v_variables[i]))
690 cov->update_moments (cov, i, val1->f);
692 for (j = i; j < cov->n_variables; j++)
694 if (is_interaction (v_variables[j], i_var, n_intr))
696 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
697 val2 = interaction_value_get (i_val2);
701 val2 = case_data (ccase, v_variables[j]);
703 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
704 val1, val2, i_val1, i_val2);
706 update_hash_entry (cov->ca, v_variables[j], v_variables[i],
707 val2, val1, i_val2, i_val1);
713 Call this function during the data pass. Each case will be added to
714 a hash containing all values of the covariance matrix. After the
715 data have been passed, call covariance_matrix_compute to put the
716 values in the struct covariance_matrix.
719 covariance_matrix_accumulate (struct covariance_matrix *cov,
720 const struct ccase *ccase, void **aux, size_t n_intr)
722 cov->accumulate (cov, ccase, (const struct interaction_variable **) aux, n_intr);
726 covariance_matrix_insert (struct design_matrix *cov,
727 const struct variable *v1,
728 const struct variable *v2, const union value *val1,
729 const union value *val2, double product)
734 const union value *tmp_val;
736 assert (cov != NULL);
738 row = design_matrix_var_to_column (cov, v1);
739 if (var_is_alpha (v1))
742 tmp_val = cat_subscript_to_value (i, v1);
743 while (compare_values_short (tmp_val, val1, v1))
746 tmp_val = cat_subscript_to_value (i, v1);
749 if (var_is_numeric (v2))
751 col = design_matrix_var_to_column (cov, v2);
755 col = design_matrix_var_to_column (cov, v2);
757 tmp_val = cat_subscript_to_value (i, v1);
758 while (compare_values_short (tmp_val, val1, v1))
761 tmp_val = cat_subscript_to_value (i, v1);
768 if (var_is_numeric (v2))
770 col = design_matrix_var_to_column (cov, v2);
774 covariance_matrix_insert (cov, v2, v1, val2, val1, product);
778 gsl_matrix_set (cov->m, row, col, product);
781 static struct design_matrix *
782 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
785 struct covariance_accumulator *entry;
786 struct design_matrix *result = NULL;
787 struct hsh_iterator iter;
789 result = covariance_matrix_create (cov->n_variables, cov->v_variables);
791 entry = hsh_first (cov->ca, &iter);
793 while (entry != NULL)
796 We compute the centered, un-normalized covariance matrix.
798 tmp = entry->dot_product - entry->sum1 * entry->sum2 / entry->ssize;
799 covariance_matrix_insert (result, entry->v1, entry->v2, entry->val1,
801 entry = hsh_next (cov->ca, &iter);
808 Call this function after passing the data.
811 covariance_matrix_compute (struct covariance_matrix *cov)
813 if (cov->n_pass == ONE_PASS)
815 cov->cov = covariance_accumulator_to_matrix (cov);
819 struct design_matrix *
820 covariance_to_design (const struct covariance_matrix *c)