1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
46 const union value *val1;
47 const union value *val2;
56 struct covariance_matrix
58 struct design_matrix *cov;
62 const struct variable **v_variables;
66 enum mv_class missing_value;
67 void (*accumulate) (struct covariance_matrix *, const struct ccase *,
68 const struct interaction_variable **, size_t);
69 void (*update_moments) (struct covariance_matrix *, size_t, double);
74 static struct hsh_table *covariance_hsh_create (size_t *);
75 static hsh_hash_func covariance_accumulator_hash;
76 static unsigned int hash_numeric_alpha (const struct variable *,
77 const struct variable *,
78 const union value *, size_t);
79 static hsh_compare_func covariance_accumulator_compare;
80 static hsh_free_func covariance_accumulator_free;
81 static void update_moments1 (struct covariance_matrix *, size_t, double);
82 static void update_moments2 (struct covariance_matrix *, size_t, double);
83 static struct covariance_accumulator *get_new_covariance_accumulator (const
98 static void covariance_accumulate_listwise (struct covariance_matrix *,
100 const struct interaction_variable **,
102 static void covariance_accumulate_pairwise (struct covariance_matrix *,
103 const struct ccase *,
104 const struct interaction_variable **,
107 struct covariance_matrix *
108 covariance_matrix_init (size_t n_variables,
109 const struct variable *v_variables[], int n_pass,
110 int missing_handling, enum mv_class missing_value)
113 struct covariance_matrix *result = NULL;
115 result = xmalloc (sizeof (*result));
117 result->n_variables = n_variables;
118 result->ca = covariance_hsh_create (&result->n_variables);
121 result->missing_handling = missing_handling;
122 result->missing_value = missing_value;
123 result->accumulate = (result->missing_handling == LISTWISE) ?
124 covariance_accumulate_listwise : covariance_accumulate_pairwise;
125 if (n_pass == ONE_PASS)
127 result->update_moments = update_moments1;
128 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
129 for (i = 0; i < n_variables; i++)
131 result->m1[i] = moments1_create (MOMENT_MEAN);
136 result->update_moments = update_moments2;
137 result->m = xnmalloc (n_variables, sizeof (*result->m));
138 for (i = 0; i < n_variables; i++)
140 result->m[i] = moments_create (MOMENT_MEAN);
143 result->v_variables = v_variables;
145 result->n_pass = n_pass;
151 The covariances are stored in a DESIGN_MATRIX structure.
153 struct design_matrix *
154 covariance_matrix_create (size_t n_variables,
155 const struct variable *v_variables[])
157 return design_matrix_create (n_variables, v_variables,
158 (size_t) n_variables);
162 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
164 assert (cov->m1 != NULL);
165 moments1_add (cov->m1[i], x, 1.0);
169 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
171 assert (cov->m != NULL);
172 moments_pass_one (cov->m[i], x, 1.0);
176 covariance_matrix_destroy (struct covariance_matrix *cov)
180 assert (cov != NULL);
181 design_matrix_destroy (cov->cov);
182 hsh_destroy (cov->ca);
183 if (cov->n_pass == ONE_PASS)
185 for (i = 0; i < cov->n_variables; i++)
187 moments1_destroy (cov->m1[i]);
193 for (i = 0; i < cov->n_variables; i++)
195 moments_destroy (cov->m[i]);
202 Update the covariance matrix with the new entries, assuming that ROW
203 corresponds to a categorical variable and V2 is numeric.
206 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
208 const struct variable *v2, double x,
209 const union value *val2)
214 assert (var_is_numeric (v2));
216 col = design_matrix_var_to_column (cov, v2);
217 assert (val2 != NULL);
218 tmp = gsl_matrix_get (cov->m, row, col);
219 gsl_matrix_set (cov->m, row, col, (val2->f - mean) * x + tmp);
220 gsl_matrix_set (cov->m, col, row, (val2->f - mean) * x + tmp);
223 column_iterate (struct design_matrix *cov, const struct variable *v,
224 double ssize, double x, const union value *val1, size_t row)
230 const union value *tmp_val;
232 col = design_matrix_var_to_column (cov, v);
233 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
236 y = -1.0 * cat_get_category_count (i, v) / ssize;
237 tmp_val = cat_subscript_to_value (i, v);
238 if (!compare_values_short (tmp_val, val1, v))
242 tmp = gsl_matrix_get (cov->m, row, col);
243 gsl_matrix_set (cov->m, row, col, x * y + tmp);
244 gsl_matrix_set (cov->m, col, row, x * y + tmp);
249 Call this function in the second data pass. The central moments are
250 MEAN1 and MEAN2. Any categorical variables should already have their
251 values summarized in in its OBS_VALS element.
254 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
255 double ssize, const struct variable *v1,
256 const struct variable *v2, const union value *val1,
257 const union value *val2)
263 const union value *tmp_val;
265 if (var_is_alpha (v1))
267 row = design_matrix_var_to_column (cov, v1);
268 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
271 x = -1.0 * cat_get_category_count (i, v1) / ssize;
272 tmp_val = cat_subscript_to_value (i, v1);
273 if (!compare_values_short (tmp_val, val1, v1))
277 if (var_is_numeric (v2))
279 covariance_update_categorical_numeric (cov, mean2, row,
284 column_iterate (cov, v1, ssize, x, val1, row);
285 column_iterate (cov, v2, ssize, x, val2, row);
289 else if (var_is_alpha (v2))
292 Reverse the orders of V1, V2, etc. and put ourselves back
293 in the previous IF scope.
295 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
300 Both variables are numeric.
302 row = design_matrix_var_to_column (cov, v1);
303 col = design_matrix_var_to_column (cov, v2);
304 x = (val1->f - mean1) * (val2->f - mean2);
305 x += gsl_matrix_get (cov->m, col, row);
306 gsl_matrix_set (cov->m, row, col, x);
307 gsl_matrix_set (cov->m, col, row, x);
312 covariance_accumulator_hash (const void *h, const void *aux)
314 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
315 size_t *n_vars = (size_t *) aux;
318 const struct variable *v_min;
319 const struct variable *v_max;
320 const union value *val_min;
321 const union value *val_max;
324 Order everything by the variables' indices. This ensures we get the
325 same key regardless of the order in which the variables are stored
329 (var_get_dict_index (ca->v1) <
330 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
331 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
333 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
334 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
336 idx_min = var_get_dict_index (v_min);
337 idx_max = var_get_dict_index (v_max);
339 if (var_is_numeric (v_max) && var_is_numeric (v_min))
341 return (*n_vars * idx_max + idx_min);
343 if (var_is_numeric (v_max) && var_is_alpha (v_min))
345 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
347 if (var_is_alpha (v_max) && var_is_numeric (v_min))
349 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
351 if (var_is_alpha (v_max) && var_is_alpha (v_min))
355 xnmalloc (1 + var_get_width (v_max) + var_get_width (v_min),
357 strncpy (x, val_max->s, var_get_width (v_max));
358 strncat (x, val_min->s, var_get_width (v_min));
359 tmp = *n_vars * (*n_vars + 1 + idx_max) + idx_min + hsh_hash_string (x);
367 Make a hash table consisting of struct covariance_accumulators.
368 This allows the accumulation of the elements of a covariance matrix
369 in a single data pass. Call covariance_accumulate () for each case
372 static struct hsh_table *
373 covariance_hsh_create (size_t *n_vars)
375 return hsh_create (*n_vars * *n_vars, covariance_accumulator_compare,
376 covariance_accumulator_hash, covariance_accumulator_free,
381 covariance_accumulator_free (void *c_, const void *aux UNUSED)
383 struct covariance_accumulator *c = c_;
389 Hash comparison. Returns 0 for a match, or a non-zero int
390 otherwise. The sign of a non-zero return value *should* indicate the
391 position of C relative to the covariance_accumulator described by
392 the other arguments. But for now, it just returns 1 for any
393 non-match. This should be changed when someone figures out how to
394 compute a sensible sign for the return value.
397 match_nodes (const struct covariance_accumulator *c,
398 const struct variable *v1, const struct variable *v2,
399 const union value *val1, const union value *val2)
401 if (var_get_dict_index (v1) == var_get_dict_index (c->v1))
402 if (var_get_dict_index (v2) == var_get_dict_index (c->v2))
404 if (var_is_numeric (v1) && var_is_numeric (v2))
408 if (var_is_numeric (v1) && var_is_alpha (v2))
410 if (!compare_values_short (val2, c->val2, v2))
415 if (var_is_alpha (v1) && var_is_numeric (v2))
417 if (!compare_values_short (val1, c->val1, v1))
422 if (var_is_alpha (v1) && var_is_alpha (v2))
424 if (!compare_values_short (val1, c->val1, v1))
426 if (!compare_values_short (val2, c->val2, v2))
437 This function is meant to be used as a comparison function for
438 a struct hsh_table in src/libpspp/hash.c.
441 covariance_accumulator_compare (const void *a1_, const void *a2_,
442 const void *aux UNUSED)
444 const struct covariance_accumulator *a1 = a1_;
445 const struct covariance_accumulator *a2 = a2_;
447 if (a1 == NULL && a2 == NULL)
450 if (a1 == NULL || a2 == NULL)
453 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
457 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
458 const union value *val, size_t n_vars)
460 unsigned int result = -1u;
461 if (var_is_numeric (v1) && var_is_alpha (v2))
463 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
464 + var_get_dict_index (v2) + hsh_hash_string (val->s);
466 else if (var_is_alpha (v1) && var_is_numeric (v2))
468 result = hash_numeric_alpha (v2, v1, val, n_vars);
475 update_product (const struct variable *v1, const struct variable *v2,
476 const union value *val1, const union value *val2)
480 assert (val1 != NULL);
481 assert (val2 != NULL);
482 if (var_is_alpha (v1) && var_is_alpha (v2))
486 if (var_is_numeric (v1) && var_is_numeric (v2))
488 return (val1->f * val2->f);
490 if (var_is_numeric (v1) && var_is_alpha (v2))
494 if (var_is_numeric (v2) && var_is_alpha (v1))
496 update_product (v2, v1, val2, val1);
501 update_sum (const struct variable *var, const union value *val, double weight)
503 assert (var != NULL);
504 assert (val != NULL);
505 if (var_is_alpha (var))
511 static struct covariance_accumulator *
512 get_new_covariance_accumulator (const struct variable *v1,
513 const struct variable *v2,
514 const union value *val1,
515 const union value *val2)
517 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
519 struct covariance_accumulator *ca;
520 ca = xmalloc (sizeof (*ca));
530 static const struct variable **
531 get_covariance_variables (const struct covariance_matrix *cov)
533 return cov->v_variables;
538 update_hash_entry (struct hsh_table *c,
539 const struct variable *v1,
540 const struct variable *v2,
541 const union value *val1, const union value *val2,
542 const struct interaction_value *i_val1,
543 const struct interaction_value *i_val2)
545 struct covariance_accumulator *ca;
546 struct covariance_accumulator *new_entry;
550 iv_f1 = interaction_value_get_nonzero_entry (i_val1);
551 iv_f2 = interaction_value_get_nonzero_entry (i_val2);
552 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
553 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
554 ca->dot_product *= iv_f1 * iv_f2;
555 ca->sum1 = update_sum (ca->v1, ca->val1, iv_f1);
556 ca->sum2 = update_sum (ca->v2, ca->val2, iv_f2);
558 new_entry = hsh_insert (c, ca);
560 if (new_entry != NULL)
562 new_entry->dot_product += ca->dot_product;
563 new_entry->ssize += 1.0;
564 new_entry->sum1 += ca->sum1;
565 new_entry->sum2 += ca->sum2;
567 If DOT_PRODUCT is null, CA was not already in the hash
568 hable, so we don't free it because it was just inserted.
569 If DOT_PRODUCT was not null, CA is already in the hash table.
570 Unnecessary now, it must be freed here.
577 Compute the covariance matrix in a single data-pass. Cases with
578 missing values are dropped pairwise, in other words, only if one of
579 the two values necessary to accumulate the inner product is missing.
581 Do not call this function directly. Call it through the struct
582 covariance_matrix ACCUMULATE member function, for example,
583 cov->accumulate (cov, ccase).
586 covariance_accumulate_pairwise (struct covariance_matrix *cov,
587 const struct ccase *ccase,
588 const struct interaction_variable **i_var,
593 const union value *val1;
594 const union value *val2;
595 const struct variable **v_variables;
596 struct interaction_value *i_val1 = NULL;
597 struct interaction_value *i_val2 = NULL;
599 assert (cov != NULL);
600 assert (ccase != NULL);
602 v_variables = get_covariance_variables (cov);
603 assert (v_variables != NULL);
605 for (i = 0; i < cov->n_variables; ++i)
607 if (is_interaction (v_variables[i], i_var, n_intr))
609 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
610 val1 = interaction_value_get (i_val1);
614 val1 = case_data (ccase, v_variables[i]);
616 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
618 cat_value_update (v_variables[i], val1);
619 if (var_is_numeric (v_variables[i]))
620 cov->update_moments (cov, i, val1->f);
622 for (j = i; j < cov->n_variables; j++)
624 if (is_interaction (v_variables[j], i_var, n_intr))
626 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
627 val2 = interaction_value_get (i_val2);
631 val2 = case_data (ccase, v_variables[j]);
633 if (!var_is_value_missing
634 (v_variables[j], val2, cov->missing_value))
636 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
637 val1, val2, i_val1, i_val2);
639 update_hash_entry (cov->ca, v_variables[j],
640 v_variables[i], val2, val1, i_val2, i_val1);
648 Compute the covariance matrix in a single data-pass. Cases with
649 missing values are dropped listwise. In other words, if one of the
650 values for any variable in a case is missing, the entire case is
653 The caller must use a casefilter to remove the cases with missing
654 values before calling covariance_accumulate_listwise. This function
655 assumes that CCASE has already passed through this filter, and
656 contains no missing values.
658 Do not call this function directly. Call it through the struct
659 covariance_matrix ACCUMULATE member function, for example,
660 cov->accumulate (cov, ccase).
663 covariance_accumulate_listwise (struct covariance_matrix *cov,
664 const struct ccase *ccase,
665 const struct interaction_variable **i_var,
670 const union value *val1;
671 const union value *val2;
672 const struct variable **v_variables;
673 struct interaction_value *i_val1 = NULL;
674 struct interaction_value *i_val2 = NULL;
676 assert (cov != NULL);
677 assert (ccase != NULL);
679 v_variables = get_covariance_variables (cov);
680 assert (v_variables != NULL);
682 for (i = 0; i < cov->n_variables; ++i)
684 if (is_interaction (v_variables[i], i_var, n_intr))
686 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
687 val1 = interaction_value_get (i_val1);
691 val1 = case_data (ccase, v_variables[i]);
693 cat_value_update (v_variables[i], val1);
694 if (var_is_numeric (v_variables[i]))
695 cov->update_moments (cov, i, val1->f);
697 for (j = i; j < cov->n_variables; j++)
699 if (is_interaction (v_variables[j], i_var, n_intr))
701 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
702 val2 = interaction_value_get (i_val2);
706 val2 = case_data (ccase, v_variables[j]);
708 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
709 val1, val2, i_val1, i_val2);
711 update_hash_entry (cov->ca, v_variables[j], v_variables[i],
712 val2, val1, i_val2, i_val1);
718 Call this function during the data pass. Each case will be added to
719 a hash containing all values of the covariance matrix. After the
720 data have been passed, call covariance_matrix_compute to put the
721 values in the struct covariance_matrix.
724 covariance_matrix_accumulate (struct covariance_matrix *cov,
725 const struct ccase *ccase, void **aux, size_t n_intr)
727 cov->accumulate (cov, ccase, (const struct interaction_variable **) aux, n_intr);
731 covariance_matrix_insert (struct design_matrix *cov,
732 const struct variable *v1,
733 const struct variable *v2, const union value *val1,
734 const union value *val2, double product)
739 const union value *tmp_val;
741 assert (cov != NULL);
743 row = design_matrix_var_to_column (cov, v1);
744 if (var_is_alpha (v1))
747 tmp_val = cat_subscript_to_value (i, v1);
748 while (compare_values_short (tmp_val, val1, v1))
751 tmp_val = cat_subscript_to_value (i, v1);
754 if (var_is_numeric (v2))
756 col = design_matrix_var_to_column (cov, v2);
760 col = design_matrix_var_to_column (cov, v2);
762 tmp_val = cat_subscript_to_value (i, v1);
763 while (compare_values_short (tmp_val, val1, v1))
766 tmp_val = cat_subscript_to_value (i, v1);
773 if (var_is_numeric (v2))
775 col = design_matrix_var_to_column (cov, v2);
779 covariance_matrix_insert (cov, v2, v1, val2, val1, product);
783 gsl_matrix_set (cov->m, row, col, product);
786 static struct design_matrix *
787 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
790 struct covariance_accumulator *entry;
791 struct design_matrix *result = NULL;
792 struct hsh_iterator iter;
794 result = covariance_matrix_create (cov->n_variables, cov->v_variables);
796 entry = hsh_first (cov->ca, &iter);
798 while (entry != NULL)
801 We compute the centered, un-normalized covariance matrix.
803 tmp = entry->dot_product - entry->sum1 * entry->sum2 / entry->ssize;
804 covariance_matrix_insert (result, entry->v1, entry->v2, entry->val1,
806 entry = hsh_next (cov->ca, &iter);
813 Call this function after passing the data.
816 covariance_matrix_compute (struct covariance_matrix *cov)
818 if (cov->n_pass == ONE_PASS)
820 cov->cov = covariance_accumulator_to_matrix (cov);
824 struct design_matrix *
825 covariance_to_design (const struct covariance_matrix *c)