1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 Create and update the values in the covariance matrix.
22 #include <data/case.h>
23 #include <data/category.h>
24 #include <data/variable.h>
25 #include <data/value.h>
26 #include <libpspp/hash.h>
27 #include <libpspp/hash-functions.h>
28 #include <math/covariance-matrix.h>
29 #include <math/moments.h>
34 Structure used to accumulate the covariance matrix in a single data
35 pass. Before passing the data, we do not know how many categories
36 there are in each categorical variable. Therefore we do not know the
37 size of the covariance matrix. To get around this problem, we
38 accumulate the elements of the covariance matrix in pointers to
39 COVARIANC_ACCUMULATOR. These values are then used to populate
40 the covariance matrix.
42 struct covariance_accumulator
44 const struct variable *v1;
45 const struct variable *v2;
46 const union value *val1;
47 const union value *val2;
56 struct covariance_matrix
58 struct design_matrix *cov;
59 struct design_matrix *ssize;
60 struct design_matrix *sums;
64 const struct variable **v_variables;
68 enum mv_class missing_value;
69 void (*accumulate) (struct covariance_matrix *, const struct ccase *,
70 const struct interaction_variable **, size_t);
71 void (*update_moments) (struct covariance_matrix *, size_t, double);
76 static struct hsh_table *covariance_hsh_create (size_t *);
77 static hsh_hash_func covariance_accumulator_hash;
78 static unsigned int hash_numeric_alpha (const struct variable *,
79 const struct variable *,
80 const union value *, size_t);
81 static hsh_compare_func covariance_accumulator_compare;
82 static hsh_free_func covariance_accumulator_free;
83 static void update_moments1 (struct covariance_matrix *, size_t, double);
84 static void update_moments2 (struct covariance_matrix *, size_t, double);
85 static struct covariance_accumulator *get_new_covariance_accumulator (const
100 static void covariance_accumulate_listwise (struct covariance_matrix *,
101 const struct ccase *,
102 const struct interaction_variable **,
104 static void covariance_accumulate_pairwise (struct covariance_matrix *,
105 const struct ccase *,
106 const struct interaction_variable **,
109 struct covariance_matrix *
110 covariance_matrix_init (size_t n_variables,
111 const struct variable *v_variables[], int n_pass,
112 int missing_handling, enum mv_class missing_value)
115 struct covariance_matrix *result = NULL;
117 result = xmalloc (sizeof (*result));
119 result->n_variables = n_variables;
120 result->ca = covariance_hsh_create (&result->n_variables);
123 result->missing_handling = missing_handling;
124 result->missing_value = missing_value;
125 result->accumulate = (result->missing_handling == LISTWISE) ?
126 covariance_accumulate_listwise : covariance_accumulate_pairwise;
127 if (n_pass == ONE_PASS)
129 result->update_moments = update_moments1;
130 result->m1 = xnmalloc (n_variables, sizeof (*result->m1));
131 for (i = 0; i < n_variables; i++)
133 result->m1[i] = moments1_create (MOMENT_MEAN);
138 result->update_moments = update_moments2;
139 result->m = xnmalloc (n_variables, sizeof (*result->m));
140 for (i = 0; i < n_variables; i++)
142 result->m[i] = moments_create (MOMENT_MEAN);
145 result->v_variables = v_variables;
147 result->n_pass = n_pass;
152 get_n_rows (size_t n_variables, size_t *v_variables[])
156 for (i = 0; i < n_variables; i++)
158 if (var_is_numeric (v_variables[i]))
162 else if (var_is_alpha (v_variables[i]))
164 size_t n_categories = cat_get_n_categories (v_variables[i]);
165 result += n_categories - 1;
171 The covariances are stored in a DESIGN_MATRIX structure.
173 struct design_matrix *
174 covariance_matrix_create (size_t n_variables,
175 const struct variable *v_variables[])
177 size_t n_rows = get_n_rows (n_variables, v_variables);
178 return design_matrix_create (n_variables, v_variables, n_rows);
182 update_moments1 (struct covariance_matrix *cov, size_t i, double x)
184 assert (cov->m1 != NULL);
185 moments1_add (cov->m1[i], x, 1.0);
189 update_moments2 (struct covariance_matrix *cov, size_t i, double x)
191 assert (cov->m != NULL);
192 moments_pass_one (cov->m[i], x, 1.0);
196 covariance_matrix_destroy (struct covariance_matrix *cov)
200 assert (cov != NULL);
201 design_matrix_destroy (cov->cov);
202 design_matrix_destroy (cov->ssize);
203 design_matrix_destroy (cov->sums);
204 hsh_destroy (cov->ca);
205 if (cov->n_pass == ONE_PASS)
207 for (i = 0; i < cov->n_variables; i++)
209 moments1_destroy (cov->m1[i]);
215 for (i = 0; i < cov->n_variables; i++)
217 moments_destroy (cov->m[i]);
224 Update the covariance matrix with the new entries, assuming that ROW
225 corresponds to a categorical variable and V2 is numeric.
228 covariance_update_categorical_numeric (struct design_matrix *cov, double mean,
230 const struct variable *v2, double x,
231 const union value *val2)
236 assert (var_is_numeric (v2));
238 col = design_matrix_var_to_column (cov, v2);
239 assert (val2 != NULL);
240 tmp = design_matrix_get_element (cov, row, col);
241 design_matrix_set_element (cov, row, col, (val2->f - mean) * x + tmp);
242 design_matrix_set_element (cov, col, row, (val2->f - mean) * x + tmp);
245 column_iterate (struct design_matrix *cov, const struct variable *v,
246 double ssize, double x, const union value *val1, size_t row)
252 const union value *tmp_val;
254 col = design_matrix_var_to_column (cov, v);
255 for (i = 0; i < cat_get_n_categories (v) - 1; i++)
258 y = -1.0 * cat_get_category_count (i, v) / ssize;
259 tmp_val = cat_subscript_to_value (i, v);
260 if (!compare_values_short (tmp_val, val1, v))
264 tmp = design_matrix_get_element (cov, row, col);
265 design_matrix_set_element (cov, row, col, x * y + tmp);
266 design_matrix_set_element (cov, col, row, x * y + tmp);
271 Call this function in the second data pass. The central moments are
272 MEAN1 and MEAN2. Any categorical variables should already have their
273 values summarized in in its OBS_VALS element.
276 covariance_pass_two (struct design_matrix *cov, double mean1, double mean2,
277 double ssize, const struct variable *v1,
278 const struct variable *v2, const union value *val1,
279 const union value *val2)
285 const union value *tmp_val;
287 if (var_is_alpha (v1))
289 row = design_matrix_var_to_column (cov, v1);
290 for (i = 0; i < cat_get_n_categories (v1) - 1; i++)
293 x = -1.0 * cat_get_category_count (i, v1) / ssize;
294 tmp_val = cat_subscript_to_value (i, v1);
295 if (!compare_values_short (tmp_val, val1, v1))
299 if (var_is_numeric (v2))
301 covariance_update_categorical_numeric (cov, mean2, row,
306 column_iterate (cov, v1, ssize, x, val1, row);
307 column_iterate (cov, v2, ssize, x, val2, row);
311 else if (var_is_alpha (v2))
314 Reverse the orders of V1, V2, etc. and put ourselves back
315 in the previous IF scope.
317 covariance_pass_two (cov, mean2, mean1, ssize, v2, v1, val2, val1);
322 Both variables are numeric.
324 row = design_matrix_var_to_column (cov, v1);
325 col = design_matrix_var_to_column (cov, v2);
326 x = (val1->f - mean1) * (val2->f - mean2);
327 x += design_matrix_get_element (cov, col, row);
328 design_matrix_set_element (cov, row, col, x);
329 design_matrix_set_element (cov, col, row, x);
334 covariance_accumulator_hash (const void *h, const void *aux)
336 struct covariance_accumulator *ca = (struct covariance_accumulator *) h;
337 size_t *n_vars = (size_t *) aux;
340 const struct variable *v_min;
341 const struct variable *v_max;
342 const union value *val_min;
343 const union value *val_max;
346 Order everything by the variables' indices. This ensures we get the
347 same key regardless of the order in which the variables are stored
351 (var_get_dict_index (ca->v1) <
352 var_get_dict_index (ca->v2)) ? ca->v1 : ca->v2;
353 v_max = (ca->v1 == v_min) ? ca->v2 : ca->v1;
355 val_min = (v_min == ca->v1) ? ca->val1 : ca->val2;
356 val_max = (ca->val1 == val_min) ? ca->val2 : ca->val1;
358 idx_min = var_get_dict_index (v_min);
359 idx_max = var_get_dict_index (v_max);
361 if (var_is_numeric (v_max) && var_is_numeric (v_min))
363 return (*n_vars * idx_max + idx_min);
365 if (var_is_numeric (v_max) && var_is_alpha (v_min))
367 return hash_numeric_alpha (v_max, v_min, val_min, *n_vars);
369 if (var_is_alpha (v_max) && var_is_numeric (v_min))
371 return (hash_numeric_alpha (v_min, v_max, val_max, *n_vars));
373 if (var_is_alpha (v_max) && var_is_alpha (v_min))
375 unsigned hash = hash_bytes (val_max, var_get_width (v_max), 0);
376 hash = hash_bytes (val_min, var_get_width (v_min), hash);
377 return hash_int (*n_vars * (*n_vars + 1 + idx_max) + idx_min, hash);
383 Make a hash table consisting of struct covariance_accumulators.
384 This allows the accumulation of the elements of a covariance matrix
385 in a single data pass. Call covariance_accumulate () for each case
388 static struct hsh_table *
389 covariance_hsh_create (size_t *n_vars)
391 return hsh_create (*n_vars * *n_vars, covariance_accumulator_compare,
392 covariance_accumulator_hash, covariance_accumulator_free,
397 covariance_accumulator_free (void *c_, const void *aux UNUSED)
399 struct covariance_accumulator *c = c_;
405 Hash comparison. Returns 0 for a match, or a non-zero int
406 otherwise. The sign of a non-zero return value *should* indicate the
407 position of C relative to the covariance_accumulator described by
408 the other arguments. But for now, it just returns 1 for any
409 non-match. This should be changed when someone figures out how to
410 compute a sensible sign for the return value.
413 match_nodes (const struct covariance_accumulator *c,
414 const struct variable *v1, const struct variable *v2,
415 const union value *val1, const union value *val2)
417 if (var_get_dict_index (v1) == var_get_dict_index (c->v1))
418 if (var_get_dict_index (v2) == var_get_dict_index (c->v2))
420 if (var_is_numeric (v1) && var_is_numeric (v2))
424 if (var_is_numeric (v1) && var_is_alpha (v2))
426 if (!compare_values_short (val2, c->val2, v2))
431 if (var_is_alpha (v1) && var_is_numeric (v2))
433 if (!compare_values_short (val1, c->val1, v1))
438 if (var_is_alpha (v1) && var_is_alpha (v2))
440 if (!compare_values_short (val1, c->val1, v1))
442 if (!compare_values_short (val2, c->val2, v2))
453 This function is meant to be used as a comparison function for
454 a struct hsh_table in src/libpspp/hash.c.
457 covariance_accumulator_compare (const void *a1_, const void *a2_,
458 const void *aux UNUSED)
460 const struct covariance_accumulator *a1 = a1_;
461 const struct covariance_accumulator *a2 = a2_;
463 if (a1 == NULL && a2 == NULL)
466 if (a1 == NULL || a2 == NULL)
469 return match_nodes (a1, a2->v1, a2->v2, a2->val1, a2->val2);
473 hash_numeric_alpha (const struct variable *v1, const struct variable *v2,
474 const union value *val, size_t n_vars)
476 unsigned int result = -1u;
477 if (var_is_numeric (v1) && var_is_alpha (v2))
479 result = n_vars * ((n_vars + 1) + var_get_dict_index (v1))
480 + var_get_dict_index (v2) + hash_string (val->s, 0);
482 else if (var_is_alpha (v1) && var_is_numeric (v2))
484 result = hash_numeric_alpha (v2, v1, val, n_vars);
491 update_product (const struct variable *v1, const struct variable *v2,
492 const union value *val1, const union value *val2)
496 assert (val1 != NULL);
497 assert (val2 != NULL);
498 if (var_is_alpha (v1) && var_is_alpha (v2))
502 if (var_is_numeric (v1) && var_is_numeric (v2))
504 return (val1->f * val2->f);
506 if (var_is_numeric (v1) && var_is_alpha (v2))
510 if (var_is_numeric (v2) && var_is_alpha (v1))
512 update_product (v2, v1, val2, val1);
517 update_sum (const struct variable *var, const union value *val, double weight)
519 assert (var != NULL);
520 assert (val != NULL);
521 if (var_is_alpha (var))
527 static struct covariance_accumulator *
528 get_new_covariance_accumulator (const struct variable *v1,
529 const struct variable *v2,
530 const union value *val1,
531 const union value *val2)
533 if ((v1 != NULL) && (v2 != NULL) && (val1 != NULL) && (val2 != NULL))
535 struct covariance_accumulator *ca;
536 ca = xmalloc (sizeof (*ca));
546 static const struct variable **
547 get_covariance_variables (const struct covariance_matrix *cov)
549 return cov->v_variables;
553 update_hash_entry (struct hsh_table *c,
554 const struct variable *v1,
555 const struct variable *v2,
556 const union value *val1, const union value *val2,
557 const struct interaction_value *i_val1,
558 const struct interaction_value *i_val2)
560 struct covariance_accumulator *ca;
561 struct covariance_accumulator *new_entry;
565 iv_f1 = interaction_value_get_nonzero_entry (i_val1);
566 iv_f2 = interaction_value_get_nonzero_entry (i_val2);
567 ca = get_new_covariance_accumulator (v1, v2, val1, val2);
568 ca->dot_product = update_product (ca->v1, ca->v2, ca->val1, ca->val2);
569 ca->dot_product *= iv_f1 * iv_f2;
570 ca->sum1 = update_sum (ca->v1, ca->val1, iv_f1);
571 ca->sum2 = update_sum (ca->v2, ca->val2, iv_f2);
573 new_entry = hsh_insert (c, ca);
575 if (new_entry != NULL)
577 new_entry->dot_product += ca->dot_product;
578 new_entry->ssize += 1.0;
579 new_entry->sum1 += ca->sum1;
580 new_entry->sum2 += ca->sum2;
582 If DOT_PRODUCT is null, CA was not already in the hash
583 hable, so we don't free it because it was just inserted.
584 If DOT_PRODUCT was not null, CA is already in the hash table.
585 Unnecessary now, it must be freed here.
592 Compute the covariance matrix in a single data-pass. Cases with
593 missing values are dropped pairwise, in other words, only if one of
594 the two values necessary to accumulate the inner product is missing.
596 Do not call this function directly. Call it through the struct
597 covariance_matrix ACCUMULATE member function, for example,
598 cov->accumulate (cov, ccase).
601 covariance_accumulate_pairwise (struct covariance_matrix *cov,
602 const struct ccase *ccase,
603 const struct interaction_variable **i_var,
608 const union value *val1;
609 const union value *val2;
610 const struct variable **v_variables;
611 struct interaction_value *i_val1 = NULL;
612 struct interaction_value *i_val2 = NULL;
614 assert (cov != NULL);
615 assert (ccase != NULL);
617 v_variables = get_covariance_variables (cov);
618 assert (v_variables != NULL);
620 for (i = 0; i < cov->n_variables; ++i)
622 if (is_interaction (v_variables[i], i_var, n_intr))
624 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
625 val1 = interaction_value_get (i_val1);
629 val1 = case_data (ccase, v_variables[i]);
631 if (!var_is_value_missing (v_variables[i], val1, cov->missing_value))
633 cat_value_update (v_variables[i], val1);
634 if (var_is_numeric (v_variables[i]))
635 cov->update_moments (cov, i, val1->f);
637 for (j = i; j < cov->n_variables; j++)
639 if (is_interaction (v_variables[j], i_var, n_intr))
641 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
642 val2 = interaction_value_get (i_val2);
646 val2 = case_data (ccase, v_variables[j]);
648 if (!var_is_value_missing
649 (v_variables[j], val2, cov->missing_value))
651 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
652 val1, val2, i_val1, i_val2);
654 update_hash_entry (cov->ca, v_variables[j],
655 v_variables[i], val2, val1, i_val2, i_val1);
663 Compute the covariance matrix in a single data-pass. Cases with
664 missing values are dropped listwise. In other words, if one of the
665 values for any variable in a case is missing, the entire case is
668 The caller must use a casefilter to remove the cases with missing
669 values before calling covariance_accumulate_listwise. This function
670 assumes that CCASE has already passed through this filter, and
671 contains no missing values.
673 Do not call this function directly. Call it through the struct
674 covariance_matrix ACCUMULATE member function, for example,
675 cov->accumulate (cov, ccase).
678 covariance_accumulate_listwise (struct covariance_matrix *cov,
679 const struct ccase *ccase,
680 const struct interaction_variable **i_var,
685 const union value *val1;
686 const union value *val2;
687 const struct variable **v_variables;
688 struct interaction_value *i_val1 = NULL;
689 struct interaction_value *i_val2 = NULL;
691 assert (cov != NULL);
692 assert (ccase != NULL);
694 v_variables = get_covariance_variables (cov);
695 assert (v_variables != NULL);
697 for (i = 0; i < cov->n_variables; ++i)
699 if (is_interaction (v_variables[i], i_var, n_intr))
701 i_val1 = interaction_case_data (ccase, v_variables[i], i_var, n_intr);
702 val1 = interaction_value_get (i_val1);
706 val1 = case_data (ccase, v_variables[i]);
708 cat_value_update (v_variables[i], val1);
709 if (var_is_numeric (v_variables[i]))
710 cov->update_moments (cov, i, val1->f);
712 for (j = i; j < cov->n_variables; j++)
714 if (is_interaction (v_variables[j], i_var, n_intr))
716 i_val2 = interaction_case_data (ccase, v_variables[j], i_var, n_intr);
717 val2 = interaction_value_get (i_val2);
721 val2 = case_data (ccase, v_variables[j]);
723 update_hash_entry (cov->ca, v_variables[i], v_variables[j],
724 val1, val2, i_val1, i_val2);
726 update_hash_entry (cov->ca, v_variables[j], v_variables[i],
727 val2, val1, i_val2, i_val1);
733 Call this function during the data pass. Each case will be added to
734 a hash containing all values of the covariance matrix. After the
735 data have been passed, call covariance_matrix_compute to put the
736 values in the struct covariance_matrix.
739 covariance_matrix_accumulate (struct covariance_matrix *cov,
740 const struct ccase *ccase, void **aux, size_t n_intr)
742 cov->accumulate (cov, ccase, (const struct interaction_variable **) aux, n_intr);
745 If VAR is categorical with d categories, its first category should
746 correspond to the origin in d-dimensional Euclidean space.
749 is_origin (const struct variable *var, const union value *val)
751 if (cat_value_find (var, val) == 0)
759 Return the subscript of the column of the design matrix
760 corresponding to VAL. If VAR is categorical with d categories, its
761 first category should correspond to the origin in d-dimensional
762 Euclidean space, so there is no subscript for this value.
765 get_exact_subscript (const struct design_matrix *dm, const struct variable *var,
766 const union value *val)
770 if (is_origin (var, val))
775 result = design_matrix_var_to_column (dm, var);
776 if (var_is_alpha (var))
778 result += cat_value_find (var, val) - 1;
784 covariance_matrix_insert (struct design_matrix *cov,
785 const struct variable *v1,
786 const struct variable *v2, const union value *val1,
787 const union value *val2, double product)
792 assert (cov != NULL);
794 row = get_exact_subscript (cov, v1, val1);
795 col = get_exact_subscript (cov, v2, val2);
796 if (row != -1u && col != -1u)
798 design_matrix_set_element (cov, row, col, product);
804 is_covariance_contributor (const struct covariance_accumulator *ca, const struct design_matrix *dm,
808 const struct variable *v1;
809 const struct variable *v2;
812 v1 = design_matrix_col_to_var (dm, i);
813 if (var_get_dict_index (v1) == var_get_dict_index(ca->v1))
815 v2 = design_matrix_col_to_var (dm, j);
816 if (var_get_dict_index (v2) == var_get_dict_index (ca->v2))
818 k = get_exact_subscript (dm, v1, ca->val1);
821 k = get_exact_subscript (dm, v2, ca->val2);
832 get_sum (const struct covariance_matrix *cov, size_t i)
835 const struct variable *var;
836 const union value *val = NULL;
837 struct covariance_accumulator ca;
838 struct covariance_accumulator *c;
840 assert ( cov != NULL);
841 var = design_matrix_col_to_var (cov->cov, i);
844 if (var_is_alpha (var))
846 k = design_matrix_var_to_column (cov->cov, var);
848 val = cat_subscript_to_value (i, var);
854 c = (struct covariance_accumulator *) hsh_find (cov->ca, &ca);
863 update_ssize (struct design_matrix *dm, size_t i, size_t j, struct covariance_accumulator *ca)
865 struct variable *var;
867 var = design_matrix_col_to_var (dm, i);
868 if (var_get_dict_index (ca->v1) == var_get_dict_index (var))
870 var = design_matrix_col_to_var (dm, j);
871 if (var_get_dict_index (ca->v2) == var_get_dict_index (var))
873 tmp = design_matrix_get_element (dm, i, j);
875 design_matrix_set_element (dm, i, j, tmp);
880 covariance_accumulator_to_matrix (struct covariance_matrix *cov)
887 struct covariance_accumulator *entry;
888 struct hsh_iterator iter;
890 cov->cov = covariance_matrix_create (cov->n_variables, cov->v_variables);
891 cov->ssize = covariance_matrix_create (cov->n_variables, cov->v_variables);
892 cov->sums = covariance_matrix_create (cov->n_variables, cov->v_variables);
893 for (i = 0; i < design_matrix_get_n_cols (cov->cov); i++)
895 sum_i = get_sum (cov, i);
896 for (j = i; j < design_matrix_get_n_cols (cov->cov); j++)
898 sum_j = get_sum (cov, j);
899 entry = hsh_first (cov->ca, &iter);
900 design_matrix_set_element (cov->sums, i, j, sum_i);
901 while (entry != NULL)
903 update_ssize (cov->ssize, i, j, entry);
905 We compute the centered, un-normalized covariance matrix.
907 if (is_covariance_contributor (entry, cov->cov, i, j))
909 covariance_matrix_insert (cov->cov, entry->v1, entry->v2, entry->val1,
910 entry->val2, entry->dot_product);
912 entry = hsh_next (cov->ca, &iter);
914 tmp = design_matrix_get_element (cov->cov, i, j);
915 tmp -= sum_i * sum_j / design_matrix_get_element (cov->ssize, i, j);
916 design_matrix_set_element (cov->cov, i, j, tmp);
923 Call this function after passing the data.
926 covariance_matrix_compute (struct covariance_matrix *cov)
928 if (cov->n_pass == ONE_PASS)
930 covariance_accumulator_to_matrix (cov);
934 struct design_matrix *
935 covariance_to_design (const struct covariance_matrix *c)
945 covariance_matrix_get_element (const struct covariance_matrix *c, size_t row, size_t col)
947 return (design_matrix_get_element (c->cov, row, col));