1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/covariance.h"
21 #include <gsl/gsl_matrix.h>
23 #include "data/case.h"
24 #include "data/variable.h"
25 #include "libpspp/assertion.h"
26 #include "libpspp/misc.h"
27 #include "math/categoricals.h"
28 #include "math/moments.h"
30 #include "gl/xalloc.h"
32 #define n_MOMENTS (MOMENT_VARIANCE + 1)
35 /* Create a new matrix of NEW_SIZE x NEW_SIZE and copy the elements of
36 matrix IN into it. IN must be a square matrix, and in normal usage
37 it will be smaller than NEW_SIZE.
38 IN is destroyed by this function. The return value must be destroyed
39 when no longer required.
42 resize_matrix (gsl_matrix *in, size_t new_size)
46 gsl_matrix *out = NULL;
48 assert (in->size1 == in->size2);
50 if (new_size <= in->size1)
53 out = gsl_matrix_calloc (new_size, new_size);
55 for (i = 0; i < in->size1; ++i)
57 for (j = 0; j < in->size2; ++j)
59 double x = gsl_matrix_get (in, i, j);
61 gsl_matrix_set (out, i, j, x);
72 /* The variables for which the covariance matrix is to be calculated. */
74 const struct variable *const *vars;
76 /* Categorical variables. */
77 struct categoricals *categoricals;
79 /* Array containing number of categories per categorical variable. */
82 /* Dimension of the covariance matrix. */
85 /* The weight variable (or NULL if none) */
86 const struct variable *wv;
88 /* A set of matrices containing the 0th, 1st and 2nd moments */
91 /* The class of missing values to exclude */
92 enum mv_class exclude;
94 /* An array of doubles representing the covariance matrix.
95 Only the top triangle is included, and no diagonals */
99 /* 1 for single pass algorithm;
100 2 for double pass algorithm
105 0 : No pass has been made
106 1 : First pass has been started
107 2 : Second pass has been
109 IE: How many passes have been (partially) made. */
112 /* Flags indicating that the first case has been seen */
113 bool pass_one_first_case_seen;
114 bool pass_two_first_case_seen;
119 /* Return a matrix containing the M th moments.
120 The matrix is of size NxN where N is the number of variables.
121 Each row represents the moments of a variable.
122 In the absence of missing values, the columns of this matrix will
123 be identical. If missing values are involved, then element (i,j)
124 is the moment of the i th variable, when paired with the j th variable.
127 covariance_moments (const struct covariance *cov, int m)
129 return cov->moments[m];
134 /* Create a covariance struct.
137 covariance_1pass_create (size_t n_vars, const struct variable *const *vars,
138 const struct variable *weight, enum mv_class exclude)
141 struct covariance *cov = xzalloc (sizeof *cov);
145 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
150 cov->n_vars = n_vars;
153 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
155 for (i = 0; i < n_MOMENTS; ++i)
156 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
158 cov->exclude = exclude;
160 cov->n_cm = (n_vars * (n_vars - 1) ) / 2;
163 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
164 cov->categoricals = NULL;
170 Create a covariance struct for a two-pass algorithm. If categorical
171 variables are involed, the dimension cannot be know until after the
172 first data pass, so the actual covariances will not be allocated
176 covariance_2pass_create (size_t n_vars, const struct variable *const *vars,
177 struct categoricals *cats,
178 const struct variable *wv, enum mv_class exclude)
181 struct covariance *cov = xmalloc (sizeof *cov);
185 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
190 cov->n_vars = n_vars;
193 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
195 for (i = 0; i < n_MOMENTS; ++i)
196 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
198 cov->exclude = exclude;
203 cov->categoricals = cats;
208 /* Return an integer, which can be used to index
209 into COV->cm, to obtain the I, J th element
210 of the covariance matrix. If COV->cm does not
211 contain that element, then a negative value
215 cm_idx (const struct covariance *cov, int i, int j)
218 const int n2j = cov->dim - 2 - j;
219 const int nj = cov->dim - 2 ;
222 assert (j < cov->dim);
227 if (j >= cov->dim - 1)
234 as -= n2j * (n2j + 1) ;
242 Returns true iff the variable corresponding to the Ith element of the covariance matrix
243 has a missing value for case C
246 is_missing (const struct covariance *cov, int i, const struct ccase *c)
248 const struct variable *var = i < cov->n_vars ?
250 categoricals_get_variable_by_subscript (cov->categoricals, i - cov->n_vars);
252 const union value *val = case_data (c, var);
254 return var_is_value_missing (var, val, cov->exclude);
259 get_val (const struct covariance *cov, int i, const struct ccase *c)
261 if ( i < cov->n_vars)
263 const struct variable *var = cov->vars[i];
265 const union value *val = case_data (c, var);
270 return categoricals_get_binary_by_subscript (cov->categoricals, i - cov->n_vars, c);
275 dump_matrix (const gsl_matrix *m)
279 for (i = 0 ; i < m->size1; ++i)
281 for (j = 0 ; j < m->size2; ++j)
282 printf ("%02f ", gsl_matrix_get (m, i, j));
288 /* Call this function for every case in the data set */
290 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
293 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
295 assert (cov->passes == 2);
296 if (!cov->pass_one_first_case_seen)
298 assert (cov->state == 0);
302 if (cov->categoricals)
303 categoricals_update (cov->categoricals, c);
305 for (i = 0 ; i < cov->dim; ++i)
307 double v1 = get_val (cov, i, c);
309 if ( is_missing (cov, i, c))
312 for (j = 0 ; j < cov->dim; ++j)
316 if ( is_missing (cov, j, c))
319 for (m = 0 ; m <= MOMENT_MEAN; ++m)
321 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
329 cov->pass_one_first_case_seen = true;
333 /* Call this function for every case in the data set */
335 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
338 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
340 assert (cov->passes == 2);
341 assert (cov->state >= 1);
343 if (! cov->pass_two_first_case_seen)
346 assert (cov->state == 1);
349 cov->dim = cov->n_vars;
351 if (cov->categoricals)
352 cov->dim += categoricals_total (cov->categoricals)
353 - categoricals_get_n_variables (cov->categoricals);
355 cov->n_cm = (cov->dim * (cov->dim - 1) ) / 2;
356 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
358 /* Grow the moment matrices so that they're large enough to accommodate the
359 categorical elements */
360 for (i = 0; i < n_MOMENTS; ++i)
362 cov->moments[i] = resize_matrix (cov->moments[i], cov->dim);
365 if (cov->categoricals)
366 categoricals_done (cov->categoricals);
368 /* Populate the moments matrices with the categorical value elements */
369 for (i = cov->n_vars; i < cov->dim; ++i)
371 for (j = 0 ; j < cov->dim ; ++j) /* FIXME: This is WRONG !!! */
373 double w = categoricals_get_weight_by_subscript (cov->categoricals, i - cov->n_vars);
375 gsl_matrix_set (cov->moments[MOMENT_NONE], i, j, w);
377 w = categoricals_get_sum_by_subscript (cov->categoricals, i - cov->n_vars);
379 gsl_matrix_set (cov->moments[MOMENT_MEAN], i, j, w);
383 /* FIXME: This is WRONG!! It must be fixed to properly handle missing values. For
384 now it assumes there are none */
385 for (m = 0 ; m < n_MOMENTS; ++m)
387 for (i = 0 ; i < cov->dim ; ++i)
389 double x = gsl_matrix_get (cov->moments[m], i, cov->n_vars -1);
390 for (j = cov->n_vars; j < cov->dim; ++j)
392 gsl_matrix_set (cov->moments[m], i, j, x);
397 /* Divide the means by the number of samples */
398 for (i = 0; i < cov->dim; ++i)
400 for (j = 0; j < cov->dim; ++j)
402 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
403 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
408 for (i = 0 ; i < cov->dim; ++i)
410 double v1 = get_val (cov, i, c);
412 if ( is_missing (cov, i, c))
415 for (j = 0 ; j < cov->dim; ++j)
419 double v2 = get_val (cov, j, c);
421 const double s = pow2 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
423 if ( is_missing (cov, j, c))
427 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
432 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
434 (v2 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
438 idx = cm_idx (cov, i, j);
446 cov->pass_two_first_case_seen = true;
450 /* Call this function for every case in the data set.
451 After all cases have been passed, call covariance_calculate
454 covariance_accumulate (struct covariance *cov, const struct ccase *c)
457 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
459 assert (cov->passes == 1);
461 if ( !cov->pass_one_first_case_seen)
463 assert ( cov->state == 0);
467 for (i = 0 ; i < cov->dim; ++i)
469 const union value *val1 = case_data (c, cov->vars[i]);
471 if ( is_missing (cov, i, c))
474 for (j = 0 ; j < cov->dim; ++j)
478 const union value *val2 = case_data (c, cov->vars[j]);
480 if ( is_missing (cov, j, c))
483 idx = cm_idx (cov, i, j);
486 cov->cm [idx] += val1->f * val2->f * weight;
489 for (m = 0 ; m < n_MOMENTS; ++m)
491 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
499 cov->pass_one_first_case_seen = true;
504 Allocate and return a gsl_matrix containing the covariances of the
508 cm_to_gsl (struct covariance *cov)
511 gsl_matrix *m = gsl_matrix_calloc (cov->dim, cov->dim);
513 /* Copy the non-diagonal elements from cov->cm */
514 for ( j = 0 ; j < cov->dim - 1; ++j)
516 for (i = j+1 ; i < cov->dim; ++i)
518 double x = cov->cm [cm_idx (cov, i, j)];
519 gsl_matrix_set (m, i, j, x);
520 gsl_matrix_set (m, j, i, x);
524 /* Copy the diagonal elements from cov->moments[2] */
525 for (j = 0 ; j < cov->dim ; ++j)
527 double sigma = gsl_matrix_get (cov->moments[2], j, j);
528 gsl_matrix_set (m, j, j, sigma);
536 covariance_calculate_double_pass (struct covariance *cov)
539 for (i = 0 ; i < cov->dim; ++i)
541 for (j = 0 ; j < cov->dim; ++j)
544 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
545 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
547 idx = cm_idx (cov, i, j);
551 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
556 return cm_to_gsl (cov);
560 covariance_calculate_single_pass (struct covariance *cov)
565 for (m = 0; m < n_MOMENTS; ++m)
567 /* Divide the moments by the number of samples */
570 for (i = 0 ; i < cov->dim; ++i)
572 for (j = 0 ; j < cov->dim; ++j)
574 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
575 *x /= gsl_matrix_get (cov->moments[0], i, j);
577 if ( m == MOMENT_VARIANCE)
578 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
584 /* Centre the moments */
585 for ( j = 0 ; j < cov->dim - 1; ++j)
587 for (i = j + 1 ; i < cov->dim; ++i)
589 double *x = &cov->cm [cm_idx (cov, i, j)];
591 *x /= gsl_matrix_get (cov->moments[0], i, j);
594 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
596 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
600 return cm_to_gsl (cov);
604 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
605 caller owns the returned matrix and must free it when it is no longer
608 Call this function only after all data have been accumulated. */
610 covariance_calculate (struct covariance *cov)
612 if ( cov->state <= 0 )
618 return covariance_calculate_single_pass (cov);
621 return covariance_calculate_double_pass (cov);
629 Covariance computed without dividing by the sample size.
632 covariance_calculate_double_pass_unnormalized (struct covariance *cov)
635 for (i = 0 ; i < cov->dim; ++i)
637 for (j = 0 ; j < cov->dim; ++j)
640 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
642 idx = cm_idx (cov, i, j);
650 return cm_to_gsl (cov);
654 covariance_calculate_single_pass_unnormalized (struct covariance *cov)
658 for (i = 0 ; i < cov->dim; ++i)
660 for (j = 0 ; j < cov->dim; ++j)
662 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
663 *x -= pow2 (gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
664 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
667 for ( j = 0 ; j < cov->dim - 1; ++j)
669 for (i = j + 1 ; i < cov->dim; ++i)
671 double *x = &cov->cm [cm_idx (cov, i, j)];
674 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
676 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i)
677 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
681 return cm_to_gsl (cov);
685 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
686 caller owns the returned matrix and must free it when it is no longer
689 Call this function only after all data have been accumulated. */
691 covariance_calculate_unnormalized (struct covariance *cov)
693 if ( cov->state <= 0 )
699 return covariance_calculate_single_pass_unnormalized (cov);
702 return covariance_calculate_double_pass_unnormalized (cov);
709 /* Function to access the categoricals used by COV
710 The return value is owned by the COV
712 const struct categoricals *
713 covariance_get_categoricals (const struct covariance *cov)
715 return cov->categoricals;
719 /* Destroy the COV object */
721 covariance_destroy (struct covariance *cov)
725 categoricals_destroy (cov->categoricals);
727 for (i = 0; i < n_MOMENTS; ++i)
728 gsl_matrix_free (cov->moments[i]);
736 covariance_dim (const struct covariance * cov)
742 Returns an array of variables corresponding to rows of the covariance matrix.
743 In other words, element i of the array is the variable corresponding to
744 row (and column) i of the covariance matrix.
747 covariance_get_var_indices (const struct covariance *cov, const struct variable **vars)
750 for (i = 0; i < cov->n_vars; i++)
752 vars[i] = cov->vars[i];
754 for (i = cov->n_vars; i < cov->dim; i++)
756 vars[i] = categoricals_get_variable_by_subscript (cov->categoricals, i - cov->n_vars);
762 covariance_get_interaction_indices (const struct covariance *cov, const struct interaction **iacts)
765 for (i = 0; i < cov->n_vars; i++)
767 iacts[i] = cov->vars[i];
769 for (i = cov->n_vars; i < cov->dim; i++)
771 iacts[i] = categoricals_get_interaction_by_subscript (cov->categoricals, i - cov->n_vars);