1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/covariance.h"
21 #include <gsl/gsl_matrix.h>
23 #include "data/case.h"
24 #include "data/variable.h"
25 #include "libpspp/assertion.h"
26 #include "libpspp/misc.h"
27 #include "math/categoricals.h"
28 #include "math/interaction.h"
29 #include "math/moments.h"
31 #include "gl/xalloc.h"
33 #define n_MOMENTS (MOMENT_VARIANCE + 1)
36 /* Create a new matrix of NEW_SIZE x NEW_SIZE and copy the elements of
37 matrix IN into it. IN must be a square matrix, and in normal usage
38 it will be smaller than NEW_SIZE.
39 IN is destroyed by this function. The return value must be destroyed
40 when no longer required.
43 resize_matrix (gsl_matrix *in, size_t new_size)
47 gsl_matrix *out = NULL;
49 assert (in->size1 == in->size2);
51 if (new_size <= in->size1)
54 out = gsl_matrix_calloc (new_size, new_size);
56 for (i = 0; i < in->size1; ++i)
58 for (j = 0; j < in->size2; ++j)
60 double x = gsl_matrix_get (in, i, j);
62 gsl_matrix_set (out, i, j, x);
73 /* The variables for which the covariance matrix is to be calculated. */
75 const struct variable *const *vars;
77 /* Categorical variables. */
78 struct categoricals *categoricals;
80 /* Array containing number of categories per categorical variable. */
83 /* Dimension of the covariance matrix. */
86 /* The weight variable (or NULL if none) */
87 const struct variable *wv;
89 /* A set of matrices containing the 0th, 1st and 2nd moments */
92 /* The class of missing values to exclude */
93 enum mv_class exclude;
95 /* An array of doubles representing the covariance matrix.
96 Only the top triangle is included, and no diagonals */
100 /* 1 for single pass algorithm;
101 2 for double pass algorithm
106 0 : No pass has been made
107 1 : First pass has been started
108 2 : Second pass has been
110 IE: How many passes have been (partially) made. */
113 /* Flags indicating that the first case has been seen */
114 bool pass_one_first_case_seen;
115 bool pass_two_first_case_seen;
120 /* Return a matrix containing the M th moments.
121 The matrix is of size NxN where N is the number of variables.
122 Each row represents the moments of a variable.
123 In the absence of missing values, the columns of this matrix will
124 be identical. If missing values are involved, then element (i,j)
125 is the moment of the i th variable, when paired with the j th variable.
128 covariance_moments (const struct covariance *cov, int m)
130 return cov->moments[m];
135 /* Create a covariance struct.
138 covariance_1pass_create (size_t n_vars, const struct variable *const *vars,
139 const struct variable *weight, enum mv_class exclude)
142 struct covariance *cov = xzalloc (sizeof *cov);
146 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
151 cov->n_vars = n_vars;
154 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
156 for (i = 0; i < n_MOMENTS; ++i)
157 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
159 cov->exclude = exclude;
161 cov->n_cm = (n_vars * (n_vars - 1) ) / 2;
164 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
165 cov->categoricals = NULL;
171 Create a covariance struct for a two-pass algorithm. If categorical
172 variables are involed, the dimension cannot be know until after the
173 first data pass, so the actual covariances will not be allocated
177 covariance_2pass_create (size_t n_vars, const struct variable *const *vars,
178 struct categoricals *cats,
179 const struct variable *wv, enum mv_class exclude)
182 struct covariance *cov = xmalloc (sizeof *cov);
186 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
191 cov->n_vars = n_vars;
194 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
196 for (i = 0; i < n_MOMENTS; ++i)
197 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
199 cov->exclude = exclude;
204 cov->categoricals = cats;
209 /* Return an integer, which can be used to index
210 into COV->cm, to obtain the I, J th element
211 of the covariance matrix. If COV->cm does not
212 contain that element, then a negative value
216 cm_idx (const struct covariance *cov, int i, int j)
219 const int n2j = cov->dim - 2 - j;
220 const int nj = cov->dim - 2 ;
223 assert (j < cov->dim);
228 if (j >= cov->dim - 1)
235 as -= n2j * (n2j + 1) ;
243 Returns true iff the variable corresponding to the Ith element of the covariance matrix
244 has a missing value for case C
247 is_missing (const struct covariance *cov, int i, const struct ccase *c)
249 const struct variable *var = i < cov->n_vars ?
251 categoricals_get_interaction_by_subscript (cov->categoricals, i - cov->n_vars)->vars[0];
253 const union value *val = case_data (c, var);
255 return var_is_value_missing (var, val, cov->exclude);
260 get_val (const struct covariance *cov, int i, const struct ccase *c)
262 if ( i < cov->n_vars)
264 const struct variable *var = cov->vars[i];
266 const union value *val = case_data (c, var);
271 return categoricals_get_binary_by_subscript (cov->categoricals, i - cov->n_vars, c);
276 dump_matrix (const gsl_matrix *m)
280 for (i = 0 ; i < m->size1; ++i)
282 for (j = 0 ; j < m->size2; ++j)
283 printf ("%02f ", gsl_matrix_get (m, i, j));
289 /* Call this function for every case in the data set */
291 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
294 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
296 assert (cov->passes == 2);
297 if (!cov->pass_one_first_case_seen)
299 assert (cov->state == 0);
303 if (cov->categoricals)
304 categoricals_update (cov->categoricals, c);
306 for (i = 0 ; i < cov->dim; ++i)
308 double v1 = get_val (cov, i, c);
310 if ( is_missing (cov, i, c))
313 for (j = 0 ; j < cov->dim; ++j)
317 if ( is_missing (cov, j, c))
320 for (m = 0 ; m <= MOMENT_MEAN; ++m)
322 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
330 cov->pass_one_first_case_seen = true;
334 /* Call this function for every case in the data set */
336 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
339 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
341 assert (cov->passes == 2);
342 assert (cov->state >= 1);
344 if (! cov->pass_two_first_case_seen)
347 assert (cov->state == 1);
350 cov->dim = cov->n_vars;
352 if (cov->categoricals)
353 cov->dim += categoricals_total (cov->categoricals)
354 - categoricals_get_n_variables (cov->categoricals);
356 cov->n_cm = (cov->dim * (cov->dim - 1) ) / 2;
357 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
359 /* Grow the moment matrices so that they're large enough to accommodate the
360 categorical elements */
361 for (i = 0; i < n_MOMENTS; ++i)
363 cov->moments[i] = resize_matrix (cov->moments[i], cov->dim);
366 if (cov->categoricals)
367 categoricals_done (cov->categoricals);
369 /* Populate the moments matrices with the categorical value elements */
370 for (i = cov->n_vars; i < cov->dim; ++i)
372 for (j = 0 ; j < cov->dim ; ++j) /* FIXME: This is WRONG !!! */
374 double w = categoricals_get_weight_by_subscript (cov->categoricals, i - cov->n_vars);
376 gsl_matrix_set (cov->moments[MOMENT_NONE], i, j, w);
378 w = categoricals_get_sum_by_subscript (cov->categoricals, i - cov->n_vars);
380 gsl_matrix_set (cov->moments[MOMENT_MEAN], i, j, w);
384 /* FIXME: This is WRONG!! It must be fixed to properly handle missing values. For
385 now it assumes there are none */
386 for (m = 0 ; m < n_MOMENTS; ++m)
388 for (i = 0 ; i < cov->dim ; ++i)
390 double x = gsl_matrix_get (cov->moments[m], i, cov->n_vars -1);
391 for (j = cov->n_vars; j < cov->dim; ++j)
393 gsl_matrix_set (cov->moments[m], i, j, x);
398 /* Divide the means by the number of samples */
399 for (i = 0; i < cov->dim; ++i)
401 for (j = 0; j < cov->dim; ++j)
403 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
404 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
409 for (i = 0 ; i < cov->dim; ++i)
411 double v1 = get_val (cov, i, c);
413 if ( is_missing (cov, i, c))
416 for (j = 0 ; j < cov->dim; ++j)
420 double v2 = get_val (cov, j, c);
422 const double s = pow2 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
424 if ( is_missing (cov, j, c))
428 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
433 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
435 (v2 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
439 idx = cm_idx (cov, i, j);
447 cov->pass_two_first_case_seen = true;
451 /* Call this function for every case in the data set.
452 After all cases have been passed, call covariance_calculate
455 covariance_accumulate (struct covariance *cov, const struct ccase *c)
458 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
460 assert (cov->passes == 1);
462 if ( !cov->pass_one_first_case_seen)
464 assert ( cov->state == 0);
468 for (i = 0 ; i < cov->dim; ++i)
470 const union value *val1 = case_data (c, cov->vars[i]);
472 if ( is_missing (cov, i, c))
475 for (j = 0 ; j < cov->dim; ++j)
479 const union value *val2 = case_data (c, cov->vars[j]);
481 if ( is_missing (cov, j, c))
484 idx = cm_idx (cov, i, j);
487 cov->cm [idx] += val1->f * val2->f * weight;
490 for (m = 0 ; m < n_MOMENTS; ++m)
492 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
500 cov->pass_one_first_case_seen = true;
505 Allocate and return a gsl_matrix containing the covariances of the
509 cm_to_gsl (struct covariance *cov)
512 gsl_matrix *m = gsl_matrix_calloc (cov->dim, cov->dim);
514 /* Copy the non-diagonal elements from cov->cm */
515 for ( j = 0 ; j < cov->dim - 1; ++j)
517 for (i = j+1 ; i < cov->dim; ++i)
519 double x = cov->cm [cm_idx (cov, i, j)];
520 gsl_matrix_set (m, i, j, x);
521 gsl_matrix_set (m, j, i, x);
525 /* Copy the diagonal elements from cov->moments[2] */
526 for (j = 0 ; j < cov->dim ; ++j)
528 double sigma = gsl_matrix_get (cov->moments[2], j, j);
529 gsl_matrix_set (m, j, j, sigma);
537 covariance_calculate_double_pass (struct covariance *cov)
540 for (i = 0 ; i < cov->dim; ++i)
542 for (j = 0 ; j < cov->dim; ++j)
545 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
546 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
548 idx = cm_idx (cov, i, j);
552 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
557 return cm_to_gsl (cov);
561 covariance_calculate_single_pass (struct covariance *cov)
566 for (m = 0; m < n_MOMENTS; ++m)
568 /* Divide the moments by the number of samples */
571 for (i = 0 ; i < cov->dim; ++i)
573 for (j = 0 ; j < cov->dim; ++j)
575 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
576 *x /= gsl_matrix_get (cov->moments[0], i, j);
578 if ( m == MOMENT_VARIANCE)
579 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
585 /* Centre the moments */
586 for ( j = 0 ; j < cov->dim - 1; ++j)
588 for (i = j + 1 ; i < cov->dim; ++i)
590 double *x = &cov->cm [cm_idx (cov, i, j)];
592 *x /= gsl_matrix_get (cov->moments[0], i, j);
595 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
597 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
601 return cm_to_gsl (cov);
605 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
606 caller owns the returned matrix and must free it when it is no longer
609 Call this function only after all data have been accumulated. */
611 covariance_calculate (struct covariance *cov)
613 if ( cov->state <= 0 )
619 return covariance_calculate_single_pass (cov);
622 return covariance_calculate_double_pass (cov);
630 Covariance computed without dividing by the sample size.
633 covariance_calculate_double_pass_unnormalized (struct covariance *cov)
636 for (i = 0 ; i < cov->dim; ++i)
638 for (j = 0 ; j < cov->dim; ++j)
641 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
643 idx = cm_idx (cov, i, j);
651 return cm_to_gsl (cov);
655 covariance_calculate_single_pass_unnormalized (struct covariance *cov)
659 for (i = 0 ; i < cov->dim; ++i)
661 for (j = 0 ; j < cov->dim; ++j)
663 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
664 *x -= pow2 (gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
665 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
668 for ( j = 0 ; j < cov->dim - 1; ++j)
670 for (i = j + 1 ; i < cov->dim; ++i)
672 double *x = &cov->cm [cm_idx (cov, i, j)];
675 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
677 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i)
678 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
682 return cm_to_gsl (cov);
686 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
687 caller owns the returned matrix and must free it when it is no longer
690 Call this function only after all data have been accumulated. */
692 covariance_calculate_unnormalized (struct covariance *cov)
694 if ( cov->state <= 0 )
700 return covariance_calculate_single_pass_unnormalized (cov);
703 return covariance_calculate_double_pass_unnormalized (cov);
710 /* Function to access the categoricals used by COV
711 The return value is owned by the COV
713 const struct categoricals *
714 covariance_get_categoricals (const struct covariance *cov)
716 return cov->categoricals;
720 /* Destroy the COV object */
722 covariance_destroy (struct covariance *cov)
726 categoricals_destroy (cov->categoricals);
728 for (i = 0; i < n_MOMENTS; ++i)
729 gsl_matrix_free (cov->moments[i]);
737 covariance_dim (const struct covariance * cov)