1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <libpspp/assertion.h>
20 #include "covariance.h"
21 #include <gl/xalloc.h>
23 #include <gsl/gsl_matrix.h>
24 #include <data/case.h>
25 #include <data/variable.h>
26 #include <libpspp/misc.h>
27 #include "categoricals.h"
29 #define n_MOMENTS (MOMENT_VARIANCE + 1)
32 /* Create a new matrix of NEW_SIZE x NEW_SIZE and copy the elements of
33 matrix IN into it. IN must be a square matrix, and in normal usage
34 it will be smaller than NEW_SIZE.
35 IN is destroyed by this function. The return value must be destroyed
36 when no longer required.
39 resize_matrix (gsl_matrix *in, size_t new_size)
43 gsl_matrix *out = NULL;
45 assert (in->size1 == in->size2);
47 if (new_size <= in->size1)
50 out = gsl_matrix_calloc (new_size, new_size);
52 for (i = 0; i < in->size1; ++i)
54 for (j = 0; j < in->size2; ++j)
56 double x = gsl_matrix_get (in, i, j);
58 gsl_matrix_set (out, i, j, x);
69 /* The variables for which the covariance matrix is to be calculated. */
71 const struct variable *const *vars;
73 /* Categorical variables. */
74 struct categoricals *categoricals;
76 /* Array containing number of categories per categorical variable. */
79 /* Dimension of the covariance matrix. */
82 /* The weight variable (or NULL if none) */
83 const struct variable *wv;
85 /* A set of matrices containing the 0th, 1st and 2nd moments */
88 /* The class of missing values to exclude */
89 enum mv_class exclude;
91 /* An array of doubles representing the covariance matrix.
92 Only the top triangle is included, and no diagonals */
96 /* 1 for single pass algorithm;
97 2 for double pass algorithm
102 0 : No pass has been made
103 1 : First pass has been started
104 2 : Second pass has been
106 IE: How many passes have been (partially) made. */
109 /* Flags indicating that the first case has been seen */
110 bool pass_one_first_case_seen;
111 bool pass_two_first_case_seen;
116 /* Return a matrix containing the M th moments.
117 The matrix is of size NxN where N is the number of variables.
118 Each row represents the moments of a variable.
119 In the absence of missing values, the columns of this matrix will
120 be identical. If missing values are involved, then element (i,j)
121 is the moment of the i th variable, when paired with the j th variable.
124 covariance_moments (const struct covariance *cov, int m)
126 return cov->moments[m];
131 /* Create a covariance struct.
134 covariance_1pass_create (size_t n_vars, const struct variable *const *vars,
135 const struct variable *weight, enum mv_class exclude)
138 struct covariance *cov = xzalloc (sizeof *cov);
142 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
147 cov->n_vars = n_vars;
150 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
152 for (i = 0; i < n_MOMENTS; ++i)
153 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
155 cov->exclude = exclude;
157 cov->n_cm = (n_vars * (n_vars - 1) ) / 2;
160 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
161 cov->categoricals = NULL;
167 Create a covariance struct for a two-pass algorithm. If categorical
168 variables are involed, the dimension cannot be know until after the
169 first data pass, so the actual covariances will not be allocated
173 covariance_2pass_create (size_t n_vars, const struct variable *const *vars,
174 struct categoricals *cats,
175 const struct variable *wv, enum mv_class exclude)
178 struct covariance *cov = xmalloc (sizeof *cov);
182 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
187 cov->n_vars = n_vars;
190 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
192 for (i = 0; i < n_MOMENTS; ++i)
193 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
195 cov->exclude = exclude;
200 cov->categoricals = cats;
205 /* Return an integer, which can be used to index
206 into COV->cm, to obtain the I, J th element
207 of the covariance matrix. If COV->cm does not
208 contain that element, then a negative value
212 cm_idx (const struct covariance *cov, int i, int j)
215 const int n2j = cov->dim - 2 - j;
216 const int nj = cov->dim - 2 ;
219 assert (j < cov->dim);
224 if (j >= cov->dim - 1)
231 as -= n2j * (n2j + 1) ;
239 Returns true iff the variable corresponding to the Ith element of the covariance matrix
240 has a missing value for case C
243 is_missing (const struct covariance *cov, int i, const struct ccase *c)
245 const struct variable *var = i < cov->n_vars ?
247 categoricals_get_variable_by_subscript (cov->categoricals, i - cov->n_vars);
249 const union value *val = case_data (c, var);
251 return var_is_value_missing (var, val, cov->exclude);
256 get_val (const struct covariance *cov, int i, const struct ccase *c)
258 if ( i < cov->n_vars)
260 const struct variable *var = cov->vars[i];
262 const union value *val = case_data (c, var);
267 return categoricals_get_binary_by_subscript (cov->categoricals, i - cov->n_vars, c);
272 dump_matrix (const gsl_matrix *m)
276 for (i = 0 ; i < m->size1; ++i)
278 for (j = 0 ; j < m->size2; ++j)
279 printf ("%02f ", gsl_matrix_get (m, i, j));
285 /* Call this function for every case in the data set */
287 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
290 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
292 assert (cov->passes == 2);
293 if (!cov->pass_one_first_case_seen)
295 assert (cov->state == 0);
299 if (cov->categoricals)
300 categoricals_update (cov->categoricals, c);
302 for (i = 0 ; i < cov->dim; ++i)
304 double v1 = get_val (cov, i, c);
306 if ( is_missing (cov, i, c))
309 for (j = 0 ; j < cov->dim; ++j)
313 if ( is_missing (cov, j, c))
316 for (m = 0 ; m <= MOMENT_MEAN; ++m)
318 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
326 cov->pass_one_first_case_seen = true;
330 /* Call this function for every case in the data set */
332 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
335 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
337 assert (cov->passes == 2);
338 assert (cov->state >= 1);
340 if (! cov->pass_two_first_case_seen)
343 assert (cov->state == 1);
346 cov->dim = cov->n_vars;
348 if (cov->categoricals)
349 cov->dim += categoricals_total (cov->categoricals)
350 - categoricals_get_n_variables (cov->categoricals);
352 cov->n_cm = (cov->dim * (cov->dim - 1) ) / 2;
353 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
355 /* Grow the moment matrices so that they're large enough to accommodate the
356 categorical elements */
357 for (i = 0; i < n_MOMENTS; ++i)
359 cov->moments[i] = resize_matrix (cov->moments[i], cov->dim);
362 if (cov->categoricals)
363 categoricals_done (cov->categoricals);
365 /* Populate the moments matrices with the categorical value elements */
366 for (i = cov->n_vars; i < cov->dim; ++i)
368 for (j = 0 ; j < cov->dim ; ++j) /* FIXME: This is WRONG !!! */
370 double w = categoricals_get_weight_by_subscript (cov->categoricals, i - cov->n_vars);
372 gsl_matrix_set (cov->moments[MOMENT_NONE], i, j, w);
374 w = categoricals_get_sum_by_subscript (cov->categoricals, i - cov->n_vars);
376 gsl_matrix_set (cov->moments[MOMENT_MEAN], i, j, w);
380 /* FIXME: This is WRONG!! It must be fixed to properly handle missing values. For
381 now it assumes there are none */
382 for (m = 0 ; m < n_MOMENTS; ++m)
384 for (i = 0 ; i < cov->dim ; ++i)
386 double x = gsl_matrix_get (cov->moments[m], i, cov->n_vars -1);
387 for (j = cov->n_vars; j < cov->dim; ++j)
389 gsl_matrix_set (cov->moments[m], i, j, x);
394 /* Divide the means by the number of samples */
395 for (i = 0; i < cov->dim; ++i)
397 for (j = 0; j < cov->dim; ++j)
399 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
400 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
405 for (i = 0 ; i < cov->dim; ++i)
407 double v1 = get_val (cov, i, c);
409 if ( is_missing (cov, i, c))
412 for (j = 0 ; j < cov->dim; ++j)
416 double v2 = get_val (cov, j, c);
418 const double s = pow2 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
420 if ( is_missing (cov, j, c))
424 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
429 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
431 (v2 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
435 idx = cm_idx (cov, i, j);
443 cov->pass_two_first_case_seen = true;
447 /* Call this function for every case in the data set.
448 After all cases have been passed, call covariance_calculate
451 covariance_accumulate (struct covariance *cov, const struct ccase *c)
454 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
456 assert (cov->passes == 1);
458 if ( !cov->pass_one_first_case_seen)
460 assert ( cov->state == 0);
464 for (i = 0 ; i < cov->dim; ++i)
466 const union value *val1 = case_data (c, cov->vars[i]);
468 if ( is_missing (cov, i, c))
471 for (j = 0 ; j < cov->dim; ++j)
475 const union value *val2 = case_data (c, cov->vars[j]);
477 if ( is_missing (cov, j, c))
480 idx = cm_idx (cov, i, j);
483 cov->cm [idx] += val1->f * val2->f * weight;
486 for (m = 0 ; m < n_MOMENTS; ++m)
488 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
496 cov->pass_one_first_case_seen = true;
501 Allocate and return a gsl_matrix containing the covariances of the
505 cm_to_gsl (struct covariance *cov)
508 gsl_matrix *m = gsl_matrix_calloc (cov->dim, cov->dim);
510 /* Copy the non-diagonal elements from cov->cm */
511 for ( j = 0 ; j < cov->dim - 1; ++j)
513 for (i = j+1 ; i < cov->dim; ++i)
515 double x = cov->cm [cm_idx (cov, i, j)];
516 gsl_matrix_set (m, i, j, x);
517 gsl_matrix_set (m, j, i, x);
521 /* Copy the diagonal elements from cov->moments[2] */
522 for (j = 0 ; j < cov->dim ; ++j)
524 double sigma = gsl_matrix_get (cov->moments[2], j, j);
525 gsl_matrix_set (m, j, j, sigma);
532 static const gsl_matrix *
533 covariance_calculate_double_pass (struct covariance *cov)
536 for (i = 0 ; i < cov->dim; ++i)
538 for (j = 0 ; j < cov->dim; ++j)
541 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
542 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
544 idx = cm_idx (cov, i, j);
548 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
553 return cm_to_gsl (cov);
556 static const gsl_matrix *
557 covariance_calculate_single_pass (struct covariance *cov)
562 for (m = 0; m < n_MOMENTS; ++m)
564 /* Divide the moments by the number of samples */
567 for (i = 0 ; i < cov->dim; ++i)
569 for (j = 0 ; j < cov->dim; ++j)
571 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
572 *x /= gsl_matrix_get (cov->moments[0], i, j);
574 if ( m == MOMENT_VARIANCE)
575 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
581 /* Centre the moments */
582 for ( j = 0 ; j < cov->dim - 1; ++j)
584 for (i = j + 1 ; i < cov->dim; ++i)
586 double *x = &cov->cm [cm_idx (cov, i, j)];
588 *x /= gsl_matrix_get (cov->moments[0], i, j);
591 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
593 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
597 return cm_to_gsl (cov);
602 Return a pointer to gsl_matrix containing the pairwise covariances.
603 The matrix remains owned by the COV object, and must not be freed.
604 Call this function only after all data have been accumulated.
607 covariance_calculate (struct covariance *cov)
609 if ( cov->state <= 0 )
615 return covariance_calculate_single_pass (cov);
618 return covariance_calculate_double_pass (cov);
626 Covariance computed without dividing by the sample size.
628 static const gsl_matrix *
629 covariance_calculate_double_pass_unnormalized (struct covariance *cov)
632 for (i = 0 ; i < cov->dim; ++i)
634 for (j = 0 ; j < cov->dim; ++j)
637 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
639 idx = cm_idx (cov, i, j);
647 return cm_to_gsl (cov);
650 static const gsl_matrix *
651 covariance_calculate_single_pass_unnormalized (struct covariance *cov)
655 for (i = 0 ; i < cov->dim; ++i)
657 for (j = 0 ; j < cov->dim; ++j)
659 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
660 *x -= pow2 (gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
661 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
664 for ( j = 0 ; j < cov->dim - 1; ++j)
666 for (i = j + 1 ; i < cov->dim; ++i)
668 double *x = &cov->cm [cm_idx (cov, i, j)];
671 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
673 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i)
674 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
678 return cm_to_gsl (cov);
683 Return a pointer to gsl_matrix containing the pairwise covariances.
684 The matrix remains owned by the COV object, and must not be freed.
685 Call this function only after all data have been accumulated.
688 covariance_calculate_unnormalized (struct covariance *cov)
690 if ( cov->state <= 0 )
696 return covariance_calculate_single_pass_unnormalized (cov);
699 return covariance_calculate_double_pass_unnormalized (cov);
706 /* Function to access the categoricals used by COV
707 The return value is owned by the COV
709 const struct categoricals *
710 covariance_get_categoricals (const struct covariance *cov)
712 return cov->categoricals;
716 /* Destroy the COV object */
718 covariance_destroy (struct covariance *cov)
722 categoricals_destroy (cov->categoricals);
724 for (i = 0; i < n_MOMENTS; ++i)
725 gsl_matrix_free (cov->moments[i]);