1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <libpspp/assertion.h>
20 #include "covariance.h"
21 #include <gl/xalloc.h>
23 #include <gsl/gsl_matrix.h>
24 #include <data/case.h>
25 #include <data/variable.h>
26 #include <libpspp/misc.h>
27 #include "categoricals.h"
29 #define n_MOMENTS (MOMENT_VARIANCE + 1)
32 /* Create a new matrix of NEW_SIZE x NEW_SIZE and copy the elements of
33 matrix IN into it. IN must be a square matrix, and in normal usage
34 it will be smaller than NEW_SIZE.
35 IN is destroyed by this function. The return value must be destroyed
36 when no longer required.
39 resize_matrix (gsl_matrix *in, size_t new_size)
43 gsl_matrix *out = NULL;
45 assert (in->size1 == in->size2);
47 if (new_size <= in->size1)
50 out = gsl_matrix_calloc (new_size, new_size);
52 for (i = 0; i < in->size1; ++i)
54 for (j = 0; j < in->size2; ++j)
56 double x = gsl_matrix_get (in, i, j);
58 gsl_matrix_set (out, i, j, x);
69 /* The variables for which the covariance matrix is to be calculated. */
71 const struct variable **vars;
73 /* Categorical variables. */
74 struct categoricals *categoricals;
76 /* Array containing number of categories per categorical variable. */
79 /* Dimension of the covariance matrix. */
82 /* The weight variable (or NULL if none) */
83 const struct variable *wv;
85 /* A set of matrices containing the 0th, 1st and 2nd moments */
88 /* The class of missing values to exclude */
89 enum mv_class exclude;
91 /* An array of doubles representing the covariance matrix.
92 Only the top triangle is included, and no diagonals */
96 /* 1 for single pass algorithm;
97 2 for double pass algorithm
102 0 : No pass has been made
103 1 : First pass has been started
104 2 : Second pass has been
106 IE: How many passes have been (partially) made. */
109 /* Flags indicating that the first case has been seen */
110 bool pass_one_first_case_seen;
111 bool pass_two_first_case_seen;
116 /* Return a matrix containing the M th moments.
117 The matrix is of size NxN where N is the number of variables.
118 Each row represents the moments of a variable.
119 In the absence of missing values, the columns of this matrix will
120 be identical. If missing values are involved, then element (i,j)
121 is the moment of the i th variable, when paired with the j th variable.
124 covariance_moments (const struct covariance *cov, int m)
126 return cov->moments[m];
131 /* Create a covariance struct.
134 covariance_1pass_create (size_t n_vars, const struct variable **vars,
135 const struct variable *weight, enum mv_class exclude)
138 struct covariance *cov = xmalloc (sizeof *cov);
142 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
147 cov->n_vars = n_vars;
150 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
152 for (i = 0; i < n_MOMENTS; ++i)
153 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
155 cov->exclude = exclude;
157 cov->n_cm = (n_vars * (n_vars - 1) ) / 2;
159 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
160 cov->categoricals = NULL;
166 Create a covariance struct for a two-pass algorithm. If categorical
167 variables are involed, the dimension cannot be know until after the
168 first data pass, so the actual covariances will not be allocated
172 covariance_2pass_create (size_t n_vars, const struct variable **vars,
173 size_t n_catvars, const struct variable **catvars,
174 const struct variable *wv, enum mv_class exclude)
177 struct covariance *cov = xmalloc (sizeof *cov);
181 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
186 cov->n_vars = n_vars;
189 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
191 for (i = 0; i < n_MOMENTS; ++i)
192 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
194 cov->exclude = exclude;
199 cov->categoricals = categoricals_create (catvars, n_catvars, wv, exclude);
204 /* Return an integer, which can be used to index
205 into COV->cm, to obtain the I, J th element
206 of the covariance matrix. If COV->cm does not
207 contain that element, then a negative value
211 cm_idx (const struct covariance *cov, int i, int j)
214 const int n2j = cov->dim - 2 - j;
215 const int nj = cov->dim - 2 ;
218 assert (j < cov->dim);
223 if (j >= cov->dim - 1)
230 as -= n2j * (n2j + 1) ;
238 Returns true iff the variable corresponding to the Ith element of the covariance matrix
239 has a missing value for case C
242 is_missing (const struct covariance *cov, int i, const struct ccase *c)
244 const struct variable *var = i < cov->n_vars ?
246 categoricals_get_variable_by_subscript (cov->categoricals, i - cov->n_vars);
248 const union value *val = case_data (c, var);
250 return var_is_value_missing (var, val, cov->exclude);
255 get_val (const struct covariance *cov, int i, const struct ccase *c)
257 if ( i < cov->n_vars)
259 const struct variable *var = cov->vars[i];
261 const union value *val = case_data (c, var);
266 return categoricals_get_binary_by_subscript (cov->categoricals, i - cov->n_vars, c);
270 dump_matrix (const gsl_matrix *m)
274 for (i = 0 ; i < m->size1; ++i)
276 for (j = 0 ; j < m->size2; ++j)
277 printf ("%02f ", gsl_matrix_get (m, i, j));
282 /* Call this function for every case in the data set */
284 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
287 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
289 assert (cov->passes == 2);
290 if (!cov->pass_one_first_case_seen)
292 assert (cov->state == 0);
296 categoricals_update (cov->categoricals, c);
298 for (i = 0 ; i < cov->dim; ++i)
300 double v1 = get_val (cov, i, c);
302 if ( is_missing (cov, i, c))
305 for (j = 0 ; j < cov->dim; ++j)
309 if ( is_missing (cov, j, c))
312 for (m = 0 ; m <= MOMENT_MEAN; ++m)
314 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
322 cov->pass_one_first_case_seen = true;
326 /* Call this function for every case in the data set */
328 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
331 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
333 assert (cov->passes == 2);
334 assert (cov->state >= 1);
336 if (! cov->pass_two_first_case_seen)
339 assert (cov->state == 1);
342 cov->dim = cov->n_vars +
343 categoricals_total (cov->categoricals) - categoricals_get_n_variables (cov->categoricals);
345 cov->n_cm = (cov->dim * (cov->dim - 1) ) / 2;
346 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
348 /* Grow the moment matrices so that they're large enough to accommodate the
349 categorical elements */
350 for (i = 0; i < n_MOMENTS; ++i)
352 cov->moments[i] = resize_matrix (cov->moments[i], cov->dim);
355 categoricals_done (cov->categoricals);
357 /* Populate the moments matrices with the categorical value elements */
358 for (i = cov->n_vars; i < cov->dim; ++i)
360 for (j = 0 ; j < cov->dim ; ++j) /* FIXME: This is WRONG !!! */
362 double w = categoricals_get_weight_by_subscript (cov->categoricals, i - cov->n_vars);
364 gsl_matrix_set (cov->moments[MOMENT_NONE], i, j, w);
366 w = categoricals_get_sum_by_subscript (cov->categoricals, i - cov->n_vars);
368 gsl_matrix_set (cov->moments[MOMENT_MEAN], i, j, w);
372 /* FIXME: This is WRONG!! It must be fixed to properly handle missing values. For
373 now it assumes there are none */
374 for (m = 0 ; m < n_MOMENTS; ++m)
376 for (i = 0 ; i < cov->dim ; ++i)
378 double x = gsl_matrix_get (cov->moments[m], i, cov->n_vars -1);
379 for (j = cov->n_vars; j < cov->dim; ++j)
381 gsl_matrix_set (cov->moments[m], i, j, x);
386 /* Divide the means by the number of samples */
387 for (i = 0; i < cov->dim; ++i)
389 for (j = 0; j < cov->dim; ++j)
391 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
392 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
397 for (i = 0 ; i < cov->dim; ++i)
399 double v1 = get_val (cov, i, c);
401 if ( is_missing (cov, i, c))
404 for (j = 0 ; j < cov->dim; ++j)
408 double v2 = get_val (cov, j, c);
410 const double s = pow2 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
412 if ( is_missing (cov, j, c))
416 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
421 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
423 (v2 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
427 idx = cm_idx (cov, i, j);
435 cov->pass_two_first_case_seen = true;
439 /* Call this function for every case in the data set.
440 After all cases have been passed, call covariance_calculate
443 covariance_accumulate (struct covariance *cov, const struct ccase *c)
446 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
448 assert (cov->passes == 1);
450 if ( !cov->pass_one_first_case_seen)
452 assert ( cov->state == 0);
456 for (i = 0 ; i < cov->dim; ++i)
458 const union value *val1 = case_data (c, cov->vars[i]);
460 if ( is_missing (cov, i, c))
463 for (j = 0 ; j < cov->dim; ++j)
467 const union value *val2 = case_data (c, cov->vars[j]);
469 if ( is_missing (cov, j, c))
472 idx = cm_idx (cov, i, j);
475 cov->cm [idx] += val1->f * val2->f * weight;
478 for (m = 0 ; m < n_MOMENTS; ++m)
480 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
488 cov->pass_one_first_case_seen = true;
493 Allocate and return a gsl_matrix containing the covariances of the
497 cm_to_gsl (struct covariance *cov)
500 gsl_matrix *m = gsl_matrix_calloc (cov->dim, cov->dim);
502 /* Copy the non-diagonal elements from cov->cm */
503 for ( j = 0 ; j < cov->dim - 1; ++j)
505 for (i = j+1 ; i < cov->dim; ++i)
507 double x = cov->cm [cm_idx (cov, i, j)];
508 gsl_matrix_set (m, i, j, x);
509 gsl_matrix_set (m, j, i, x);
513 /* Copy the diagonal elements from cov->moments[2] */
514 for (j = 0 ; j < cov->dim ; ++j)
516 double sigma = gsl_matrix_get (cov->moments[2], j, j);
517 gsl_matrix_set (m, j, j, sigma);
524 static const gsl_matrix *
525 covariance_calculate_double_pass (struct covariance *cov)
528 for (i = 0 ; i < cov->dim; ++i)
530 for (j = 0 ; j < cov->dim; ++j)
533 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
534 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
536 idx = cm_idx (cov, i, j);
540 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
545 return cm_to_gsl (cov);
548 static const gsl_matrix *
549 covariance_calculate_single_pass (struct covariance *cov)
554 for (m = 0; m < n_MOMENTS; ++m)
556 /* Divide the moments by the number of samples */
559 for (i = 0 ; i < cov->dim; ++i)
561 for (j = 0 ; j < cov->dim; ++j)
563 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
564 *x /= gsl_matrix_get (cov->moments[0], i, j);
566 if ( m == MOMENT_VARIANCE)
567 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
573 /* Centre the moments */
574 for ( j = 0 ; j < cov->dim - 1; ++j)
576 for (i = j + 1 ; i < cov->dim; ++i)
578 double *x = &cov->cm [cm_idx (cov, i, j)];
580 *x /= gsl_matrix_get (cov->moments[0], i, j);
583 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
585 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
589 return cm_to_gsl (cov);
594 Return a pointer to gsl_matrix containing the pairwise covariances.
595 The matrix remains owned by the COV object, and must not be freed.
596 Call this function only after all data have been accumulated.
599 covariance_calculate (struct covariance *cov)
601 assert ( cov->state > 0 );
606 return covariance_calculate_single_pass (cov);
609 return covariance_calculate_double_pass (cov);
617 Covariance computed without dividing by the sample size.
619 static const gsl_matrix *
620 covariance_calculate_double_pass_unnormalized (struct covariance *cov)
623 for (i = 0 ; i < cov->dim; ++i)
625 for (j = 0 ; j < cov->dim; ++j)
628 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
630 idx = cm_idx (cov, i, j);
638 return cm_to_gsl (cov);
641 static const gsl_matrix *
642 covariance_calculate_single_pass_unnormalized (struct covariance *cov)
647 for (i = 0 ; i < cov->dim; ++i)
649 for (j = 0 ; j < cov->dim; ++j)
651 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
652 *x -= pow2 (gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
653 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
656 for ( j = 0 ; j < cov->dim - 1; ++j)
658 for (i = j + 1 ; i < cov->dim; ++i)
660 double *x = &cov->cm [cm_idx (cov, i, j)];
663 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
665 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i)
666 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
670 return cm_to_gsl (cov);
675 Return a pointer to gsl_matrix containing the pairwise covariances.
676 The matrix remains owned by the COV object, and must not be freed.
677 Call this function only after all data have been accumulated.
680 covariance_calculate_unnormalized (struct covariance *cov)
682 assert ( cov->state > 0 );
687 return covariance_calculate_single_pass_unnormalized (cov);
690 return covariance_calculate_double_pass_unnormalized (cov);
699 /* Destroy the COV object */
701 covariance_destroy (struct covariance *cov)
705 categoricals_destroy (cov->categoricals);
707 for (i = 0; i < n_MOMENTS; ++i)
708 gsl_matrix_free (cov->moments[i]);