1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <libpspp/assertion.h>
20 #include "covariance.h"
21 #include <gl/xalloc.h>
23 #include <gsl/gsl_matrix.h>
24 #include <data/case.h>
25 #include <data/variable.h>
26 #include <libpspp/misc.h>
27 #include "categoricals.h"
29 #define n_MOMENTS (MOMENT_VARIANCE + 1)
32 /* Create a new matrix of NEW_SIZE x NEW_SIZE and copy the elements of
33 matrix IN into it. IN must be a square matrix, and in normal usage
34 it will be smaller than NEW_SIZE.
35 IN is destroyed by this function. The return value must be destroyed
36 when no longer required.
39 resize_matrix (gsl_matrix *in, size_t new_size)
43 gsl_matrix *out = NULL;
45 assert (in->size1 == in->size2);
47 if (new_size <= in->size1)
50 out = gsl_matrix_calloc (new_size, new_size);
52 for (i = 0; i < in->size1; ++i)
54 for (j = 0; j < in->size2; ++j)
56 double x = gsl_matrix_get (in, i, j);
58 gsl_matrix_set (out, i, j, x);
69 /* The variables for which the covariance matrix is to be calculated. */
71 const struct variable **vars;
73 /* Categorical variables. */
74 struct categoricals *categoricals;
76 /* Array containing number of categories per categorical variable. */
79 /* Dimension of the covariance matrix. */
82 /* The weight variable (or NULL if none) */
83 const struct variable *wv;
85 /* A set of matrices containing the 0th, 1st and 2nd moments */
88 /* The class of missing values to exclude */
89 enum mv_class exclude;
91 /* An array of doubles representing the covariance matrix.
92 Only the top triangle is included, and no diagonals */
96 /* 1 for single pass algorithm;
97 2 for double pass algorithm
102 0 : No pass has been made
103 1 : First pass has been started
104 2 : Second pass has been
106 IE: How many passes have been (partially) made. */
109 /* Flags indicating that the first case has been seen */
110 bool pass_one_first_case_seen;
111 bool pass_two_first_case_seen;
116 /* Return a matrix containing the M th moments.
117 The matrix is of size NxN where N is the number of variables.
118 Each row represents the moments of a variable.
119 In the absence of missing values, the columns of this matrix will
120 be identical. If missing values are involved, then element (i,j)
121 is the moment of the i th variable, when paired with the j th variable.
124 covariance_moments (const struct covariance *cov, int m)
126 return cov->moments[m];
131 /* Create a covariance struct.
134 covariance_1pass_create (size_t n_vars, const struct variable **vars,
135 const struct variable *weight, enum mv_class exclude)
138 struct covariance *cov = xmalloc (sizeof *cov);
142 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
147 cov->n_vars = n_vars;
150 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
152 for (i = 0; i < n_MOMENTS; ++i)
153 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
155 cov->exclude = exclude;
157 cov->n_cm = (n_vars * (n_vars - 1) ) / 2;
159 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
165 Create a covariance struct for a two-pass algorithm. If categorical
166 variables are involed, the dimension cannot be know until after the
167 first data pass, so the actual covariances will not be allocated
171 covariance_2pass_create (size_t n_vars, const struct variable **vars,
172 size_t n_catvars, const struct variable **catvars,
173 const struct variable *wv, enum mv_class exclude)
176 struct covariance *cov = xmalloc (sizeof *cov);
180 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
185 cov->n_vars = n_vars;
188 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
190 for (i = 0; i < n_MOMENTS; ++i)
191 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
193 cov->exclude = exclude;
198 cov->categoricals = categoricals_create (catvars, n_catvars, wv, exclude);
203 /* Return an integer, which can be used to index
204 into COV->cm, to obtain the I, J th element
205 of the covariance matrix. If COV->cm does not
206 contain that element, then a negative value
210 cm_idx (const struct covariance *cov, int i, int j)
213 const int n2j = cov->dim - 2 - j;
214 const int nj = cov->dim - 2 ;
217 assert (j < cov->dim);
222 if (j >= cov->dim - 1)
229 as -= n2j * (n2j + 1) ;
237 Returns true iff the variable corresponding to the Ith element of the covariance matrix
238 has a missing value for case C
241 is_missing (const struct covariance *cov, int i, const struct ccase *c)
243 const struct variable *var = i < cov->n_vars ?
245 categoricals_get_variable_by_subscript (cov->categoricals, i - cov->n_vars);
247 const union value *val = case_data (c, var);
249 return var_is_value_missing (var, val, cov->exclude);
254 get_val (const struct covariance *cov, int i, const struct ccase *c)
256 if ( i < cov->n_vars)
258 const struct variable *var = cov->vars[i];
260 const union value *val = case_data (c, var);
265 return categoricals_get_binary_by_subscript (cov->categoricals, i - cov->n_vars, c);
269 dump_matrix (const gsl_matrix *m)
273 for (i = 0 ; i < m->size1; ++i)
275 for (j = 0 ; j < m->size2; ++j)
276 printf ("%02f ", gsl_matrix_get (m, i, j));
281 /* Call this function for every case in the data set */
283 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
286 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
288 assert (cov->passes == 2);
289 if (!cov->pass_one_first_case_seen)
291 assert (cov->state == 0);
295 categoricals_update (cov->categoricals, c);
297 for (i = 0 ; i < cov->dim; ++i)
299 double v1 = get_val (cov, i, c);
301 if ( is_missing (cov, i, c))
304 for (j = 0 ; j < cov->dim; ++j)
308 if ( is_missing (cov, j, c))
311 for (m = 0 ; m <= MOMENT_MEAN; ++m)
313 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
321 cov->pass_one_first_case_seen = true;
325 /* Call this function for every case in the data set */
327 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
330 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
332 assert (cov->passes == 2);
333 assert (cov->state >= 1);
335 if (! cov->pass_two_first_case_seen)
338 assert (cov->state == 1);
341 cov->dim = cov->n_vars +
342 categoricals_total (cov->categoricals) - categoricals_get_n_variables (cov->categoricals);
344 cov->n_cm = (cov->dim * (cov->dim - 1) ) / 2;
345 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
347 /* Grow the moment matrices so that they're large enough to accommodate the
348 categorical elements */
349 for (i = 0; i < n_MOMENTS; ++i)
351 cov->moments[i] = resize_matrix (cov->moments[i], cov->dim);
354 categoricals_done (cov->categoricals);
356 /* Populate the moments matrices with the categorical value elements */
357 for (i = cov->n_vars; i < cov->dim; ++i)
359 for (j = 0 ; j < cov->dim ; ++j) /* FIXME: This is WRONG !!! */
361 double w = categoricals_get_weight_by_subscript (cov->categoricals, i - cov->n_vars);
363 gsl_matrix_set (cov->moments[MOMENT_NONE], i, j, w);
365 w = categoricals_get_sum_by_subscript (cov->categoricals, i - cov->n_vars);
367 gsl_matrix_set (cov->moments[MOMENT_MEAN], i, j, w);
371 /* FIXME: This is WRONG!! It must be fixed to properly handle missing values. For
372 now it assumes there are none */
373 for (m = 0 ; m < n_MOMENTS; ++m)
375 for (i = 0 ; i < cov->dim ; ++i)
377 double x = gsl_matrix_get (cov->moments[m], i, cov->n_vars -1);
378 for (j = cov->n_vars; j < cov->dim; ++j)
380 gsl_matrix_set (cov->moments[m], i, j, x);
385 /* Divide the means by the number of samples */
386 for (i = 0; i < cov->dim; ++i)
388 for (j = 0; j < cov->dim; ++j)
390 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
391 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
396 for (i = 0 ; i < cov->dim; ++i)
398 double v1 = get_val (cov, i, c);
400 if ( is_missing (cov, i, c))
403 for (j = 0 ; j < cov->dim; ++j)
407 double v2 = get_val (cov, j, c);
409 const double s = pow2 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
411 if ( is_missing (cov, j, c))
415 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
420 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
422 (v2 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
426 idx = cm_idx (cov, i, j);
434 cov->pass_two_first_case_seen = true;
438 /* Call this function for every case in the data set.
439 After all cases have been passed, call covariance_calculate
442 covariance_accumulate (struct covariance *cov, const struct ccase *c)
445 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
447 assert (cov->passes == 1);
449 if ( !cov->pass_one_first_case_seen)
451 assert ( cov->state == 0);
455 for (i = 0 ; i < cov->dim; ++i)
457 const union value *val1 = case_data (c, cov->vars[i]);
459 if ( is_missing (cov, i, c))
462 for (j = 0 ; j < cov->dim; ++j)
466 const union value *val2 = case_data (c, cov->vars[j]);
468 if ( is_missing (cov, j, c))
471 idx = cm_idx (cov, i, j);
474 cov->cm [idx] += val1->f * val2->f * weight;
477 for (m = 0 ; m < n_MOMENTS; ++m)
479 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
487 cov->pass_one_first_case_seen = true;
492 Allocate and return a gsl_matrix containing the covariances of the
496 cm_to_gsl (struct covariance *cov)
499 gsl_matrix *m = gsl_matrix_calloc (cov->dim, cov->dim);
501 /* Copy the non-diagonal elements from cov->cm */
502 for ( j = 0 ; j < cov->dim - 1; ++j)
504 for (i = j+1 ; i < cov->dim; ++i)
506 double x = cov->cm [cm_idx (cov, i, j)];
507 gsl_matrix_set (m, i, j, x);
508 gsl_matrix_set (m, j, i, x);
512 /* Copy the diagonal elements from cov->moments[2] */
513 for (j = 0 ; j < cov->dim ; ++j)
515 double sigma = gsl_matrix_get (cov->moments[2], j, j);
516 gsl_matrix_set (m, j, j, sigma);
523 static const gsl_matrix *
524 covariance_calculate_double_pass (struct covariance *cov)
527 for (i = 0 ; i < cov->dim; ++i)
529 for (j = 0 ; j < cov->dim; ++j)
532 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
533 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
535 idx = cm_idx (cov, i, j);
539 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
544 return cm_to_gsl (cov);
547 static const gsl_matrix *
548 covariance_calculate_single_pass (struct covariance *cov)
553 for (m = 0; m < n_MOMENTS; ++m)
555 /* Divide the moments by the number of samples */
558 for (i = 0 ; i < cov->dim; ++i)
560 for (j = 0 ; j < cov->dim; ++j)
562 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
563 *x /= gsl_matrix_get (cov->moments[0], i, j);
565 if ( m == MOMENT_VARIANCE)
566 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
572 /* Centre the moments */
573 for ( j = 0 ; j < cov->dim - 1; ++j)
575 for (i = j + 1 ; i < cov->dim; ++i)
577 double *x = &cov->cm [cm_idx (cov, i, j)];
579 *x /= gsl_matrix_get (cov->moments[0], i, j);
582 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
584 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
588 return cm_to_gsl (cov);
594 Return a pointer to gsl_matrix containing the pairwise covariances.
595 The matrix remains owned by the COV object, and must not be freed.
596 Call this function only after all data have been accumulated.
599 covariance_calculate (struct covariance *cov)
601 assert ( cov->state > 0 );
606 return covariance_calculate_single_pass (cov);
609 return covariance_calculate_double_pass (cov);
619 /* Destroy the COV object */
621 covariance_destroy (struct covariance *cov)
625 categoricals_destroy (cov->categoricals);
627 for (i = 0; i < n_MOMENTS; ++i)
628 gsl_matrix_free (cov->moments[i]);