1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <libpspp/assertion.h>
20 #include "covariance.h"
21 #include <gl/xalloc.h>
23 #include <gsl/gsl_matrix.h>
24 #include <data/case.h>
25 #include <data/variable.h>
26 #include <libpspp/misc.h>
27 #include "categoricals.h"
29 #define n_MOMENTS (MOMENT_VARIANCE + 1)
34 /* The variables for which the covariance matrix is to be calculated. */
36 const struct variable **vars;
38 /* Categorical variables. */
39 struct categoricals *categoricals;
41 /* Array containing number of categories per categorical variable. */
44 /* Dimension of the covariance matrix. */
47 /* The weight variable (or NULL if none) */
48 const struct variable *wv;
50 /* A set of matrices containing the 0th, 1st and 2nd moments */
53 /* The class of missing values to exclude */
54 enum mv_class exclude;
56 /* An array of doubles representing the covariance matrix.
57 Only the top triangle is included, and no diagonals */
61 /* 1 for single pass algorithm;
62 2 for double pass algorithm
67 0 : No pass has been made
68 1 : First pass has been started
69 2 : Second pass has been
71 IE: How many passes have been (partially) made. */
74 /* Flags indicating that the first case has been seen */
75 bool pass_one_first_case_seen;
76 bool pass_two_first_case_seen;
81 /* Return a matrix containing the M th moments.
82 The matrix is of size NxN where N is the number of variables.
83 Each row represents the moments of a variable.
84 In the absence of missing values, the columns of this matrix will
85 be identical. If missing values are involved, then element (i,j)
86 is the moment of the i th variable, when paired with the j th variable.
89 covariance_moments (const struct covariance *cov, int m)
91 return cov->moments[m];
96 /* Create a covariance struct.
99 covariance_1pass_create (size_t n_vars, const struct variable **vars,
100 const struct variable *weight, enum mv_class exclude)
103 struct covariance *cov = xmalloc (sizeof *cov);
107 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
112 cov->n_vars = n_vars;
115 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
117 for (i = 0; i < n_MOMENTS; ++i)
118 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
120 cov->exclude = exclude;
122 cov->n_cm = (n_vars * (n_vars - 1) ) / 2;
124 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
130 Create a covariance struct for a two-pass algorithm. If categorical
131 variables are involed, the dimension cannot be know until after the
132 first data pass, so the actual covariances will not be allocated
136 covariance_2pass_create (size_t n_vars, const struct variable **vars,
137 size_t n_catvars, const struct variable **catvars,
138 const struct variable *wv, enum mv_class exclude)
141 struct covariance *cov = xmalloc (sizeof *cov);
145 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
150 cov->n_vars = n_vars;
153 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
155 for (i = 0; i < n_MOMENTS; ++i)
156 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
158 cov->exclude = exclude;
163 cov->categoricals = categoricals_create (catvars, n_catvars, wv);
168 /* Return an integer, which can be used to index
169 into COV->cm, to obtain the I, J th element
170 of the covariance matrix. If COV->cm does not
171 contain that element, then a negative value
175 cm_idx (const struct covariance *cov, int i, int j)
178 const int n2j = cov->n_vars - 2 - j;
179 const int nj = cov->n_vars - 2 ;
182 assert (j < cov->n_vars);
187 if (j >= cov->n_vars - 1)
194 as -= n2j * (n2j + 1) ;
201 dump_matrix (const gsl_matrix *m)
205 for (i = 0 ; i < m->size1; ++i)
207 for (j = 0 ; j < m->size2; ++j)
208 printf ("%02f ", gsl_matrix_get (m, i, j));
213 /* Call this function for every case in the data set */
215 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
218 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
220 assert (cov->passes == 2);
221 if (!cov->pass_one_first_case_seen)
223 assert (cov->state == 0);
227 categoricals_update (cov->categoricals, c);
229 for (i = 0 ; i < cov->n_vars; ++i)
231 const union value *val1 = case_data (c, cov->vars[i]);
233 if ( var_is_value_missing (cov->vars[i], val1, cov->exclude))
236 for (j = 0 ; j < cov->n_vars; ++j)
239 const union value *val2 = case_data (c, cov->vars[j]);
241 if ( var_is_value_missing (cov->vars[j], val2, cov->exclude))
244 for (m = 0 ; m <= MOMENT_MEAN; ++m)
246 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
254 cov->pass_one_first_case_seen = true;
258 /* Call this function for every case in the data set */
260 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
263 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
265 assert (cov->passes == 2);
266 assert (cov->state >= 1);
268 if (! cov->pass_two_first_case_seen)
270 assert (cov->state == 1);
273 cov->dim = cov->n_vars + categoricals_total (cov->categoricals);
274 cov->n_cm = (cov->dim * (cov->dim - 1) ) / 2;
275 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
277 /* Divide the means by the number of samples */
278 for (i = 0; i < cov->n_vars; ++i)
280 for (j = 0; j < cov->n_vars; ++j)
282 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
283 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
288 for (i = 0 ; i < cov->n_vars; ++i)
290 const union value *val1 = case_data (c, cov->vars[i]);
292 if ( var_is_value_missing (cov->vars[i], val1, cov->exclude))
295 for (j = 0 ; j < cov->n_vars; ++j)
299 const union value *val2 = case_data (c, cov->vars[j]);
301 const double s = pow2 (val1->f - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
303 if ( var_is_value_missing (cov->vars[j], val2, cov->exclude))
307 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
312 (val1->f - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
314 (val2->f - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
318 idx = cm_idx (cov, i, j);
327 cov->pass_two_first_case_seen = true;
331 /* Call this function for every case in the data set.
332 After all cases have been passed, call covariance_calculate
335 covariance_accumulate (struct covariance *cov, const struct ccase *c)
338 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
340 assert (cov->passes == 1);
342 if ( !cov->pass_one_first_case_seen)
344 assert ( cov->state == 0);
348 for (i = 0 ; i < cov->n_vars; ++i)
350 const union value *val1 = case_data (c, cov->vars[i]);
352 if ( var_is_value_missing (cov->vars[i], val1, cov->exclude))
355 for (j = 0 ; j < cov->n_vars; ++j)
359 const union value *val2 = case_data (c, cov->vars[j]);
361 if ( var_is_value_missing (cov->vars[j], val2, cov->exclude))
364 idx = cm_idx (cov, i, j);
367 cov->cm [idx] += val1->f * val2->f * weight;
370 for (m = 0 ; m < n_MOMENTS; ++m)
372 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
380 cov->pass_one_first_case_seen = true;
385 Allocate and return a gsl_matrix containing the covariances of the
389 cm_to_gsl (struct covariance *cov)
392 gsl_matrix *m = gsl_matrix_calloc (cov->n_vars, cov->n_vars);
394 /* Copy the non-diagonal elements from cov->cm */
395 for ( j = 0 ; j < cov->n_vars - 1; ++j)
397 for (i = j+1 ; i < cov->n_vars; ++i)
399 double x = cov->cm [cm_idx (cov, i, j)];
400 gsl_matrix_set (m, i, j, x);
401 gsl_matrix_set (m, j, i, x);
405 /* Copy the diagonal elements from cov->moments[2] */
406 for (j = 0 ; j < cov->n_vars ; ++j)
408 double sigma = gsl_matrix_get (cov->moments[2], j, j);
409 gsl_matrix_set (m, j, j, sigma);
416 static const gsl_matrix *
417 covariance_calculate_double_pass (struct covariance *cov)
420 for (i = 0 ; i < cov->n_vars; ++i)
422 for (j = 0 ; j < cov->n_vars; ++j)
425 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
426 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
428 idx = cm_idx (cov, i, j);
432 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
437 return cm_to_gsl (cov);
440 static const gsl_matrix *
441 covariance_calculate_single_pass (struct covariance *cov)
446 for (m = 0; m < n_MOMENTS; ++m)
448 /* Divide the moments by the number of samples */
451 for (i = 0 ; i < cov->n_vars; ++i)
453 for (j = 0 ; j < cov->n_vars; ++j)
455 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
456 *x /= gsl_matrix_get (cov->moments[0], i, j);
458 if ( m == MOMENT_VARIANCE)
459 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
465 /* Centre the moments */
466 for ( j = 0 ; j < cov->n_vars - 1; ++j)
468 for (i = j + 1 ; i < cov->n_vars; ++i)
470 double *x = &cov->cm [cm_idx (cov, i, j)];
472 *x /= gsl_matrix_get (cov->moments[0], i, j);
475 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
477 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
481 return cm_to_gsl (cov);
487 Return a pointer to gsl_matrix containing the pairwise covariances.
488 The matrix remains owned by the COV object, and must not be freed.
489 Call this function only after all data have been accumulated.
492 covariance_calculate (struct covariance *cov)
494 assert ( cov->state > 0 );
499 return covariance_calculate_single_pass (cov);
502 return covariance_calculate_double_pass (cov);
512 /* Destroy the COV object */
514 covariance_destroy (struct covariance *cov)
519 for (i = 0; i < n_MOMENTS; ++i)
520 gsl_matrix_free (cov->moments[i]);