1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <libpspp/assertion.h>
20 #include "covariance.h"
21 #include <gl/xalloc.h>
23 #include <gsl/gsl_matrix.h>
24 #include <data/case.h>
25 #include <data/variable.h>
26 #include <libpspp/misc.h>
28 #define n_MOMENTS (MOMENT_VARIANCE + 1)
33 /* The variables for which the covariance matrix is to be calculated. */
35 const struct variable **vars;
37 /* Categorical variables. */
39 const struct variable **catvars;
41 /* Array containing number of categories per categorical variable. */
44 /* Dimension of the covariance matrix. */
47 /* The weight variable (or NULL if none) */
48 const struct variable *wv;
50 /* A set of matrices containing the 0th, 1st and 2nd moments */
53 /* The class of missing values to exclude */
54 enum mv_class exclude;
56 /* An array of doubles representing the covariance matrix.
57 Only the top triangle is included, and no diagonals */
61 /* 1 for single pass algorithm;
62 2 for double pass algorithm
67 0 : No pass has been made
68 1 : First pass has been started
69 2 : Second pass has been
71 IE: How many passes have been (partially) made. */
74 /* Flags indicating that the first case has been seen */
75 bool pass_one_first_case_seen;
76 bool pass_two_first_case_seen;
81 /* Return a matrix containing the M th moments.
82 The matrix is of size NxN where N is the number of variables.
83 Each row represents the moments of a variable.
84 In the absence of missing values, the columns of this matrix will
85 be identical. If missing values are involved, then element (i,j)
86 is the moment of the i th variable, when paired with the j th variable.
89 covariance_moments (const struct covariance *cov, int m)
91 return cov->moments[m];
96 /* Create a covariance struct.
99 covariance_create (size_t n_vars, const struct variable **vars,
100 const struct variable *weight, enum mv_class exclude,
104 struct covariance *cov = xmalloc (sizeof *cov);
105 assert (passes == 1 || passes == 2);
106 cov->passes = passes;
108 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
110 cov->vars = xmalloc (sizeof *cov->vars * n_vars);
113 cov->n_vars = n_vars;
116 for (i = 0; i < n_vars; ++i)
117 cov->vars[i] = vars[i];
119 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
121 for (i = 0; i < n_MOMENTS; ++i)
122 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
124 cov->exclude = exclude;
126 cov->n_cm = (n_vars * (n_vars - 1) ) / 2;
128 cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
134 Create a covariance struct for a two-pass algorithm. If categorical
135 variables are involed, the dimension cannot be know until after the
136 first data pass, so the actual covariances will not be allocated
140 covariance_2pass_create (size_t n_vars, const struct variable **vars,
141 size_t n_catvars, const struct variable **catvars,
142 const struct variable *weight, enum mv_class exclude)
145 struct covariance *cov = xmalloc (sizeof *cov);
146 cov->vars = xmalloc (sizeof *cov->vars * n_vars);
147 cov->catvars = xnmalloc (n_catvars, sizeof (*cov->catvars));
148 cov->n_categories = xnmalloc (n_catvars, sizeof (cov->n_categories));
151 cov->n_vars = n_vars;
152 cov->n_catvars = n_catvars;
154 for (i = 0; i < n_vars; ++i)
155 cov->vars[i] = vars[i];
157 for (i = 0; i < n_catvars; i++)
159 cov->catvars[i] = catvars[i];
160 cov->n_categories[i] = 0;
163 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
165 cov->exclude = exclude;
170 /* Return an integer, which can be used to index
171 into COV->cm, to obtain the I, J th element
172 of the covariance matrix. If COV->cm does not
173 contain that element, then a negative value
177 cm_idx (const struct covariance *cov, int i, int j)
180 const int n2j = cov->n_vars - 2 - j;
181 const int nj = cov->n_vars - 2 ;
184 assert (j < cov->n_vars);
189 if (j >= cov->n_vars - 1)
196 as -= n2j * (n2j + 1) ;
203 dump_matrix (const gsl_matrix *m)
207 for (i = 0 ; i < m->size1; ++i)
209 for (j = 0 ; j < m->size2; ++j)
210 printf ("%02f ", gsl_matrix_get (m, i, j));
215 /* Call this function for every case in the data set */
217 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
220 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
222 assert (cov->passes == 2);
223 if (!cov->pass_one_first_case_seen)
225 assert (cov->state == 0);
229 for (i = 0 ; i < cov->n_vars; ++i)
231 const union value *val1 = case_data (c, cov->vars[i]);
233 if ( var_is_value_missing (cov->vars[i], val1, cov->exclude))
236 for (j = 0 ; j < cov->n_vars; ++j)
239 const union value *val2 = case_data (c, cov->vars[j]);
241 if ( var_is_value_missing (cov->vars[j], val2, cov->exclude))
244 for (m = 0 ; m <= MOMENT_MEAN; ++m)
246 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
254 cov->pass_one_first_case_seen = true;
258 /* Call this function for every case in the data set */
260 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
263 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
265 assert (cov->passes == 2);
266 assert (cov->state >= 1);
268 if (! cov->pass_two_first_case_seen)
270 assert (cov->state == 1);
273 /* Divide the means by the number of samples */
274 for (i = 0; i < cov->n_vars; ++i)
276 for (j = 0; j < cov->n_vars; ++j)
278 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
279 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
284 for (i = 0 ; i < cov->n_vars; ++i)
286 const union value *val1 = case_data (c, cov->vars[i]);
288 if ( var_is_value_missing (cov->vars[i], val1, cov->exclude))
291 for (j = 0 ; j < cov->n_vars; ++j)
295 const union value *val2 = case_data (c, cov->vars[j]);
297 const double s = pow2 (val1->f - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
299 if ( var_is_value_missing (cov->vars[j], val2, cov->exclude))
303 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
308 (val1->f - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
310 (val2->f - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
314 idx = cm_idx (cov, i, j);
323 cov->pass_two_first_case_seen = true;
327 /* Call this function for every case in the data set.
328 After all cases have been passed, call covariance_calculate
331 covariance_accumulate (struct covariance *cov, const struct ccase *c)
334 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
336 assert (cov->passes == 1);
338 if ( !cov->pass_one_first_case_seen)
340 assert ( cov->state == 0);
344 for (i = 0 ; i < cov->n_vars; ++i)
346 const union value *val1 = case_data (c, cov->vars[i]);
348 if ( var_is_value_missing (cov->vars[i], val1, cov->exclude))
351 for (j = 0 ; j < cov->n_vars; ++j)
355 const union value *val2 = case_data (c, cov->vars[j]);
357 if ( var_is_value_missing (cov->vars[j], val2, cov->exclude))
360 idx = cm_idx (cov, i, j);
363 cov->cm [idx] += val1->f * val2->f * weight;
366 for (m = 0 ; m < n_MOMENTS; ++m)
368 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
376 cov->pass_one_first_case_seen = true;
381 Allocate and return a gsl_matrix containing the covariances of the
385 cm_to_gsl (struct covariance *cov)
388 gsl_matrix *m = gsl_matrix_calloc (cov->n_vars, cov->n_vars);
390 /* Copy the non-diagonal elements from cov->cm */
391 for ( j = 0 ; j < cov->n_vars - 1; ++j)
393 for (i = j+1 ; i < cov->n_vars; ++i)
395 double x = cov->cm [cm_idx (cov, i, j)];
396 gsl_matrix_set (m, i, j, x);
397 gsl_matrix_set (m, j, i, x);
401 /* Copy the diagonal elements from cov->moments[2] */
402 for (j = 0 ; j < cov->n_vars ; ++j)
404 double sigma = gsl_matrix_get (cov->moments[2], j, j);
405 gsl_matrix_set (m, j, j, sigma);
412 static const gsl_matrix *
413 covariance_calculate_double_pass (struct covariance *cov)
416 for (i = 0 ; i < cov->n_vars; ++i)
418 for (j = 0 ; j < cov->n_vars; ++j)
421 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
422 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
424 idx = cm_idx (cov, i, j);
428 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
433 return cm_to_gsl (cov);
436 static const gsl_matrix *
437 covariance_calculate_single_pass (struct covariance *cov)
442 for (m = 0; m < n_MOMENTS; ++m)
444 /* Divide the moments by the number of samples */
447 for (i = 0 ; i < cov->n_vars; ++i)
449 for (j = 0 ; j < cov->n_vars; ++j)
451 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
452 *x /= gsl_matrix_get (cov->moments[0], i, j);
454 if ( m == MOMENT_VARIANCE)
455 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
461 /* Centre the moments */
462 for ( j = 0 ; j < cov->n_vars - 1; ++j)
464 for (i = j + 1 ; i < cov->n_vars; ++i)
466 double *x = &cov->cm [cm_idx (cov, i, j)];
468 *x /= gsl_matrix_get (cov->moments[0], i, j);
471 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
473 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
477 return cm_to_gsl (cov);
483 Return a pointer to gsl_matrix containing the pairwise covariances.
484 The matrix remains owned by the COV object, and must not be freed.
485 Call this function only after all data have been accumulated.
488 covariance_calculate (struct covariance *cov)
490 assert ( cov->state > 0 );
495 return covariance_calculate_single_pass (cov);
498 return covariance_calculate_double_pass (cov);
508 /* Destroy the COV object */
510 covariance_destroy (struct covariance *cov)
515 for (i = 0; i < n_MOMENTS; ++i)
516 gsl_matrix_free (cov->moments[i]);