1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/covariance.h"
21 #include <gsl/gsl_matrix.h>
23 #include "data/case.h"
24 #include "data/variable.h"
25 #include "libpspp/assertion.h"
26 #include "libpspp/misc.h"
27 #include "math/categoricals.h"
28 #include "math/interaction.h"
29 #include "math/moments.h"
31 #include "gl/xalloc.h"
33 #define n_MOMENTS (MOMENT_VARIANCE + 1)
36 /* Create a new matrix of NEW_SIZE x NEW_SIZE and copy the elements of
37 matrix IN into it. IN must be a square matrix, and in normal usage
38 it will be smaller than NEW_SIZE.
39 IN is destroyed by this function. The return value must be destroyed
40 when no longer required.
43 resize_matrix (gsl_matrix *in, size_t new_size)
47 gsl_matrix *out = NULL;
49 assert (in->size1 == in->size2);
51 if (new_size <= in->size1)
54 out = gsl_matrix_calloc (new_size, new_size);
56 for (i = 0; i < in->size1; ++i)
58 for (j = 0; j < in->size2; ++j)
60 double x = gsl_matrix_get (in, i, j);
62 gsl_matrix_set (out, i, j, x);
73 /* True if the covariances are centerered. (ie Real covariances) */
76 /* The variables for which the covariance matrix is to be calculated. */
78 const struct variable *const *vars;
80 /* Categorical variables. */
81 struct categoricals *categoricals;
83 /* Array containing number of categories per categorical variable. */
86 /* Dimension of the covariance matrix. */
89 /* The weight variable (or NULL if none) */
90 const struct variable *wv;
92 /* A set of matrices containing the 0th, 1st and 2nd moments */
95 /* The class of missing values to exclude */
96 enum mv_class exclude;
98 /* An array of doubles representing the covariance matrix.
99 Only the top triangle is included, and no diagonals */
103 /* 1 for single pass algorithm;
104 2 for double pass algorithm
109 0 : No pass has been made
110 1 : First pass has been started
111 2 : Second pass has been
113 IE: How many passes have been (partially) made. */
116 /* Flags indicating that the first case has been seen */
117 bool pass_one_first_case_seen;
118 bool pass_two_first_case_seen;
120 gsl_matrix *unnormalised;
125 /* Return a matrix containing the M th moments.
126 The matrix is of size NxN where N is the number of variables.
127 Each row represents the moments of a variable.
128 In the absence of missing values, the columns of this matrix will
129 be identical. If missing values are involved, then element (i,j)
130 is the moment of the i th variable, when paired with the j th variable.
133 covariance_moments (const struct covariance *cov, int m)
135 return cov->moments[m];
140 /* Create a covariance struct.
143 covariance_1pass_create (size_t n_vars, const struct variable *const *vars,
144 const struct variable *weight, enum mv_class exclude,
148 struct covariance *cov = XZALLOC (struct covariance);
150 cov->centered = centered;
153 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
158 cov->n_vars = n_vars;
161 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
163 for (i = 0; i < n_MOMENTS; ++i)
164 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
166 cov->exclude = exclude;
168 cov->n_cm = (n_vars * (n_vars - 1)) / 2;
171 cov->cm = xcalloc (cov->n_cm, sizeof *cov->cm);
172 cov->categoricals = NULL;
178 Create a covariance struct for a two-pass algorithm. If categorical
179 variables are involed, the dimension cannot be know until after the
180 first data pass, so the actual covariances will not be allocated
184 covariance_2pass_create (size_t n_vars, const struct variable *const *vars,
185 struct categoricals *cats,
186 const struct variable *wv, enum mv_class exclude,
190 struct covariance *cov = xmalloc (sizeof *cov);
192 cov->centered = centered;
195 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
200 cov->n_vars = n_vars;
203 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
205 for (i = 0; i < n_MOMENTS; ++i)
206 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
208 cov->exclude = exclude;
213 cov->categoricals = cats;
214 cov->unnormalised = NULL;
219 /* Return an integer, which can be used to index
220 into COV->cm, to obtain the I, J th element
221 of the covariance matrix. If COV->cm does not
222 contain that element, then a negative value
226 cm_idx (const struct covariance *cov, int i, int j)
229 const int n2j = cov->dim - 2 - j;
230 const int nj = cov->dim - 2 ;
233 assert (j < cov->dim);
238 if (j >= cov->dim - 1)
245 as -= n2j * (n2j + 1) ;
253 Returns true iff the variable corresponding to the Ith element of the covariance matrix
254 has a missing value for case C
257 is_missing (const struct covariance *cov, int i, const struct ccase *c)
259 const struct variable *var = i < cov->n_vars ?
261 categoricals_get_interaction_by_subscript (cov->categoricals, i - cov->n_vars)->vars[0];
263 const union value *val = case_data (c, var);
265 return (var_is_value_missing (var, val) & cov->exclude) != 0;
270 get_val (const struct covariance *cov, int i, const struct ccase *c)
274 const struct variable *var = cov->vars[i];
276 const union value *val = case_data (c, var);
281 return categoricals_get_effects_code_for_case (cov->categoricals, i - cov->n_vars, c);
286 dump_matrix (const gsl_matrix *m)
290 for (i = 0 ; i < m->size1; ++i)
292 for (j = 0 ; j < m->size2; ++j)
293 printf ("%02f ", gsl_matrix_get (m, i, j));
299 /* Call this function for every case in the data set */
301 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
304 const double weight = cov->wv ? case_num (c, cov->wv) : 1.0;
306 assert (cov->passes == 2);
307 if (!cov->pass_one_first_case_seen)
309 assert (cov->state == 0);
313 if (cov->categoricals)
314 categoricals_update (cov->categoricals, c);
316 for (i = 0 ; i < cov->dim; ++i)
318 double v1 = get_val (cov, i, c);
320 if (is_missing (cov, i, c))
323 for (j = 0 ; j < cov->dim; ++j)
327 if (is_missing (cov, j, c))
330 for (m = 0 ; m <= MOMENT_MEAN; ++m)
332 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
340 cov->pass_one_first_case_seen = true;
344 /* Call this function for every case in the data set */
346 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
349 const double weight = cov->wv ? case_num (c, cov->wv) : 1.0;
351 assert (cov->passes == 2);
352 assert (cov->state >= 1);
354 if (! cov->pass_two_first_case_seen)
357 assert (cov->state == 1);
360 if (cov->categoricals)
361 categoricals_done (cov->categoricals);
363 cov->dim = cov->n_vars;
365 if (cov->categoricals)
366 cov->dim += categoricals_df_total (cov->categoricals);
368 cov->n_cm = (cov->dim * (cov->dim - 1)) / 2;
369 cov->cm = xcalloc (cov->n_cm, sizeof *cov->cm);
371 /* Grow the moment matrices so that they're large enough to accommodate the
372 categorical elements */
373 for (i = 0; i < n_MOMENTS; ++i)
375 cov->moments[i] = resize_matrix (cov->moments[i], cov->dim);
378 /* Populate the moments matrices with the categorical value elements */
379 for (i = cov->n_vars; i < cov->dim; ++i)
381 for (j = 0 ; j < cov->dim ; ++j) /* FIXME: This is WRONG !!! */
383 double w = categoricals_get_weight_by_subscript (cov->categoricals, i - cov->n_vars);
385 gsl_matrix_set (cov->moments[MOMENT_NONE], i, j, w);
387 w = categoricals_get_sum_by_subscript (cov->categoricals, i - cov->n_vars);
389 gsl_matrix_set (cov->moments[MOMENT_MEAN], i, j, w);
393 /* FIXME: This is WRONG!! It must be fixed to properly handle missing values. For
394 now it assumes there are none */
395 for (m = 0 ; m < n_MOMENTS; ++m)
397 for (i = 0 ; i < cov->dim ; ++i)
399 double x = gsl_matrix_get (cov->moments[m], i, cov->n_vars -1);
400 for (j = cov->n_vars; j < cov->dim; ++j)
402 gsl_matrix_set (cov->moments[m], i, j, x);
407 /* Divide the means by the number of samples */
408 for (i = 0; i < cov->dim; ++i)
410 for (j = 0; j < cov->dim; ++j)
412 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
413 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
418 for (i = 0 ; i < cov->dim; ++i)
420 double v1 = get_val (cov, i, c);
422 if (is_missing (cov, i, c))
425 for (j = 0 ; j < cov->dim; ++j)
429 double v2 = get_val (cov, j, c);
431 const double s = pow2 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
433 if (is_missing (cov, j, c))
437 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
442 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
444 (v2 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
448 idx = cm_idx (cov, i, j);
456 cov->pass_two_first_case_seen = true;
460 /* Call this function for every case in the data set.
461 After all cases have been passed, call covariance_calculate
464 covariance_accumulate (struct covariance *cov, const struct ccase *c)
467 const double weight = cov->wv ? case_num (c, cov->wv) : 1.0;
469 assert (cov->passes == 1);
471 if (!cov->pass_one_first_case_seen)
473 assert (cov->state == 0);
477 for (i = 0 ; i < cov->dim; ++i)
479 const union value *val1 = case_data (c, cov->vars[i]);
481 if (is_missing (cov, i, c))
484 for (j = 0 ; j < cov->dim; ++j)
488 const union value *val2 = case_data (c, cov->vars[j]);
490 if (is_missing (cov, j, c))
493 idx = cm_idx (cov, i, j);
496 cov->cm [idx] += val1->f * val2->f * weight;
499 for (m = 0 ; m < n_MOMENTS; ++m)
501 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
509 cov->pass_one_first_case_seen = true;
514 Allocate and return a gsl_matrix containing the covariances of the
518 cm_to_gsl (struct covariance *cov)
521 gsl_matrix *m = gsl_matrix_calloc (cov->dim, cov->dim);
523 /* Copy the non-diagonal elements from cov->cm */
524 for (j = 0 ; j < cov->dim - 1; ++j)
526 for (i = j+1 ; i < cov->dim; ++i)
528 double x = cov->cm [cm_idx (cov, i, j)];
529 gsl_matrix_set (m, i, j, x);
530 gsl_matrix_set (m, j, i, x);
534 /* Copy the diagonal elements from cov->moments[2] */
535 for (j = 0 ; j < cov->dim ; ++j)
537 double sigma = gsl_matrix_get (cov->moments[2], j, j);
538 gsl_matrix_set (m, j, j, sigma);
546 covariance_calculate_double_pass (struct covariance *cov)
549 for (i = 0 ; i < cov->dim; ++i)
551 for (j = 0 ; j < cov->dim; ++j)
554 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
555 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
557 idx = cm_idx (cov, i, j);
561 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
566 return cm_to_gsl (cov);
570 covariance_calculate_single_pass (struct covariance *cov)
575 for (m = 0; m < n_MOMENTS; ++m)
577 /* Divide the moments by the number of samples */
580 for (i = 0 ; i < cov->dim; ++i)
582 for (j = 0 ; j < cov->dim; ++j)
584 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
585 *x /= gsl_matrix_get (cov->moments[0], i, j);
587 if (m == MOMENT_VARIANCE)
588 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
596 /* Centre the moments */
597 for (j = 0 ; j < cov->dim - 1; ++j)
599 for (i = j + 1 ; i < cov->dim; ++i)
601 double *x = &cov->cm [cm_idx (cov, i, j)];
603 *x /= gsl_matrix_get (cov->moments[0], i, j);
606 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
608 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
613 return cm_to_gsl (cov);
617 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
618 caller owns the returned matrix and must free it when it is no longer
621 Call this function only after all data have been accumulated. */
623 covariance_calculate (struct covariance *cov)
631 return covariance_calculate_single_pass (cov);
634 return covariance_calculate_double_pass (cov);
642 Covariance computed without dividing by the sample size.
645 covariance_calculate_double_pass_unnormalized (struct covariance *cov)
647 return cm_to_gsl (cov);
651 covariance_calculate_single_pass_unnormalized (struct covariance *cov)
657 for (i = 0 ; i < cov->dim; ++i)
659 for (j = 0 ; j < cov->dim; ++j)
661 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
662 *x -= pow2 (gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
663 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
667 for (j = 0 ; j < cov->dim - 1; ++j)
669 for (i = j + 1 ; i < cov->dim; ++i)
671 double *x = &cov->cm [cm_idx (cov, i, j)];
674 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
676 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i)
677 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
682 return cm_to_gsl (cov);
686 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
687 returned matrix is owned by the structure, and must not be freed.
689 Call this function only after all data have been accumulated. */
691 covariance_calculate_unnormalized (struct covariance *cov)
696 if (cov->unnormalised != NULL)
697 return cov->unnormalised;
702 cov->unnormalised = covariance_calculate_single_pass_unnormalized (cov);
705 cov->unnormalised = covariance_calculate_double_pass_unnormalized (cov);
711 return cov->unnormalised;
714 /* Function to access the categoricals used by COV
715 The return value is owned by the COV
717 const struct categoricals *
718 covariance_get_categoricals (const struct covariance *cov)
720 return cov->categoricals;
724 /* Destroy the COV object */
726 covariance_destroy (struct covariance *cov)
730 categoricals_destroy (cov->categoricals);
732 for (i = 0; i < n_MOMENTS; ++i)
733 gsl_matrix_free (cov->moments[i]);
735 gsl_matrix_free (cov->unnormalised);
742 covariance_dim (const struct covariance * cov)
750 Routines to assist debugging.
751 The following are not thoroughly tested and in certain respects
752 unreliable. They should only be
753 used for aids to development. Not as user accessible code.
756 #include "libpspp/str.h"
757 #include "output/pivot-table.h"
758 #include "data/format.h"
761 /* Create a table which can be populated with the encodings for
762 the covariance matrix COV */
764 covariance_dump_enc_header (const struct covariance *cov)
766 struct pivot_table *table = pivot_table_create ("Covariance Encoding");
768 struct pivot_dimension *factors = pivot_dimension_create (
769 table, PIVOT_AXIS_COLUMN, "Factor");
770 for (size_t i = 0 ; i < cov->n_vars; ++i)
771 pivot_category_create_leaf (factors->root,
772 pivot_value_new_variable (cov->vars[i]));
773 for (size_t i = 0, n = 0; i < cov->dim - cov->n_vars; n++)
775 const struct interaction *iact =
776 categoricals_get_interaction_by_subscript (cov->categoricals, i);
778 struct string str = DS_EMPTY_INITIALIZER;
779 interaction_to_string (iact, &str);
780 struct pivot_category *group = pivot_category_create_group__ (
782 pivot_value_new_user_text_nocopy (ds_steal_cstr (&str)));
784 int df = categoricals_df (cov->categoricals, n);
785 for (int j = 0; j < df; j++)
786 pivot_category_create_leaf_rc (group, pivot_value_new_integer (j),
792 struct pivot_dimension *matrix = pivot_dimension_create (
793 table, PIVOT_AXIS_ROW, "Matrix", "Matrix");
794 matrix->hide_all_labels = true;
801 Append table T, which should have been returned by covariance_dump_enc_header
802 with an entry corresponding to case C for the covariance matrix COV
805 covariance_dump_enc (const struct covariance *cov, const struct ccase *c,
806 struct pivot_table *table)
808 int row = pivot_category_create_leaf (
809 table->dimensions[1]->root,
810 pivot_value_new_integer (table->dimensions[1]->n_leaves));
812 for (int i = 0 ; i < cov->dim; ++i)
814 table, i, row, pivot_value_new_number (get_val (cov, i, c)));