From 400eaca1f378c40dced767bdc14a395dff220b8d Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sat, 3 Oct 2009 21:52:01 +0200 Subject: [PATCH] First working version of CORRELATIONS. This commit includes a new module, src/math/covariance.[ch], which may eventually replace src/math/covariance-matrix.[ch] --- src/language/stats/correlations.c | 77 +++++---- src/math/automake.mk | 2 + src/math/covariance.c | 267 ++++++++++++++++++++++++++++++ src/math/covariance.h | 41 +++++ tests/automake.mk | 2 + tests/command/correlation.sh | 153 +++++++++++++++++ 6 files changed, 514 insertions(+), 28 deletions(-) create mode 100644 src/math/covariance.c create mode 100644 src/math/covariance.h create mode 100755 tests/command/correlation.sh diff --git a/src/language/stats/correlations.c b/src/language/stats/correlations.c index 4899fd03..92572897 100644 --- a/src/language/stats/correlations.c +++ b/src/language/stats/correlations.c @@ -16,6 +16,7 @@ #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include #include #include "xalloc.h" @@ -42,29 +44,6 @@ #define _(msgid) gettext (msgid) #define N_(msgid) msgid -/* Returns the correlation matrix corresponding to the covariance -matrix COV. The return value must be freed with gsl_matrix_free -when no longer required. -*/ -static gsl_matrix * -covariance_to_correlation (const gsl_matrix *cov) -{ - size_t r, c; - gsl_matrix *corr = gsl_matrix_alloc (cov->size1, cov->size2); - - for (r = 0 ; r < cov->size1; ++r) - { - for (c = 0 ; c < cov->size2 ; ++c) - { - double x = gsl_matrix_get (cov, r, c); - x /= sqrt (gsl_matrix_get (cov, r, r) - * gsl_matrix_get (cov, c, c) ); - gsl_matrix_set (corr, r, c, x); - } - } - - return corr; -} static double significance_of_correlation (double rho, double w) @@ -205,28 +184,70 @@ output_correlation (const struct corr *corr, const struct corr_opts *opts, tab_submit (t); } + +static gsl_matrix * +correlation_from_covariance (const gsl_matrix *cv, const gsl_matrix *v) +{ + size_t i, j; + gsl_matrix *corr = gsl_matrix_calloc (cv->size1, cv->size2); + + for (i = 0 ; i < cv->size1; ++i) + { + for (j = 0 ; j < cv->size2; ++j) + { + double rho = gsl_matrix_get (cv, i, j); + + rho /= sqrt (gsl_matrix_get (v, i, j)) + * + sqrt (gsl_matrix_get (v, j, i)); + + gsl_matrix_set (corr, i, j, rho); + } + } + + return corr; +} + + + + static void run_corr (struct casereader *r, const struct corr_opts *opts, const struct corr *corr) { struct ccase *c; - const struct design_matrix *cov_matrix; + const gsl_matrix *var_matrix; const gsl_matrix *samples_matrix; + const gsl_matrix *cov_matrix; + gsl_matrix *corr_matrix; + struct covariance *cov = covariance_create (corr->n_vars_total, corr->vars, + opts->wv, opts->exclude); for ( ; (c = casereader_read (r) ); case_unref (c)) { - + covariance_accumulate (cov, c); } + cov_matrix = covariance_calculate (cov); + + samples_matrix = covariance_moments (cov, MOMENT_NONE); + var_matrix = covariance_moments (cov, MOMENT_VARIANCE); + + corr_matrix = correlation_from_covariance (cov_matrix, var_matrix); output_correlation (corr, opts, - covariance_to_correlation (cov_matrix->m), + corr_matrix, samples_matrix ); + + covariance_destroy (cov); + gsl_matrix_free (corr_matrix); } int cmd_correlation (struct lexer *lexer, struct dataset *ds) { + int i; int n_all_vars = 0; /* Total number of variables involved in this command */ + const struct variable **all_vars ; const struct dictionary *dict = dataset_dict (ds); bool ok = true; @@ -336,8 +357,7 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) } - const struct variable **all_vars = xmalloc (sizeof (*all_vars) * n_all_vars); - int i; + all_vars = xmalloc (sizeof (*all_vars) * n_all_vars); { /* FIXME: Using a hash here would make more sense */ @@ -365,6 +385,7 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) r = casereader_create_filter_missing (r, all_vars, n_all_vars, opts.exclude, NULL, NULL); + run_corr (r, &opts, &corr[i]); casereader_destroy (r); } diff --git a/src/math/automake.mk b/src/math/automake.mk index cadfb900..9fc15aaf 100644 --- a/src/math/automake.mk +++ b/src/math/automake.mk @@ -13,6 +13,8 @@ src_math_libpspp_math_la_SOURCES = \ src/math/box-whisker.c src/math/box-whisker.h \ src/math/coefficient.c \ src/math/coefficient.h \ + src/math/covariance.c \ + src/math/covariance.h \ src/math/covariance-matrix.c \ src/math/covariance-matrix.h \ src/math/design-matrix.c src/math/design-matrix.h \ diff --git a/src/math/covariance.c b/src/math/covariance.c new file mode 100644 index 00000000..b2d43427 --- /dev/null +++ b/src/math/covariance.c @@ -0,0 +1,267 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "covariance.h" +#include +#include "moments.h" +#include +#include +#include +#include + +#define n_MOMENTS (MOMENT_VARIANCE + 1) + + +struct covariance +{ + /* The variables for which the covariance matrix is to be calculated */ + size_t n_vars; + const struct variable **vars; + + /* The weight variable (or NULL if none) */ + const struct variable *wv; + + /* A set of matrices containing the 0th, 1st and 2nd moments */ + gsl_matrix **moments; + + /* The class of missing values to exclude */ + enum mv_class exclude; + + /* An array of doubles representing the covariance matrix. + Only the top triangle is included, and no diagonals */ + double *cm; + int n_cm; +}; + + + +/* Return a matrix containing the M th moments. + The matrix is of size NxN where N is the number of variables. + Each row represents the moments of a variable. + In the absence of missing values, the columns of this matrix will + be identical. If missing values are involved, then element (i,j) + is the moment of the i th variable, when paired with the j th variable. + */ +const gsl_matrix * +covariance_moments (const struct covariance *cov, int m) +{ + return cov->moments[m]; +} + + + +/* Create a covariance struct */ +struct covariance * +covariance_create (size_t n_vars, const struct variable **vars, + const struct variable *weight, enum mv_class exclude) +{ + size_t i; + struct covariance *cov = xmalloc (sizeof *cov); + cov->vars = xmalloc (sizeof *cov->vars * n_vars); + + cov->wv = weight; + cov->n_vars = n_vars; + + for (i = 0; i < n_vars; ++i) + cov->vars[i] = vars[i]; + + cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS); + + for (i = 0; i < n_MOMENTS; ++i) + cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars); + + cov->exclude = exclude; + + cov->n_cm = (n_vars * (n_vars - 1) ) / 2; + + cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm); + + return cov; +} + +/* Return an integer, which can be used to index + into COV->cm, to obtain the I, J th element + of the covariance matrix. If COV->cm does not + contain that element, then a negative value + will be returned. +*/ +static int +cm_idx (const struct covariance *cov, int i, int j) +{ + int as; + const int n2j = cov->n_vars - 2 - j; + const int nj = cov->n_vars - 2 ; + + assert (i >= 0); + assert (j < cov->n_vars); + + if ( i == 0) + return -1; + + if (j >= cov->n_vars - 1) + return -1; + + if ( i <= j) + return -1 ; + + as = nj * (nj + 1) ; + as -= n2j * (n2j + 1) ; + as /= 2; + + return i - 1 + as; +} + + +/* Call this function for every case in the data set */ +void +covariance_accumulate (struct covariance *cov, const struct ccase *c) +{ + size_t i, j, m; + const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0; + + for (i = 0 ; i < cov->n_vars; ++i) + { + const union value *val1 = case_data (c, cov->vars[i]); + + if ( var_is_value_missing (cov->vars[i], val1, cov->exclude)) + continue; + + for (j = 0 ; j < cov->n_vars; ++j) + { + double pwr = 1.0; + int idx; + const union value *val2 = case_data (c, cov->vars[j]); + + if ( var_is_value_missing (cov->vars[j], val2, cov->exclude)) + continue; + + idx = cm_idx (cov, i, j); + if (idx >= 0) + { + cov->cm [idx] += val1->f * val2->f; + } + + for (m = 0 ; m < n_MOMENTS; ++m) + { + double *x = gsl_matrix_ptr (cov->moments[m], i, j); + + *x += pwr * weight; + pwr *= val1->f; + } + } + } +} + + +/* + Allocate and return a gsl_matrix containing the covariances of the + data. +*/ +static gsl_matrix * +cm_to_gsl (struct covariance *cov) +{ + int i, j; + gsl_matrix *m = gsl_matrix_calloc (cov->n_vars, cov->n_vars); + + /* Copy the non-diagonal elements from cov->cm */ + for ( j = 0 ; j < cov->n_vars - 1; ++j) + { + for (i = j+1 ; i < cov->n_vars; ++i) + { + double x = cov->cm [cm_idx (cov, i, j)]; + gsl_matrix_set (m, i, j, x); + gsl_matrix_set (m, j, i, x); + } + } + + /* Copy the diagonal elements from cov->moments[2] */ + for (j = 0 ; j < cov->n_vars ; ++j) + { + double sigma = gsl_matrix_get (cov->moments[2], j, j); + gsl_matrix_set (m, j, j, sigma); + } + + return m; +} + + + +/* + Return a pointer to gsl_matrix containing the pairwise covariances. + The matrix remains owned by the COV object, and must not be freed. + Call this function only after all data have been accumulated. +*/ +const gsl_matrix * +covariance_calculate (struct covariance *cov) +{ + size_t i, j; + size_t m; + + for (m = 0; m < n_MOMENTS; ++m) + { + /* Divide the moments by the number of samples */ + if ( m > 0) + { + for (i = 0 ; i < cov->n_vars; ++i) + { + for (j = 0 ; j < cov->n_vars; ++j) + { + double *x = gsl_matrix_ptr (cov->moments[m], i, j); + *x /= gsl_matrix_get (cov->moments[0], i, j); + + if ( m == MOMENT_VARIANCE) + *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j)); + } + } + } + } + + /* Centre the moments */ + for ( j = 0 ; j < cov->n_vars - 1; ++j) + { + for (i = j + 1 ; i < cov->n_vars; ++i) + { + double *x = &cov->cm [cm_idx (cov, i, j)]; + + *x /= gsl_matrix_get (cov->moments[0], i, j); + + *x -= + gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j) + * + gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i); + } + } + + return cm_to_gsl (cov); +} + + +/* Destroy the COV object */ +void +covariance_destroy (struct covariance *cov) +{ + size_t i; + free (cov->vars); + + for (i = 0; i < n_MOMENTS; ++i) + gsl_matrix_free (cov->moments[i]); + + free (cov->moments); + free (cov->cm); + free (cov); +} diff --git a/src/math/covariance.h b/src/math/covariance.h new file mode 100644 index 00000000..8b8de88e --- /dev/null +++ b/src/math/covariance.h @@ -0,0 +1,41 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + + +#ifndef COVARIANCE_H +#define COVARIANCE_H + +#include + +#include +#include + +struct covariance; +struct variable; +struct ccase ; + +struct covariance * covariance_create (size_t n_vars, const struct variable **vars, + const struct variable *wv, enum mv_class excl); + +void covariance_accumulate (struct covariance *, const struct ccase *); + +const gsl_matrix * covariance_calculate (struct covariance *cov); + +void covariance_destroy (struct covariance *cov); + +const gsl_matrix *covariance_moments (const struct covariance *cov, int m); + +#endif diff --git a/tests/automake.mk b/tests/automake.mk index 6493a845..7fb70931 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -16,6 +16,7 @@ dist_TESTS = \ tests/command/beg-data.sh \ tests/command/bignum.sh \ tests/command/count.sh \ + tests/command/correlation.sh \ tests/command/data-list.sh \ tests/command/do-if.sh \ tests/command/do-repeat.sh \ @@ -337,6 +338,7 @@ tests_dissect_sysfile_SOURCES = \ src/libpspp/float-format.c \ tests/dissect-sysfile.c tests_dissect_sysfile_LDADD = gl/libgl.la @LIBINTL@ +tests_dissect_sysfile_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=\"$(bindir)\" EXTRA_DIST += \ $(dist_TESTS) \ diff --git a/tests/command/correlation.sh b/tests/command/correlation.sh new file mode 100755 index 00000000..bdc74939 --- /dev/null +++ b/tests/command/correlation.sh @@ -0,0 +1,153 @@ +#!/bin/sh + +# This program tests the CORRELATIONS command + +TEMPDIR=/tmp/pspp-tst-$$ +TESTFILE=$TEMPDIR/`basename $0`.sps + +# ensure that top_srcdir and top_builddir are absolute +if [ -z "$top_srcdir" ] ; then top_srcdir=. ; fi +if [ -z "$top_builddir" ] ; then top_builddir=. ; fi +top_srcdir=`cd $top_srcdir; pwd` +top_builddir=`cd $top_builddir; pwd` + +PSPP=$top_builddir/src/ui/terminal/pspp + +STAT_CONFIG_PATH=$top_srcdir/config +export STAT_CONFIG_PATH + +LANG=C +export LANG + + +cleanup() +{ + if [ x"$PSPP_TEST_NO_CLEANUP" != x ] ; then + echo "NOT cleaning $TEMPDIR" + return ; + fi + rm -rf $TEMPDIR +} + + +fail() +{ + echo $activity + echo FAILED + cleanup; + exit 1; +} + + +no_result() +{ + echo $activity + echo NO RESULT; + cleanup; + exit 2; +} + +pass() +{ + cleanup; + exit 0; +} + +mkdir -p $TEMPDIR + +cd $TEMPDIR + +activity="create program" +cat << EOF > $TESTFILE +set format = F11.3. +data list notable list /foo * bar * wiz * bang *. +begin data. +1 0 3 1 +3 9 -50 5 +3 4 3 203 +4 -9 0 -4 +98 78 104 2 +3 50 -49 200 +. 4 4 4 +5 3 0 . +end data. + +correlations + variables = foo bar wiz bang + /print nosig + /missing = listwise + . + +correlations + variables = bar wiz + /print nosig + /missing = listwise + . + +correlations + variables = foo bar wiz bang + /print nosig + /missing = pairwise + . +EOF +if [ $? -ne 0 ] ; then no_result ; fi + + +activity="run program" +$SUPERVISOR $PSPP --testing-mode -o raw-ascii $TESTFILE +if [ $? -ne 0 ] ; then no_result ; fi + +activity="compare results" +perl -pi -e 's/^\s*$//g' $TEMPDIR/pspp.list +cp $TEMPDIR/pspp.list /tmp +diff -b $TEMPDIR/pspp.list - << EOF +1.1 CORRELATIONS. Correlations +#========================#=====#=====#=====#=====# +# #foo |bar |wiz |bang # +#----+-------------------#-----+-----+-----+-----# +#foo |Pearson Correlation#1.000| .802| .890|-.308# +# |Sig. (2-tailed) # | .055| .017| .553# +#----+-------------------#-----+-----+-----+-----# +#bar |Pearson Correlation# .802|1.000| .519| .118# +# |Sig. (2-tailed) # .055| | .291| .824# +#----+-------------------#-----+-----+-----+-----# +#wiz |Pearson Correlation# .890| .519|1.000|-.344# +# |Sig. (2-tailed) # .017| .291| | .505# +#----+-------------------#-----+-----+-----+-----# +#bang|Pearson Correlation#-.308| .118|-.344|1.000# +# |Sig. (2-tailed) # .553| .824| .505| # +#====#===================#=====#=====#=====#=====# +2.1 CORRELATIONS. Correlations +#=======================#=====#=====# +# #bar |wiz # +#---+-------------------#-----+-----# +#bar|Pearson Correlation#1.000| .497# +# |Sig. (2-tailed) # | .210# +#---+-------------------#-----+-----# +#wiz|Pearson Correlation# .497|1.000# +# |Sig. (2-tailed) # .210| # +#===#===================#=====#=====# +3.1 CORRELATIONS. Correlations +#========================#=====#=====#=====#=====# +# #foo |bar |wiz |bang # +#----+-------------------#-----+-----+-----+-----# +#foo |Pearson Correlation#1.000| .805| .883|-.308# +# |Sig. (2-tailed) # | .029| .008| .553# +# |N # 7| 7| 7| 6# +#----+-------------------#-----+-----+-----+-----# +#bar |Pearson Correlation# .805|1.000| .497| .164# +# |Sig. (2-tailed) # .029| | .210| .725# +# |N # 7| 8| 8| 7# +#----+-------------------#-----+-----+-----+-----# +#wiz |Pearson Correlation# .883| .497|1.000|-.337# +# |Sig. (2-tailed) # .008| .210| | .460# +# |N # 7| 8| 8| 7# +#----+-------------------#-----+-----+-----+-----# +#bang|Pearson Correlation#-.308| .164|-.337|1.000# +# |Sig. (2-tailed) # .553| .725| .460| # +# |N # 6| 7| 7| 7# +#====#===================#=====#=====#=====#=====# +EOF +if [ $? -ne 0 ] ; then fail ; fi + +pass; -- 2.30.2