From 4687a500973dfdd7293c89b14248dab5ba76df90 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sat, 3 Oct 2009 08:42:44 +0200 Subject: [PATCH] Initial framework for CORRELATIONS command --- src/language/command.def | 3 +- src/language/stats/.gitignore | 1 - src/language/stats/automake.mk | 1 + src/language/stats/correlations.c | 387 ++++++++++++++++++++++++++++++ 4 files changed, 390 insertions(+), 2 deletions(-) create mode 100644 src/language/stats/correlations.c diff --git a/src/language/command.def b/src/language/command.def index fa1bb1e8..bc180ed7 100644 --- a/src/language/command.def +++ b/src/language/command.def @@ -100,6 +100,7 @@ DEF_CMD (S_DATA, 0, "AUTORECODE", cmd_autorecode) DEF_CMD (S_DATA, F_KEEP_FINAL_TOKEN, "BEGIN DATA", cmd_begin_data) DEF_CMD (S_DATA, 0, "COUNT", cmd_count) DEF_CMD (S_DATA, 0, "CROSSTABS", cmd_crosstabs) +DEF_CMD (S_DATA, 0, "CORRELATIONS", cmd_correlation) DEF_CMD (S_DATA, 0, "DELETE VARIABLES", cmd_delete_variables) DEF_CMD (S_DATA, 0, "DESCRIPTIVES", cmd_descriptives) DEF_CMD (S_DATA, 0, "EXAMINE", cmd_examine) @@ -113,6 +114,7 @@ DEF_CMD (S_DATA, 0, "MEANS", cmd_means) DEF_CMD (S_DATA, 0, "MODIFY VARS", cmd_modify_vars) DEF_CMD (S_DATA, 0, "NPAR TESTS", cmd_npar_tests) DEF_CMD (S_DATA, 0, "ONEWAY", cmd_oneway) +DEF_CMD (S_DATA, 0, "PEARSON CORRELATIONS", cmd_correlation) DEF_CMD (S_DATA, 0, "RANK", cmd_rank) DEF_CMD (S_DATA, 0, "REGRESSION", cmd_regression) DEF_CMD (S_DATA, 0, "RELIABILITY", cmd_reliability) @@ -216,7 +218,6 @@ UNIMPL_CMD ("ORTHOPLAN", "Orthogonal effects design") UNIMPL_CMD ("OVERALS", "Nonlinear canonical correlation") UNIMPL_CMD ("PACF", "Partial autocorrelation") UNIMPL_CMD ("PARTIAL CORR", "Partial correlation") -UNIMPL_CMD ("PEARSON CORRELATIONS", "Correlation coefficients") UNIMPL_CMD ("PLANCARDS", "Conjoint analysis planning") UNIMPL_CMD ("PLUM", "Estimate ordinal regression models") UNIMPL_CMD ("POINT", "Marker in keyed file") diff --git a/src/language/stats/.gitignore b/src/language/stats/.gitignore index 6df3f96b..84bcaba8 100644 --- a/src/language/stats/.gitignore +++ b/src/language/stats/.gitignore @@ -1,4 +1,3 @@ -correlations.c crosstabs.c examine.c frequencies.c diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 8eb52cf0..05e8bcf9 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -22,6 +22,7 @@ language_stats_sources = \ src/language/stats/binomial.h \ src/language/stats/chisquare.c \ src/language/stats/chisquare.h \ + src/language/stats/correlations.c \ src/language/stats/descriptives.c \ src/language/stats/npar.h \ src/language/stats/sort-cases.c \ diff --git a/src/language/stats/correlations.c b/src/language/stats/correlations.c new file mode 100644 index 00000000..4899fd03 --- /dev/null +++ b/src/language/stats/correlations.c @@ -0,0 +1,387 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "xalloc.h" +#include "minmax.h" +#include +#include + +#include "gettext.h" +#define _(msgid) gettext (msgid) +#define N_(msgid) msgid + +/* Returns the correlation matrix corresponding to the covariance +matrix COV. The return value must be freed with gsl_matrix_free +when no longer required. +*/ +static gsl_matrix * +covariance_to_correlation (const gsl_matrix *cov) +{ + size_t r, c; + gsl_matrix *corr = gsl_matrix_alloc (cov->size1, cov->size2); + + for (r = 0 ; r < cov->size1; ++r) + { + for (c = 0 ; c < cov->size2 ; ++c) + { + double x = gsl_matrix_get (cov, r, c); + x /= sqrt (gsl_matrix_get (cov, r, r) + * gsl_matrix_get (cov, c, c) ); + gsl_matrix_set (corr, r, c, x); + } + } + + return corr; +} + +static double +significance_of_correlation (double rho, double w) +{ + double t = w - 2; + t /= 1 - MIN (1, pow2 (rho)); + t = sqrt (t); + t *= rho; + + if (t > 0) + return gsl_cdf_tdist_Q (t, w - 2); + else + return gsl_cdf_tdist_P (t, w - 2); +} + + +struct corr +{ + size_t n_vars_total; + size_t n_vars1; + + const struct variable **vars; +}; + + +/* Handling of missing values. */ +enum corr_missing_type + { + CORR_PAIRWISE, /* Handle missing values on a per-variable-pair basis. */ + CORR_LISTWISE /* Discard entire case if any variable is missing. */ + }; + +struct corr_opts +{ + enum corr_missing_type missing_type; + enum mv_class exclude; /* Classes of missing values to exclude. */ + + bool sig; /* Flag significant values or not */ + int tails; /* Report significance with how many tails ? */ + + const struct variable *wv; /* The weight variable (if any) */ +}; + + +static void +output_correlation (const struct corr *corr, const struct corr_opts *opts, + const gsl_matrix *cm, const gsl_matrix *samples) +{ + int r, c; + struct tab_table *t; + int matrix_cols; + int nr = corr->n_vars1; + int nc = matrix_cols = corr->n_vars_total > corr->n_vars1 ? + corr->n_vars_total - corr->n_vars1 : corr->n_vars1; + + const struct fmt_spec *wfmt = opts->wv ? var_get_print_format (opts->wv) : & F_8_0; + + const int heading_columns = 2; + const int heading_rows = 1; + + const int rows_per_variable = opts->missing_type == CORR_LISTWISE ? 2 : 3; + + /* Two header columns */ + nc += heading_columns; + + /* Three data per variable */ + nr *= rows_per_variable; + + /* One header row */ + nr += heading_rows; + + t = tab_create (nc, nr, 0); + tab_title (t, _("Correlations")); + tab_dim (t, tab_natural_dimensions, NULL); + + tab_headers (t, heading_columns, 0, heading_rows, 0); + + /* Outline the box */ + tab_box (t, + TAL_2, TAL_2, + -1, -1, + 0, 0, + nc - 1, nr - 1); + + /* Vertical lines */ + tab_box (t, + -1, -1, + -1, TAL_1, + heading_columns, 0, + nc - 1, nr - 1); + + tab_vline (t, TAL_2, heading_columns, 0, nr - 1); + tab_vline (t, TAL_1, 1, heading_rows, nr - 1); + + for (r = 0 ; r < corr->n_vars1 ; ++r) + { + tab_text (t, 0, 1 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, + var_to_string (corr->vars[r])); + + tab_text (t, 1, 1 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, _("Pearson Correlation")); + tab_text (t, 1, 2 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, + (opts->tails == 2) ? _("Sig. (2-tailed)") : _("Sig. (1-tailed)")); + if ( opts->missing_type != CORR_LISTWISE ) + tab_text (t, 1, 3 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, _("N")); + tab_hline (t, TAL_1, 0, nc - 1, r * rows_per_variable + 1); + } + + for (c = 0 ; c < matrix_cols ; ++c) + { + const struct variable *v = corr->n_vars_total > corr->n_vars1 ? corr->vars[corr->n_vars_total - corr->n_vars1 + c] : corr->vars[c]; + tab_text (t, heading_columns + c, 0, TAB_LEFT | TAT_TITLE, var_to_string (v)); + } + + for (r = 0 ; r < corr->n_vars1 ; ++r) + { + const int row = r * rows_per_variable + heading_rows; + for (c = 0 ; c < matrix_cols ; ++c) + { + unsigned char flags = 0; + int col_index = corr->n_vars_total - corr->n_vars1 + c; + double pearson = gsl_matrix_get (cm, r, col_index); + double w = gsl_matrix_get (samples, r, col_index); + double sig = opts->tails * significance_of_correlation (pearson, w); + + if ( opts->missing_type != CORR_LISTWISE ) + tab_double (t, c + heading_columns, row + 2, 0, w, wfmt); + + if ( c != r) + tab_double (t, c + heading_columns, row + 1, 0, sig, NULL); + + if ( opts->sig && c != r && sig < 0.05) + flags = TAB_EMPH; + + tab_double (t, c + heading_columns, row, flags, pearson, NULL); + } + } + + tab_submit (t); +} + +static void +run_corr (struct casereader *r, const struct corr_opts *opts, const struct corr *corr) +{ + struct ccase *c; + const struct design_matrix *cov_matrix; + const gsl_matrix *samples_matrix; + + for ( ; (c = casereader_read (r) ); case_unref (c)) + { + + } + + + output_correlation (corr, opts, + covariance_to_correlation (cov_matrix->m), + samples_matrix ); +} + +int +cmd_correlation (struct lexer *lexer, struct dataset *ds) +{ + int n_all_vars = 0; /* Total number of variables involved in this command */ + const struct dictionary *dict = dataset_dict (ds); + bool ok = true; + + struct casegrouper *grouper; + struct casereader *group; + + struct corr *corr = NULL; + size_t n_corrs = 0; + + struct corr_opts opts; + opts.missing_type = CORR_PAIRWISE; + opts.wv = dict_get_weight (dict); + opts.tails = 2; + opts.sig = false; + opts.exclude = MV_ANY; + + /* Parse CORRELATIONS. */ + while (lex_token (lexer) != '.') + { + lex_match (lexer, '/'); + if (lex_match_id (lexer, "MISSING")) + { + lex_match (lexer, '='); + while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + { + if (lex_match_id (lexer, "PAIRWISE")) + opts.missing_type = CORR_PAIRWISE; + else if (lex_match_id (lexer, "LISTWISE")) + opts.missing_type = CORR_LISTWISE; + + else if (lex_match_id (lexer, "INCLUDE")) + opts.exclude = MV_SYSTEM; + else if (lex_match_id (lexer, "EXCLUDE")) + opts.exclude = MV_ANY; + else + { + lex_error (lexer, NULL); + goto error; + } + lex_match (lexer, ','); + } + } + else if (lex_match_id (lexer, "PRINT")) + { + lex_match (lexer, '='); + while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + { + if ( lex_match_id (lexer, "TWOTAIL")) + opts.tails = 2; + else if (lex_match_id (lexer, "ONETAIL")) + opts.tails = 1; + else if (lex_match_id (lexer, "SIG")) + opts.sig = false; + else if (lex_match_id (lexer, "NOSIG")) + opts.sig = true; + else + { + lex_error (lexer, NULL); + goto error; + } + + lex_match (lexer, ','); + } + } + else + { + if (lex_match_id (lexer, "VARIABLES")) + { + lex_match (lexer, '='); + } + + corr = xrealloc (corr, sizeof (*corr) * (n_corrs + 1)); + corr[n_corrs].n_vars_total = corr[n_corrs].n_vars1 = 0; + + if ( ! parse_variables_const (lexer, dict, &corr[n_corrs].vars, + &corr[n_corrs].n_vars_total, + PV_NUMERIC)) + { + ok = false; + break; + } + + + corr[n_corrs].n_vars1 = corr[n_corrs].n_vars_total; + + if ( lex_match (lexer, T_WITH)) + { + if ( ! parse_variables_const (lexer, dict, + &corr[n_corrs].vars, &corr[n_corrs].n_vars_total, + PV_NUMERIC | PV_APPEND)) + { + ok = false; + break; + } + } + + n_all_vars += corr[n_corrs].n_vars_total; + + n_corrs++; + } + } + + if (n_corrs == 0) + { + msg (SE, _("No variables specified.")); + goto error; + } + + + const struct variable **all_vars = xmalloc (sizeof (*all_vars) * n_all_vars); + int i; + + { + /* FIXME: Using a hash here would make more sense */ + const struct variable **vv = all_vars; + + for (i = 0 ; i < n_corrs; ++i) + { + int v; + const struct corr *c = &corr[i]; + for (v = 0 ; v < c->n_vars_total; ++v) + *vv++ = c->vars[v]; + } + } + + grouper = casegrouper_create_splits (proc_open (ds), dict); + + while (casegrouper_get_next_group (grouper, &group)) + { + for (i = 0 ; i < n_corrs; ++i) + { + /* FIXME: No need to iterate the data multiple times */ + struct casereader *r = casereader_clone (group); + + if ( opts.missing_type == CORR_LISTWISE) + r = casereader_create_filter_missing (r, all_vars, n_all_vars, + opts.exclude, NULL, NULL); + + run_corr (r, &opts, &corr[i]); + casereader_destroy (r); + } + casereader_destroy (group); + } + + ok = casegrouper_destroy (grouper); + ok = proc_commit (ds) && ok; + + free (all_vars); + + + /* Done. */ + free (corr); + return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE; + + error: + free (corr); + return CMD_FAILURE; +} -- 2.30.2