X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fcorrelations.c;h=b33baf5758372a4a50d869cda1cecd9a4224a0a6;hb=1e9875fc6bd073f970e28de5ef49e82717705858;hp=4899fd039b8959efde0baa06edea851305f2cfac;hpb=4687a500973dfdd7293c89b14248dab5ba76df90;p=pspp diff --git a/src/language/stats/correlations.c b/src/language/stats/correlations.c index 4899fd039b..b33baf5758 100644 --- a/src/language/stats/correlations.c +++ b/src/language/stats/correlations.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,70 +16,35 @@ #include -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include -#include "xalloc.h" -#include "minmax.h" -#include -#include + +#include "data/casegrouper.h" +#include "data/casereader.h" +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/variable.h" +#include "language/command.h" +#include "language/dictionary/split-file.h" +#include "language/lexer/lexer.h" +#include "language/lexer/variable-parser.h" +#include "libpspp/assertion.h" +#include "libpspp/message.h" +#include "libpspp/misc.h" +#include "math/correlation.h" +#include "math/covariance.h" +#include "math/moments.h" +#include "output/pivot-table.h" + +#include "gl/xalloc.h" +#include "gl/minmax.h" #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) msgid -/* Returns the correlation matrix corresponding to the covariance -matrix COV. The return value must be freed with gsl_matrix_free -when no longer required. -*/ -static gsl_matrix * -covariance_to_correlation (const gsl_matrix *cov) -{ - size_t r, c; - gsl_matrix *corr = gsl_matrix_alloc (cov->size1, cov->size2); - - for (r = 0 ; r < cov->size1; ++r) - { - for (c = 0 ; c < cov->size2 ; ++c) - { - double x = gsl_matrix_get (cov, r, c); - x /= sqrt (gsl_matrix_get (cov, r, r) - * gsl_matrix_get (cov, c, c) ); - gsl_matrix_set (corr, r, c, x); - } - } - - return corr; -} - -static double -significance_of_correlation (double rho, double w) -{ - double t = w - 2; - t /= 1 - MIN (1, pow2 (rho)); - t = sqrt (t); - t *= rho; - - if (t > 0) - return gsl_cdf_tdist_Q (t, w - 2); - else - return gsl_cdf_tdist_P (t, w - 2); -} - struct corr { @@ -97,6 +62,13 @@ enum corr_missing_type CORR_LISTWISE /* Discard entire case if any variable is missing. */ }; +enum stats_opts + { + STATS_DESCRIPTIVES = 0x01, + STATS_XPROD = 0x02, + STATS_ALL = STATS_XPROD | STATS_DESCRIPTIVES + }; + struct corr_opts { enum corr_missing_type missing_type; @@ -104,129 +76,189 @@ struct corr_opts bool sig; /* Flag significant values or not */ int tails; /* Report significance with how many tails ? */ + enum stats_opts statistics; const struct variable *wv; /* The weight variable (if any) */ }; static void -output_correlation (const struct corr *corr, const struct corr_opts *opts, - const gsl_matrix *cm, const gsl_matrix *samples) +output_descriptives (const struct corr *corr, const struct corr_opts *opts, + const gsl_matrix *means, + const gsl_matrix *vars, const gsl_matrix *ns) { - int r, c; - struct tab_table *t; - int matrix_cols; - int nr = corr->n_vars1; - int nc = matrix_cols = corr->n_vars_total > corr->n_vars1 ? - corr->n_vars_total - corr->n_vars1 : corr->n_vars1; + struct pivot_table *table = pivot_table_create ( + N_("Descriptive Statistics")); + pivot_table_set_weight_var (table, opts->wv); - const struct fmt_spec *wfmt = opts->wv ? var_get_print_format (opts->wv) : & F_8_0; + pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"), + N_("Mean"), PIVOT_RC_OTHER, + N_("Std. Deviation"), PIVOT_RC_OTHER, + N_("N"), PIVOT_RC_COUNT); - const int heading_columns = 2; - const int heading_rows = 1; + struct pivot_dimension *variables = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Variable")); - const int rows_per_variable = opts->missing_type == CORR_LISTWISE ? 2 : 3; - - /* Two header columns */ - nc += heading_columns; + for (size_t r = 0 ; r < corr->n_vars_total ; ++r) + { + const struct variable *v = corr->vars[r]; + + int row = pivot_category_create_leaf (variables->root, + pivot_value_new_variable (v)); + + double mean = gsl_matrix_get (means, r, 0); + /* Here we want to display the non-biased estimator */ + double n = gsl_matrix_get (ns, r, 0); + double stddev = sqrt (gsl_matrix_get (vars, r, 0) * n / (n - 1)); + double entries[] = { mean, stddev, n }; + for (size_t i = 0; i < sizeof entries / sizeof *entries; i++) + pivot_table_put2 (table, i, row, pivot_value_new_number (entries[i])); + } - /* Three data per variable */ - nr *= rows_per_variable; + pivot_table_submit (table); +} - /* One header row */ - nr += heading_rows; +static void +output_correlation (const struct corr *corr, const struct corr_opts *opts, + const gsl_matrix *cm, const gsl_matrix *samples, + const gsl_matrix *cv) +{ + struct pivot_table *table = pivot_table_create (N_("Correlations")); + pivot_table_set_weight_var (table, opts->wv); - t = tab_create (nc, nr, 0); - tab_title (t, _("Correlations")); - tab_dim (t, tab_natural_dimensions, NULL); + /* Column variable dimension. */ + struct pivot_dimension *columns = pivot_dimension_create ( + table, PIVOT_AXIS_COLUMN, N_("Variables")); - tab_headers (t, heading_columns, 0, heading_rows, 0); + int matrix_cols = (corr->n_vars_total > corr->n_vars1 + ? corr->n_vars_total - corr->n_vars1 + : corr->n_vars1); + for (int c = 0; c < matrix_cols; c++) + { + const struct variable *v = corr->n_vars_total > corr->n_vars1 ? + corr->vars[corr->n_vars1 + c] : corr->vars[c]; + pivot_category_create_leaf (columns->root, pivot_value_new_variable (v)); + } - /* Outline the box */ - tab_box (t, - TAL_2, TAL_2, - -1, -1, - 0, 0, - nc - 1, nr - 1); + /* Statistics dimension. */ + struct pivot_dimension *statistics = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Statistics"), + N_("Pearson Correlation"), PIVOT_RC_CORRELATION, + opts->tails == 2 ? N_("Sig. (2-tailed)") : N_("Sig. (1-tailed)"), + PIVOT_RC_SIGNIFICANCE); - /* Vertical lines */ - tab_box (t, - -1, -1, - -1, TAL_1, - heading_columns, 0, - nc - 1, nr - 1); + if (opts->statistics & STATS_XPROD) + pivot_category_create_leaves (statistics->root, N_("Cross-products"), + N_("Covariance")); - tab_vline (t, TAL_2, heading_columns, 0, nr - 1); - tab_vline (t, TAL_1, 1, heading_rows, nr - 1); + if (opts->missing_type != CORR_LISTWISE) + pivot_category_create_leaves (statistics->root, N_("N"), PIVOT_RC_COUNT); - for (r = 0 ; r < corr->n_vars1 ; ++r) - { - tab_text (t, 0, 1 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, - var_to_string (corr->vars[r])); - - tab_text (t, 1, 1 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, _("Pearson Correlation")); - tab_text (t, 1, 2 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, - (opts->tails == 2) ? _("Sig. (2-tailed)") : _("Sig. (1-tailed)")); - if ( opts->missing_type != CORR_LISTWISE ) - tab_text (t, 1, 3 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, _("N")); - tab_hline (t, TAL_1, 0, nc - 1, r * rows_per_variable + 1); - } + /* Row variable dimension. */ + struct pivot_dimension *rows = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Variables")); + for (size_t r = 0; r < corr->n_vars1; r++) + pivot_category_create_leaf (rows->root, + pivot_value_new_variable (corr->vars[r])); - for (c = 0 ; c < matrix_cols ; ++c) - { - const struct variable *v = corr->n_vars_total > corr->n_vars1 ? corr->vars[corr->n_vars_total - corr->n_vars1 + c] : corr->vars[c]; - tab_text (t, heading_columns + c, 0, TAB_LEFT | TAT_TITLE, var_to_string (v)); - } + struct pivot_footnote *sig_footnote = pivot_table_create_footnote ( + table, pivot_value_new_text (N_("Significant at .05 level"))); - for (r = 0 ; r < corr->n_vars1 ; ++r) - { - const int row = r * rows_per_variable + heading_rows; - for (c = 0 ; c < matrix_cols ; ++c) - { - unsigned char flags = 0; - int col_index = corr->n_vars_total - corr->n_vars1 + c; - double pearson = gsl_matrix_get (cm, r, col_index); - double w = gsl_matrix_get (samples, r, col_index); - double sig = opts->tails * significance_of_correlation (pearson, w); - - if ( opts->missing_type != CORR_LISTWISE ) - tab_double (t, c + heading_columns, row + 2, 0, w, wfmt); - - if ( c != r) - tab_double (t, c + heading_columns, row + 1, 0, sig, NULL); - - if ( opts->sig && c != r && sig < 0.05) - flags = TAB_EMPH; - - tab_double (t, c + heading_columns, row, flags, pearson, NULL); - } - } + for (int r = 0; r < corr->n_vars1; r++) + for (int c = 0; c < matrix_cols; c++) + { + const int col_index = (corr->n_vars_total > corr->n_vars1 + ? corr->n_vars1 + c + : c); + double pearson = gsl_matrix_get (cm, r, col_index); + double w = gsl_matrix_get (samples, r, col_index); + double sig = opts->tails * significance_of_correlation (pearson, w); + + double entries[5]; + int n = 0; + entries[n++] = pearson; + entries[n++] = col_index != r ? sig : SYSMIS; + if (opts->statistics & STATS_XPROD) + { + double cov = gsl_matrix_get (cv, r, col_index); + const double xprod_dev = cov * w; + cov *= w / (w - 1.0); + + entries[n++] = xprod_dev; + entries[n++] = cov; + } + if (opts->missing_type != CORR_LISTWISE) + entries[n++] = w; + + for (int i = 0; i < n; i++) + if (entries[i] != SYSMIS) + { + struct pivot_value *v = pivot_value_new_number (entries[i]); + if (!i && opts->sig && col_index != r && sig < 0.05) + pivot_value_add_footnote (v, sig_footnote); + pivot_table_put3 (table, c, i, r, v); + } + } - tab_submit (t); + pivot_table_submit (table); } + static void run_corr (struct casereader *r, const struct corr_opts *opts, const struct corr *corr) { struct ccase *c; - const struct design_matrix *cov_matrix; - const gsl_matrix *samples_matrix; + const gsl_matrix *var_matrix, *samples_matrix, *mean_matrix; + gsl_matrix *cov_matrix = NULL; + gsl_matrix *corr_matrix = NULL; + struct covariance *cov = covariance_2pass_create (corr->n_vars_total, corr->vars, + NULL, + opts->wv, opts->exclude, + true); + + struct casereader *rc = casereader_clone (r); + for (; (c = casereader_read (r)); case_unref (c)) + { + covariance_accumulate_pass1 (cov, c); + } - for ( ; (c = casereader_read (r) ); case_unref (c)) + for (; (c = casereader_read (rc)); case_unref (c)) { + covariance_accumulate_pass2 (cov, c); + } + casereader_destroy (rc); + cov_matrix = covariance_calculate (cov); + if (! cov_matrix) + { + msg (SE, _("The data for the chosen variables are all missing or empty.")); + goto error; } + samples_matrix = covariance_moments (cov, MOMENT_NONE); + var_matrix = covariance_moments (cov, MOMENT_VARIANCE); + mean_matrix = covariance_moments (cov, MOMENT_MEAN); - output_correlation (corr, opts, - covariance_to_correlation (cov_matrix->m), - samples_matrix ); + corr_matrix = correlation_from_covariance (cov_matrix, var_matrix); + + if (opts->statistics & STATS_DESCRIPTIVES) + output_descriptives (corr, opts, mean_matrix, var_matrix, samples_matrix); + + output_correlation (corr, opts, corr_matrix, + samples_matrix, cov_matrix); + + error: + covariance_destroy (cov); + gsl_matrix_free (corr_matrix); + gsl_matrix_free (cov_matrix); } int cmd_correlation (struct lexer *lexer, struct dataset *ds) { + int i; int n_all_vars = 0; /* Total number of variables involved in this command */ + const struct variable **all_vars ; const struct dictionary *dict = dataset_dict (ds); bool ok = true; @@ -242,15 +274,16 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) opts.tails = 2; opts.sig = false; opts.exclude = MV_ANY; + opts.statistics = 0; /* Parse CORRELATIONS. */ - while (lex_token (lexer) != '.') + while (lex_token (lexer) != T_ENDCMD) { - lex_match (lexer, '/'); + lex_match (lexer, T_SLASH); if (lex_match_id (lexer, "MISSING")) { - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "PAIRWISE")) opts.missing_type = CORR_PAIRWISE; @@ -266,15 +299,15 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) lex_error (lexer, NULL); goto error; } - lex_match (lexer, ','); + lex_match (lexer, T_COMMA); } } else if (lex_match_id (lexer, "PRINT")) { - lex_match (lexer, '='); - while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { - if ( lex_match_id (lexer, "TWOTAIL")) + if (lex_match_id (lexer, "TWOTAIL")) opts.tails = 2; else if (lex_match_id (lexer, "ONETAIL")) opts.tails = 1; @@ -288,20 +321,43 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) goto error; } - lex_match (lexer, ','); + lex_match (lexer, T_COMMA); + } + } + else if (lex_match_id (lexer, "STATISTICS")) + { + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "DESCRIPTIVES")) + opts.statistics = STATS_DESCRIPTIVES; + else if (lex_match_id (lexer, "XPROD")) + opts.statistics = STATS_XPROD; + else if (lex_token (lexer) == T_ALL) + { + opts.statistics = STATS_ALL; + lex_get (lexer); + } + else + { + lex_error (lexer, NULL); + goto error; + } + + lex_match (lexer, T_COMMA); } } else { if (lex_match_id (lexer, "VARIABLES")) { - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); } corr = xrealloc (corr, sizeof (*corr) * (n_corrs + 1)); corr[n_corrs].n_vars_total = corr[n_corrs].n_vars1 = 0; - - if ( ! parse_variables_const (lexer, dict, &corr[n_corrs].vars, + + if (! parse_variables_const (lexer, dict, &corr[n_corrs].vars, &corr[n_corrs].n_vars_total, PV_NUMERIC)) { @@ -312,9 +368,9 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) corr[n_corrs].n_vars1 = corr[n_corrs].n_vars_total; - if ( lex_match (lexer, T_WITH)) + if (lex_match (lexer, T_WITH)) { - if ( ! parse_variables_const (lexer, dict, + if (! parse_variables_const (lexer, dict, &corr[n_corrs].vars, &corr[n_corrs].n_vars_total, PV_NUMERIC | PV_APPEND)) { @@ -336,8 +392,7 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) } - const struct variable **all_vars = xmalloc (sizeof (*all_vars) * n_all_vars); - int i; + all_vars = xmalloc (sizeof (*all_vars) * n_all_vars); { /* FIXME: Using a hash here would make more sense */ @@ -361,10 +416,11 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) /* FIXME: No need to iterate the data multiple times */ struct casereader *r = casereader_clone (group); - if ( opts.missing_type == CORR_LISTWISE) + if (opts.missing_type == CORR_LISTWISE) r = casereader_create_filter_missing (r, all_vars, n_all_vars, opts.exclude, NULL, NULL); + run_corr (r, &opts, &corr[i]); casereader_destroy (r); } @@ -378,10 +434,14 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) /* Done. */ + free (corr->vars); free (corr); + return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE; error: + if (corr) + free (corr->vars); free (corr); return CMD_FAILURE; }