X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fcorrelations.c;h=6af8ba32ffd54cc32f8e84baa8f77ee0192eee1d;hb=a258e53c63a08b0ec48aea8f03808eb651729424;hp=4899fd039b8959efde0baa06edea851305f2cfac;hpb=4687a500973dfdd7293c89b14248dab5ba76df90;p=pspp-builds.git diff --git a/src/language/stats/correlations.c b/src/language/stats/correlations.c index 4899fd03..6af8ba32 100644 --- a/src/language/stats/correlations.c +++ b/src/language/stats/correlations.c @@ -16,7 +16,9 @@ #include -#include +#include +#include +#include #include #include #include @@ -27,10 +29,10 @@ #include #include #include -#include -#include +#include #include #include +#include #include #include "xalloc.h" @@ -42,44 +44,6 @@ #define _(msgid) gettext (msgid) #define N_(msgid) msgid -/* Returns the correlation matrix corresponding to the covariance -matrix COV. The return value must be freed with gsl_matrix_free -when no longer required. -*/ -static gsl_matrix * -covariance_to_correlation (const gsl_matrix *cov) -{ - size_t r, c; - gsl_matrix *corr = gsl_matrix_alloc (cov->size1, cov->size2); - - for (r = 0 ; r < cov->size1; ++r) - { - for (c = 0 ; c < cov->size2 ; ++c) - { - double x = gsl_matrix_get (cov, r, c); - x /= sqrt (gsl_matrix_get (cov, r, r) - * gsl_matrix_get (cov, c, c) ); - gsl_matrix_set (corr, r, c, x); - } - } - - return corr; -} - -static double -significance_of_correlation (double rho, double w) -{ - double t = w - 2; - t /= 1 - MIN (1, pow2 (rho)); - t = sqrt (t); - t *= rho; - - if (t > 0) - return gsl_cdf_tdist_Q (t, w - 2); - else - return gsl_cdf_tdist_P (t, w - 2); -} - struct corr { @@ -97,6 +61,13 @@ enum corr_missing_type CORR_LISTWISE /* Discard entire case if any variable is missing. */ }; +enum stats_opts + { + STATS_DESCRIPTIVES = 0x01, + STATS_XPROD = 0x02, + STATS_ALL = STATS_XPROD | STATS_DESCRIPTIVES + }; + struct corr_opts { enum corr_missing_type missing_type; @@ -104,14 +75,90 @@ struct corr_opts bool sig; /* Flag significant values or not */ int tails; /* Report significance with how many tails ? */ + enum stats_opts statistics; const struct variable *wv; /* The weight variable (if any) */ }; +static void +output_descriptives (const struct corr *corr, const gsl_matrix *means, + const gsl_matrix *vars, const gsl_matrix *ns) +{ + const int nr = corr->n_vars_total + 1; + const int nc = 4; + int c, r; + + const int heading_columns = 1; + const int heading_rows = 1; + + struct tab_table *t = tab_create (nc, nr); + tab_title (t, _("Descriptive Statistics")); + + tab_headers (t, heading_columns, 0, heading_rows, 0); + + /* Outline the box */ + tab_box (t, + TAL_2, TAL_2, + -1, -1, + 0, 0, + nc - 1, nr - 1); + + /* Vertical lines */ + tab_box (t, + -1, -1, + -1, TAL_1, + heading_columns, 0, + nc - 1, nr - 1); + + tab_vline (t, TAL_2, heading_columns, 0, nr - 1); + tab_hline (t, TAL_1, 0, nc - 1, heading_rows); + + tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Mean")); + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("N")); + + for (r = 0 ; r < corr->n_vars_total ; ++r) + { + const struct variable *v = corr->vars[r]; + tab_text (t, 0, r + heading_rows, TAB_LEFT | TAT_TITLE, var_to_string (v)); + + for (c = 1 ; c < nc ; ++c) + { + double x ; + double n; + switch (c) + { + case 1: + x = gsl_matrix_get (means, r, 0); + break; + case 2: + x = gsl_matrix_get (vars, r, 0); + + /* Here we want to display the non-biased estimator */ + n = gsl_matrix_get (ns, r, 0); + x *= n / (n -1); + + x = sqrt (x); + break; + case 3: + x = gsl_matrix_get (ns, r, 0); + break; + default: + NOT_REACHED (); + }; + + tab_double (t, c, r + heading_rows, 0, x, NULL); + } + } + + tab_submit (t); +} + static void output_correlation (const struct corr *corr, const struct corr_opts *opts, - const gsl_matrix *cm, const gsl_matrix *samples) + const gsl_matrix *cm, const gsl_matrix *samples, + const gsl_matrix *cv) { int r, c; struct tab_table *t; @@ -125,7 +172,10 @@ output_correlation (const struct corr *corr, const struct corr_opts *opts, const int heading_columns = 2; const int heading_rows = 1; - const int rows_per_variable = opts->missing_type == CORR_LISTWISE ? 2 : 3; + int rows_per_variable = opts->missing_type == CORR_LISTWISE ? 2 : 3; + + if (opts->statistics & STATS_XPROD) + rows_per_variable += 2; /* Two header columns */ nc += heading_columns; @@ -136,9 +186,8 @@ output_correlation (const struct corr *corr, const struct corr_opts *opts, /* One header row */ nr += heading_rows; - t = tab_create (nc, nr, 0); + t = tab_create (nc, nr); tab_title (t, _("Correlations")); - tab_dim (t, tab_natural_dimensions, NULL); tab_headers (t, heading_columns, 0, heading_rows, 0); @@ -167,8 +216,16 @@ output_correlation (const struct corr *corr, const struct corr_opts *opts, tab_text (t, 1, 1 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, _("Pearson Correlation")); tab_text (t, 1, 2 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, (opts->tails == 2) ? _("Sig. (2-tailed)") : _("Sig. (1-tailed)")); + + if (opts->statistics & STATS_XPROD) + { + tab_text (t, 1, 3 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, _("Cross-products")); + tab_text (t, 1, 4 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, _("Covariance")); + } + if ( opts->missing_type != CORR_LISTWISE ) - tab_text (t, 1, 3 + r * rows_per_variable, TAB_LEFT | TAT_TITLE, _("N")); + tab_text (t, 1, rows_per_variable + r * rows_per_variable, TAB_LEFT | TAT_TITLE, _("N")); + tab_hline (t, TAL_1, 0, nc - 1, r * rows_per_variable + 1); } @@ -184,13 +241,13 @@ output_correlation (const struct corr *corr, const struct corr_opts *opts, for (c = 0 ; c < matrix_cols ; ++c) { unsigned char flags = 0; - int col_index = corr->n_vars_total - corr->n_vars1 + c; + const int col_index = corr->n_vars_total - corr->n_vars1 + c; double pearson = gsl_matrix_get (cm, r, col_index); double w = gsl_matrix_get (samples, r, col_index); double sig = opts->tails * significance_of_correlation (pearson, w); if ( opts->missing_type != CORR_LISTWISE ) - tab_double (t, c + heading_columns, row + 2, 0, w, wfmt); + tab_double (t, c + heading_columns, row + rows_per_variable - 1, 0, w, wfmt); if ( c != r) tab_double (t, c + heading_columns, row + 1, 0, sig, NULL); @@ -199,34 +256,73 @@ output_correlation (const struct corr *corr, const struct corr_opts *opts, flags = TAB_EMPH; tab_double (t, c + heading_columns, row, flags, pearson, NULL); + + if (opts->statistics & STATS_XPROD) + { + double cov = gsl_matrix_get (cv, r, col_index); + const double xprod_dev = cov * w; + cov *= w / (w - 1.0); + + tab_double (t, c + heading_columns, row + 2, 0, xprod_dev, NULL); + tab_double (t, c + heading_columns, row + 3, 0, cov, NULL); + } } } tab_submit (t); } + static void run_corr (struct casereader *r, const struct corr_opts *opts, const struct corr *corr) { struct ccase *c; - const struct design_matrix *cov_matrix; - const gsl_matrix *samples_matrix; - + const gsl_matrix *var_matrix, *samples_matrix, *mean_matrix; + const gsl_matrix *cov_matrix; + gsl_matrix *corr_matrix; + struct covariance *cov = covariance_2pass_create (corr->n_vars_total, corr->vars, + NULL, + opts->wv, opts->exclude); + + struct casereader *rc = casereader_clone (r); for ( ; (c = casereader_read (r) ); case_unref (c)) { + covariance_accumulate_pass1 (cov, c); + } + for ( ; (c = casereader_read (rc) ); case_unref (c)) + { + covariance_accumulate_pass2 (cov, c); } + cov_matrix = covariance_calculate (cov); + + casereader_destroy (rc); + + samples_matrix = covariance_moments (cov, MOMENT_NONE); + var_matrix = covariance_moments (cov, MOMENT_VARIANCE); + mean_matrix = covariance_moments (cov, MOMENT_MEAN); + + corr_matrix = correlation_from_covariance (cov_matrix, var_matrix); + + if ( opts->statistics & STATS_DESCRIPTIVES) + output_descriptives (corr, mean_matrix, var_matrix, samples_matrix); output_correlation (corr, opts, - covariance_to_correlation (cov_matrix->m), - samples_matrix ); + corr_matrix, + samples_matrix, + cov_matrix); + + covariance_destroy (cov); + gsl_matrix_free (corr_matrix); } int cmd_correlation (struct lexer *lexer, struct dataset *ds) { + int i; int n_all_vars = 0; /* Total number of variables involved in this command */ + const struct variable **all_vars ; const struct dictionary *dict = dataset_dict (ds); bool ok = true; @@ -242,6 +338,7 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) opts.tails = 2; opts.sig = false; opts.exclude = MV_ANY; + opts.statistics = 0; /* Parse CORRELATIONS. */ while (lex_token (lexer) != '.') @@ -288,6 +385,29 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) goto error; } + lex_match (lexer, ','); + } + } + else if (lex_match_id (lexer, "STATISTICS")) + { + lex_match (lexer, '='); + while (lex_token (lexer) != '.' && lex_token (lexer) != '/') + { + if ( lex_match_id (lexer, "DESCRIPTIVES")) + opts.statistics = STATS_DESCRIPTIVES; + else if (lex_match_id (lexer, "XPROD")) + opts.statistics = STATS_XPROD; + else if (lex_token (lexer) == T_ALL) + { + opts.statistics = STATS_ALL; + lex_get (lexer); + } + else + { + lex_error (lexer, NULL); + goto error; + } + lex_match (lexer, ','); } } @@ -336,8 +456,7 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) } - const struct variable **all_vars = xmalloc (sizeof (*all_vars) * n_all_vars); - int i; + all_vars = xmalloc (sizeof (*all_vars) * n_all_vars); { /* FIXME: Using a hash here would make more sense */ @@ -365,6 +484,7 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) r = casereader_create_filter_missing (r, all_vars, n_all_vars, opts.exclude, NULL, NULL); + run_corr (r, &opts, &corr[i]); casereader_destroy (r); } @@ -378,10 +498,13 @@ cmd_correlation (struct lexer *lexer, struct dataset *ds) /* Done. */ + free (corr->vars); free (corr); + return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE; error: + free (corr->vars); free (corr); return CMD_FAILURE; }