From b9295616cd1beacf0c05eb2b52e9145e9e42fc0a Mon Sep 17 00:00:00 2001 From: John Darrington Date: Fri, 13 Apr 2012 18:55:36 +0200 Subject: [PATCH 1/1] Replace regression.q with regression.c Reviewed-by: Jason Stover --- src/language/command.def | 2 +- src/language/stats/.gitignore | 1 - src/language/stats/automake.mk | 4 +- .../stats/{regression.q => regression.c} | 1209 ++++++++--------- tests/output/ascii.at | 9 +- 5 files changed, 558 insertions(+), 667 deletions(-) rename src/language/stats/{regression.q => regression.c} (63%) diff --git a/src/language/command.def b/src/language/command.def index f8ee29af61..05b18d6c61 100644 --- a/src/language/command.def +++ b/src/language/command.def @@ -128,7 +128,7 @@ DEF_CMD (S_DATA, 0, "ONEWAY", cmd_oneway) DEF_CMD (S_DATA, 0, "PEARSON CORRELATIONS", cmd_correlation) DEF_CMD (S_DATA, 0, "QUICK CLUSTER", cmd_quick_cluster) DEF_CMD (S_DATA, 0, "RANK", cmd_rank) -DEF_CMD (S_DATA, 0, "REGRESSION", cmd_regression) +DEF_CMD (S_DATA, 0, "REGRESSION", cmd_regression) DEF_CMD (S_DATA, 0, "RELIABILITY", cmd_reliability) DEF_CMD (S_DATA, 0, "RENAME VARIABLES", cmd_rename_variables) DEF_CMD (S_DATA, 0, "ROC", cmd_roc) diff --git a/src/language/stats/.gitignore b/src/language/stats/.gitignore index d550b0d129..10f6baf208 100644 --- a/src/language/stats/.gitignore +++ b/src/language/stats/.gitignore @@ -1,3 +1,2 @@ crosstabs.c frequencies.c -regression.c diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 93a8e2053d..8e89d014b9 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -4,8 +4,7 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/language/stats src_language_stats_built_sources = \ src/language/stats/crosstabs.c \ - src/language/stats/frequencies.c \ - src/language/stats/regression.c + src/language/stats/frequencies.c language_stats_sources = \ src/language/stats/aggregate.c \ @@ -50,6 +49,7 @@ language_stats_sources = \ src/language/stats/reliability.c \ src/language/stats/roc.c \ src/language/stats/roc.h \ + src/language/stats/regression.c \ src/language/stats/runs.h \ src/language/stats/runs.c \ src/language/stats/sign.c \ diff --git a/src/language/stats/regression.q b/src/language/stats/regression.c similarity index 63% rename from src/language/stats/regression.q rename to src/language/stats/regression.c index ca1f67da0e..0a2e2a8cf6 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2005, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,514 +16,82 @@ #include +#include + #include #include -#include -#include -#include -#include "data/case.h" +#include + +#include "language/command.h" +#include "language/lexer/lexer.h" +#include "language/lexer/value-parser.h" +#include "language/lexer/variable-parser.h" + + #include "data/casegrouper.h" #include "data/casereader.h" -#include "data/dataset.h" #include "data/dictionary.h" -#include "data/missing-values.h" -#include "data/transformations.h" -#include "data/value-labels.h" -#include "data/variable.h" -#include "language/command.h" -#include "language/data-io/file-handle.h" -#include "language/dictionary/split-file.h" -#include "language/lexer/lexer.h" -#include "libpspp/compiler.h" -#include "libpspp/message.h" -#include "libpspp/taint.h" + #include "math/covariance.h" #include "math/linreg.h" #include "math/moments.h" -#include "output/tab.h" -#include "gl/intprops.h" -#include "gl/xalloc.h" +#include "libpspp/message.h" +#include "libpspp/taint.h" + +#include "output/tab.h" #include "gettext.h" #define _(msgid) gettext (msgid) +#define N_(msgid) msgid -#define REG_LARGE_DATA 1000 - -/* (headers) */ - -/* (specification) - "REGRESSION" (regression_): - *variables=custom; - +statistics[st_]=r, - coeff, - anova, - outs, - zpp, - label, - sha, - ci, - bcov, - ses, - xtx, - collin, - tol, - selection, - f, - defaults, - all; - ^dependent=varlist; - +save[sv_]=resid,pred; - +method=enter. -*/ -/* (declarations) */ -/* (functions) */ -static struct cmd_regression cmd; - -/* - Moments for each of the variables used. - */ -struct moments_var -{ - struct moments1 *m; - const struct variable *v; -}; - -/* - Transformations for saving predicted values - and residuals, etc. - */ -struct reg_trns -{ - int n_trns; /* Number of transformations. */ - int trns_id; /* Which trns is this one? */ - linreg *c; /* Linear model for this trns. */ -}; -/* - Variables used (both explanatory and response). - */ -static const struct variable **v_variables; - -/* - Number of variables. - */ -static size_t n_variables; - -static bool run_regression (struct casereader *, struct cmd_regression *, - struct dataset *, linreg **); - -/* - STATISTICS subcommand output functions. - */ -static void reg_stats_r (linreg *, void *); -static void reg_stats_coeff (linreg *, void *); -static void reg_stats_anova (linreg *, void *); -static void reg_stats_outs (linreg *, void *); -static void reg_stats_zpp (linreg *, void *); -static void reg_stats_label (linreg *, void *); -static void reg_stats_sha (linreg *, void *); -static void reg_stats_ci (linreg *, void *); -static void reg_stats_f (linreg *, void *); -static void reg_stats_bcov (linreg *, void *); -static void reg_stats_ses (linreg *, void *); -static void reg_stats_xtx (linreg *, void *); -static void reg_stats_collin (linreg *, void *); -static void reg_stats_tol (linreg *, void *); -static void reg_stats_selection (linreg *, void *); -static void statistics_keyword_output (void (*)(linreg *, void *), - int, linreg *, void *); - -static void -reg_stats_r (linreg *c, void *aux UNUSED) -{ - struct tab_table *t; - int n_rows = 2; - int n_cols = 5; - double rsq; - double adjrsq; - double std_error; - - assert (c != NULL); - rsq = linreg_ssreg (c) / linreg_sst (c); - adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c)); - std_error = sqrt (linreg_mse (c)); - t = tab_create (n_cols, n_rows); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - - tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R")); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate")); - tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL); - tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL); - tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL); - tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL); - tab_title (t, _("Model Summary")); - tab_submit (t); -} -/* - Table showing estimated regression coefficients. - */ -static void -reg_stats_coeff (linreg * c, void *aux_) -{ - size_t j; - int n_cols = 7; - int n_rows; - int this_row; - double t_stat; - double pval; - double std_err; - double beta; - const char *label; +#include - const struct variable *v; - struct tab_table *t; - gsl_matrix *cov = aux_; - - assert (c != NULL); - n_rows = linreg_n_coeffs (c) + 3; - - t = tab_create (n_cols, n_rows); - tab_headers (t, 2, 0, 1, 0); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta")); - tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); - tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); - tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); - std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); - tab_double (t, 3, 1, 0, std_err, NULL); - tab_double (t, 4, 1, 0, 0.0, NULL); - t_stat = linreg_intercept (c) / std_err; - tab_double (t, 5, 1, 0, t_stat, NULL); - pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, 1, 0, pval, NULL); - for (j = 0; j < linreg_n_coeffs (c); j++) - { - struct string tstr; - ds_init_empty (&tstr); - this_row = j + 2; - - v = linreg_indep_var (c, j); - label = var_to_string (v); - /* Do not overwrite the variable's name. */ - ds_put_cstr (&tstr, label); - tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); - /* - Regression coefficients. - */ - tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL); - /* - Standard error of the coefficients. - */ - std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); - tab_double (t, 3, this_row, 0, std_err, NULL); - /* - Standardized coefficient, i.e., regression coefficient - if all variables had unit variance. - */ - beta = sqrt (gsl_matrix_get (cov, j, j)); - beta *= linreg_coeff (c, j) / - sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); - tab_double (t, 4, this_row, 0, beta, NULL); - - /* - Test statistic for H0: coefficient is 0. - */ - t_stat = linreg_coeff (c, j) / std_err; - tab_double (t, 5, this_row, 0, t_stat, NULL); - /* - P values for the test statistic above. - */ - pval = - 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, this_row, 0, pval, NULL); - ds_destroy (&tstr); - } - tab_title (t, _("Coefficients")); - tab_submit (t); -} +#define REG_LARGE_DATA 1000 -/* - Display the ANOVA table. - */ -static void -reg_stats_anova (linreg * c, void *aux UNUSED) +struct regression { - int n_cols = 7; - int n_rows = 4; - const double msm = linreg_ssreg (c) / linreg_dfmodel (c); - const double mse = linreg_mse (c); - const double F = msm / mse; - const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe); - - struct tab_table *t; - - assert (c != NULL); - t = tab_create (n_cols, n_rows); - tab_headers (t, 2, 0, 1, 0); - - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square")); - tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); + struct dataset *ds; - tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression")); - tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual")); - tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total")); - - /* Sums of Squares */ - tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL); - tab_double (t, 2, 3, 0, linreg_sst (c), NULL); - tab_double (t, 2, 2, 0, linreg_sse (c), NULL); - - - /* Degrees of freedom */ - tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm); - tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe); - tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft); - - /* Mean Squares */ - tab_double (t, 4, 1, TAB_RIGHT, msm, NULL); - tab_double (t, 4, 2, TAB_RIGHT, mse, NULL); - - tab_double (t, 5, 1, 0, F, NULL); + const struct variable **vars; + size_t n_vars; - tab_double (t, 6, 1, 0, pval, NULL); + const struct variable **dep_vars; + size_t n_dep_vars; - tab_title (t, _("ANOVA")); - tab_submit (t); -} - -static void -reg_stats_outs (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} + bool r; + bool coeff; + bool anova; + bool bcov; -static void -reg_stats_zpp (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_label (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} + bool resid; + bool pred; -static void -reg_stats_sha (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_ci (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_f (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_bcov (linreg * c, void *aux UNUSED) -{ - int n_cols; - int n_rows; - int i; - int k; - int row; - int col; - const char *label; - struct tab_table *t; + linreg **models; +}; - assert (c != NULL); - n_cols = c->n_indeps + 1 + 2; - n_rows = 2 * (c->n_indeps + 1); - t = tab_create (n_cols, n_rows); - tab_headers (t, 2, 0, 1, 0); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model")); - tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances")); - for (i = 0; i < linreg_n_coeffs (c); i++) - { - const struct variable *v = linreg_indep_var (c, i); - label = var_to_string (v); - tab_text (t, 2, i, TAB_CENTER, label); - tab_text (t, i + 2, 0, TAB_CENTER, label); - for (k = 1; k < linreg_n_coeffs (c); k++) - { - col = (i <= k) ? k : i; - row = (i <= k) ? i : k; - tab_double (t, k + 2, i, TAB_CENTER, - gsl_matrix_get (c->cov, row, col), NULL); - } - } - tab_title (t, _("Coefficient Correlations")); - tab_submit (t); -} -static void -reg_stats_ses (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_xtx (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_collin (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_tol (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_selection (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -statistics_keyword_output (void (*function) (linreg *, void *), - int keyword, linreg * c, void *aux) -{ - if (keyword) - { - (*function) (c, aux); - } -} +static void run_regression (const struct regression *cmd, struct casereader *input); -static void -subcommand_statistics (int *keywords, linreg * c, void *aux) -{ - /* - The order here must match the order in which the STATISTICS - keywords appear in the specification section above. - */ - enum - { r, - coeff, - anova, - outs, - zpp, - label, - sha, - ci, - bcov, - ses, - xtx, - collin, - tol, - selection, - f, - defaults, - all - }; - int i; - int d = 1; - if (keywords[all]) - { - /* - Set everything but F. - */ - for (i = 0; i < f; i++) - { - keywords[i] = 1; - } - } - else - { - for (i = 0; i < all; i++) - { - if (keywords[i]) - { - d = 0; - } - } - /* - Default output: ANOVA table, parameter estimates, - and statistics for variables not entered into model, - if appropriate. - */ - if (keywords[defaults] | d) - { - keywords[anova] = 1; - keywords[outs] = 1; - keywords[coeff] = 1; - keywords[r] = 1; - } - } - statistics_keyword_output (reg_stats_r, keywords[r], c, aux); - statistics_keyword_output (reg_stats_anova, keywords[anova], c, aux); - statistics_keyword_output (reg_stats_coeff, keywords[coeff], c, aux); - statistics_keyword_output (reg_stats_outs, keywords[outs], c, aux); - statistics_keyword_output (reg_stats_zpp, keywords[zpp], c, aux); - statistics_keyword_output (reg_stats_label, keywords[label], c, aux); - statistics_keyword_output (reg_stats_sha, keywords[sha], c, aux); - statistics_keyword_output (reg_stats_ci, keywords[ci], c, aux); - statistics_keyword_output (reg_stats_f, keywords[f], c, aux); - statistics_keyword_output (reg_stats_bcov, keywords[bcov], c, aux); - statistics_keyword_output (reg_stats_ses, keywords[ses], c, aux); - statistics_keyword_output (reg_stats_xtx, keywords[xtx], c, aux); - statistics_keyword_output (reg_stats_collin, keywords[collin], c, aux); - statistics_keyword_output (reg_stats_tol, keywords[tol], c, aux); - statistics_keyword_output (reg_stats_selection, keywords[selection], c, aux); -} /* - Free the transformation. Free its linear model if this - transformation is the last one. - */ -static bool -regression_trns_free (void *t_) + Transformations for saving predicted values + and residuals, etc. +*/ +struct reg_trns { - bool result = true; - struct reg_trns *t = t_; - - if (t->trns_id == t->n_trns) - { - result = linreg_free (t->c); - } - free (t); - - return result; -} + int n_trns; /* Number of transformations. */ + int trns_id; /* Which trns is this one? */ + linreg *c; /* Linear model for this trns. */ +}; /* Gets the predicted values. - */ +*/ static int regression_trns_pred_proc (void *t_, struct ccase **c, casenumber case_idx UNUSED) @@ -562,7 +130,7 @@ regression_trns_pred_proc (void *t_, struct ccase **c, /* Gets the residuals. - */ +*/ static int regression_trns_resid_proc (void *t_, struct ccase **c, casenumber case_idx UNUSED) @@ -604,6 +172,7 @@ regression_trns_resid_proc (void *t_, struct ccase **c, return TRNS_CONTINUE; } + static char * reg_get_name (const struct dictionary *dict, const char *prefix) { @@ -620,6 +189,25 @@ reg_get_name (const struct dictionary *dict, const char *prefix) } } +/* + Free the transformation. Free its linear model if this + transformation is the last one. +*/ +static bool +regression_trns_free (void *t_) +{ + bool result = true; + struct reg_trns *t = t_; + + if (t->trns_id == t->n_trns) + { + result = linreg_free (t->c); + } + free (t); + + return result; +} + static void reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, linreg * c, struct variable **v, int n_trns) @@ -643,156 +231,275 @@ reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, add_transformation (ds, f, regression_trns_free, t); trns_index++; } + static void -subcommand_save (struct dataset *ds, int save, linreg ** models) +subcommand_save (const struct regression *cmd) { linreg **lc; int n_trns = 0; - int i; - if (save) + if ( cmd->resid ) n_trns++; + if ( cmd->pred ) n_trns++; + + n_trns *= cmd->n_dep_vars; + + for (lc = cmd->models; lc < cmd->models + cmd->n_dep_vars; lc++) + { + if (*lc != NULL) + { + if ((*lc)->depvar != NULL) + { + if (cmd->resid) + { + reg_save_var (cmd->ds, "RES", regression_trns_resid_proc, *lc, + &(*lc)->resid, n_trns); + } + if (cmd->pred) + { + reg_save_var (cmd->ds, "PRED", regression_trns_pred_proc, *lc, + &(*lc)->pred, n_trns); + } + } + } + } +} + +int +cmd_regression (struct lexer *lexer, struct dataset *ds) +{ + struct regression regression; + const struct dictionary *dict = dataset_dict (ds); + + memset (®ression, 0, sizeof (struct regression)); + + regression.anova = true; + regression.coeff = true; + regression.r = true; + + regression.pred = false; + regression.resid = false; + + regression.ds = ds; + + /* Accept an optional, completely pointless "/VARIABLES=" */ + lex_match (lexer, T_SLASH); + if (lex_match_id (lexer, "VARIABLES")) { - /* Count the number of transformations we will need. */ - for (i = 0; i < REGRESSION_SV_count; i++) + if (! lex_force_match (lexer, T_EQUALS) ) + goto error; + } + + if (!parse_variables_const (lexer, dict, + ®ression.vars, ®ression.n_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto error; + + + while (lex_token (lexer) != T_ENDCMD) + { + lex_match (lexer, T_SLASH); + + if (lex_match_id (lexer, "DEPENDENT")) + { + if (! lex_force_match (lexer, T_EQUALS) ) + goto error; + + if (!parse_variables_const (lexer, dict, + ®ression.dep_vars, ®ression.n_dep_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto error; + } + else if (lex_match_id (lexer, "METHOD")) { - if (cmd.a_save[i]) - { - n_trns++; - } - } - n_trns *= cmd.n_dependent; + lex_match (lexer, T_EQUALS); + + if (!lex_force_match_id (lexer, "ENTER")) + { + goto error; + } + } + else if (lex_match_id (lexer, "STATISTICS")) + { + lex_match (lexer, T_EQUALS); - for (lc = models; lc < models + cmd.n_dependent; lc++) + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match (lexer, T_ALL)) + { + } + else if (lex_match_id (lexer, "DEFAULTS")) + { + } + else if (lex_match_id (lexer, "R")) + { + } + else if (lex_match_id (lexer, "COEFF")) + { + } + else if (lex_match_id (lexer, "ANOVA")) + { + } + else if (lex_match_id (lexer, "BCOV")) + { + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + } + else if (lex_match_id (lexer, "SAVE")) { - if (*lc != NULL) + lex_match (lexer, T_EQUALS); + + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) { - if ((*lc)->depvar != NULL) - { - if (cmd.a_save[REGRESSION_SV_RESID]) - { - reg_save_var (ds, "RES", regression_trns_resid_proc, *lc, - &(*lc)->resid, n_trns); - } - if (cmd.a_save[REGRESSION_SV_PRED]) - { - reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc, - &(*lc)->pred, n_trns); - } - } - } - } + if (lex_match_id (lexer, "PRED")) + { + regression.pred = true; + } + else if (lex_match_id (lexer, "RESID")) + { + regression.resid = true; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + } + else + { + lex_error (lexer, NULL); + goto error; + } } - else + + if (!regression.vars) { - for (lc = models; lc < models + cmd.n_dependent; lc++) + dict_get_vars (dict, ®ression.vars, ®ression.n_vars, 0); + } + + + regression.models = xcalloc (regression.n_dep_vars, sizeof *regression.models); + + { + struct casegrouper *grouper; + struct casereader *group; + bool ok; + + grouper = casegrouper_create_splits (proc_open (ds), dict); + while (casegrouper_get_next_group (grouper, &group)) + run_regression (®ression, group); + ok = casegrouper_destroy (grouper); + ok = proc_commit (ds) && ok; + } + + if (regression.pred || regression.resid ) + subcommand_save (®ression); + + + return CMD_SUCCESS; + + error: + return CMD_FAILURE; +} + + +static size_t +get_n_all_vars (const struct regression *cmd) +{ + size_t result = cmd->n_vars; + size_t i; + size_t j; + + result += cmd->n_dep_vars; + for (i = 0; i < cmd->n_dep_vars; i++) + { + for (j = 0; j < cmd->n_vars; j++) { - if (*lc != NULL) + if (cmd->vars[j] == cmd->dep_vars[i]) { - linreg_free (*lc); + result--; } } } + return result; } -int -cmd_regression (struct lexer *lexer, struct dataset *ds) +static void +fill_all_vars (const struct variable **vars, const struct regression *cmd) { - struct casegrouper *grouper; - struct casereader *group; - linreg **models; - bool ok; size_t i; - - if (!parse_regression (lexer, ds, &cmd, NULL)) + size_t j; + bool absent; + + for (i = 0; i < cmd->n_vars; i++) { - return CMD_FAILURE; + vars[i] = cmd->vars[i]; } - - models = xnmalloc (cmd.n_dependent, sizeof *models); - for (i = 0; i < cmd.n_dependent; i++) + for (i = 0; i < cmd->n_dep_vars; i++) { - models[i] = NULL; + absent = true; + for (j = 0; j < cmd->n_vars; j++) + { + if (cmd->dep_vars[i] == cmd->vars[j]) + { + absent = false; + break; + } + } + if (absent) + { + vars[i + cmd->n_vars] = cmd->dep_vars[i]; + } } - - /* Data pass. */ - grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); - while (casegrouper_get_next_group (grouper, &group)) - run_regression (group, &cmd, ds, models); - ok = casegrouper_destroy (grouper); - ok = proc_commit (ds) && ok; - - subcommand_save (ds, cmd.sbc_save, models); - free (v_variables); - free (models); - free_regression (&cmd); - - return ok ? CMD_SUCCESS : CMD_FAILURE; } /* Is variable k the dependent variable? - */ +*/ static bool -is_depvar (size_t k, const struct variable *v) +is_depvar (const struct regression *cmd, size_t k, const struct variable *v) { - return v == v_variables[k]; + return v == cmd->vars[k]; } -/* Parser for the variables sub command */ -static int -regression_custom_variables (struct lexer *lexer, struct dataset *ds, - struct cmd_regression *cmd UNUSED, - void *aux UNUSED) -{ - const struct dictionary *dict = dataset_dict (ds); - - lex_match (lexer, T_EQUALS); - - if ((lex_token (lexer) != T_ID - || dict_lookup_var (dict, lex_tokcstr (lexer)) == NULL) - && lex_token (lexer) != T_ALL) - return 2; - - - if (!parse_variables_const - (lexer, dict, &v_variables, &n_variables, PV_NONE)) - { - free (v_variables); - return 0; - } - assert (n_variables); - - return 1; -} /* Identify the explanatory variables in v_variables. Returns the number of independent variables. */ static int -identify_indep_vars (const struct variable **indep_vars, +identify_indep_vars (const struct regression *cmd, + const struct variable **indep_vars, const struct variable *depvar) { int n_indep_vars = 0; int i; - for (i = 0; i < n_variables; i++) - if (!is_depvar (i, depvar)) - indep_vars[n_indep_vars++] = v_variables[i]; - if ((n_indep_vars < 1) && is_depvar (0, depvar)) + for (i = 0; i < cmd->n_vars; i++) + if (!is_depvar (cmd, i, depvar)) + indep_vars[n_indep_vars++] = cmd->vars[i]; + if ((n_indep_vars < 1) && is_depvar (cmd, 0, depvar)) { /* There is only one independent variable, and it is the same as the dependent variable. Print a warning and continue. - */ + */ msg (SE, gettext ("The dependent variable is equal to the independent variable." "The least squares line is therefore Y=X." "Standard errors and related statistics may be meaningless.")); n_indep_vars = 1; - indep_vars[0] = v_variables[0]; + indep_vars[0] = cmd->vars[0]; } return n_indep_vars; } + static double fill_covariance (gsl_matrix *cov, struct covariance *all_cov, const struct variable **vars, @@ -863,62 +570,37 @@ fill_covariance (gsl_matrix *cov, struct covariance *all_cov, gsl_matrix_free (cm); return result; } -static size_t -get_n_all_vars (struct cmd_regression *cmd) -{ - size_t result = n_variables; - size_t i; - size_t j; - result += cmd->n_dependent; - for (i = 0; i < cmd->n_dependent; i++) - { - for (j = 0; j < n_variables; j++) - { - if (v_variables[j] == cmd->v_dependent[i]) - { - result--; - } - } - } - return result; -} -static void -fill_all_vars (const struct variable **vars, struct cmd_regression *cmd) -{ - size_t i; - size_t j; - bool absent; - - for (i = 0; i < n_variables; i++) - { - vars[i] = v_variables[i]; - } - for (i = 0; i < cmd->n_dependent; i++) - { - absent = true; - for (j = 0; j < n_variables; j++) - { - if (cmd->v_dependent[i] == v_variables[j]) - { - absent = false; - break; - } - } - if (absent) - { - vars[i + n_variables] = cmd->v_dependent[i]; - } - } + +/* + STATISTICS subcommand output functions. +*/ +static void reg_stats_r (linreg *, void *); +static void reg_stats_coeff (linreg *, void *); +static void reg_stats_anova (linreg *, void *); +static void reg_stats_bcov (linreg *, void *); + +static void statistics_keyword_output (void (*)(linreg *, void *), + bool, linreg *, void *); + + + +static void +subcommand_statistics (const struct regression *cmd , linreg * c, void *aux) +{ + statistics_keyword_output (reg_stats_r, cmd->r, c, aux); + statistics_keyword_output (reg_stats_anova, cmd->anova, c, aux); + statistics_keyword_output (reg_stats_coeff, cmd->coeff, c, aux); + statistics_keyword_output (reg_stats_bcov, cmd->bcov, c, aux); } -static bool -run_regression (struct casereader *input, struct cmd_regression *cmd, - struct dataset *ds, linreg **models) + + +static void +run_regression (const struct regression *cmd, struct casereader *input) { size_t i; int n_indep = 0; int k; - double n_data; double *means; struct ccase *c; struct covariance *cov; @@ -926,41 +608,17 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, const struct variable **all_vars; const struct variable *dep_var; struct casereader *reader; - const struct dictionary *dict; size_t n_all_vars; - assert (models != NULL); - - for (i = 0; i < n_variables; i++) - { - if (!var_is_numeric (v_variables[i])) - { - msg (SE, _("REGRESSION requires numeric variables.")); - return false; - } - } + linreg **models = cmd->models; - c = casereader_peek (input, 0); - if (c == NULL) - { - casereader_destroy (input); - return true; - } - output_split_file_values (ds, c); - case_unref (c); - - dict = dataset_dict (ds); - if (!v_variables) - { - dict_get_vars (dict, &v_variables, &n_variables, 0); - } n_all_vars = get_n_all_vars (cmd); all_vars = xnmalloc (n_all_vars, sizeof (*all_vars)); fill_all_vars (all_vars, cmd); - vars = xnmalloc (n_variables, sizeof (*vars)); + vars = xnmalloc (cmd->n_vars, sizeof (*vars)); means = xnmalloc (n_all_vars, sizeof (*means)); cov = covariance_1pass_create (n_all_vars, all_vars, - dict_get_weight (dict), MV_ANY); + dict_get_weight (dataset_dict (cmd->ds)), MV_ANY); reader = casereader_clone (input); reader = casereader_create_filter_missing (reader, all_vars, n_all_vars, @@ -971,12 +629,14 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, { covariance_accumulate (cov, c); } - - for (k = 0; k < cmd->n_dependent; k++) + + for (k = 0; k < cmd->n_dep_vars; k++) { + double n_data; + gsl_matrix *this_cm; - dep_var = cmd->v_dependent[k]; - n_indep = identify_indep_vars (vars, dep_var); + dep_var = cmd->dep_vars[k]; + n_indep = identify_indep_vars (cmd, vars, dep_var); this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); n_data = fill_covariance (this_cm, cov, vars, n_indep, @@ -996,7 +656,7 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, { models[k]->method = LINREG_QR; } - + if (n_data > 0) { /* @@ -1006,13 +666,13 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, if (!taint_has_tainted_successor (casereader_get_taint (input))) { - subcommand_statistics (cmd->a_statistics, models[k], this_cm); + subcommand_statistics (cmd, models[k], this_cm); } } else { msg (SE, - gettext ("No valid data found. This command was skipped.")); + _("No valid data found. This command was skipped.")); linreg_free (models[k]); models[k] = NULL; } @@ -1025,12 +685,241 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, free (means); casereader_destroy (input); covariance_destroy (cov); - - return true; +} + + + + + +static void +reg_stats_r (linreg *c, void *aux UNUSED) +{ + struct tab_table *t; + int n_rows = 2; + int n_cols = 5; + double rsq; + double adjrsq; + double std_error; + + assert (c != NULL); + rsq = linreg_ssreg (c) / linreg_sst (c); + adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c)); + std_error = sqrt (linreg_mse (c)); + t = tab_create (n_cols, n_rows); + tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 2, 0, n_rows - 1); + tab_vline (t, TAL_0, 1, 0, 0); + + tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R")); + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate")); + tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL); + tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL); + tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL); + tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL); + tab_title (t, _("Model Summary")); + tab_submit (t); +} + +/* + Table showing estimated regression coefficients. +*/ +static void +reg_stats_coeff (linreg * c, void *aux_) +{ + size_t j; + int n_cols = 7; + int n_rows; + int this_row; + double t_stat; + double pval; + double std_err; + double beta; + const char *label; + + const struct variable *v; + struct tab_table *t; + gsl_matrix *cov = aux_; + + assert (c != NULL); + n_rows = linreg_n_coeffs (c) + 3; + + t = tab_create (n_cols, n_rows); + tab_headers (t, 2, 0, 1, 0); + tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 2, 0, n_rows - 1); + tab_vline (t, TAL_0, 1, 0, 0); + + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta")); + tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); + tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); + tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); + tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); + std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); + tab_double (t, 3, 1, 0, std_err, NULL); + tab_double (t, 4, 1, 0, 0.0, NULL); + t_stat = linreg_intercept (c) / std_err; + tab_double (t, 5, 1, 0, t_stat, NULL); + pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); + tab_double (t, 6, 1, 0, pval, NULL); + for (j = 0; j < linreg_n_coeffs (c); j++) + { + struct string tstr; + ds_init_empty (&tstr); + this_row = j + 2; + + v = linreg_indep_var (c, j); + label = var_to_string (v); + /* Do not overwrite the variable's name. */ + ds_put_cstr (&tstr, label); + tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); + /* + Regression coefficients. + */ + tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL); + /* + Standard error of the coefficients. + */ + std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); + tab_double (t, 3, this_row, 0, std_err, NULL); + /* + Standardized coefficient, i.e., regression coefficient + if all variables had unit variance. + */ + beta = sqrt (gsl_matrix_get (cov, j, j)); + beta *= linreg_coeff (c, j) / + sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); + tab_double (t, 4, this_row, 0, beta, NULL); + + /* + Test statistic for H0: coefficient is 0. + */ + t_stat = linreg_coeff (c, j) / std_err; + tab_double (t, 5, this_row, 0, t_stat, NULL); + /* + P values for the test statistic above. + */ + pval = + 2 * gsl_cdf_tdist_Q (fabs (t_stat), + (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); + tab_double (t, 6, this_row, 0, pval, NULL); + ds_destroy (&tstr); + } + tab_title (t, _("Coefficients")); + tab_submit (t); } /* - Local Variables: - mode: c - End: + Display the ANOVA table. */ +static void +reg_stats_anova (linreg * c, void *aux UNUSED) +{ + int n_cols = 7; + int n_rows = 4; + const double msm = linreg_ssreg (c) / linreg_dfmodel (c); + const double mse = linreg_mse (c); + const double F = msm / mse; + const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe); + + struct tab_table *t; + + assert (c != NULL); + t = tab_create (n_cols, n_rows); + tab_headers (t, 2, 0, 1, 0); + + tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); + + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 2, 0, n_rows - 1); + tab_vline (t, TAL_0, 1, 0, 0); + + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square")); + tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F")); + tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); + + tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression")); + tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual")); + tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total")); + + /* Sums of Squares */ + tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL); + tab_double (t, 2, 3, 0, linreg_sst (c), NULL); + tab_double (t, 2, 2, 0, linreg_sse (c), NULL); + + + /* Degrees of freedom */ + tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm); + tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe); + tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft); + + /* Mean Squares */ + tab_double (t, 4, 1, TAB_RIGHT, msm, NULL); + tab_double (t, 4, 2, TAB_RIGHT, mse, NULL); + + tab_double (t, 5, 1, 0, F, NULL); + + tab_double (t, 6, 1, 0, pval, NULL); + + tab_title (t, _("ANOVA")); + tab_submit (t); +} + + +static void +reg_stats_bcov (linreg * c, void *aux UNUSED) +{ + int n_cols; + int n_rows; + int i; + int k; + int row; + int col; + const char *label; + struct tab_table *t; + + assert (c != NULL); + n_cols = c->n_indeps + 1 + 2; + n_rows = 2 * (c->n_indeps + 1); + t = tab_create (n_cols, n_rows); + tab_headers (t, 2, 0, 1, 0); + tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 2, 0, n_rows - 1); + tab_vline (t, TAL_0, 1, 0, 0); + tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model")); + tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances")); + for (i = 0; i < linreg_n_coeffs (c); i++) + { + const struct variable *v = linreg_indep_var (c, i); + label = var_to_string (v); + tab_text (t, 2, i, TAB_CENTER, label); + tab_text (t, i + 2, 0, TAB_CENTER, label); + for (k = 1; k < linreg_n_coeffs (c); k++) + { + col = (i <= k) ? k : i; + row = (i <= k) ? i : k; + tab_double (t, k + 2, i, TAB_CENTER, + gsl_matrix_get (c->cov, row, col), NULL); + } + } + tab_title (t, _("Coefficient Correlations")); + tab_submit (t); +} + +static void +statistics_keyword_output (void (*function) (linreg *, void *), + bool keyword, linreg * c, void *aux) +{ + if (keyword) + { + (*function) (c, aux); + } +} diff --git a/tests/output/ascii.at b/tests/output/ascii.at index df1263eb01..07b440efb5 100644 --- a/tests/output/ascii.at +++ b/tests/output/ascii.at @@ -555,7 +555,7 @@ REGRESSION /DEPENDENT= x y /STATISTICS=COEFF R ANOVA. ]) -AT_CHECK([pspp ascii.sps], [1], [dnl +AT_CHECK([pspp ascii.sps], [0], [dnl SET PRINTBACK=ON. DATA LIST LIST /x * y * a (a23). @@ -577,9 +577,12 @@ END DATA. REGRESSION /VARIABLES= a + +ascii.sps:11: warning: REGRESSION: a is not a numeric variable. It will not be +included in the variable list. + /DEPENDENT= x y /STATISTICS=COEFF R ANOVA. - -ascii.sps:12: error: REGRESSION: REGRESSION requires numeric variables. ]) + AT_CLEANUP -- 2.30.2