From: John Darrington Date: Fri, 13 Apr 2012 16:55:36 +0000 (+0200) Subject: Replace regression.q with regression.c X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=b9295616cd1beacf0c05eb2b52e9145e9e42fc0a Replace regression.q with regression.c Reviewed-by: Jason Stover --- diff --git a/src/language/command.def b/src/language/command.def index f8ee29af61..05b18d6c61 100644 --- a/src/language/command.def +++ b/src/language/command.def @@ -128,7 +128,7 @@ DEF_CMD (S_DATA, 0, "ONEWAY", cmd_oneway) DEF_CMD (S_DATA, 0, "PEARSON CORRELATIONS", cmd_correlation) DEF_CMD (S_DATA, 0, "QUICK CLUSTER", cmd_quick_cluster) DEF_CMD (S_DATA, 0, "RANK", cmd_rank) -DEF_CMD (S_DATA, 0, "REGRESSION", cmd_regression) +DEF_CMD (S_DATA, 0, "REGRESSION", cmd_regression) DEF_CMD (S_DATA, 0, "RELIABILITY", cmd_reliability) DEF_CMD (S_DATA, 0, "RENAME VARIABLES", cmd_rename_variables) DEF_CMD (S_DATA, 0, "ROC", cmd_roc) diff --git a/src/language/stats/.gitignore b/src/language/stats/.gitignore index d550b0d129..10f6baf208 100644 --- a/src/language/stats/.gitignore +++ b/src/language/stats/.gitignore @@ -1,3 +1,2 @@ crosstabs.c frequencies.c -regression.c diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 93a8e2053d..8e89d014b9 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -4,8 +4,7 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/language/stats src_language_stats_built_sources = \ src/language/stats/crosstabs.c \ - src/language/stats/frequencies.c \ - src/language/stats/regression.c + src/language/stats/frequencies.c language_stats_sources = \ src/language/stats/aggregate.c \ @@ -50,6 +49,7 @@ language_stats_sources = \ src/language/stats/reliability.c \ src/language/stats/roc.c \ src/language/stats/roc.h \ + src/language/stats/regression.c \ src/language/stats/runs.h \ src/language/stats/runs.c \ src/language/stats/sign.c \ diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c new file mode 100644 index 0000000000..0a2e2a8cf6 --- /dev/null +++ b/src/language/stats/regression.c @@ -0,0 +1,925 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2005, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include + +#include +#include + +#include + +#include "language/command.h" +#include "language/lexer/lexer.h" +#include "language/lexer/value-parser.h" +#include "language/lexer/variable-parser.h" + + +#include "data/casegrouper.h" +#include "data/casereader.h" +#include "data/dictionary.h" + +#include "math/covariance.h" +#include "math/linreg.h" +#include "math/moments.h" + +#include "libpspp/message.h" +#include "libpspp/taint.h" + +#include "output/tab.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) +#define N_(msgid) msgid + + +#include + +#define REG_LARGE_DATA 1000 + +struct regression +{ + struct dataset *ds; + + const struct variable **vars; + size_t n_vars; + + const struct variable **dep_vars; + size_t n_dep_vars; + + bool r; + bool coeff; + bool anova; + bool bcov; + + + bool resid; + bool pred; + + linreg **models; +}; + + +static void run_regression (const struct regression *cmd, struct casereader *input); + + + +/* + Transformations for saving predicted values + and residuals, etc. +*/ +struct reg_trns +{ + int n_trns; /* Number of transformations. */ + int trns_id; /* Which trns is this one? */ + linreg *c; /* Linear model for this trns. */ +}; + +/* + Gets the predicted values. +*/ +static int +regression_trns_pred_proc (void *t_, struct ccase **c, + casenumber case_idx UNUSED) +{ + size_t i; + size_t n_vals; + struct reg_trns *trns = t_; + linreg *model; + union value *output = NULL; + const union value *tmp; + double *vals; + const struct variable **vars = NULL; + + assert (trns != NULL); + model = trns->c; + assert (model != NULL); + assert (model->depvar != NULL); + assert (model->pred != NULL); + + vars = linreg_get_vars (model); + n_vals = linreg_n_coeffs (model); + vals = xnmalloc (n_vals, sizeof (*vals)); + *c = case_unshare (*c); + + output = case_data_rw (*c, model->pred); + + for (i = 0; i < n_vals; i++) + { + tmp = case_data (*c, vars[i]); + vals[i] = tmp->f; + } + output->f = linreg_predict (model, vals, n_vals); + free (vals); + return TRNS_CONTINUE; +} + +/* + Gets the residuals. +*/ +static int +regression_trns_resid_proc (void *t_, struct ccase **c, + casenumber case_idx UNUSED) +{ + size_t i; + size_t n_vals; + struct reg_trns *trns = t_; + linreg *model; + union value *output = NULL; + const union value *tmp; + double *vals = NULL; + double obs; + const struct variable **vars = NULL; + + assert (trns != NULL); + model = trns->c; + assert (model != NULL); + assert (model->depvar != NULL); + assert (model->resid != NULL); + + vars = linreg_get_vars (model); + n_vals = linreg_n_coeffs (model); + + vals = xnmalloc (n_vals, sizeof (*vals)); + *c = case_unshare (*c); + output = case_data_rw (*c, model->resid); + assert (output != NULL); + + for (i = 0; i < n_vals; i++) + { + tmp = case_data (*c, vars[i]); + vals[i] = tmp->f; + } + tmp = case_data (*c, model->depvar); + obs = tmp->f; + output->f = linreg_residual (model, obs, vals, n_vals); + free (vals); + + return TRNS_CONTINUE; +} + + +static char * +reg_get_name (const struct dictionary *dict, const char *prefix) +{ + char *name; + int i; + + /* XXX handle too-long prefixes */ + name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1); + for (i = 1; ; i++) + { + sprintf (name, "%s%d", prefix, i); + if (dict_lookup_var (dict, name) == NULL) + return name; + } +} + +/* + Free the transformation. Free its linear model if this + transformation is the last one. +*/ +static bool +regression_trns_free (void *t_) +{ + bool result = true; + struct reg_trns *t = t_; + + if (t->trns_id == t->n_trns) + { + result = linreg_free (t->c); + } + free (t); + + return result; +} + +static void +reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, + linreg * c, struct variable **v, int n_trns) +{ + struct dictionary *dict = dataset_dict (ds); + static int trns_index = 1; + char *name; + struct variable *new_var; + struct reg_trns *t = NULL; + + t = xmalloc (sizeof (*t)); + t->trns_id = trns_index; + t->n_trns = n_trns; + t->c = c; + + name = reg_get_name (dict, prefix); + new_var = dict_create_var_assert (dict, name, 0); + free (name); + + *v = new_var; + add_transformation (ds, f, regression_trns_free, t); + trns_index++; +} + +static void +subcommand_save (const struct regression *cmd) +{ + linreg **lc; + int n_trns = 0; + + if ( cmd->resid ) n_trns++; + if ( cmd->pred ) n_trns++; + + n_trns *= cmd->n_dep_vars; + + for (lc = cmd->models; lc < cmd->models + cmd->n_dep_vars; lc++) + { + if (*lc != NULL) + { + if ((*lc)->depvar != NULL) + { + if (cmd->resid) + { + reg_save_var (cmd->ds, "RES", regression_trns_resid_proc, *lc, + &(*lc)->resid, n_trns); + } + if (cmd->pred) + { + reg_save_var (cmd->ds, "PRED", regression_trns_pred_proc, *lc, + &(*lc)->pred, n_trns); + } + } + } + } +} + +int +cmd_regression (struct lexer *lexer, struct dataset *ds) +{ + struct regression regression; + const struct dictionary *dict = dataset_dict (ds); + + memset (®ression, 0, sizeof (struct regression)); + + regression.anova = true; + regression.coeff = true; + regression.r = true; + + regression.pred = false; + regression.resid = false; + + regression.ds = ds; + + /* Accept an optional, completely pointless "/VARIABLES=" */ + lex_match (lexer, T_SLASH); + if (lex_match_id (lexer, "VARIABLES")) + { + if (! lex_force_match (lexer, T_EQUALS) ) + goto error; + } + + if (!parse_variables_const (lexer, dict, + ®ression.vars, ®ression.n_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto error; + + + while (lex_token (lexer) != T_ENDCMD) + { + lex_match (lexer, T_SLASH); + + if (lex_match_id (lexer, "DEPENDENT")) + { + if (! lex_force_match (lexer, T_EQUALS) ) + goto error; + + if (!parse_variables_const (lexer, dict, + ®ression.dep_vars, ®ression.n_dep_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto error; + } + else if (lex_match_id (lexer, "METHOD")) + { + lex_match (lexer, T_EQUALS); + + if (!lex_force_match_id (lexer, "ENTER")) + { + goto error; + } + } + else if (lex_match_id (lexer, "STATISTICS")) + { + lex_match (lexer, T_EQUALS); + + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match (lexer, T_ALL)) + { + } + else if (lex_match_id (lexer, "DEFAULTS")) + { + } + else if (lex_match_id (lexer, "R")) + { + } + else if (lex_match_id (lexer, "COEFF")) + { + } + else if (lex_match_id (lexer, "ANOVA")) + { + } + else if (lex_match_id (lexer, "BCOV")) + { + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + } + else if (lex_match_id (lexer, "SAVE")) + { + lex_match (lexer, T_EQUALS); + + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "PRED")) + { + regression.pred = true; + } + else if (lex_match_id (lexer, "RESID")) + { + regression.resid = true; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + + if (!regression.vars) + { + dict_get_vars (dict, ®ression.vars, ®ression.n_vars, 0); + } + + + regression.models = xcalloc (regression.n_dep_vars, sizeof *regression.models); + + { + struct casegrouper *grouper; + struct casereader *group; + bool ok; + + grouper = casegrouper_create_splits (proc_open (ds), dict); + while (casegrouper_get_next_group (grouper, &group)) + run_regression (®ression, group); + ok = casegrouper_destroy (grouper); + ok = proc_commit (ds) && ok; + } + + if (regression.pred || regression.resid ) + subcommand_save (®ression); + + + return CMD_SUCCESS; + + error: + return CMD_FAILURE; +} + + +static size_t +get_n_all_vars (const struct regression *cmd) +{ + size_t result = cmd->n_vars; + size_t i; + size_t j; + + result += cmd->n_dep_vars; + for (i = 0; i < cmd->n_dep_vars; i++) + { + for (j = 0; j < cmd->n_vars; j++) + { + if (cmd->vars[j] == cmd->dep_vars[i]) + { + result--; + } + } + } + return result; +} + +static void +fill_all_vars (const struct variable **vars, const struct regression *cmd) +{ + size_t i; + size_t j; + bool absent; + + for (i = 0; i < cmd->n_vars; i++) + { + vars[i] = cmd->vars[i]; + } + for (i = 0; i < cmd->n_dep_vars; i++) + { + absent = true; + for (j = 0; j < cmd->n_vars; j++) + { + if (cmd->dep_vars[i] == cmd->vars[j]) + { + absent = false; + break; + } + } + if (absent) + { + vars[i + cmd->n_vars] = cmd->dep_vars[i]; + } + } +} + +/* + Is variable k the dependent variable? +*/ +static bool +is_depvar (const struct regression *cmd, size_t k, const struct variable *v) +{ + return v == cmd->vars[k]; +} + + +/* Identify the explanatory variables in v_variables. Returns + the number of independent variables. */ +static int +identify_indep_vars (const struct regression *cmd, + const struct variable **indep_vars, + const struct variable *depvar) +{ + int n_indep_vars = 0; + int i; + + for (i = 0; i < cmd->n_vars; i++) + if (!is_depvar (cmd, i, depvar)) + indep_vars[n_indep_vars++] = cmd->vars[i]; + if ((n_indep_vars < 1) && is_depvar (cmd, 0, depvar)) + { + /* + There is only one independent variable, and it is the same + as the dependent variable. Print a warning and continue. + */ + msg (SE, + gettext ("The dependent variable is equal to the independent variable." + "The least squares line is therefore Y=X." + "Standard errors and related statistics may be meaningless.")); + n_indep_vars = 1; + indep_vars[0] = cmd->vars[0]; + } + return n_indep_vars; +} + + +static double +fill_covariance (gsl_matrix *cov, struct covariance *all_cov, + const struct variable **vars, + size_t n_vars, const struct variable *dep_var, + const struct variable **all_vars, size_t n_all_vars, + double *means) +{ + size_t i; + size_t j; + size_t dep_subscript; + size_t *rows; + const gsl_matrix *ssizes; + const gsl_matrix *mean_matrix; + const gsl_matrix *ssize_matrix; + double result = 0.0; + + gsl_matrix *cm = covariance_calculate_unnormalized (all_cov); + + if ( cm == NULL) + return 0; + + rows = xnmalloc (cov->size1 - 1, sizeof (*rows)); + + for (i = 0; i < n_all_vars; i++) + { + for (j = 0; j < n_vars; j++) + { + if (vars[j] == all_vars[i]) + { + rows[j] = i; + } + } + if (all_vars[i] == dep_var) + { + dep_subscript = i; + } + } + mean_matrix = covariance_moments (all_cov, MOMENT_MEAN); + ssize_matrix = covariance_moments (all_cov, MOMENT_NONE); + for (i = 0; i < cov->size1 - 1; i++) + { + means[i] = gsl_matrix_get (mean_matrix, rows[i], 0) + / gsl_matrix_get (ssize_matrix, rows[i], 0); + for (j = 0; j < cov->size2 - 1; j++) + { + gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j])); + gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i])); + } + } + means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0) + / gsl_matrix_get (ssize_matrix, dep_subscript, 0); + ssizes = covariance_moments (all_cov, MOMENT_NONE); + result = gsl_matrix_get (ssizes, dep_subscript, rows[0]); + for (i = 0; i < cov->size1 - 1; i++) + { + gsl_matrix_set (cov, i, cov->size1 - 1, + gsl_matrix_get (cm, rows[i], dep_subscript)); + gsl_matrix_set (cov, cov->size1 - 1, i, + gsl_matrix_get (cm, rows[i], dep_subscript)); + if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript)) + { + result = gsl_matrix_get (ssizes, rows[i], dep_subscript); + } + } + gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, + gsl_matrix_get (cm, dep_subscript, dep_subscript)); + free (rows); + gsl_matrix_free (cm); + return result; +} + + +/* + STATISTICS subcommand output functions. +*/ +static void reg_stats_r (linreg *, void *); +static void reg_stats_coeff (linreg *, void *); +static void reg_stats_anova (linreg *, void *); +static void reg_stats_bcov (linreg *, void *); + +static void statistics_keyword_output (void (*)(linreg *, void *), + bool, linreg *, void *); + + + +static void +subcommand_statistics (const struct regression *cmd , linreg * c, void *aux) +{ + statistics_keyword_output (reg_stats_r, cmd->r, c, aux); + statistics_keyword_output (reg_stats_anova, cmd->anova, c, aux); + statistics_keyword_output (reg_stats_coeff, cmd->coeff, c, aux); + statistics_keyword_output (reg_stats_bcov, cmd->bcov, c, aux); +} + + +static void +run_regression (const struct regression *cmd, struct casereader *input) +{ + size_t i; + int n_indep = 0; + int k; + double *means; + struct ccase *c; + struct covariance *cov; + const struct variable **vars; + const struct variable **all_vars; + const struct variable *dep_var; + struct casereader *reader; + size_t n_all_vars; + + linreg **models = cmd->models; + + n_all_vars = get_n_all_vars (cmd); + all_vars = xnmalloc (n_all_vars, sizeof (*all_vars)); + fill_all_vars (all_vars, cmd); + vars = xnmalloc (cmd->n_vars, sizeof (*vars)); + means = xnmalloc (n_all_vars, sizeof (*means)); + cov = covariance_1pass_create (n_all_vars, all_vars, + dict_get_weight (dataset_dict (cmd->ds)), MV_ANY); + + reader = casereader_clone (input); + reader = casereader_create_filter_missing (reader, all_vars, n_all_vars, + MV_ANY, NULL, NULL); + + + for (; (c = casereader_read (reader)) != NULL; case_unref (c)) + { + covariance_accumulate (cov, c); + } + + for (k = 0; k < cmd->n_dep_vars; k++) + { + double n_data; + + gsl_matrix *this_cm; + dep_var = cmd->dep_vars[k]; + n_indep = identify_indep_vars (cmd, vars, dep_var); + + this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); + n_data = fill_covariance (this_cm, cov, vars, n_indep, + dep_var, all_vars, n_all_vars, means); + models[k] = linreg_alloc (dep_var, (const struct variable **) vars, + n_data, n_indep); + models[k]->depvar = dep_var; + for (i = 0; i < n_indep; i++) + { + linreg_set_indep_variable_mean (models[k], i, means[i]); + } + linreg_set_depvar_mean (models[k], means[i]); + /* + For large data sets, use QR decomposition. + */ + if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) + { + models[k]->method = LINREG_QR; + } + + if (n_data > 0) + { + /* + Find the least-squares estimates and other statistics. + */ + linreg_fit (this_cm, models[k]); + + if (!taint_has_tainted_successor (casereader_get_taint (input))) + { + subcommand_statistics (cmd, models[k], this_cm); + } + } + else + { + msg (SE, + _("No valid data found. This command was skipped.")); + linreg_free (models[k]); + models[k] = NULL; + } + gsl_matrix_free (this_cm); + } + + casereader_destroy (reader); + free (vars); + free (all_vars); + free (means); + casereader_destroy (input); + covariance_destroy (cov); +} + + + + + +static void +reg_stats_r (linreg *c, void *aux UNUSED) +{ + struct tab_table *t; + int n_rows = 2; + int n_cols = 5; + double rsq; + double adjrsq; + double std_error; + + assert (c != NULL); + rsq = linreg_ssreg (c) / linreg_sst (c); + adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c)); + std_error = sqrt (linreg_mse (c)); + t = tab_create (n_cols, n_rows); + tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 2, 0, n_rows - 1); + tab_vline (t, TAL_0, 1, 0, 0); + + tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R")); + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate")); + tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL); + tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL); + tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL); + tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL); + tab_title (t, _("Model Summary")); + tab_submit (t); +} + +/* + Table showing estimated regression coefficients. +*/ +static void +reg_stats_coeff (linreg * c, void *aux_) +{ + size_t j; + int n_cols = 7; + int n_rows; + int this_row; + double t_stat; + double pval; + double std_err; + double beta; + const char *label; + + const struct variable *v; + struct tab_table *t; + gsl_matrix *cov = aux_; + + assert (c != NULL); + n_rows = linreg_n_coeffs (c) + 3; + + t = tab_create (n_cols, n_rows); + tab_headers (t, 2, 0, 1, 0); + tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 2, 0, n_rows - 1); + tab_vline (t, TAL_0, 1, 0, 0); + + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta")); + tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); + tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); + tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); + tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); + std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); + tab_double (t, 3, 1, 0, std_err, NULL); + tab_double (t, 4, 1, 0, 0.0, NULL); + t_stat = linreg_intercept (c) / std_err; + tab_double (t, 5, 1, 0, t_stat, NULL); + pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); + tab_double (t, 6, 1, 0, pval, NULL); + for (j = 0; j < linreg_n_coeffs (c); j++) + { + struct string tstr; + ds_init_empty (&tstr); + this_row = j + 2; + + v = linreg_indep_var (c, j); + label = var_to_string (v); + /* Do not overwrite the variable's name. */ + ds_put_cstr (&tstr, label); + tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); + /* + Regression coefficients. + */ + tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL); + /* + Standard error of the coefficients. + */ + std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); + tab_double (t, 3, this_row, 0, std_err, NULL); + /* + Standardized coefficient, i.e., regression coefficient + if all variables had unit variance. + */ + beta = sqrt (gsl_matrix_get (cov, j, j)); + beta *= linreg_coeff (c, j) / + sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); + tab_double (t, 4, this_row, 0, beta, NULL); + + /* + Test statistic for H0: coefficient is 0. + */ + t_stat = linreg_coeff (c, j) / std_err; + tab_double (t, 5, this_row, 0, t_stat, NULL); + /* + P values for the test statistic above. + */ + pval = + 2 * gsl_cdf_tdist_Q (fabs (t_stat), + (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); + tab_double (t, 6, this_row, 0, pval, NULL); + ds_destroy (&tstr); + } + tab_title (t, _("Coefficients")); + tab_submit (t); +} + +/* + Display the ANOVA table. +*/ +static void +reg_stats_anova (linreg * c, void *aux UNUSED) +{ + int n_cols = 7; + int n_rows = 4; + const double msm = linreg_ssreg (c) / linreg_dfmodel (c); + const double mse = linreg_mse (c); + const double F = msm / mse; + const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe); + + struct tab_table *t; + + assert (c != NULL); + t = tab_create (n_cols, n_rows); + tab_headers (t, 2, 0, 1, 0); + + tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); + + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 2, 0, n_rows - 1); + tab_vline (t, TAL_0, 1, 0, 0); + + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square")); + tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F")); + tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); + + tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression")); + tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual")); + tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total")); + + /* Sums of Squares */ + tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL); + tab_double (t, 2, 3, 0, linreg_sst (c), NULL); + tab_double (t, 2, 2, 0, linreg_sse (c), NULL); + + + /* Degrees of freedom */ + tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm); + tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe); + tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft); + + /* Mean Squares */ + tab_double (t, 4, 1, TAB_RIGHT, msm, NULL); + tab_double (t, 4, 2, TAB_RIGHT, mse, NULL); + + tab_double (t, 5, 1, 0, F, NULL); + + tab_double (t, 6, 1, 0, pval, NULL); + + tab_title (t, _("ANOVA")); + tab_submit (t); +} + + +static void +reg_stats_bcov (linreg * c, void *aux UNUSED) +{ + int n_cols; + int n_rows; + int i; + int k; + int row; + int col; + const char *label; + struct tab_table *t; + + assert (c != NULL); + n_cols = c->n_indeps + 1 + 2; + n_rows = 2 * (c->n_indeps + 1); + t = tab_create (n_cols, n_rows); + tab_headers (t, 2, 0, 1, 0); + tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 2, 0, n_rows - 1); + tab_vline (t, TAL_0, 1, 0, 0); + tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model")); + tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances")); + for (i = 0; i < linreg_n_coeffs (c); i++) + { + const struct variable *v = linreg_indep_var (c, i); + label = var_to_string (v); + tab_text (t, 2, i, TAB_CENTER, label); + tab_text (t, i + 2, 0, TAB_CENTER, label); + for (k = 1; k < linreg_n_coeffs (c); k++) + { + col = (i <= k) ? k : i; + row = (i <= k) ? i : k; + tab_double (t, k + 2, i, TAB_CENTER, + gsl_matrix_get (c->cov, row, col), NULL); + } + } + tab_title (t, _("Coefficient Correlations")); + tab_submit (t); +} + +static void +statistics_keyword_output (void (*function) (linreg *, void *), + bool keyword, linreg * c, void *aux) +{ + if (keyword) + { + (*function) (c, aux); + } +} diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q deleted file mode 100644 index ca1f67da0e..0000000000 --- a/src/language/stats/regression.q +++ /dev/null @@ -1,1036 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2009, 2010, 2011 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#include - -#include -#include -#include -#include -#include - -#include "data/case.h" -#include "data/casegrouper.h" -#include "data/casereader.h" -#include "data/dataset.h" -#include "data/dictionary.h" -#include "data/missing-values.h" -#include "data/transformations.h" -#include "data/value-labels.h" -#include "data/variable.h" -#include "language/command.h" -#include "language/data-io/file-handle.h" -#include "language/dictionary/split-file.h" -#include "language/lexer/lexer.h" -#include "libpspp/compiler.h" -#include "libpspp/message.h" -#include "libpspp/taint.h" -#include "math/covariance.h" -#include "math/linreg.h" -#include "math/moments.h" -#include "output/tab.h" - -#include "gl/intprops.h" -#include "gl/xalloc.h" - -#include "gettext.h" -#define _(msgid) gettext (msgid) - -#define REG_LARGE_DATA 1000 - -/* (headers) */ - -/* (specification) - "REGRESSION" (regression_): - *variables=custom; - +statistics[st_]=r, - coeff, - anova, - outs, - zpp, - label, - sha, - ci, - bcov, - ses, - xtx, - collin, - tol, - selection, - f, - defaults, - all; - ^dependent=varlist; - +save[sv_]=resid,pred; - +method=enter. -*/ -/* (declarations) */ -/* (functions) */ -static struct cmd_regression cmd; - -/* - Moments for each of the variables used. - */ -struct moments_var -{ - struct moments1 *m; - const struct variable *v; -}; - -/* - Transformations for saving predicted values - and residuals, etc. - */ -struct reg_trns -{ - int n_trns; /* Number of transformations. */ - int trns_id; /* Which trns is this one? */ - linreg *c; /* Linear model for this trns. */ -}; -/* - Variables used (both explanatory and response). - */ -static const struct variable **v_variables; - -/* - Number of variables. - */ -static size_t n_variables; - -static bool run_regression (struct casereader *, struct cmd_regression *, - struct dataset *, linreg **); - -/* - STATISTICS subcommand output functions. - */ -static void reg_stats_r (linreg *, void *); -static void reg_stats_coeff (linreg *, void *); -static void reg_stats_anova (linreg *, void *); -static void reg_stats_outs (linreg *, void *); -static void reg_stats_zpp (linreg *, void *); -static void reg_stats_label (linreg *, void *); -static void reg_stats_sha (linreg *, void *); -static void reg_stats_ci (linreg *, void *); -static void reg_stats_f (linreg *, void *); -static void reg_stats_bcov (linreg *, void *); -static void reg_stats_ses (linreg *, void *); -static void reg_stats_xtx (linreg *, void *); -static void reg_stats_collin (linreg *, void *); -static void reg_stats_tol (linreg *, void *); -static void reg_stats_selection (linreg *, void *); -static void statistics_keyword_output (void (*)(linreg *, void *), - int, linreg *, void *); - -static void -reg_stats_r (linreg *c, void *aux UNUSED) -{ - struct tab_table *t; - int n_rows = 2; - int n_cols = 5; - double rsq; - double adjrsq; - double std_error; - - assert (c != NULL); - rsq = linreg_ssreg (c) / linreg_sst (c); - adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c)); - std_error = sqrt (linreg_mse (c)); - t = tab_create (n_cols, n_rows); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - - tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R")); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate")); - tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL); - tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL); - tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL); - tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL); - tab_title (t, _("Model Summary")); - tab_submit (t); -} - -/* - Table showing estimated regression coefficients. - */ -static void -reg_stats_coeff (linreg * c, void *aux_) -{ - size_t j; - int n_cols = 7; - int n_rows; - int this_row; - double t_stat; - double pval; - double std_err; - double beta; - const char *label; - - const struct variable *v; - struct tab_table *t; - gsl_matrix *cov = aux_; - - assert (c != NULL); - n_rows = linreg_n_coeffs (c) + 3; - - t = tab_create (n_cols, n_rows); - tab_headers (t, 2, 0, 1, 0); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta")); - tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); - tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); - tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); - std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); - tab_double (t, 3, 1, 0, std_err, NULL); - tab_double (t, 4, 1, 0, 0.0, NULL); - t_stat = linreg_intercept (c) / std_err; - tab_double (t, 5, 1, 0, t_stat, NULL); - pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, 1, 0, pval, NULL); - for (j = 0; j < linreg_n_coeffs (c); j++) - { - struct string tstr; - ds_init_empty (&tstr); - this_row = j + 2; - - v = linreg_indep_var (c, j); - label = var_to_string (v); - /* Do not overwrite the variable's name. */ - ds_put_cstr (&tstr, label); - tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); - /* - Regression coefficients. - */ - tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL); - /* - Standard error of the coefficients. - */ - std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); - tab_double (t, 3, this_row, 0, std_err, NULL); - /* - Standardized coefficient, i.e., regression coefficient - if all variables had unit variance. - */ - beta = sqrt (gsl_matrix_get (cov, j, j)); - beta *= linreg_coeff (c, j) / - sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); - tab_double (t, 4, this_row, 0, beta, NULL); - - /* - Test statistic for H0: coefficient is 0. - */ - t_stat = linreg_coeff (c, j) / std_err; - tab_double (t, 5, this_row, 0, t_stat, NULL); - /* - P values for the test statistic above. - */ - pval = - 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, this_row, 0, pval, NULL); - ds_destroy (&tstr); - } - tab_title (t, _("Coefficients")); - tab_submit (t); -} - -/* - Display the ANOVA table. - */ -static void -reg_stats_anova (linreg * c, void *aux UNUSED) -{ - int n_cols = 7; - int n_rows = 4; - const double msm = linreg_ssreg (c) / linreg_dfmodel (c); - const double mse = linreg_mse (c); - const double F = msm / mse; - const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe); - - struct tab_table *t; - - assert (c != NULL); - t = tab_create (n_cols, n_rows); - tab_headers (t, 2, 0, 1, 0); - - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square")); - tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); - - tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression")); - tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual")); - tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total")); - - /* Sums of Squares */ - tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL); - tab_double (t, 2, 3, 0, linreg_sst (c), NULL); - tab_double (t, 2, 2, 0, linreg_sse (c), NULL); - - - /* Degrees of freedom */ - tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm); - tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe); - tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft); - - /* Mean Squares */ - tab_double (t, 4, 1, TAB_RIGHT, msm, NULL); - tab_double (t, 4, 2, TAB_RIGHT, mse, NULL); - - tab_double (t, 5, 1, 0, F, NULL); - - tab_double (t, 6, 1, 0, pval, NULL); - - tab_title (t, _("ANOVA")); - tab_submit (t); -} - -static void -reg_stats_outs (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} - -static void -reg_stats_zpp (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} - -static void -reg_stats_label (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} - -static void -reg_stats_sha (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_ci (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_f (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_bcov (linreg * c, void *aux UNUSED) -{ - int n_cols; - int n_rows; - int i; - int k; - int row; - int col; - const char *label; - struct tab_table *t; - - assert (c != NULL); - n_cols = c->n_indeps + 1 + 2; - n_rows = 2 * (c->n_indeps + 1); - t = tab_create (n_cols, n_rows); - tab_headers (t, 2, 0, 1, 0); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model")); - tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances")); - for (i = 0; i < linreg_n_coeffs (c); i++) - { - const struct variable *v = linreg_indep_var (c, i); - label = var_to_string (v); - tab_text (t, 2, i, TAB_CENTER, label); - tab_text (t, i + 2, 0, TAB_CENTER, label); - for (k = 1; k < linreg_n_coeffs (c); k++) - { - col = (i <= k) ? k : i; - row = (i <= k) ? i : k; - tab_double (t, k + 2, i, TAB_CENTER, - gsl_matrix_get (c->cov, row, col), NULL); - } - } - tab_title (t, _("Coefficient Correlations")); - tab_submit (t); -} -static void -reg_stats_ses (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_xtx (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_collin (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_tol (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} -static void -reg_stats_selection (linreg * c, void *aux UNUSED) -{ - assert (c != NULL); -} - -static void -statistics_keyword_output (void (*function) (linreg *, void *), - int keyword, linreg * c, void *aux) -{ - if (keyword) - { - (*function) (c, aux); - } -} - -static void -subcommand_statistics (int *keywords, linreg * c, void *aux) -{ - /* - The order here must match the order in which the STATISTICS - keywords appear in the specification section above. - */ - enum - { r, - coeff, - anova, - outs, - zpp, - label, - sha, - ci, - bcov, - ses, - xtx, - collin, - tol, - selection, - f, - defaults, - all - }; - int i; - int d = 1; - - if (keywords[all]) - { - /* - Set everything but F. - */ - for (i = 0; i < f; i++) - { - keywords[i] = 1; - } - } - else - { - for (i = 0; i < all; i++) - { - if (keywords[i]) - { - d = 0; - } - } - /* - Default output: ANOVA table, parameter estimates, - and statistics for variables not entered into model, - if appropriate. - */ - if (keywords[defaults] | d) - { - keywords[anova] = 1; - keywords[outs] = 1; - keywords[coeff] = 1; - keywords[r] = 1; - } - } - statistics_keyword_output (reg_stats_r, keywords[r], c, aux); - statistics_keyword_output (reg_stats_anova, keywords[anova], c, aux); - statistics_keyword_output (reg_stats_coeff, keywords[coeff], c, aux); - statistics_keyword_output (reg_stats_outs, keywords[outs], c, aux); - statistics_keyword_output (reg_stats_zpp, keywords[zpp], c, aux); - statistics_keyword_output (reg_stats_label, keywords[label], c, aux); - statistics_keyword_output (reg_stats_sha, keywords[sha], c, aux); - statistics_keyword_output (reg_stats_ci, keywords[ci], c, aux); - statistics_keyword_output (reg_stats_f, keywords[f], c, aux); - statistics_keyword_output (reg_stats_bcov, keywords[bcov], c, aux); - statistics_keyword_output (reg_stats_ses, keywords[ses], c, aux); - statistics_keyword_output (reg_stats_xtx, keywords[xtx], c, aux); - statistics_keyword_output (reg_stats_collin, keywords[collin], c, aux); - statistics_keyword_output (reg_stats_tol, keywords[tol], c, aux); - statistics_keyword_output (reg_stats_selection, keywords[selection], c, aux); -} - -/* - Free the transformation. Free its linear model if this - transformation is the last one. - */ -static bool -regression_trns_free (void *t_) -{ - bool result = true; - struct reg_trns *t = t_; - - if (t->trns_id == t->n_trns) - { - result = linreg_free (t->c); - } - free (t); - - return result; -} - -/* - Gets the predicted values. - */ -static int -regression_trns_pred_proc (void *t_, struct ccase **c, - casenumber case_idx UNUSED) -{ - size_t i; - size_t n_vals; - struct reg_trns *trns = t_; - linreg *model; - union value *output = NULL; - const union value *tmp; - double *vals; - const struct variable **vars = NULL; - - assert (trns != NULL); - model = trns->c; - assert (model != NULL); - assert (model->depvar != NULL); - assert (model->pred != NULL); - - vars = linreg_get_vars (model); - n_vals = linreg_n_coeffs (model); - vals = xnmalloc (n_vals, sizeof (*vals)); - *c = case_unshare (*c); - - output = case_data_rw (*c, model->pred); - - for (i = 0; i < n_vals; i++) - { - tmp = case_data (*c, vars[i]); - vals[i] = tmp->f; - } - output->f = linreg_predict (model, vals, n_vals); - free (vals); - return TRNS_CONTINUE; -} - -/* - Gets the residuals. - */ -static int -regression_trns_resid_proc (void *t_, struct ccase **c, - casenumber case_idx UNUSED) -{ - size_t i; - size_t n_vals; - struct reg_trns *trns = t_; - linreg *model; - union value *output = NULL; - const union value *tmp; - double *vals = NULL; - double obs; - const struct variable **vars = NULL; - - assert (trns != NULL); - model = trns->c; - assert (model != NULL); - assert (model->depvar != NULL); - assert (model->resid != NULL); - - vars = linreg_get_vars (model); - n_vals = linreg_n_coeffs (model); - - vals = xnmalloc (n_vals, sizeof (*vals)); - *c = case_unshare (*c); - output = case_data_rw (*c, model->resid); - assert (output != NULL); - - for (i = 0; i < n_vals; i++) - { - tmp = case_data (*c, vars[i]); - vals[i] = tmp->f; - } - tmp = case_data (*c, model->depvar); - obs = tmp->f; - output->f = linreg_residual (model, obs, vals, n_vals); - free (vals); - - return TRNS_CONTINUE; -} - -static char * -reg_get_name (const struct dictionary *dict, const char *prefix) -{ - char *name; - int i; - - /* XXX handle too-long prefixes */ - name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1); - for (i = 1; ; i++) - { - sprintf (name, "%s%d", prefix, i); - if (dict_lookup_var (dict, name) == NULL) - return name; - } -} - -static void -reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, - linreg * c, struct variable **v, int n_trns) -{ - struct dictionary *dict = dataset_dict (ds); - static int trns_index = 1; - char *name; - struct variable *new_var; - struct reg_trns *t = NULL; - - t = xmalloc (sizeof (*t)); - t->trns_id = trns_index; - t->n_trns = n_trns; - t->c = c; - - name = reg_get_name (dict, prefix); - new_var = dict_create_var_assert (dict, name, 0); - free (name); - - *v = new_var; - add_transformation (ds, f, regression_trns_free, t); - trns_index++; -} -static void -subcommand_save (struct dataset *ds, int save, linreg ** models) -{ - linreg **lc; - int n_trns = 0; - int i; - - if (save) - { - /* Count the number of transformations we will need. */ - for (i = 0; i < REGRESSION_SV_count; i++) - { - if (cmd.a_save[i]) - { - n_trns++; - } - } - n_trns *= cmd.n_dependent; - - for (lc = models; lc < models + cmd.n_dependent; lc++) - { - if (*lc != NULL) - { - if ((*lc)->depvar != NULL) - { - if (cmd.a_save[REGRESSION_SV_RESID]) - { - reg_save_var (ds, "RES", regression_trns_resid_proc, *lc, - &(*lc)->resid, n_trns); - } - if (cmd.a_save[REGRESSION_SV_PRED]) - { - reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc, - &(*lc)->pred, n_trns); - } - } - } - } - } - else - { - for (lc = models; lc < models + cmd.n_dependent; lc++) - { - if (*lc != NULL) - { - linreg_free (*lc); - } - } - } -} - -int -cmd_regression (struct lexer *lexer, struct dataset *ds) -{ - struct casegrouper *grouper; - struct casereader *group; - linreg **models; - bool ok; - size_t i; - - if (!parse_regression (lexer, ds, &cmd, NULL)) - { - return CMD_FAILURE; - } - - models = xnmalloc (cmd.n_dependent, sizeof *models); - for (i = 0; i < cmd.n_dependent; i++) - { - models[i] = NULL; - } - - /* Data pass. */ - grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); - while (casegrouper_get_next_group (grouper, &group)) - run_regression (group, &cmd, ds, models); - ok = casegrouper_destroy (grouper); - ok = proc_commit (ds) && ok; - - subcommand_save (ds, cmd.sbc_save, models); - free (v_variables); - free (models); - free_regression (&cmd); - - return ok ? CMD_SUCCESS : CMD_FAILURE; -} - -/* - Is variable k the dependent variable? - */ -static bool -is_depvar (size_t k, const struct variable *v) -{ - return v == v_variables[k]; -} - -/* Parser for the variables sub command */ -static int -regression_custom_variables (struct lexer *lexer, struct dataset *ds, - struct cmd_regression *cmd UNUSED, - void *aux UNUSED) -{ - const struct dictionary *dict = dataset_dict (ds); - - lex_match (lexer, T_EQUALS); - - if ((lex_token (lexer) != T_ID - || dict_lookup_var (dict, lex_tokcstr (lexer)) == NULL) - && lex_token (lexer) != T_ALL) - return 2; - - - if (!parse_variables_const - (lexer, dict, &v_variables, &n_variables, PV_NONE)) - { - free (v_variables); - return 0; - } - assert (n_variables); - - return 1; -} - -/* Identify the explanatory variables in v_variables. Returns - the number of independent variables. */ -static int -identify_indep_vars (const struct variable **indep_vars, - const struct variable *depvar) -{ - int n_indep_vars = 0; - int i; - - for (i = 0; i < n_variables; i++) - if (!is_depvar (i, depvar)) - indep_vars[n_indep_vars++] = v_variables[i]; - if ((n_indep_vars < 1) && is_depvar (0, depvar)) - { - /* - There is only one independent variable, and it is the same - as the dependent variable. Print a warning and continue. - */ - msg (SE, - gettext ("The dependent variable is equal to the independent variable." - "The least squares line is therefore Y=X." - "Standard errors and related statistics may be meaningless.")); - n_indep_vars = 1; - indep_vars[0] = v_variables[0]; - } - return n_indep_vars; -} - -static double -fill_covariance (gsl_matrix *cov, struct covariance *all_cov, - const struct variable **vars, - size_t n_vars, const struct variable *dep_var, - const struct variable **all_vars, size_t n_all_vars, - double *means) -{ - size_t i; - size_t j; - size_t dep_subscript; - size_t *rows; - const gsl_matrix *ssizes; - const gsl_matrix *mean_matrix; - const gsl_matrix *ssize_matrix; - double result = 0.0; - - gsl_matrix *cm = covariance_calculate_unnormalized (all_cov); - - if ( cm == NULL) - return 0; - - rows = xnmalloc (cov->size1 - 1, sizeof (*rows)); - - for (i = 0; i < n_all_vars; i++) - { - for (j = 0; j < n_vars; j++) - { - if (vars[j] == all_vars[i]) - { - rows[j] = i; - } - } - if (all_vars[i] == dep_var) - { - dep_subscript = i; - } - } - mean_matrix = covariance_moments (all_cov, MOMENT_MEAN); - ssize_matrix = covariance_moments (all_cov, MOMENT_NONE); - for (i = 0; i < cov->size1 - 1; i++) - { - means[i] = gsl_matrix_get (mean_matrix, rows[i], 0) - / gsl_matrix_get (ssize_matrix, rows[i], 0); - for (j = 0; j < cov->size2 - 1; j++) - { - gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j])); - gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i])); - } - } - means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0) - / gsl_matrix_get (ssize_matrix, dep_subscript, 0); - ssizes = covariance_moments (all_cov, MOMENT_NONE); - result = gsl_matrix_get (ssizes, dep_subscript, rows[0]); - for (i = 0; i < cov->size1 - 1; i++) - { - gsl_matrix_set (cov, i, cov->size1 - 1, - gsl_matrix_get (cm, rows[i], dep_subscript)); - gsl_matrix_set (cov, cov->size1 - 1, i, - gsl_matrix_get (cm, rows[i], dep_subscript)); - if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript)) - { - result = gsl_matrix_get (ssizes, rows[i], dep_subscript); - } - } - gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, - gsl_matrix_get (cm, dep_subscript, dep_subscript)); - free (rows); - gsl_matrix_free (cm); - return result; -} -static size_t -get_n_all_vars (struct cmd_regression *cmd) -{ - size_t result = n_variables; - size_t i; - size_t j; - - result += cmd->n_dependent; - for (i = 0; i < cmd->n_dependent; i++) - { - for (j = 0; j < n_variables; j++) - { - if (v_variables[j] == cmd->v_dependent[i]) - { - result--; - } - } - } - return result; -} -static void -fill_all_vars (const struct variable **vars, struct cmd_regression *cmd) -{ - size_t i; - size_t j; - bool absent; - - for (i = 0; i < n_variables; i++) - { - vars[i] = v_variables[i]; - } - for (i = 0; i < cmd->n_dependent; i++) - { - absent = true; - for (j = 0; j < n_variables; j++) - { - if (cmd->v_dependent[i] == v_variables[j]) - { - absent = false; - break; - } - } - if (absent) - { - vars[i + n_variables] = cmd->v_dependent[i]; - } - } -} -static bool -run_regression (struct casereader *input, struct cmd_regression *cmd, - struct dataset *ds, linreg **models) -{ - size_t i; - int n_indep = 0; - int k; - double n_data; - double *means; - struct ccase *c; - struct covariance *cov; - const struct variable **vars; - const struct variable **all_vars; - const struct variable *dep_var; - struct casereader *reader; - const struct dictionary *dict; - size_t n_all_vars; - - assert (models != NULL); - - for (i = 0; i < n_variables; i++) - { - if (!var_is_numeric (v_variables[i])) - { - msg (SE, _("REGRESSION requires numeric variables.")); - return false; - } - } - - c = casereader_peek (input, 0); - if (c == NULL) - { - casereader_destroy (input); - return true; - } - output_split_file_values (ds, c); - case_unref (c); - - dict = dataset_dict (ds); - if (!v_variables) - { - dict_get_vars (dict, &v_variables, &n_variables, 0); - } - n_all_vars = get_n_all_vars (cmd); - all_vars = xnmalloc (n_all_vars, sizeof (*all_vars)); - fill_all_vars (all_vars, cmd); - vars = xnmalloc (n_variables, sizeof (*vars)); - means = xnmalloc (n_all_vars, sizeof (*means)); - cov = covariance_1pass_create (n_all_vars, all_vars, - dict_get_weight (dict), MV_ANY); - - reader = casereader_clone (input); - reader = casereader_create_filter_missing (reader, all_vars, n_all_vars, - MV_ANY, NULL, NULL); - - - for (; (c = casereader_read (reader)) != NULL; case_unref (c)) - { - covariance_accumulate (cov, c); - } - - for (k = 0; k < cmd->n_dependent; k++) - { - gsl_matrix *this_cm; - dep_var = cmd->v_dependent[k]; - n_indep = identify_indep_vars (vars, dep_var); - - this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); - n_data = fill_covariance (this_cm, cov, vars, n_indep, - dep_var, all_vars, n_all_vars, means); - models[k] = linreg_alloc (dep_var, (const struct variable **) vars, - n_data, n_indep); - models[k]->depvar = dep_var; - for (i = 0; i < n_indep; i++) - { - linreg_set_indep_variable_mean (models[k], i, means[i]); - } - linreg_set_depvar_mean (models[k], means[i]); - /* - For large data sets, use QR decomposition. - */ - if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) - { - models[k]->method = LINREG_QR; - } - - if (n_data > 0) - { - /* - Find the least-squares estimates and other statistics. - */ - linreg_fit (this_cm, models[k]); - - if (!taint_has_tainted_successor (casereader_get_taint (input))) - { - subcommand_statistics (cmd->a_statistics, models[k], this_cm); - } - } - else - { - msg (SE, - gettext ("No valid data found. This command was skipped.")); - linreg_free (models[k]); - models[k] = NULL; - } - gsl_matrix_free (this_cm); - } - - casereader_destroy (reader); - free (vars); - free (all_vars); - free (means); - casereader_destroy (input); - covariance_destroy (cov); - - return true; -} - -/* - Local Variables: - mode: c - End: -*/ diff --git a/tests/output/ascii.at b/tests/output/ascii.at index df1263eb01..07b440efb5 100644 --- a/tests/output/ascii.at +++ b/tests/output/ascii.at @@ -555,7 +555,7 @@ REGRESSION /DEPENDENT= x y /STATISTICS=COEFF R ANOVA. ]) -AT_CHECK([pspp ascii.sps], [1], [dnl +AT_CHECK([pspp ascii.sps], [0], [dnl SET PRINTBACK=ON. DATA LIST LIST /x * y * a (a23). @@ -577,9 +577,12 @@ END DATA. REGRESSION /VARIABLES= a + +ascii.sps:11: warning: REGRESSION: a is not a numeric variable. It will not be +included in the variable list. + /DEPENDENT= x y /STATISTICS=COEFF R ANOVA. - -ascii.sps:12: error: REGRESSION: REGRESSION requires numeric variables. ]) + AT_CLEANUP