X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fregression.c;h=0ef739a4e04e1d31a7a53b0089322ea9b8a7ef61;hb=bea96fcb5c8cf6ffb179a830ecb39f52ff2c5db8;hp=de3194d76dc6d7b49dc05e2e3e957ad1c31dd807;hpb=4443108e98b43b196f3f07217c6ec8bd96581367;p=pspp diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c index de3194d76d..0ef739a4e0 100644 --- a/src/language/stats/regression.c +++ b/src/language/stats/regression.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. + Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ #include +#include #include #include @@ -52,6 +53,17 @@ #define REG_LARGE_DATA 1000 +#define STATS_R 1 +#define STATS_COEFF 2 +#define STATS_ANOVA 4 +#define STATS_OUTS 8 +#define STATS_CI 16 +#define STATS_BCOV 32 + +#define STATS_DEFAULT (STATS_R | STATS_COEFF | STATS_ANOVA | STATS_OUTS) + + + struct regression { struct dataset *ds; @@ -62,25 +74,15 @@ struct regression const struct variable **dep_vars; size_t n_dep_vars; - bool r; - bool coeff; - bool anova; - bool bcov; - + unsigned int stats; + double ci; bool resid; bool pred; }; -struct per_split_ws -{ - linreg **models; -}; - struct regression_workspace { - struct per_split_ws *psw; - /* The new variables which will be introduced by /SAVE */ const struct variable **predvars; const struct variable **residvars; @@ -99,7 +101,6 @@ struct regression_workspace }; static void run_regression (const struct regression *cmd, - struct per_split_ws *psw, struct regression_workspace *ws, struct casereader *input); @@ -190,20 +191,15 @@ save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED) int cmd_regression (struct lexer *lexer, struct dataset *ds) { - int i; - int n_splits = 0; struct regression_workspace workspace; struct regression regression; const struct dictionary *dict = dataset_dict (ds); bool save; - workspace.psw = NULL; memset (®ression, 0, sizeof (struct regression)); - regression.anova = true; - regression.coeff = true; - regression.r = true; - + regression.ci = 0.95; + regression.stats = STATS_DEFAULT; regression.pred = false; regression.resid = false; @@ -232,6 +228,9 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) if (!lex_force_match (lexer, T_EQUALS)) goto error; + free (regression.dep_vars); + regression.n_dep_vars = 0; + if (!parse_variables_const (lexer, dict, ®ression.dep_vars, ®ression.n_dep_vars, @@ -256,21 +255,38 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) { if (lex_match (lexer, T_ALL)) { + regression.stats = ~0; } else if (lex_match_id (lexer, "DEFAULTS")) { + regression.stats |= STATS_DEFAULT; } else if (lex_match_id (lexer, "R")) { + regression.stats |= STATS_R; } else if (lex_match_id (lexer, "COEFF")) { + regression.stats |= STATS_COEFF; } else if (lex_match_id (lexer, "ANOVA")) { + regression.stats |= STATS_ANOVA; } else if (lex_match_id (lexer, "BCOV")) { + regression.stats |= STATS_BCOV; + } + else if (lex_match_id (lexer, "CI")) + { + regression.stats |= STATS_CI; + + if (lex_match (lexer, T_LPAREN)) + { + regression.ci = lex_number (lexer) / 100.0; + lex_get (lexer); + lex_force_match (lexer, T_RPAREN); + } } else { @@ -359,7 +375,6 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) } - n_splits = 0; { struct casegrouper *grouper; struct casereader *group; @@ -370,9 +385,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) while (casegrouper_get_next_group (grouper, &group)) { - workspace.psw = xrealloc (workspace.psw, ++n_splits * sizeof (*workspace.psw)); - - run_regression (®ression, &workspace.psw[n_splits - 1], + run_regression (®ression, &workspace, group); @@ -394,17 +407,6 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) add_transformation (ds, save_trans_func, save_trans_free, save_trans_data); } - for (i = 0; i < n_splits; ++i) - { - int k; - - for (k = 0; k < regression.n_dep_vars; ++k) - linreg_unref (workspace.psw[i].models[k]); - - free (workspace.psw[i].models); - } - free (workspace.psw); - free (regression.vars); free (regression.dep_vars); @@ -443,17 +445,17 @@ get_n_all_vars (const struct regression *cmd) static void fill_all_vars (const struct variable **vars, const struct regression *cmd) { + size_t x = 0; size_t i; - size_t j; - bool absent; - for (i = 0; i < cmd->n_vars; i++) { vars[i] = cmd->vars[i]; } + for (i = 0; i < cmd->n_dep_vars; i++) { - absent = true; + size_t j; + bool absent = true; for (j = 0; j < cmd->n_vars; j++) { if (cmd->dep_vars[i] == cmd->vars[j]) @@ -464,7 +466,7 @@ fill_all_vars (const struct variable **vars, const struct regression *cmd) } if (absent) { - vars[i + cmd->n_vars] = cmd->dep_vars[i]; + vars[cmd->n_vars + x++] = cmd->dep_vars[i]; } } } @@ -500,8 +502,8 @@ identify_indep_vars (const struct regression *cmd, */ msg (SW, gettext - ("The dependent variable is equal to the independent variable." - "The least squares line is therefore Y=X." + ("The dependent variable is equal to the independent variable. " + "The least squares line is therefore Y=X. " "Standard errors and related statistics may be meaningless.")); n_indep_vars = 1; indep_vars[0] = cmd->vars[0]; @@ -586,7 +588,7 @@ fill_covariance (gsl_matrix * cov, struct covariance *all_cov, STATISTICS subcommand output functions. */ static void reg_stats_r (const linreg *, const struct variable *); -static void reg_stats_coeff (const linreg *, const gsl_matrix *, const struct variable *); +static void reg_stats_coeff (const linreg *, const gsl_matrix *, const struct variable *, const struct regression *); static void reg_stats_anova (const linreg *, const struct variable *); static void reg_stats_bcov (const linreg *, const struct variable *); @@ -595,27 +597,27 @@ static void subcommand_statistics (const struct regression *cmd, const linreg * c, const gsl_matrix * cm, const struct variable *var) { - if (cmd->r) + if (cmd->stats & STATS_R) reg_stats_r (c, var); - if (cmd->anova) + if (cmd->stats & STATS_ANOVA) reg_stats_anova (c, var); - if (cmd->coeff) - reg_stats_coeff (c, cm, var); + if (cmd->stats & STATS_COEFF) + reg_stats_coeff (c, cm, var, cmd); - if (cmd->bcov) + if (cmd->stats & STATS_BCOV) reg_stats_bcov (c, var); } static void run_regression (const struct regression *cmd, - struct per_split_ws *psw, struct regression_workspace *ws, struct casereader *input) { size_t i; + linreg **models; int k; struct ccase *c; @@ -646,29 +648,28 @@ run_regression (const struct regression *cmd, casereader_destroy (r); } - psw->models = xcalloc (cmd->n_dep_vars, sizeof (*psw->models)); + models = xcalloc (cmd->n_dep_vars, sizeof (*models)); for (k = 0; k < cmd->n_dep_vars; k++) { - const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars)); const struct variable *dep_var = cmd->dep_vars[k]; int n_indep = identify_indep_vars (cmd, vars, dep_var); gsl_matrix *this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); double n_data = fill_covariance (this_cm, cov, vars, n_indep, dep_var, all_vars, n_all_vars, means); - psw->models[k] = linreg_alloc (dep_var, vars, n_data, n_indep); - psw->models[k]->depvar = dep_var; + models[k] = linreg_alloc (dep_var, vars, n_data, n_indep); + models[k]->depvar = dep_var; for (i = 0; i < n_indep; i++) { - linreg_set_indep_variable_mean (psw->models[k], i, means[i]); + linreg_set_indep_variable_mean (models[k], i, means[i]); } - linreg_set_depvar_mean (psw->models[k], means[i]); + linreg_set_depvar_mean (models[k], means[i]); /* For large data sets, use QR decomposition. */ if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) { - psw->models[k]->method = LINREG_QR; + models[k]->method = LINREG_QR; } if (n_data > 0) @@ -676,11 +677,11 @@ run_regression (const struct regression *cmd, /* Find the least-squares estimates and other statistics. */ - linreg_fit (this_cm, psw->models[k]); + linreg_fit (this_cm, models[k]); if (!taint_has_tainted_successor (casereader_get_taint (input))) { - subcommand_statistics (cmd, psw->models[k], this_cm, dep_var); + subcommand_statistics (cmd, models[k], this_cm, dep_var); } } else @@ -713,14 +714,14 @@ run_regression (const struct regression *cmd, if (cmd->pred) { - double pred = linreg_predict (psw->models[k], vals, n_indep); + double pred = linreg_predict (models[k], vals, n_indep); case_data_rw_idx (outc, k * ws->extras + ws->pred_idx)->f = pred; } if (cmd->resid) { - double obs = case_data (c, psw->models[k]->depvar)->f; - double res = linreg_residual (psw->models[k], obs, vals, n_indep); + double obs = case_data (c, models[k]->depvar)->f; + double res = linreg_residual (models[k], obs, vals, n_indep); case_data_rw_idx (outc, k * ws->extras + ws->res_idx)->f = res; } free (vals); @@ -733,15 +734,19 @@ run_regression (const struct regression *cmd, casereader_destroy (reader); + for (k = 0; k < cmd->n_dep_vars; k++) + { + linreg_unref (models[k]); + } + free (models); free (all_vars); free (means); casereader_destroy (input); covariance_destroy (cov); } - - + static void @@ -782,10 +787,11 @@ reg_stats_r (const linreg * c, const struct variable *var) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable *var) +reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable *var, const struct regression *cmd) { size_t j; int n_cols = 7; + const int heading_rows = 2; int n_rows; int this_row; double t_stat; @@ -797,43 +803,68 @@ reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable const struct variable *v; struct tab_table *t; + const double df = linreg_n_obs (c) - linreg_n_coeffs (c) - 1; + double q = (1 - cmd->ci) / 2.0; /* 2-tailed test */ + double tval = gsl_cdf_tdist_Qinv (q, df); + assert (c != NULL); - n_rows = linreg_n_coeffs (c) + 3; + n_rows = linreg_n_coeffs (c) + heading_rows + 1; + + if (cmd->stats & STATS_CI) + n_cols += 2; t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_hline (t, TAL_2, 0, n_cols - 1, heading_rows); tab_vline (t, TAL_2, 2, 0, n_rows - 1); tab_vline (t, TAL_0, 1, 0, 0); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta")); - tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); - tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); - tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); + + tab_hline (t, TAL_1, 2, 4, 1); + tab_joint_text (t, 2, 0, 3, 0, TAB_CENTER | TAT_TITLE, _("Unstandardized Coefficients")); + tab_text (t, 2, 1, TAB_CENTER | TAT_TITLE, _("B")); + tab_text (t, 3, 1, TAB_CENTER | TAT_TITLE, _("Std. Error")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Standardized Coefficients")); + tab_text (t, 4, 1, TAB_CENTER | TAT_TITLE, _("Beta")); + tab_text (t, 5, 1, TAB_CENTER | TAT_TITLE, _("t")); + tab_text (t, 6, 1, TAB_CENTER | TAT_TITLE, _("Sig.")); + tab_text (t, 1, heading_rows, TAB_LEFT | TAT_TITLE, _("(Constant)")); + tab_double (t, 2, heading_rows, 0, linreg_intercept (c), NULL); std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); - tab_double (t, 3, 1, 0, std_err, NULL); - tab_double (t, 4, 1, 0, 0.0, NULL); + + if (cmd->stats & STATS_CI) + { + double lower = linreg_intercept (c) - tval * std_err ; + double upper = linreg_intercept (c) + tval * std_err ; + tab_double (t, 7, heading_rows, 0, lower, NULL); + tab_double (t, 8, heading_rows, 0, upper, NULL); + + tab_joint_text_format (t, 7, 0, 8, 0, TAB_CENTER | TAT_TITLE, _("%g%% Confidence Interval for B"), cmd->ci * 100); + tab_hline (t, TAL_1, 7, 8, 1); + tab_text (t, 7, 1, TAB_CENTER | TAT_TITLE, _("Lower Bound")); + tab_text (t, 8, 1, TAB_CENTER | TAT_TITLE, _("Upper Bound")); + } + tab_double (t, 3, heading_rows, 0, std_err, NULL); + tab_double (t, 4, heading_rows, 0, 0.0, NULL); t_stat = linreg_intercept (c) / std_err; - tab_double (t, 5, 1, 0, t_stat, NULL); + tab_double (t, 5, heading_rows, 0, t_stat, NULL); pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, 1, 0, pval, NULL); + tab_double (t, 6, heading_rows, 0, pval, NULL); + for (j = 0; j < linreg_n_coeffs (c); j++) { struct string tstr; ds_init_empty (&tstr); - this_row = j + 2; + this_row = j + heading_rows + 1; v = linreg_indep_var (c, j); label = var_to_string (v); /* Do not overwrite the variable's name. */ ds_put_cstr (&tstr, label); - tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); + tab_text (t, 1, this_row, TAB_LEFT, ds_cstr (&tstr)); /* Regression coefficients. */ @@ -860,12 +891,18 @@ reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable /* P values for the test statistic above. */ - pval = - 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (linreg_n_obs (c) - - linreg_n_coeffs (c) - 1)); + pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), df); tab_double (t, 6, this_row, 0, pval, NULL); ds_destroy (&tstr); + + if (cmd->stats & STATS_CI) + { + double lower = linreg_coeff (c, j) - tval * std_err ; + double upper = linreg_coeff (c, j) + tval * std_err ; + + tab_double (t, 7, this_row, 0, lower, NULL); + tab_double (t, 8, this_row, 0, upper, NULL); + } } tab_title (t, _("Coefficients (%s)"), var_to_string (var)); tab_submit (t); @@ -900,7 +937,7 @@ reg_stats_anova (const linreg * c, const struct variable *var) tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df")); tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square")); tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); + tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Sig.")); tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression")); tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual")); @@ -913,9 +950,9 @@ reg_stats_anova (const linreg * c, const struct variable *var) /* Degrees of freedom */ - tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm); - tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe); - tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft); + tab_text_format (t, 3, 1, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dfm); + tab_text_format (t, 3, 2, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dfe); + tab_text_format (t, 3, 3, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dft); /* Mean Squares */ tab_double (t, 4, 1, TAB_RIGHT, msm, NULL);