From: John Darrington Date: Mon, 13 Jan 2014 09:32:59 +0000 (+0100) Subject: REGRESSION: Added calculation of the coefficients' confidence interval. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5f86425ef29d236118667470f2a461818d7e55b2;p=pspp REGRESSION: Added calculation of the coefficients' confidence interval. --- diff --git a/NEWS b/NEWS index 9c0fc96c6a..50e1332bb5 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,11 @@ See the end for copying conditions. Please send PSPP bug reports to bug-gnu-pspp@gnu.org. +Changes from 0.8.2 to 0.8.3: + + * REGRESSION now recognises /STATISTICS=CI(x) which causes confidence + intervals for the coefficients to be printed. + Changes from 0.8.1 to 0.8.2: * Charts are now rendered with colours from the Tango palette instead diff --git a/doc/regression.texi b/doc/regression.texi index 218a52405d..9ff569d4d6 100644 --- a/doc/regression.texi +++ b/doc/regression.texi @@ -45,7 +45,7 @@ linear model. REGRESSION /VARIABLES=@var{var_list} /DEPENDENT=@var{var_list} - /STATISTICS=@{ALL, DEFAULTS, R, COEFF, ANOVA, BCOV@} + /STATISTICS=@{ALL, DEFAULTS, R, COEFF, ANOVA, BCOV, CI[@var{conf}]@} /SAVE=@{PRED, RESID@} @end display @@ -61,7 +61,8 @@ are treated as explanatory variables in the linear model. All other subcommands are optional: -The @subcmd{STATISTICS} subcommand specifies the statistics to be displayed: +The @subcmd{STATISTICS} subcommand specifies additional statistics to be displayed. +The following keywords are accepted: @table @subcmd @item ALL @@ -71,10 +72,16 @@ The ratio of the sums of squares due to the model to the total sums of squares for the dependent variable. @item COEFF A table containing the estimated model coefficients and their standard errors. +@item CI (@var{conf}) +This item is only relevant if COEFF has also been selected. It specifies that the +confidence interval for the coefficients should be printed. The optional value @var{conf}, +which must be in parentheses, is the desired confidence level expressed as a percentage. @item ANOVA Analysis of variance table for the model. @item BCOV The covariance matrix for the estimated model coefficients. +@item DEFAULT +The same as if R, COEFF, and ANOVA had been selected. @end table The @subcmd{SAVE} subcommand causes @pspp{} to save the residuals or predicted diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c index 322122866a..07188663fe 100644 --- a/src/language/stats/regression.c +++ b/src/language/stats/regression.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. + Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -74,6 +74,7 @@ struct regression size_t n_dep_vars; unsigned int stats; + double ci; bool resid; bool pred; @@ -196,6 +197,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) memset (®ression, 0, sizeof (struct regression)); + regression.ci = 0.95; regression.stats = STATS_DEFAULT; regression.pred = false; regression.resid = false; @@ -280,7 +282,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) if (lex_match (lexer, T_LPAREN)) { - lex_number (lexer); + regression.ci = lex_number (lexer) / 100.0; lex_get (lexer); lex_force_match (lexer, T_RPAREN); } @@ -585,7 +587,7 @@ fill_covariance (gsl_matrix * cov, struct covariance *all_cov, STATISTICS subcommand output functions. */ static void reg_stats_r (const linreg *, const struct variable *); -static void reg_stats_coeff (const linreg *, const gsl_matrix *, const struct variable *); +static void reg_stats_coeff (const linreg *, const gsl_matrix *, const struct variable *, const struct regression *); static void reg_stats_anova (const linreg *, const struct variable *); static void reg_stats_bcov (const linreg *, const struct variable *); @@ -601,7 +603,7 @@ subcommand_statistics (const struct regression *cmd, const linreg * c, const gsl reg_stats_anova (c, var); if (cmd->stats & STATS_COEFF) - reg_stats_coeff (c, cm, var); + reg_stats_coeff (c, cm, var, cmd); if (cmd->stats & STATS_BCOV) reg_stats_bcov (c, var); @@ -784,10 +786,11 @@ reg_stats_r (const linreg * c, const struct variable *var) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable *var) +reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable *var, const struct regression *cmd) { size_t j; int n_cols = 7; + const int heading_rows = 2; int n_rows; int this_row; double t_stat; @@ -799,43 +802,68 @@ reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable const struct variable *v; struct tab_table *t; + const double df = linreg_n_obs (c) - linreg_n_coeffs (c) - 1; + double q = (1 - cmd->ci) / 2.0; /* 2-tailed test */ + double tval = gsl_cdf_tdist_Qinv (q, df); + assert (c != NULL); - n_rows = linreg_n_coeffs (c) + 3; + n_rows = linreg_n_coeffs (c) + heading_rows + 1; + + if (cmd->stats & STATS_CI) + n_cols += 2; t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_hline (t, TAL_2, 0, n_cols - 1, heading_rows); tab_vline (t, TAL_2, 2, 0, n_rows - 1); tab_vline (t, TAL_0, 1, 0, 0); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta")); - tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Sig.")); - tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); - tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); + + tab_hline (t, TAL_1, 2, 4, 1); + tab_joint_text (t, 2, 0, 3, 0, TAB_CENTER | TAT_TITLE, _("Unstandardized Coefficients")); + tab_text (t, 2, 1, TAB_CENTER | TAT_TITLE, _("B")); + tab_text (t, 3, 1, TAB_CENTER | TAT_TITLE, _("Std. Error")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Standardized Coefficients")); + tab_text (t, 4, 1, TAB_CENTER | TAT_TITLE, _("Beta")); + tab_text (t, 5, 1, TAB_CENTER | TAT_TITLE, _("t")); + tab_text (t, 6, 1, TAB_CENTER | TAT_TITLE, _("Sig.")); + tab_text (t, 1, heading_rows, TAB_LEFT | TAT_TITLE, _("(Constant)")); + tab_double (t, 2, heading_rows, 0, linreg_intercept (c), NULL); std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); - tab_double (t, 3, 1, 0, std_err, NULL); - tab_double (t, 4, 1, 0, 0.0, NULL); + + if (cmd->stats & STATS_CI) + { + double lower = linreg_intercept (c) - tval * std_err ; + double upper = linreg_intercept (c) + tval * std_err ; + tab_double (t, 7, heading_rows, 0, lower, NULL); + tab_double (t, 8, heading_rows, 0, upper, NULL); + + tab_joint_text_format (t, 7, 0, 8, 0, TAB_CENTER | TAT_TITLE, _("%g%% Confidence Interval for B"), cmd->ci * 100); + tab_hline (t, TAL_1, 7, 8, 1); + tab_text (t, 7, 1, TAB_CENTER | TAT_TITLE, _("Lower Bound")); + tab_text (t, 8, 1, TAB_CENTER | TAT_TITLE, _("Upper Bound")); + } + tab_double (t, 3, heading_rows, 0, std_err, NULL); + tab_double (t, 4, heading_rows, 0, 0.0, NULL); t_stat = linreg_intercept (c) / std_err; - tab_double (t, 5, 1, 0, t_stat, NULL); + tab_double (t, 5, heading_rows, 0, t_stat, NULL); pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, 1, 0, pval, NULL); + tab_double (t, 6, heading_rows, 0, pval, NULL); + for (j = 0; j < linreg_n_coeffs (c); j++) { struct string tstr; ds_init_empty (&tstr); - this_row = j + 2; + this_row = j + heading_rows + 1; v = linreg_indep_var (c, j); label = var_to_string (v); /* Do not overwrite the variable's name. */ ds_put_cstr (&tstr, label); - tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); + tab_text (t, 1, this_row, TAB_LEFT, ds_cstr (&tstr)); /* Regression coefficients. */ @@ -862,12 +890,18 @@ reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable /* P values for the test statistic above. */ - pval = - 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (linreg_n_obs (c) - - linreg_n_coeffs (c) - 1)); + pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), df); tab_double (t, 6, this_row, 0, pval, NULL); ds_destroy (&tstr); + + if (cmd->stats & STATS_CI) + { + double lower = linreg_coeff (c, j) - tval * std_err ; + double upper = linreg_coeff (c, j) + tval * std_err ; + + tab_double (t, 7, this_row, 0, lower, NULL); + tab_double (t, 8, this_row, 0, upper, NULL); + } } tab_title (t, _("Coefficients (%s)"), var_to_string (var)); tab_submit (t); diff --git a/tests/language/stats/regression.at b/tests/language/stats/regression.at index d401953f12..60d077d29b 100644 --- a/tests/language/stats/regression.at +++ b/tests/language/stats/regression.at @@ -32,11 +32,11 @@ Table: ANOVA (v2) ,Total,215.256,9,,, Table: Coefficients (v2) +,,Unstandardized Coefficients,,Standardized Coefficients,, ,,B,Std. Error,Beta,t,Sig. ,(Constant),2.191,2.357,.000,.930,.380 ,v0,1.813,1.053,.171,1.722,.129 ,v1,-3.427,.332,-1.026,-10.334,.000 -,,,,,, Table: Data List v0,v1,v2,RES1,PRED1 @@ -1688,6 +1688,7 @@ end data regression /variables=v0 v1 /statistics defaults /dependent=v0 /method=enter. ]) AT_CHECK([pspp -o pspp.csv regression.sps]) + AT_CHECK([cat pspp.csv], [0], [dnl Table: Reading free-form data from INLINE. Variable,Format @@ -1705,10 +1706,10 @@ Table: ANOVA (v0) ,Total,98673.63,1499,,, Table: Coefficients (v0) +,,Unstandardized Coefficients,,Standardized Coefficients,, ,,B,Std. Error,Beta,t,Sig. ,(Constant),1.24,.42,.00,2.95,.00 ,v1,1.37,.72,.05,1.89,.06 -,,,,,, ]) AT_CLEANUP @@ -1811,3 +1812,246 @@ regression /variables=v0 v1 AT_CHECK([pspp ss.sps], [1], [ignore]) AT_CLEANUP + + +dnl The following example comes from +dnl http://www.ats.ucla.edu/stat/spss/output/reg_spss%28long%29.htm +AT_SETUP([LINEAR REGRESSION coefficient confidence interval]) + +AT_DATA([conf.sps], [dnl +set format = F22.3. + +data list notable list /math female socst read science * +begin data. + 41.00 .00 57.00 57.00 47.00 + 53.00 1.00 61.00 68.00 63.00 + 54.00 .00 31.00 44.00 58.00 + 47.00 .00 56.00 63.00 53.00 + 57.00 .00 61.00 47.00 53.00 + 51.00 .00 61.00 44.00 63.00 + 42.00 .00 61.00 50.00 53.00 + 45.00 .00 36.00 34.00 39.00 + 54.00 .00 51.00 63.00 58.00 + 52.00 .00 51.00 57.00 50.00 + 51.00 .00 61.00 60.00 53.00 + 51.00 .00 61.00 57.00 63.00 + 71.00 .00 71.00 73.00 61.00 + 57.00 .00 46.00 54.00 55.00 + 50.00 .00 56.00 45.00 31.00 + 43.00 .00 56.00 42.00 50.00 + 51.00 .00 56.00 47.00 50.00 + 60.00 .00 56.00 57.00 58.00 + 62.00 .00 61.00 68.00 55.00 + 57.00 .00 46.00 55.00 53.00 + 35.00 .00 41.00 63.00 66.00 + 75.00 .00 66.00 63.00 72.00 + 45.00 .00 56.00 50.00 55.00 + 57.00 .00 61.00 60.00 61.00 + 45.00 .00 46.00 37.00 39.00 + 46.00 .00 31.00 34.00 39.00 + 66.00 .00 66.00 65.00 61.00 + 57.00 .00 46.00 47.00 58.00 + 49.00 .00 46.00 44.00 39.00 + 49.00 .00 41.00 52.00 55.00 + 57.00 .00 51.00 42.00 47.00 + 64.00 .00 61.00 76.00 64.00 + 63.00 .00 71.00 65.00 66.00 + 57.00 .00 31.00 42.00 72.00 + 50.00 .00 61.00 52.00 61.00 + 58.00 .00 66.00 60.00 61.00 + 75.00 .00 66.00 68.00 66.00 + 68.00 .00 66.00 65.00 66.00 + 44.00 .00 36.00 47.00 36.00 + 40.00 .00 51.00 39.00 39.00 + 41.00 .00 51.00 47.00 42.00 + 62.00 .00 51.00 55.00 58.00 + 57.00 .00 51.00 52.00 55.00 + 43.00 .00 41.00 42.00 50.00 + 48.00 .00 66.00 65.00 63.00 + 63.00 .00 46.00 55.00 69.00 + 39.00 .00 47.00 50.00 49.00 + 70.00 .00 51.00 65.00 63.00 + 63.00 .00 46.00 47.00 53.00 + 59.00 .00 51.00 57.00 47.00 + 61.00 .00 56.00 53.00 57.00 + 38.00 .00 41.00 39.00 47.00 + 61.00 .00 46.00 44.00 50.00 + 49.00 .00 71.00 63.00 55.00 + 73.00 .00 66.00 73.00 69.00 + 44.00 .00 42.00 39.00 26.00 + 42.00 .00 32.00 37.00 33.00 + 39.00 .00 46.00 42.00 56.00 + 55.00 .00 41.00 63.00 58.00 + 52.00 .00 51.00 48.00 44.00 + 45.00 .00 61.00 50.00 58.00 + 61.00 .00 66.00 47.00 69.00 + 39.00 .00 46.00 44.00 34.00 + 41.00 .00 36.00 34.00 36.00 + 50.00 .00 61.00 50.00 36.00 + 40.00 .00 26.00 44.00 50.00 + 60.00 .00 66.00 60.00 55.00 + 47.00 .00 26.00 47.00 42.00 + 59.00 .00 44.00 63.00 65.00 + 49.00 .00 36.00 50.00 44.00 + 46.00 .00 51.00 44.00 39.00 + 58.00 .00 61.00 60.00 58.00 + 71.00 .00 66.00 73.00 63.00 + 58.00 .00 66.00 68.00 74.00 + 46.00 .00 51.00 55.00 58.00 + 43.00 .00 31.00 47.00 45.00 + 54.00 .00 61.00 55.00 49.00 + 56.00 .00 66.00 68.00 63.00 + 46.00 .00 46.00 31.00 39.00 + 54.00 .00 56.00 47.00 42.00 + 57.00 .00 56.00 63.00 55.00 + 54.00 .00 36.00 36.00 61.00 + 71.00 .00 56.00 68.00 66.00 + 48.00 .00 56.00 63.00 63.00 + 40.00 .00 41.00 55.00 44.00 + 64.00 .00 66.00 55.00 63.00 + 51.00 .00 56.00 52.00 53.00 + 39.00 .00 56.00 34.00 42.00 + 40.00 .00 31.00 50.00 34.00 + 61.00 .00 56.00 55.00 61.00 + 66.00 .00 46.00 52.00 47.00 + 49.00 .00 46.00 63.00 66.00 + 65.00 1.00 61.00 68.00 69.00 + 52.00 1.00 48.00 39.00 44.00 + 46.00 1.00 51.00 44.00 47.00 + 61.00 1.00 51.00 50.00 63.00 + 72.00 1.00 56.00 71.00 66.00 + 71.00 1.00 71.00 63.00 69.00 + 40.00 1.00 41.00 34.00 39.00 + 69.00 1.00 61.00 63.00 61.00 + 64.00 1.00 66.00 68.00 69.00 + 56.00 1.00 61.00 47.00 66.00 + 49.00 1.00 41.00 47.00 33.00 + 54.00 1.00 51.00 63.00 50.00 + 53.00 1.00 51.00 52.00 61.00 + 66.00 1.00 56.00 55.00 42.00 + 67.00 1.00 56.00 60.00 50.00 + 40.00 1.00 33.00 35.00 51.00 + 46.00 1.00 56.00 47.00 50.00 + 69.00 1.00 71.00 71.00 58.00 + 40.00 1.00 56.00 57.00 61.00 + 41.00 1.00 51.00 44.00 39.00 + 57.00 1.00 66.00 65.00 46.00 + 58.00 1.00 56.00 68.00 59.00 + 57.00 1.00 66.00 73.00 55.00 + 37.00 1.00 41.00 36.00 42.00 + 55.00 1.00 46.00 43.00 55.00 + 62.00 1.00 66.00 73.00 58.00 + 64.00 1.00 56.00 52.00 58.00 + 40.00 1.00 51.00 41.00 39.00 + 50.00 1.00 51.00 60.00 50.00 + 46.00 1.00 56.00 50.00 50.00 + 53.00 1.00 56.00 50.00 39.00 + 52.00 1.00 46.00 47.00 48.00 + 45.00 1.00 46.00 47.00 34.00 + 56.00 1.00 61.00 55.00 58.00 + 45.00 1.00 56.00 50.00 44.00 + 54.00 1.00 41.00 39.00 50.00 + 56.00 1.00 46.00 50.00 47.00 + 41.00 1.00 26.00 34.00 29.00 + 54.00 1.00 56.00 57.00 50.00 + 72.00 1.00 56.00 57.00 54.00 + 56.00 1.00 51.00 68.00 50.00 + 47.00 1.00 46.00 42.00 47.00 + 49.00 1.00 66.00 61.00 44.00 + 60.00 1.00 66.00 76.00 67.00 + 54.00 1.00 46.00 47.00 58.00 + 55.00 1.00 56.00 46.00 44.00 + 33.00 1.00 41.00 39.00 42.00 + 49.00 1.00 61.00 52.00 44.00 + 43.00 1.00 51.00 28.00 44.00 + 50.00 1.00 52.00 42.00 50.00 + 52.00 1.00 51.00 47.00 39.00 + 48.00 1.00 41.00 47.00 44.00 + 58.00 1.00 66.00 52.00 53.00 + 43.00 1.00 61.00 47.00 48.00 + 41.00 1.00 31.00 50.00 55.00 + 43.00 1.00 51.00 44.00 44.00 + 46.00 1.00 41.00 47.00 40.00 + 44.00 1.00 41.00 45.00 34.00 + 43.00 1.00 46.00 47.00 42.00 + 61.00 1.00 56.00 65.00 58.00 + 40.00 1.00 51.00 43.00 50.00 + 49.00 1.00 61.00 47.00 53.00 + 56.00 1.00 66.00 57.00 58.00 + 61.00 1.00 71.00 68.00 55.00 + 50.00 1.00 61.00 52.00 54.00 + 51.00 1.00 61.00 42.00 47.00 + 42.00 1.00 41.00 42.00 42.00 + 67.00 1.00 66.00 66.00 61.00 + 53.00 1.00 61.00 47.00 53.00 + 50.00 1.00 58.00 57.00 51.00 + 51.00 1.00 31.00 47.00 63.00 + 72.00 1.00 61.00 57.00 61.00 + 48.00 1.00 61.00 52.00 55.00 + 40.00 1.00 31.00 44.00 40.00 + 53.00 1.00 61.00 50.00 61.00 + 39.00 1.00 36.00 39.00 47.00 + 63.00 1.00 41.00 57.00 55.00 + 51.00 1.00 37.00 57.00 53.00 + 45.00 1.00 43.00 42.00 50.00 + 39.00 1.00 61.00 47.00 47.00 + 42.00 1.00 39.00 42.00 31.00 + 62.00 1.00 51.00 60.00 61.00 + 44.00 1.00 51.00 44.00 35.00 + 65.00 1.00 66.00 63.00 54.00 + 63.00 1.00 71.00 65.00 55.00 + 54.00 1.00 41.00 39.00 53.00 + 45.00 1.00 36.00 50.00 58.00 + 60.00 1.00 51.00 52.00 56.00 + 49.00 1.00 51.00 60.00 50.00 + 48.00 1.00 51.00 44.00 39.00 + 57.00 1.00 61.00 52.00 63.00 + 55.00 1.00 61.00 55.00 50.00 + 66.00 1.00 56.00 50.00 66.00 + 64.00 1.00 71.00 65.00 58.00 + 55.00 1.00 51.00 52.00 53.00 + 42.00 1.00 36.00 47.00 42.00 + 56.00 1.00 61.00 63.00 55.00 + 53.00 1.00 66.00 50.00 53.00 + 41.00 1.00 41.00 42.00 42.00 + 42.00 1.00 41.00 36.00 50.00 + 53.00 1.00 56.00 50.00 55.00 + 42.00 1.00 51.00 41.00 34.00 + 60.00 1.00 56.00 47.00 50.00 + 52.00 1.00 56.00 55.00 42.00 + 38.00 1.00 46.00 42.00 36.00 + 57.00 1.00 52.00 57.00 55.00 + 58.00 1.00 61.00 55.00 58.00 + 65.00 1.00 61.00 63.00 53.00 +end data. + +regression + /variables = math female socst read + /statistics = coeff r anova ci (95) + /dependent = science + /method = enter +]) + +AT_CHECK([pspp -O format=csv conf.sps], [0], [dnl +Table: Model Summary (science) +,R,R Square,Adjusted R Square,Std. Error of the Estimate +,.699,.489,.479,7.148 + +Table: ANOVA (science) +,,Sum of Squares,df,Mean Square,F,Sig. +,Regression,9543.721,4,2385.930,46.695,.000 +,Residual,9963.779,195,51.096,, +,Total,19507.500,199,,, + +Table: Coefficients (science) +,,Unstandardized Coefficients,,Standardized Coefficients,,,95% Confidence Interval for B, +,,B,Std. Error,Beta,t,Sig.,Lower Bound,Upper Bound +,(Constant),12.325,3.194,.000,3.859,.000,6.027,18.624 +,math,.389,.074,.368,5.252,.000,.243,.535 +,female,-2.010,1.023,-.101,-1.965,.051,-4.027,.007 +,socst,.050,.062,.054,.801,.424,-.073,.173 +,read,.335,.073,.347,4.607,.000,.192,.479 +]) + + +AT_CLEANUP