From edd5c738dfef01c90d02e06a33b93fc9d38320b8 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Fri, 12 May 2017 07:42:55 +0200 Subject: [PATCH] REGRESSION: Implement /ORIGIN subcommand. --- NEWS | 3 ++ doc/regression.texi | 8 ++++ src/language/stats/correlations.c | 3 +- src/language/stats/factor.c | 2 +- src/language/stats/glm.c | 2 +- src/language/stats/oneway.c | 2 +- src/language/stats/regression.c | 55 +++++++++++++-------- src/math/covariance.c | 68 +++++++++++++++----------- src/math/covariance.h | 4 +- src/math/linreg.c | 77 +++++++++++++++++------------- src/math/linreg.h | 4 +- tests/language/stats/regression.at | 57 ++++++++++++++++++++++ 12 files changed, 198 insertions(+), 87 deletions(-) diff --git a/NEWS b/NEWS index 0390e74f33..9a83b21c9d 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,9 @@ Please send PSPP bug reports to bug-gnu-pspp@gnu.org. Changes from 0.10.2 to 0.10.4: + * The REGRESSION command now has a /ORIGIN subcommand to perform + regression through the origin. + * The FACTOR command can now analyse matrix files prepared with MATRIX DATA. * The MATRIX DATA command has been added. diff --git a/doc/regression.texi b/doc/regression.texi index de639352d0..9a159014d0 100644 --- a/doc/regression.texi +++ b/doc/regression.texi @@ -46,6 +46,7 @@ REGRESSION /VARIABLES=@var{var_list} /DEPENDENT=@var{var_list} /STATISTICS=@{ALL, DEFAULTS, R, COEFF, ANOVA, BCOV, CI[@var{conf}]@} + @{ /ORIGIN | /NOORIGIN @} /SAVE=@{PRED, RESID@} @end display @@ -86,6 +87,13 @@ This is what you get if the /STATISTICS command is not specified, or if it is specified without any parameters. @end table +The @subcmd{ORIGIN} and @subcmd{NOORIGIN} subcommands are mutually +exclusive. @subcmd{ORIGIN} indicates that the regression should be +performed through the origin. You should use this option if, and +only if you have reason to believe that the regression does indeed +pass through the origin --- that is to say, the value @math{b_0} above, +is zero. The default is @subcmd{NOORIGIN}. + The @subcmd{SAVE} subcommand causes @pspp{} to save the residuals or predicted values from the fitted model to the active dataset. @pspp{} will store the residuals in a variable diff --git a/src/language/stats/correlations.c b/src/language/stats/correlations.c index e2d158dacb..207ec77528 100644 --- a/src/language/stats/correlations.c +++ b/src/language/stats/correlations.c @@ -290,7 +290,8 @@ run_corr (struct casereader *r, const struct corr_opts *opts, const struct corr gsl_matrix *corr_matrix = NULL; struct covariance *cov = covariance_2pass_create (corr->n_vars_total, corr->vars, NULL, - opts->wv, opts->exclude); + opts->wv, opts->exclude, + true); struct casereader *rc = casereader_clone (r); for ( ; (c = casereader_read (r) ); case_unref (c)) diff --git a/src/language/stats/factor.c b/src/language/stats/factor.c index e83a6497b5..c1ce93aaec 100644 --- a/src/language/stats/factor.c +++ b/src/language/stats/factor.c @@ -2197,7 +2197,7 @@ do_factor (const struct cmd_factor *factor, struct casereader *r) struct idata *idata = idata_alloc (factor->n_vars); idata->cvm = covariance_1pass_create (factor->n_vars, factor->vars, - factor->wv, factor->exclude); + factor->wv, factor->exclude, true); for ( ; (c = casereader_read (r) ); case_unref (c)) { diff --git a/src/language/stats/glm.c b/src/language/stats/glm.c index 2edc8e568e..74e918b886 100644 --- a/src/language/stats/glm.c +++ b/src/language/stats/glm.c @@ -601,7 +601,7 @@ run_glm (struct glm_spec *cmd, struct casereader *input, cmd->wv, cmd->exclude, MV_ANY); cov = covariance_2pass_create (cmd->n_dep_vars, cmd->dep_vars, - ws.cats, cmd->wv, cmd->exclude); + ws.cats, cmd->wv, cmd->exclude, true); c = casereader_peek (input, 0); diff --git a/src/language/stats/oneway.c b/src/language/stats/oneway.c index 858d825319..cbc00e0fdd 100644 --- a/src/language/stats/oneway.c +++ b/src/language/stats/oneway.c @@ -728,7 +728,7 @@ run_oneway (const struct oneway_spec *cmd, ws.vws[v].cov = covariance_2pass_create (1, &cmd->vars[v], ws.vws[v].cat, - cmd->wv, cmd->exclude); + cmd->wv, cmd->exclude, true); ws.vws[v].nl = levene_create (var_get_width (cmd->indep_var), NULL); } diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c index d7608927da..b2ba15d63a 100644 --- a/src/language/stats/regression.c +++ b/src/language/stats/regression.c @@ -1,5 +1,6 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013, 2014, 2016 Free Software Foundation, Inc. + Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013, 2014, + 2016, 2017 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -79,6 +80,8 @@ struct regression bool resid; bool pred; + + bool origin; }; struct regression_workspace @@ -204,6 +207,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) regression.resid = false; regression.ds = ds; + regression.origin = false; bool variables_seen = false; bool method_seen = false; @@ -246,6 +250,14 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) PV_NO_DUPLICATE | PV_NUMERIC)) goto error; } + else if (lex_match_id (lexer, "ORIGIN")) + { + regression.origin = true; + } + else if (lex_match_id (lexer, "NOORIGIN")) + { + regression.origin = false; + } else if (lex_match_id (lexer, "METHOD")) { method_seen = true; @@ -660,7 +672,7 @@ run_regression (const struct regression *cmd, fill_all_vars (all_vars, cmd); cov = covariance_1pass_create (n_all_vars, all_vars, dict_get_weight (dataset_dict (cmd->ds)), - MV_ANY); + MV_ANY, cmd->origin == false); reader = casereader_clone (input); reader = casereader_create_filter_missing (reader, all_vars, n_all_vars, @@ -686,7 +698,7 @@ run_regression (const struct regression *cmd, gsl_matrix *this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); double n_data = fill_covariance (this_cm, cov, vars, n_indep, dep_var, all_vars, n_all_vars, means); - models[k] = linreg_alloc (dep_var, vars, n_data, n_indep); + models[k] = linreg_alloc (dep_var, vars, n_data, n_indep, cmd->origin); models[k]->depvar = dep_var; for (i = 0; i < n_indep; i++) { @@ -822,8 +834,7 @@ reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable int n_cols = 7; const int heading_rows = 2; int n_rows; - int this_row; - double t_stat; + int this_row = heading_rows; double pval; double std_err; double beta; @@ -858,8 +869,7 @@ reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable tab_text (t, 4, 1, TAB_CENTER | TAT_TITLE, _("Beta")); tab_text (t, 5, 1, TAB_CENTER | TAT_TITLE, _("t")); tab_text (t, 6, 1, TAB_CENTER | TAT_TITLE, _("Sig.")); - tab_text (t, 1, heading_rows, TAB_LEFT | TAT_TITLE, _("(Constant)")); - tab_double (t, 2, heading_rows, 0, linreg_intercept (c), NULL, RC_OTHER); + std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); if (cmd->stats & STATS_CI) @@ -874,20 +884,27 @@ reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable tab_text (t, 7, 1, TAB_CENTER | TAT_TITLE, _("Lower Bound")); tab_text (t, 8, 1, TAB_CENTER | TAT_TITLE, _("Upper Bound")); } - tab_double (t, 3, heading_rows, 0, std_err, NULL, RC_OTHER); - tab_double (t, 4, heading_rows, 0, 0.0, NULL, RC_OTHER); - t_stat = linreg_intercept (c) / std_err; - tab_double (t, 5, heading_rows, 0, t_stat, NULL, RC_OTHER); - pval = - 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, heading_rows, 0, pval, NULL, RC_PVALUE); - - for (j = 0; j < linreg_n_coeffs (c); j++) + + if (!cmd->origin) + { + tab_text (t, 1, this_row, TAB_LEFT | TAT_TITLE, _("(Constant)")); + tab_double (t, 2, this_row, 0, linreg_intercept (c), NULL, RC_OTHER); + tab_double (t, 3, this_row, 0, std_err, NULL, RC_OTHER); + tab_double (t, 4, this_row, 0, 0.0, NULL, RC_OTHER); + double t_stat = linreg_intercept (c) / std_err; + tab_double (t, 5, this_row, 0, t_stat, NULL, RC_OTHER); + + double pval = + 2 * gsl_cdf_tdist_Q (fabs (t_stat), + (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); + tab_double (t, 6, this_row, 0, pval, NULL, RC_PVALUE); + this_row++; + } + + for (j = 0; j < linreg_n_coeffs (c); j++, this_row++) { struct string tstr; ds_init_empty (&tstr); - this_row = j + heading_rows + 1; v = linreg_indep_var (c, j); label = var_to_string (v); @@ -915,7 +932,7 @@ reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable /* Test statistic for H0: coefficient is 0. */ - t_stat = linreg_coeff (c, j) / std_err; + double t_stat = linreg_coeff (c, j) / std_err; tab_double (t, 5, this_row, 0, t_stat, NULL, RC_OTHER); /* P values for the test statistic above. diff --git a/src/math/covariance.c b/src/math/covariance.c index a8c71dc0b3..66b44c12c1 100644 --- a/src/math/covariance.c +++ b/src/math/covariance.c @@ -70,6 +70,9 @@ resize_matrix (gsl_matrix *in, size_t new_size) struct covariance { + /* True if the covariances are centerered. (ie Real covariances) */ + bool centered; + /* The variables for which the covariance matrix is to be calculated. */ size_t n_vars; const struct variable *const *vars; @@ -138,11 +141,13 @@ covariance_moments (const struct covariance *cov, int m) */ struct covariance * covariance_1pass_create (size_t n_vars, const struct variable *const *vars, - const struct variable *weight, enum mv_class exclude) + const struct variable *weight, enum mv_class exclude, + bool centered) { size_t i; struct covariance *cov = xzalloc (sizeof *cov); + cov->centered = centered; cov->passes = 1; cov->state = 0; cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false; @@ -178,11 +183,13 @@ covariance_1pass_create (size_t n_vars, const struct variable *const *vars, struct covariance * covariance_2pass_create (size_t n_vars, const struct variable *const *vars, struct categoricals *cats, - const struct variable *wv, enum mv_class exclude) + const struct variable *wv, enum mv_class exclude, + bool centered) { size_t i; struct covariance *cov = xmalloc (sizeof *cov); + cov->centered = centered; cov->passes = 2; cov->state = 0; cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false; @@ -584,19 +591,22 @@ covariance_calculate_single_pass (struct covariance *cov) } } - /* Centre the moments */ - for ( j = 0 ; j < cov->dim - 1; ++j) + if (cov->centered) { - for (i = j + 1 ; i < cov->dim; ++i) + /* Centre the moments */ + for ( j = 0 ; j < cov->dim - 1; ++j) { - double *x = &cov->cm [cm_idx (cov, i, j)]; + for (i = j + 1 ; i < cov->dim; ++i) + { + double *x = &cov->cm [cm_idx (cov, i, j)]; - *x /= gsl_matrix_get (cov->moments[0], i, j); + *x /= gsl_matrix_get (cov->moments[0], i, j); - *x -= - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j) - * - gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i); + *x -= + gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j) + * + gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i); + } } } @@ -642,29 +652,33 @@ covariance_calculate_single_pass_unnormalized (struct covariance *cov) { size_t i, j; - for (i = 0 ; i < cov->dim; ++i) + if (cov->centered) { - for (j = 0 ; j < cov->dim; ++j) + for (i = 0 ; i < cov->dim; ++i) { - double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j); - *x -= pow2 (gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) - / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j); + for (j = 0 ; j < cov->dim; ++j) + { + double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j); + *x -= pow2 (gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) + / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j); + } } - } - for ( j = 0 ; j < cov->dim - 1; ++j) - { - for (i = j + 1 ; i < cov->dim; ++i) + + for ( j = 0 ; j < cov->dim - 1; ++j) { - double *x = &cov->cm [cm_idx (cov, i, j)]; + for (i = j + 1 ; i < cov->dim; ++i) + { + double *x = &cov->cm [cm_idx (cov, i, j)]; - *x -= - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j) - * - gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i) - / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j); + *x -= + gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j) + * + gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i) + / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j); + } } } - + return cm_to_gsl (cov); } diff --git a/src/math/covariance.h b/src/math/covariance.h index a52cfced31..18f1137a79 100644 --- a/src/math/covariance.h +++ b/src/math/covariance.h @@ -28,12 +28,12 @@ struct ccase ; struct categoricals; struct covariance * covariance_1pass_create (size_t n_vars, const struct variable *const *vars, - const struct variable *wv, enum mv_class excl); + const struct variable *wv, enum mv_class excl, bool centered); struct covariance * covariance_2pass_create (size_t n_vars, const struct variable *const *vars, struct categoricals *cats, - const struct variable *wv, enum mv_class excl); + const struct variable *wv, enum mv_class excl, bool centered); void covariance_accumulate (struct covariance *, const struct ccase *); void covariance_accumulate_pass1 (struct covariance *, const struct ccase *); diff --git a/src/math/linreg.c b/src/math/linreg.c index 98816243e7..4a943e8a31 100644 --- a/src/math/linreg.c +++ b/src/math/linreg.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2005, 2010, 2011, 2017 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -70,7 +70,7 @@ linreg_get_vars (const linreg *c) */ linreg * linreg_alloc (const struct variable *depvar, const struct variable **indep_vars, - double n, size_t p) + double n, size_t p, bool origin) { linreg *c; size_t i; @@ -91,7 +91,10 @@ linreg_alloc (const struct variable *depvar, const struct variable **indep_vars, c->n_coeffs = p; c->coeff = xnmalloc (p, sizeof (*c->coeff)); c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1); - c->dft = n - 1; + c->dft = n; + if (!origin) + c->dft--; + c->dfm = p; c->dfe = c->dft - c->dfm; c->intercept = 0.0; @@ -103,6 +106,8 @@ linreg_alloc (const struct variable *depvar, const struct variable **indep_vars, c->refcnt = 1; + c->origin = origin; + return c; } @@ -130,9 +135,6 @@ linreg_unref (linreg *c) static void post_sweep_computations (linreg *l, gsl_matrix *sw) { - gsl_matrix *xm; - gsl_matrix_view xtx; - gsl_matrix_view xmxtx; double m; size_t i; size_t j; @@ -168,37 +170,44 @@ post_sweep_computations (linreg *l, gsl_matrix *sw) double tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j); gsl_matrix_set (l->cov, i + 1, j + 1, tmp); } - /* - Get the covariances related to the intercept. - */ - xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps); - xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps); - xm = gsl_matrix_calloc (1, l->n_indeps); - for (i = 0; i < xm->size2; i++) - { - gsl_matrix_set (xm, 0, i, - linreg_get_indep_variable_mean (l, i)); - } - rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse, - &xtx.matrix, xm, 0.0, &xmxtx.matrix); - gsl_matrix_free (xm); - if (rc == GSL_SUCCESS) + + if (! l->origin) { - double tmp = l->mse / l->n_obs; - for (i = 1; i < 1 + l->n_indeps; i++) + gsl_matrix *xm; + gsl_matrix_view xtx; + gsl_matrix_view xmxtx; + /* + Get the covariances related to the intercept. + */ + xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps); + xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps); + xm = gsl_matrix_calloc (1, l->n_indeps); + for (i = 0; i < xm->size2; i++) { - tmp -= gsl_matrix_get (l->cov, 0, i) - * linreg_get_indep_variable_mean (l, i - 1); + gsl_matrix_set (xm, 0, i, + linreg_get_indep_variable_mean (l, i)); + } + rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse, + &xtx.matrix, xm, 0.0, &xmxtx.matrix); + gsl_matrix_free (xm); + if (rc == GSL_SUCCESS) + { + double tmp = l->mse / l->n_obs; + for (i = 1; i < 1 + l->n_indeps; i++) + { + tmp -= gsl_matrix_get (l->cov, 0, i) + * linreg_get_indep_variable_mean (l, i - 1); + } + gsl_matrix_set (l->cov, 0, 0, tmp); + + l->intercept = m; + } + else + { + fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n", + __FILE__, __LINE__, gsl_strerror (rc)); + exit (rc); } - gsl_matrix_set (l->cov, 0, 0, tmp); - - l->intercept = m; - } - else - { - fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n", - __FILE__, __LINE__, gsl_strerror (rc)); - exit (rc); } } diff --git a/src/math/linreg.h b/src/math/linreg.h index 49e54c13c7..39cda89f29 100644 --- a/src/math/linreg.h +++ b/src/math/linreg.h @@ -140,6 +140,8 @@ struct linreg_struct int dependent_column; /* Column containing the dependent variable. Defaults to last column. */ int refcnt; + + bool origin; }; typedef struct linreg_struct linreg; @@ -147,7 +149,7 @@ typedef struct linreg_struct linreg; linreg *linreg_alloc (const struct variable *, const struct variable **, - double, size_t); + double, size_t, bool); void linreg_unref (linreg *); void linreg_ref (linreg *); diff --git a/tests/language/stats/regression.at b/tests/language/stats/regression.at index e520bb1138..d2e99344ef 100644 --- a/tests/language/stats/regression.at +++ b/tests/language/stats/regression.at @@ -2261,3 +2261,60 @@ Table: Coefficients (value) ]) AT_CLEANUP + + + +AT_SETUP([LINEAR REGRESSION /ORIGIN]) +AT_DATA([regression-origin.sps], [dnl +SET FORMAT=F10.3. + +DATA LIST notable LIST /number * value *. +BEGIN DATA + 16 7.25 + 0 .00 + 1 .10 + 9 27.9 + 0 .00 + 7 3.65 + 14 16.8 + 24 9.15 + 0 .00 + 24 19.0 + 7 4.05 + 12 7.90 + 6 .75 + 11 1.40 + 0 .00 + 3 2.30 + 12 7.60 + 11 6.80 + 16 8.65 +END DATA. + +REGRESSION + /STATISTICS COEFF R ANOVA + /DEPENDENT value + /ORIGIN + /METHOD=ENTER number. +]) + + +AT_CHECK([pspp -O format=csv regression-origin.sps], [0], [dnl +Table: Model Summary (value) +,R,R Square,Adjusted R Square,Std. Error of the Estimate +,.802,.643,.622,6.032 + +Table: ANOVA (value) +,,Sum of Squares,df,Mean Square,F,Sig. +,Regression,1181.726,1,1181.726,32.475,.000 +,Residual,654.989,18,36.388,, +,Total,1836.715,19,,, + +Table: Coefficients (value) +,,Unstandardized Coefficients,,Standardized Coefficients,, +,,B,Std. Error,Beta,t,Sig. +,number,.672,.118,.802,5.699,.000 +,,,,,, +]) + +AT_CLEANUP -- 2.30.2