From: John Darrington Date: Fri, 12 May 2017 06:36:56 +0000 (+0200) Subject: src/math/linreg.c: Encapsulate this object better. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=91dc3fe3d4ae4e0f9dee5adb11dd0d7c4b78c515;p=pspp src/math/linreg.c: Encapsulate this object better. --- diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c index b2ba15d63a..5650ca4592 100644 --- a/src/language/stats/regression.c +++ b/src/language/stats/regression.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -52,8 +53,6 @@ #include -#define REG_LARGE_DATA 1000 - #define STATS_R 1 #define STATS_COEFF 2 #define STATS_ANOVA 4 @@ -552,7 +551,6 @@ identify_indep_vars (const struct regression *cmd, return n_indep_vars; } - static double fill_covariance (gsl_matrix * cov, struct covariance *all_cov, const struct variable **vars, @@ -628,14 +626,14 @@ fill_covariance (gsl_matrix * cov, struct covariance *all_cov, /* STATISTICS subcommand output functions. */ -static void reg_stats_r (const linreg *, const struct variable *); -static void reg_stats_coeff (const linreg *, const gsl_matrix *, const struct variable *, const struct regression *); -static void reg_stats_anova (const linreg *, const struct variable *); -static void reg_stats_bcov (const linreg *, const struct variable *); +static void reg_stats_r (const struct linreg *, const struct variable *); +static void reg_stats_coeff (const struct linreg *, const gsl_matrix *, const struct variable *, const struct regression *); +static void reg_stats_anova (const struct linreg *, const struct variable *); +static void reg_stats_bcov (const struct linreg *, const struct variable *); static void -subcommand_statistics (const struct regression *cmd, const linreg * c, const gsl_matrix * cm, +subcommand_statistics (const struct regression *cmd, const struct linreg * c, const gsl_matrix * cm, const struct variable *var) { if (cmd->stats & STATS_R) @@ -658,7 +656,7 @@ run_regression (const struct regression *cmd, struct casereader *input) { size_t i; - linreg **models; + struct linreg **models; int k; struct ccase *c; @@ -699,20 +697,11 @@ run_regression (const struct regression *cmd, double n_data = fill_covariance (this_cm, cov, vars, n_indep, dep_var, all_vars, n_all_vars, means); models[k] = linreg_alloc (dep_var, vars, n_data, n_indep, cmd->origin); - models[k]->depvar = dep_var; for (i = 0; i < n_indep; i++) { linreg_set_indep_variable_mean (models[k], i, means[i]); } linreg_set_depvar_mean (models[k], means[i]); - /* - For large data sets, use QR decomposition. - */ - if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) - { - models[k]->method = LINREG_QR; - } - if (n_data > 0) { /* @@ -761,7 +750,7 @@ run_regression (const struct regression *cmd, if (cmd->resid) { - double obs = case_data (c, models[k]->depvar)->f; + double obs = case_data (c, linreg_dep_var (models[k]))->f; double res = linreg_residual (models[k], obs, vals, n_indep); case_data_rw_idx (outc, k * ws->extras + ws->res_idx)->f = res; } @@ -791,7 +780,7 @@ run_regression (const struct regression *cmd, static void -reg_stats_r (const linreg * c, const struct variable *var) +reg_stats_r (const struct linreg * c, const struct variable *var) { struct tab_table *t; int n_rows = 2; @@ -828,7 +817,7 @@ reg_stats_r (const linreg * c, const struct variable *var) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable *var, const struct regression *cmd) +reg_stats_coeff (const struct linreg * c, const gsl_matrix *cov, const struct variable *var, const struct regression *cmd) { size_t j; int n_cols = 7; @@ -958,14 +947,15 @@ reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable Display the ANOVA table. */ static void -reg_stats_anova (const linreg * c, const struct variable *var) +reg_stats_anova (const struct linreg * c, const struct variable *var) { int n_cols = 7; int n_rows = 4; const double msm = linreg_ssreg (c) / linreg_dfmodel (c); const double mse = linreg_mse (c); const double F = msm / mse; - const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe); + const double pval = gsl_cdf_fdist_Q (F, linreg_dfmodel (c), + linreg_dferror (c)); struct tab_table *t; @@ -996,9 +986,9 @@ reg_stats_anova (const linreg * c, const struct variable *var) /* Degrees of freedom */ - tab_text_format (t, 3, 1, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dfm); - tab_text_format (t, 3, 2, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dfe); - tab_text_format (t, 3, 3, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dft); + tab_text_format (t, 3, 1, TAB_RIGHT, "%.*g", DBL_DIG + 1, linreg_dfmodel (c)); + tab_text_format (t, 3, 2, TAB_RIGHT, "%.*g", DBL_DIG + 1, linreg_dferror (c)); + tab_text_format (t, 3, 3, TAB_RIGHT, "%.*g", DBL_DIG + 1, linreg_dftotal (c)); /* Mean Squares */ tab_double (t, 4, 1, TAB_RIGHT, msm, NULL, RC_OTHER); @@ -1014,7 +1004,7 @@ reg_stats_anova (const linreg * c, const struct variable *var) static void -reg_stats_bcov (const linreg * c, const struct variable *var) +reg_stats_bcov (const struct linreg * c, const struct variable *var) { int n_cols; int n_rows; @@ -1026,8 +1016,8 @@ reg_stats_bcov (const linreg * c, const struct variable *var) struct tab_table *t; assert (c != NULL); - n_cols = c->n_indeps + 1 + 2; - n_rows = 2 * (c->n_indeps + 1); + n_cols = linreg_n_indeps (c) + 1 + 2; + n_rows = 2 * (linreg_n_indeps (c) + 1); t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); @@ -1047,7 +1037,7 @@ reg_stats_bcov (const linreg * c, const struct variable *var) col = (i <= k) ? k : i; row = (i <= k) ? i : k; tab_double (t, k + 2, i, TAB_CENTER, - gsl_matrix_get (c->cov, row, col), NULL, RC_OTHER); + gsl_matrix_get (linreg_cov (c), row, col), NULL, RC_OTHER); } } tab_title (t, _("Coefficient Correlations (%s)"), var_to_string (var)); diff --git a/src/math/linreg.c b/src/math/linreg.c index 4a943e8a31..67c1424a88 100644 --- a/src/math/linreg.c +++ b/src/math/linreg.c @@ -57,9 +57,63 @@ Springer. 1998. ISBN 0-387-98542-5. */ +struct linreg +{ + double n_obs; /* Number of observations. */ + int n_indeps; /* Number of independent variables. */ + int n_coeffs; /* The intercept is not considered a + coefficient here. */ + + /* + Pointers to the variables. + */ + const struct variable *depvar; + const struct variable **indep_vars; + + double *coeff; + double intercept; + /* + Means and standard deviations of the variables. + If these pointers are null when pspp_linreg() is + called, pspp_linreg() will compute their values. + + Entry i of indep_means is the mean of independent + variable i, whose observations are stored in the ith + column of the design matrix. + */ + double depvar_mean; + gsl_vector *indep_means; + gsl_vector *indep_std; + + /* + Sums of squares. + */ + double ssm; /* Sums of squares for the overall model. */ + double sst; /* Sum of squares total. */ + double sse; /* Sum of squares error. */ + double mse; /* Mean squared error. This is just sse / + dfe, but since it is the best unbiased + estimate of the population variance, it + has its own entry here. */ + /* + Covariance matrix of the parameter estimates. + */ + gsl_matrix *cov; + /* + Degrees of freedom. + */ + double dft; + double dfe; + double dfm; + + int dependent_column; /* Column containing the dependent variable. Defaults to last column. */ + int refcnt; + + bool origin; +}; const struct variable ** -linreg_get_vars (const linreg *c) +linreg_get_vars (const struct linreg *c) { return c->indep_vars; } @@ -68,11 +122,11 @@ linreg_get_vars (const linreg *c) Allocate a linreg and return a pointer to it. n is the number of cases, p is the number of independent variables. */ -linreg * +struct linreg * linreg_alloc (const struct variable *depvar, const struct variable **indep_vars, double n, size_t p, bool origin) { - linreg *c; + struct linreg *c; size_t i; c = xmalloc (sizeof (*c)); @@ -102,7 +156,6 @@ linreg_alloc (const struct variable *depvar, const struct variable **indep_vars, /* Default settings. */ - c->method = LINREG_SWEEP; c->refcnt = 1; @@ -113,13 +166,13 @@ linreg_alloc (const struct variable *depvar, const struct variable **indep_vars, void -linreg_ref (linreg *c) +linreg_ref (struct linreg *c) { c->refcnt++; } void -linreg_unref (linreg *c) +linreg_unref (struct linreg *c) { if (--c->refcnt == 0) { @@ -133,7 +186,7 @@ linreg_unref (linreg *c) } static void -post_sweep_computations (linreg *l, gsl_matrix *sw) +post_sweep_computations (struct linreg *l, gsl_matrix *sw) { double m; size_t i; @@ -217,7 +270,7 @@ post_sweep_computations (linreg *l, gsl_matrix *sw) order of the coefficients in the linreg struct. */ double -linreg_predict (const linreg *c, const double *vals, size_t n_vals) +linreg_predict (const struct linreg *c, const double *vals, size_t n_vals) { size_t j; double result; @@ -243,7 +296,7 @@ linreg_predict (const linreg *c, const double *vals, size_t n_vals) } double -linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals) +linreg_residual (const struct linreg *c, double obs, const double *vals, size_t n_vals) { if (vals == NULL || c == NULL) { @@ -255,20 +308,20 @@ linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals) /* Mean of the independent variable. */ -double linreg_get_indep_variable_mean (const linreg *c, size_t j) +double linreg_get_indep_variable_mean (const struct linreg *c, size_t j) { assert (c != NULL); return gsl_vector_get (c->indep_means, j); } -void linreg_set_indep_variable_mean (linreg *c, size_t j, double m) +void linreg_set_indep_variable_mean (struct linreg *c, size_t j, double m) { assert (c != NULL); gsl_vector_set (c->indep_means, j, m); } static void -linreg_fit_qr (const gsl_matrix *cov, linreg *l) +linreg_fit_qr (const gsl_matrix *cov, struct linreg *l) { double intcpt_coef = 0.0; double intercept_variance = 0.0; @@ -357,6 +410,8 @@ linreg_fit_qr (const gsl_matrix *cov, linreg *l) gsl_vector_free (params); } +#define REG_LARGE_DATA 1000 + /* Estimate the model parameters from the covariance matrix. This function assumes the covariance entries corresponding to the @@ -364,99 +419,128 @@ linreg_fit_qr (const gsl_matrix *cov, linreg *l) matrix. */ void -linreg_fit (const gsl_matrix *cov, linreg *l) +linreg_fit (const gsl_matrix *cov, struct linreg *l) { assert (l != NULL); assert (cov != NULL); l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1); - if (l->method == LINREG_SWEEP) + + if ((l->n_obs * l->n_obs > l->n_indeps) && ( l->n_obs > REG_LARGE_DATA)) + { + /* + For large data sets, use QR decomposition. + */ + linreg_fit_qr (cov, l); + } + else { - gsl_matrix *params; - params = gsl_matrix_calloc (cov->size1, cov->size2); + gsl_matrix *params = gsl_matrix_calloc (cov->size1, cov->size2); gsl_matrix_memcpy (params, cov); reg_sweep (params, l->dependent_column); post_sweep_computations (l, params); gsl_matrix_free (params); } - else if (l->method == LINREG_QR) - { - linreg_fit_qr (cov, l); - } } -double linreg_mse (const linreg *c) +double linreg_mse (const struct linreg *c) { assert (c != NULL); return (c->sse / c->dfe); } -double linreg_intercept (const linreg *c) +double linreg_intercept (const struct linreg *c) { return c->intercept; } const gsl_matrix * -linreg_cov (const linreg *c) +linreg_cov (const struct linreg *c) { return c->cov; } double -linreg_coeff (const linreg *c, size_t i) +linreg_coeff (const struct linreg *c, size_t i) { return (c->coeff[i]); } const struct variable * -linreg_indep_var (const linreg *c, size_t i) +linreg_indep_var (const struct linreg *c, size_t i) { return (c->indep_vars[i]); } +int +linreg_n_indeps (const struct linreg *c) +{ + return c->n_indeps; +} + + +const struct variable * +linreg_dep_var (const struct linreg *c) +{ + return c->depvar; +} + + size_t -linreg_n_coeffs (const linreg *c) +linreg_n_coeffs (const struct linreg *c) { return c->n_coeffs; } double -linreg_n_obs (const linreg *c) +linreg_n_obs (const struct linreg *c) { return c->n_obs; } double -linreg_sse (const linreg *c) +linreg_sse (const struct linreg *c) { return c->sse; } double -linreg_ssreg (const linreg *c) +linreg_ssreg (const struct linreg *c) { return (c->sst - c->sse); } -double linreg_sst (const linreg *c) +double linreg_sst (const struct linreg *c) { return c->sst; } double -linreg_dfmodel ( const linreg *c) +linreg_dfmodel ( const struct linreg *c) { return c->dfm; } +double +linreg_dferror ( const struct linreg *c) +{ + return c->dfe; +} + +double +linreg_dftotal ( const struct linreg *c) +{ + return c->dft; +} + void -linreg_set_depvar_mean (linreg *c, double x) +linreg_set_depvar_mean (struct linreg *c, double x) { c->depvar_mean = x; } double -linreg_get_depvar_mean (const linreg *c) +linreg_get_depvar_mean (const struct linreg *c) { return c->depvar_mean; } diff --git a/src/math/linreg.h b/src/math/linreg.h index 39cda89f29..88da7f6227 100644 --- a/src/math/linreg.h +++ b/src/math/linreg.h @@ -17,50 +17,9 @@ #ifndef LINREG_H #define LINREG_H -#include #include -#include #include -enum -{ - LINREG_CONDITIONAL_INVERSE, - LINREG_QR, - LINREG_SWEEP, -}; - - - -/* - Options describing what special values should be computed. - */ -struct pspp_linreg_opts_struct -{ - int get_depvar_mean_std; - int *get_indep_mean_std; /* Array of booleans - dictating which - independent variables need - their means and standard - deviations computed within - pspp_linreg. This array - MUST be of length - n_indeps. If element i is - 1, pspp_linreg will - compute the mean and - variance of indpendent - variable i. If element i - is 0, it will not compute - the mean and standard - deviation, and assume the - values are stored. - cache->indep_mean[i] is - the mean and - cache->indep_std[i] is the - sample standard deviation. */ -}; -typedef struct pspp_linreg_opts_struct pspp_linreg_opts; - - /* Find the least-squares estimate of b for the linear model: @@ -87,103 +46,48 @@ typedef struct pspp_linreg_opts_struct pspp_linreg_opts; Springer. 1998. ISBN 0-387-98542-5. */ +struct variable; + +struct linreg *linreg_alloc (const struct variable *, const struct variable **, + double, size_t, bool); + +void linreg_unref (struct linreg *); +void linreg_ref (struct linreg *); -struct linreg_struct -{ - double n_obs; /* Number of observations. */ - int n_indeps; /* Number of independent variables. */ - int n_coeffs; /* The intercept is not considered a - coefficient here. */ - - /* - Pointers to the variables. - */ - const struct variable *depvar; - const struct variable **indep_vars; - - double *coeff; - double intercept; - int method; /* Method to use to estimate parameters. */ - /* - Means and standard deviations of the variables. - If these pointers are null when pspp_linreg() is - called, pspp_linreg() will compute their values. - - Entry i of indep_means is the mean of independent - variable i, whose observations are stored in the ith - column of the design matrix. - */ - double depvar_mean; - gsl_vector *indep_means; - gsl_vector *indep_std; - - /* - Sums of squares. - */ - double ssm; /* Sums of squares for the overall model. */ - double sst; /* Sum of squares total. */ - double sse; /* Sum of squares error. */ - double mse; /* Mean squared error. This is just sse / - dfe, but since it is the best unbiased - estimate of the population variance, it - has its own entry here. */ - /* - Covariance matrix of the parameter estimates. - */ - gsl_matrix *cov; - /* - Degrees of freedom. - */ - double dft; - double dfe; - double dfm; - - int dependent_column; /* Column containing the dependent variable. Defaults to last column. */ - int refcnt; - - bool origin; -}; - -typedef struct linreg_struct linreg; - - - -linreg *linreg_alloc (const struct variable *, const struct variable **, - double, size_t, bool); - -void linreg_unref (linreg *); -void linreg_ref (linreg *); +int linreg_n_indeps (const struct linreg *c); /* - Fit the linear model via least squares. All pointers passed to pspp_linreg - are assumed to be allocated to the correct size and initialized to the - values as indicated by opts. - */ -void linreg_fit (const gsl_matrix *, linreg *); + Fit the linear model via least squares. +*/ +void linreg_fit (const gsl_matrix *, struct linreg *); -double linreg_predict (const linreg *, const double *, size_t); -double linreg_residual (const linreg *, double, const double *, size_t); -const struct variable ** linreg_get_vars (const linreg *); +double linreg_predict (const struct linreg *, const double *, size_t); +double linreg_residual (const struct linreg *, double, const double *, size_t); +const struct variable ** linreg_get_vars (const struct linreg *); /* Mean of the independent variable. */ -double linreg_get_indep_variable_mean (const linreg *, size_t); -void linreg_set_indep_variable_mean (linreg *, size_t, double); - -double linreg_mse (const linreg *); - -double linreg_intercept (const linreg *); - -const gsl_matrix * linreg_cov (const linreg *); -double linreg_coeff (const linreg *, size_t); -const struct variable * linreg_indep_var (const linreg *, size_t); -size_t linreg_n_coeffs (const linreg *); -double linreg_n_obs (const linreg *); -double linreg_sse (const linreg *); -double linreg_ssreg (const linreg *); -double linreg_dfmodel (const linreg *); -double linreg_sst (const linreg *); -void linreg_set_depvar_mean (linreg *, double); -double linreg_get_depvar_mean (const linreg *); +double linreg_get_indep_variable_mean (const struct linreg *, size_t); +void linreg_set_indep_variable_mean (struct linreg *, size_t, double); + +double linreg_mse (const struct linreg *); + +double linreg_intercept (const struct linreg *); + +const gsl_matrix * linreg_cov (const struct linreg *); +double linreg_coeff (const struct linreg *, size_t); +const struct variable * linreg_indep_var (const struct linreg *, size_t); +const struct variable * linreg_dep_var (const struct linreg *); +size_t linreg_n_coeffs (const struct linreg *); +double linreg_n_obs (const struct linreg *); +double linreg_sse (const struct linreg *); +double linreg_ssreg (const struct linreg *); +double linreg_dfmodel (const struct linreg *); +double linreg_dferror (const struct linreg *); +double linreg_dftotal (const struct linreg *); +double linreg_sst (const struct linreg *); +void linreg_set_depvar_mean (struct linreg *, double); +double linreg_get_depvar_mean (const struct linreg *); + #endif