From 3aa2d5510442a459946a2e9bada801ac86e07fd4 Mon Sep 17 00:00:00 2001 From: Jason H Stover Date: Fri, 15 Jan 2010 15:17:54 -0500 Subject: [PATCH] Rewrote linreg.[ch], regression.q and glm.q to use the new covariance.[ch]. Dropped covariance-matrix.[ch], design-matrix.[ch] and coefficient.[ch]. --- src/language/stats/glm.q | 19 +- src/language/stats/regression.q | 432 +++++++++---------- src/math/automake.mk | 5 - src/math/linreg.c | 731 +++++++++----------------------- src/math/linreg.h | 82 ++-- 5 files changed, 429 insertions(+), 840 deletions(-) diff --git a/src/language/stats/glm.q b/src/language/stats/glm.q index b4eddc7b..2363571c 100644 --- a/src/language/stats/glm.q +++ b/src/language/stats/glm.q @@ -252,26 +252,13 @@ glm_custom_dependent (struct lexer *lexer, struct dataset *ds, return 1; } -/* - COV is the covariance matrix for variables included in the - model. That means the dependent variable is in there, too. - */ -static void -coeff_init (pspp_linreg_cache * c, const struct design_matrix *cov) -{ - c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff)); - c->n_coeffs = cov->m->size2 - 1; - pspp_coeff_init (c->coeff, cov); -} - - -static pspp_linreg_cache * +static linreg * fit_model (const struct covariance *cov, const struct variable *dep_var, const struct variable ** indep_vars, size_t n_data, size_t n_indep) { - pspp_linreg_cache *result = NULL; + linreg *result = NULL; return result; } @@ -285,7 +272,7 @@ run_glm (struct casereader *input, const struct variable **numerics = NULL; const struct variable **categoricals = NULL; int n_indep = 0; - pspp_linreg_cache *model = NULL; + linreg *model = NULL; pspp_linreg_opts lopts; struct ccase *c; size_t i; diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q index 41a23f3e..6e97fd10 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.q @@ -21,7 +21,6 @@ #include #include #include - #include #include #include @@ -39,8 +38,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -99,7 +97,7 @@ struct reg_trns { int n_trns; /* Number of transformations. */ int trns_id; /* Which trns is this one? */ - pspp_linreg_cache *c; /* Linear model for this trns. */ + linreg *c; /* Linear model for this trns. */ }; /* Variables used (both explanatory and response). @@ -112,31 +110,31 @@ static const struct variable **v_variables; static size_t n_variables; static bool run_regression (struct casereader *, struct cmd_regression *, - struct dataset *, pspp_linreg_cache **); + struct dataset *, linreg **); /* STATISTICS subcommand output functions. */ -static void reg_stats_r (pspp_linreg_cache *); -static void reg_stats_coeff (pspp_linreg_cache *); -static void reg_stats_anova (pspp_linreg_cache *); -static void reg_stats_outs (pspp_linreg_cache *); -static void reg_stats_zpp (pspp_linreg_cache *); -static void reg_stats_label (pspp_linreg_cache *); -static void reg_stats_sha (pspp_linreg_cache *); -static void reg_stats_ci (pspp_linreg_cache *); -static void reg_stats_f (pspp_linreg_cache *); -static void reg_stats_bcov (pspp_linreg_cache *); -static void reg_stats_ses (pspp_linreg_cache *); -static void reg_stats_xtx (pspp_linreg_cache *); -static void reg_stats_collin (pspp_linreg_cache *); -static void reg_stats_tol (pspp_linreg_cache *); -static void reg_stats_selection (pspp_linreg_cache *); -static void statistics_keyword_output (void (*)(pspp_linreg_cache *), - int, pspp_linreg_cache *); +static void reg_stats_r (linreg *); +static void reg_stats_coeff (linreg *); +static void reg_stats_anova (linreg *); +static void reg_stats_outs (linreg *); +static void reg_stats_zpp (linreg *); +static void reg_stats_label (linreg *); +static void reg_stats_sha (linreg *); +static void reg_stats_ci (linreg *); +static void reg_stats_f (linreg *); +static void reg_stats_bcov (linreg *); +static void reg_stats_ses (linreg *); +static void reg_stats_xtx (linreg *); +static void reg_stats_collin (linreg *); +static void reg_stats_tol (linreg *); +static void reg_stats_selection (linreg *); +static void statistics_keyword_output (void (*)(linreg *), + int, linreg *); static void -reg_stats_r (pspp_linreg_cache * c) +reg_stats_r (linreg * c) { struct tab_table *t; int n_rows = 2; @@ -148,7 +146,7 @@ reg_stats_r (pspp_linreg_cache * c) assert (c != NULL); rsq = c->ssm / c->sst; adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps); - std_error = sqrt (pspp_linreg_mse (c)); + std_error = sqrt (linreg_mse (c)); t = tab_create (n_cols, n_rows, 0); tab_dim (t, tab_natural_dimensions, NULL); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); @@ -172,7 +170,7 @@ reg_stats_r (pspp_linreg_cache * c) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (pspp_linreg_cache * c) +reg_stats_coeff (linreg * c) { size_t j; int n_cols = 7; @@ -185,7 +183,6 @@ reg_stats_coeff (pspp_linreg_cache * c) const char *label; const struct variable *v; - const union value *val; struct tab_table *t; assert (c != NULL); @@ -205,66 +202,53 @@ reg_stats_coeff (pspp_linreg_cache * c) tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); - tab_double (t, 2, 1, 0, c->intercept, NULL); - std_err = sqrt (gsl_matrix_get (c->cov, 0, 0)); + tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); + std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); tab_double (t, 3, 1, 0, std_err, NULL); tab_double (t, 4, 1, 0, 0.0, NULL); - t_stat = c->intercept / std_err; + t_stat = linreg_intercept (c) / std_err; tab_double (t, 5, 1, 0, t_stat, NULL); pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0); tab_double (t, 6, 1, 0, pval, NULL); - for (j = 0; j < c->n_coeffs; j++) + for (j = 0; j < linreg_n_coeffs (c); j++) { struct string tstr; ds_init_empty (&tstr); this_row = j + 2; - v = pspp_coeff_get_var (c->coeff[j], 0); + v = linreg_indep_var (c, j); label = var_to_string (v); /* Do not overwrite the variable's name. */ ds_put_cstr (&tstr, label); - if (var_is_alpha (v)) - { - /* - Append the value associated with this coefficient. - This makes sense only if we us the usual binary encoding - for that value. - */ - - val = pspp_coeff_get_value (c->coeff[j], v); - - var_append_value_name (v, val, &tstr); - } - tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); /* Regression coefficients. */ - tab_double (t, 2, this_row, 0, c->coeff[j]->estimate, NULL); + tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL); /* Standard error of the coefficients. */ - std_err = sqrt (gsl_matrix_get (c->cov, j + 1, j + 1)); + std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); tab_double (t, 3, this_row, 0, std_err, NULL); /* Standardized coefficient, i.e., regression coefficient if all variables had unit variance. */ - beta = pspp_coeff_get_sd (c->coeff[j]); - beta *= c->coeff[j]->estimate / c->depvar_std; + beta = sqrt (gsl_matrix_get (linreg_cov (c), j, j)); + beta *= linreg_coeff (c, j) / c->depvar_std; tab_double (t, 4, this_row, 0, beta, NULL); /* Test statistic for H0: coefficient is 0. */ - t_stat = c->coeff[j]->estimate / std_err; + t_stat = linreg_coeff (c, j) / std_err; tab_double (t, 5, this_row, 0, t_stat, NULL); /* P values for the test statistic above. */ pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (c->n_obs - c->n_coeffs)); + (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); tab_double (t, 6, this_row, 0, pval, NULL); ds_destroy (&tstr); } @@ -276,12 +260,12 @@ reg_stats_coeff (pspp_linreg_cache * c) Display the ANOVA table. */ static void -reg_stats_anova (pspp_linreg_cache * c) +reg_stats_anova (linreg * c) { int n_cols = 7; int n_rows = 4; - const double msm = c->ssm / c->dfm; - const double mse = pspp_linreg_mse (c); + const double msm = linreg_ssreg (c) / linreg_dfmodel (c); + const double mse = linreg_mse (c); const double F = msm / mse; const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe); @@ -332,40 +316,40 @@ reg_stats_anova (pspp_linreg_cache * c) } static void -reg_stats_outs (pspp_linreg_cache * c) +reg_stats_outs (linreg * c) { assert (c != NULL); } static void -reg_stats_zpp (pspp_linreg_cache * c) +reg_stats_zpp (linreg * c) { assert (c != NULL); } static void -reg_stats_label (pspp_linreg_cache * c) +reg_stats_label (linreg * c) { assert (c != NULL); } static void -reg_stats_sha (pspp_linreg_cache * c) +reg_stats_sha (linreg * c) { assert (c != NULL); } static void -reg_stats_ci (pspp_linreg_cache * c) +reg_stats_ci (linreg * c) { assert (c != NULL); } static void -reg_stats_f (pspp_linreg_cache * c) +reg_stats_f (linreg * c) { assert (c != NULL); } static void -reg_stats_bcov (pspp_linreg_cache * c) +reg_stats_bcov (linreg * c) { int n_cols; int n_rows; @@ -388,13 +372,13 @@ reg_stats_bcov (pspp_linreg_cache * c) tab_vline (t, TAL_0, 1, 0, 0); tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model")); tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances")); - for (i = 0; i < c->n_coeffs; i++) + for (i = 0; i < linreg_n_coeffs (c); i++) { - const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0); + const struct variable *v = linreg_indep_var (c, i); label = var_to_string (v); tab_text (t, 2, i, TAB_CENTER, label); tab_text (t, i + 2, 0, TAB_CENTER, label); - for (k = 1; k < c->n_coeffs; k++) + for (k = 1; k < linreg_n_coeffs (c); k++) { col = (i <= k) ? k : i; row = (i <= k) ? i : k; @@ -406,34 +390,34 @@ reg_stats_bcov (pspp_linreg_cache * c) tab_submit (t); } static void -reg_stats_ses (pspp_linreg_cache * c) +reg_stats_ses (linreg * c) { assert (c != NULL); } static void -reg_stats_xtx (pspp_linreg_cache * c) +reg_stats_xtx (linreg * c) { assert (c != NULL); } static void -reg_stats_collin (pspp_linreg_cache * c) +reg_stats_collin (linreg * c) { assert (c != NULL); } static void -reg_stats_tol (pspp_linreg_cache * c) +reg_stats_tol (linreg * c) { assert (c != NULL); } static void -reg_stats_selection (pspp_linreg_cache * c) +reg_stats_selection (linreg * c) { assert (c != NULL); } static void -statistics_keyword_output (void (*function) (pspp_linreg_cache *), - int keyword, pspp_linreg_cache * c) +statistics_keyword_output (void (*function) (linreg *), + int keyword, linreg * c) { if (keyword) { @@ -442,7 +426,7 @@ statistics_keyword_output (void (*function) (pspp_linreg_cache *), } static void -subcommand_statistics (int *keywords, pspp_linreg_cache * c) +subcommand_statistics (int *keywords, linreg * c) { /* The order here must match the order in which the STATISTICS @@ -531,7 +515,7 @@ regression_trns_free (void *t_) if (t->trns_id == t->n_trns) { - result = pspp_linreg_cache_free (t->c); + result = linreg_free (t->c); } free (t); @@ -548,9 +532,10 @@ regression_trns_pred_proc (void *t_, struct ccase **c, size_t i; size_t n_vals; struct reg_trns *trns = t_; - pspp_linreg_cache *model; + linreg *model; union value *output = NULL; - const union value **vals = NULL; + const union value *tmp; + double *vals; const struct variable **vars = NULL; assert (trns != NULL); @@ -559,21 +544,20 @@ regression_trns_pred_proc (void *t_, struct ccase **c, assert (model->depvar != NULL); assert (model->pred != NULL); - vars = xnmalloc (model->n_coeffs, sizeof (*vars)); - n_vals = (*model->get_vars) (model, vars); - + vars = linreg_get_vars (model); + n_vals = linreg_n_coeffs (model); vals = xnmalloc (n_vals, sizeof (*vals)); *c = case_unshare (*c); + output = case_data_rw (*c, model->pred); for (i = 0; i < n_vals; i++) { - vals[i] = case_data (*c, vars[i]); + tmp = case_data (*c, vars[i]); + vals[i] = tmp->f; } - output->f = (*model->predict) ((const struct variable **) vars, - vals, model, n_vals); + output->f = linreg_predict (model, vals, n_vals); free (vals); - free (vars); return TRNS_CONTINUE; } @@ -587,10 +571,11 @@ regression_trns_resid_proc (void *t_, struct ccase **c, size_t i; size_t n_vals; struct reg_trns *trns = t_; - pspp_linreg_cache *model; + linreg *model; union value *output = NULL; - const union value **vals = NULL; - const union value *obs = NULL; + const union value *tmp; + double *vals = NULL; + double obs; const struct variable **vars = NULL; assert (trns != NULL); @@ -599,8 +584,8 @@ regression_trns_resid_proc (void *t_, struct ccase **c, assert (model->depvar != NULL); assert (model->resid != NULL); - vars = xnmalloc (model->n_coeffs, sizeof (*vars)); - n_vals = (*model->get_vars) (model, vars); + vars = linreg_get_vars (model); + n_vals = linreg_n_coeffs (model); vals = xnmalloc (n_vals, sizeof (*vals)); *c = case_unshare (*c); @@ -609,13 +594,14 @@ regression_trns_resid_proc (void *t_, struct ccase **c, for (i = 0; i < n_vals; i++) { - vals[i] = case_data (*c, vars[i]); + tmp = case_data (*c, vars[i]); + vals[i] = tmp->f; } - obs = case_data (*c, model->depvar); - output->f = (*model->residual) ((const struct variable **) vars, - vals, obs, model, n_vals); + tmp = case_data (*c, model->depvar); + obs = tmp->f; + output->f = linreg_residual (model, obs, vals, n_vals); free (vals); - free (vars); + return TRNS_CONTINUE; } @@ -647,7 +633,7 @@ reg_get_name (const struct dictionary *dict, char name[VAR_NAME_LEN], static void reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, - pspp_linreg_cache * c, struct variable **v, int n_trns) + linreg * c, struct variable **v, int n_trns) { struct dictionary *dict = dataset_dict (ds); static int trns_index = 1; @@ -667,9 +653,9 @@ reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, trns_index++; } static void -subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models) +subcommand_save (struct dataset *ds, int save, linreg ** models) { - pspp_linreg_cache **lc; + linreg **lc; int n_trns = 0; int i; @@ -713,7 +699,7 @@ subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models) { if (*lc != NULL) { - pspp_linreg_cache_free (*lc); + linreg_free (*lc); } } } @@ -724,7 +710,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) { struct casegrouper *grouper; struct casereader *group; - pspp_linreg_cache **models; + linreg **models; bool ok; size_t i; @@ -817,72 +803,94 @@ identify_indep_vars (const struct variable **indep_vars, } return n_indep_vars; } - -/* Encode categorical variables. - Returns number of valid cases. */ -static int -prepare_categories (struct casereader *input, - const struct variable **vars, size_t n_vars, - struct moments_var *mom) +static double +fill_covariance (gsl_matrix *cov, struct covariance *all_cov, + const struct variable **vars, + size_t n_vars, const struct variable *dep_var, + const struct variable **all_vars, size_t n_all_vars) { - int n_data; - struct ccase *c; size_t i; - - assert (vars != NULL); - assert (mom != NULL); - - for (i = 0; i < n_vars; i++) - if (var_is_alpha (vars[i])) - cat_stored_values_create (vars[i]); - - n_data = 0; - for (; (c = casereader_read (input)) != NULL; case_unref (c)) + size_t j; + size_t k = 0; + size_t dep_subscript; + size_t *rows; + const gsl_matrix *ssizes; + const gsl_matrix *cm; + double result = 0.0; + + cm = covariance_calculate (all_cov); + rows = xnmalloc (cov->size1 - 1, sizeof (*rows)); + + for (i = 0; i < n_all_vars; i++) { - /* - The second condition ensures the program will run even if - there is only one variable to act as both explanatory and - response. - */ - for (i = 0; i < n_vars; i++) + for (j = k; j < n_vars; j++) { - const union value *val = case_data (c, vars[i]); - if (var_is_alpha (vars[i])) - cat_value_update (vars[i], val); - else - moments1_add (mom[i].m, val->f, 1.0); + if (vars[j] == all_vars[i]) + { + if (vars[j] != dep_var) + { + rows[j] = i; + } + else + { + dep_subscript = i; + } + k++; + break; + } } - n_data++; } - casereader_destroy (input); - - return n_data; -} - -static void -coeff_init (pspp_linreg_cache * c, struct design_matrix *dm) -{ - c->coeff = xnmalloc (dm->m->size2, sizeof (*c->coeff)); - pspp_coeff_init (c->coeff, dm); + for (i = 0; i < cov->size1 - 1; i++) + { + for (j = 0; j < cov->size2 - 1; j++) + { + gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j])); + gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i])); + } + } + ssizes = covariance_moments (all_cov, MOMENT_NONE); + result = gsl_matrix_get (ssizes, dep_subscript, rows[0]); + for (i = 0; i < cov->size1 - 1; i++) + { + gsl_matrix_set (cov, i, cov->size1 - 1, + gsl_matrix_get (cm, rows[i], dep_subscript)); + gsl_matrix_set (cov, cov->size1 - 1, i, + gsl_matrix_get (cm, rows[i], dep_subscript)); + if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript)) + { + result = gsl_matrix_get (ssizes, rows[i], dep_subscript); + } + } + free (rows); } static bool run_regression (struct casereader *input, struct cmd_regression *cmd, - struct dataset *ds, pspp_linreg_cache **models) + struct dataset *ds, linreg **models) { size_t i; int n_indep = 0; int k; + double n_data; struct ccase *c; - const struct variable **indep_vars; - struct design_matrix *X; - struct moments_var *mom; - gsl_vector *Y; - - pspp_linreg_opts lopts; + struct covariance *cov; + const struct variable **vars; + const struct variable *dep_var; + struct casereader *reader; + const struct dictionary *dict; + gsl_matrix *this_cm; assert (models != NULL); + for (i = 0; i < n_variables; i++) + { + if (!var_is_numeric (v_variables[i])) + { + msg (SE, _("REGRESSION requires numeric variables.")); + return false; + } + } + c = casereader_peek (input, 0); if (c == NULL) { @@ -892,125 +900,67 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, output_split_file_values (ds, c); case_unref (c); + dict = dataset_dict (ds); if (!v_variables) { - dict_get_vars (dataset_dict (ds), &v_variables, &n_variables, 0); + dict_get_vars (dict, &v_variables, &n_variables, 0); } - - for (i = 0; i < cmd->n_dependent; i++) + vars = xnmalloc (n_variables, sizeof (*vars)); + cov = covariance_1pass_create (n_variables, v_variables, + dict_get_weight (dict), MV_ANY); + + reader = casereader_clone (input); + reader = casereader_create_filter_missing (reader, v_variables, n_variables, + MV_ANY, NULL, NULL); + for (; (c = casereader_read (reader)) != NULL; case_unref (c)) { - if (!var_is_numeric (cmd->v_dependent[i])) - { - msg (SE, _("Dependent variable must be numeric.")); - return false; - } + covariance_accumulate (cov, c); } - - mom = xnmalloc (n_variables, sizeof (*mom)); - for (i = 0; i < n_variables; i++) - { - (mom + i)->m = moments1_create (MOMENT_VARIANCE); - (mom + i)->v = v_variables[i]; - } - lopts.get_depvar_mean_std = 1; - - - indep_vars = xnmalloc (n_variables, sizeof *indep_vars); - + for (k = 0; k < cmd->n_dependent; k++) { - const struct variable *dep_var; - struct casereader *reader; - casenumber row; - struct ccase *c; - size_t n_data; /* Number of valid cases. */ - dep_var = cmd->v_dependent[k]; - n_indep = identify_indep_vars (indep_vars, dep_var); - reader = casereader_clone (input); - reader = casereader_create_filter_missing (reader, indep_vars, n_indep, - MV_ANY, NULL, NULL); - reader = casereader_create_filter_missing (reader, &dep_var, 1, - MV_ANY, NULL, NULL); - n_data = prepare_categories (casereader_clone (reader), - indep_vars, n_indep, mom); - - if ((n_data > 0) && (n_indep > 0)) + n_indep = identify_indep_vars (vars, dep_var); + + this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); + n_data = fill_covariance (this_cm, cov, vars, n_indep, + dep_var, v_variables, n_variables); + models[k] = linreg_alloc (dep_var, (const struct variable **) vars, + n_data, n_indep); + models[k]->depvar = dep_var; + + /* + For large data sets, use QR decomposition. + */ + if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) + { + models[k]->method = LINREG_QR; + } + + if (n_data > 0) { - Y = gsl_vector_alloc (n_data); - X = - design_matrix_create (n_indep, - (const struct variable **) indep_vars, - n_data); - lopts.get_indep_mean_std = xnmalloc (X->m->size2, sizeof (int)); - for (i = 0; i < X->m->size2; i++) - { - lopts.get_indep_mean_std[i] = 1; - } - models[k] = pspp_linreg_cache_alloc (dep_var, (const struct variable **) indep_vars, - X->m->size1, n_indep); - models[k]->depvar = dep_var; - /* - For large data sets, use QR decomposition. - */ - if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) - { - models[k]->method = PSPP_LINREG_QR; - } - - /* - The second pass fills the design matrix. - */ - reader = casereader_create_counter (reader, &row, -1); - for (; (c = casereader_read (reader)) != NULL; case_unref (c)) - { - for (i = 0; i < n_indep; ++i) - { - const struct variable *v = indep_vars[i]; - const union value *val = case_data (c, v); - if (var_is_alpha (v)) - design_matrix_set_categorical (X, row, v, val); - else - design_matrix_set_numeric (X, row, v, val); - } - gsl_vector_set (Y, row, case_num (c, dep_var)); - } - /* - Now that we know the number of coefficients, allocate space - and store pointers to the variables that correspond to the - coefficients. - */ - coeff_init (models[k], X); - /* - Find the least-squares estimates and other statistics. - */ - pspp_linreg ((const gsl_vector *) Y, X, &lopts, models[k]); - + Find the least-squares estimates and other statistics. + */ + linreg_fit (this_cm, models[k]); + if (!taint_has_tainted_successor (casereader_get_taint (input))) { subcommand_statistics (cmd->a_statistics, models[k]); } - - gsl_vector_free (Y); - design_matrix_destroy (X); - free (lopts.get_indep_mean_std); } else { msg (SE, gettext ("No valid data found. This command was skipped.")); } - casereader_destroy (reader); - } - for (i = 0; i < n_variables; i++) - { - moments1_destroy ((mom + i)->m); } - free (mom); - free (indep_vars); - casereader_destroy (input); + casereader_destroy (reader); + free (vars); + casereader_destroy (input); + covariance_destroy (cov); + return true; } diff --git a/src/math/automake.mk b/src/math/automake.mk index 053ee440..2dc398a7 100644 --- a/src/math/automake.mk +++ b/src/math/automake.mk @@ -11,15 +11,10 @@ src_math_libpspp_math_la_SOURCES = \ src/math/chart-geometry.c \ src/math/chart-geometry.h \ src/math/box-whisker.c src/math/box-whisker.h \ - src/math/coefficient.c \ - src/math/coefficient.h \ src/math/categoricals.h \ src/math/categoricals.c \ src/math/covariance.c \ src/math/covariance.h \ - src/math/covariance-matrix.c \ - src/math/covariance-matrix.h \ - src/math/design-matrix.c src/math/design-matrix.h \ src/math/extrema.c src/math/extrema.h \ src/math/group.c src/math/group.h \ src/math/group-proc.h \ diff --git a/src/math/linreg.c b/src/math/linreg.c index c03af467..b6d1c16c 100644 --- a/src/math/linreg.c +++ b/src/math/linreg.c @@ -19,12 +19,10 @@ #include #include #include +#include #include #include -#include #include -#include -#include #include #include #include @@ -57,93 +55,23 @@ */ -/* - Get the mean and standard deviation of a vector - of doubles via a form of the Kalman filter as - described on page 32 of [3]. - */ -static int -linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp) -{ - size_t i; - double j = 0.0; - double d; - double tmp; - double mean; - double variance; - - mean = gsl_vector_get (&v.vector, 0); - variance = 0; - for (i = 1; i < v.vector.size; i++) - { - j = (double) i + 1.0; - tmp = gsl_vector_get (&v.vector, i); - d = (tmp - mean) / j; - mean += d; - variance += j * (j - 1.0) * d * d; - } - *mp = mean; - *sp = sqrt (variance / (j - 1.0)); - *ssp = variance; - - return GSL_SUCCESS; -} - -/* - Set V to contain an array of pointers to the variables - used in the model. V must be at least C->N_COEFFS in length. - The return value is the number of distinct variables found. - */ -int -pspp_linreg_get_vars (const void *c_, const struct variable **v) +const struct variable ** +linreg_get_vars (const linreg *c) { - const pspp_linreg_cache *c = c_; - const struct variable *tmp; - int i; - int j; - int result = 0; - - /* - Make sure the caller doesn't try to sneak a variable - into V that is not in the model. - */ - for (i = 0; i < c->n_coeffs; i++) - { - v[i] = NULL; - } - for (j = 0; j < c->n_coeffs; j++) - { - tmp = pspp_coeff_get_var (c->coeff[j], 0); - assert (tmp != NULL); - /* Repeated variables are likely to bunch together, at the end - of the array. */ - i = result - 1; - while (i >= 0 && v[i] != tmp) - { - i--; - } - if (i < 0 && result < c->n_coeffs) - { - v[result] = tmp; - result++; - } - } - return result; + return c->indep_vars; } /* - Allocate a pspp_linreg_cache and return a pointer - to it. n is the number of cases, p is the number of - independent variables. + Allocate a linreg and return a pointer to it. n is the number of + cases, p is the number of independent variables. */ -pspp_linreg_cache * -pspp_linreg_cache_alloc (const struct variable *depvar, const struct variable **indep_vars, - size_t n, size_t p) +linreg * +linreg_alloc (const struct variable *depvar, const struct variable **indep_vars, + double n, size_t p) { - size_t i; - pspp_linreg_cache *c; + linreg *c; - c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache)); + c = (linreg *) malloc (sizeof (linreg)); c->depvar = depvar; c->indep_vars = indep_vars; c->indep_means = gsl_vector_alloc (p); @@ -156,30 +84,17 @@ pspp_linreg_cache_alloc (const struct variable *depvar, const struct variable ** */ c->n_obs = n; c->n_indeps = p; - c->n_coeffs = 0; - for (i = 0; i < p; i++) - { - if (var_is_numeric (indep_vars[i])) - { - c->n_coeffs++; - } - else - { - c->n_coeffs += cat_get_n_categories (indep_vars[i]) - 1; - } - } - + c->n_coeffs = p; + c->coeff = xnmalloc (p, sizeof (*c->coeff)); c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1); + c->dft = n - 1; + c->dfm = p; + c->dfe = c->dft - c->dfm; + c->intercept = 0.0; /* Default settings. */ - c->method = PSPP_LINREG_SWEEP; - c->predict = pspp_linreg_predict; - c->residual = pspp_linreg_residual; /* The procedure to compute my - residuals. */ - c->get_vars = pspp_linreg_get_vars; /* The procedure that returns - pointers to model - variables. */ + c->method = LINREG_SWEEP; c->resid = NULL; /* The variable storing my residuals. */ c->pred = NULL; /* The variable storing my predicted values. */ @@ -187,40 +102,23 @@ pspp_linreg_cache_alloc (const struct variable *depvar, const struct variable ** } bool -pspp_linreg_cache_free (void *m) +linreg_free (void *m) { - int i; - - pspp_linreg_cache *c = m; + linreg *c = m; if (c != NULL) { gsl_vector_free (c->indep_means); gsl_vector_free (c->indep_std); - gsl_vector_free (c->ss_indeps); gsl_matrix_free (c->cov); gsl_vector_free (c->ssx); - for (i = 0; i < c->n_coeffs; i++) - { - pspp_coeff_free (c->coeff[i]); - } free (c->coeff); free (c); } return true; } -static void -cache_init (pspp_linreg_cache *cache) -{ - assert (cache != NULL); - cache->dft = cache->n_obs - 1; - cache->dfm = cache->n_indeps; - cache->dfe = cache->dft - cache->dfm; - cache->intercept = 0.0; -} static void -post_sweep_computations (pspp_linreg_cache *cache, const struct design_matrix *dm, - gsl_matrix *sw) +post_sweep_computations (linreg *l, gsl_matrix *sw) { gsl_matrix *xm; gsl_matrix_view xtx; @@ -232,19 +130,19 @@ post_sweep_computations (pspp_linreg_cache *cache, const struct design_matrix *d int rc; assert (sw != NULL); - assert (cache != NULL); + assert (l != NULL); - cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps); - cache->mse = cache->sse / cache->dfe; + l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps); + l->mse = l->sse / l->dfe; /* Get the intercept. */ - m = cache->depvar_mean; - for (i = 0; i < cache->n_indeps; i++) + m = l->depvar_mean; + for (i = 0; i < l->n_indeps; i++) { - tmp = gsl_matrix_get (sw, i, cache->n_indeps); - cache->coeff[i]->estimate = tmp; - m -= tmp * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i)); + tmp = gsl_matrix_get (sw, i, l->n_indeps); + l->coeff[i] = tmp; + m -= tmp * linreg_get_indep_variable_mean (l, i); } /* Get the covariance matrix of the parameter estimates. @@ -255,37 +153,37 @@ post_sweep_computations (pspp_linreg_cache *cache, const struct design_matrix *d The loops below do not compute the entries related to the estimated intercept. */ - for (i = 0; i < cache->n_indeps; i++) - for (j = i; j < cache->n_indeps; j++) + for (i = 0; i < l->n_indeps; i++) + for (j = i; j < l->n_indeps; j++) { - tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j); - gsl_matrix_set (cache->cov, i + 1, j + 1, tmp); + tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j); + gsl_matrix_set (l->cov, i + 1, j + 1, tmp); } /* Get the covariances related to the intercept. */ - xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps); - xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps); - xm = gsl_matrix_calloc (1, cache->n_indeps); + xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps); + xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps); + xm = gsl_matrix_calloc (1, l->n_indeps); for (i = 0; i < xm->size2; i++) { gsl_matrix_set (xm, 0, i, - pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i))); + linreg_get_indep_variable_mean (l, i)); } - rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse, + rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse, &xtx.matrix, xm, 0.0, &xmxtx.matrix); gsl_matrix_free (xm); if (rc == GSL_SUCCESS) { - tmp = cache->mse / cache->n_obs; - for (i = 1; i < 1 + cache->n_indeps; i++) + tmp = l->mse / l->n_obs; + for (i = 1; i < 1 + l->n_indeps; i++) { - tmp -= gsl_matrix_get (cache->cov, 0, i) - * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i - 1)); + tmp -= gsl_matrix_get (l->cov, 0, i) + * linreg_get_indep_variable_mean (l, i - 1); } - gsl_matrix_set (cache->cov, 0, 0, tmp); + gsl_matrix_set (l->cov, 0, 0, tmp); - cache->intercept = m; + l->intercept = m; } else { @@ -294,246 +192,20 @@ post_sweep_computations (pspp_linreg_cache *cache, const struct design_matrix *d exit (rc); } } - -/* - Fit the linear model via least squares. All pointers passed to pspp_linreg - are assumed to be allocated to the correct size and initialized to the - values as indicated by opts. - */ -int -pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm, - const pspp_linreg_opts * opts, pspp_linreg_cache * cache) -{ - int rc; - gsl_matrix *design = NULL; - gsl_matrix_view xtx; - gsl_vector_view xty; - gsl_vector_view xi; - gsl_vector_view xj; - gsl_vector *param_estimates; - struct pspp_coeff *coef; - const struct variable *v; - const union value *val; - - size_t i; - size_t j; - double tmp; - double m; - double s; - double ss; - - if (cache == NULL) - { - return GSL_EFAULT; - } - if (opts->get_depvar_mean_std) - { - linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size), - &m, &s, &ss); - cache->depvar_mean = m; - cache->depvar_std = s; - cache->sst = ss; - } - cache_init (cache); - cache->n_coeffs = dm->m->size2; - for (i = 0; i < dm->m->size2; i++) - { - if (opts->get_indep_mean_std[i]) - { - linreg_mean_std (gsl_matrix_const_column (dm->m, i), &m, &s, &ss); - v = design_matrix_col_to_var (dm, i); - val = NULL; - if (var_is_alpha (v)) - { - j = i - design_matrix_var_to_column (dm, v); - val = cat_subscript_to_value (j, v); - } - coef = pspp_linreg_get_coeff (cache, v, val); - pspp_coeff_set_mean (coef, m); - pspp_coeff_set_sd (coef, s); - gsl_vector_set (cache->ssx, i, ss); - - } - } - - if (cache->method == PSPP_LINREG_SWEEP) - { - gsl_matrix *sw; - /* - Subtract the means to improve the condition of the design - matrix. This requires copying dm->m and Y. We do not divide by the - standard deviations of the independent variables here since doing - so would cause a miscalculation of the residual sums of - squares. Dividing by the standard deviation is done GSL's linear - regression functions, so if the design matrix has a poor - condition, use QR decomposition. - - The design matrix here does not include a column for the intercept - (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column, - so design is allocated here when sweeping, or below if using QR. - */ - design = gsl_matrix_alloc (dm->m->size1, dm->m->size2); - for (i = 0; i < dm->m->size2; i++) - { - v = design_matrix_col_to_var (dm, i); - m = pspp_linreg_get_indep_variable_mean (cache, v); - for (j = 0; j < dm->m->size1; j++) - { - tmp = (gsl_matrix_get (dm->m, j, i) - m); - gsl_matrix_set (design, j, i, tmp); - } - } - sw = gsl_matrix_calloc (cache->n_coeffs + 1, cache->n_coeffs + 1); - xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_coeffs, cache->n_coeffs); - - for (i = 0; i < xtx.matrix.size1; i++) - { - tmp = gsl_vector_get (cache->ssx, i); - gsl_matrix_set (&(xtx.matrix), i, i, tmp); - xi = gsl_matrix_column (design, i); - for (j = (i + 1); j < xtx.matrix.size2; j++) - { - xj = gsl_matrix_column (design, j); - gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp); - gsl_matrix_set (&(xtx.matrix), i, j, tmp); - } - } - - gsl_matrix_set (sw, cache->n_coeffs, cache->n_coeffs, cache->sst); - xty = gsl_matrix_column (sw, cache->n_coeffs); - /* - This loop starts at 1, with i=0 outside the loop, so we can get - the model sum of squares due to the first independent variable. - */ - xi = gsl_matrix_column (design, 0); - gsl_blas_ddot (&(xi.vector), Y, &tmp); - gsl_vector_set (&(xty.vector), 0, tmp); - tmp *= tmp / gsl_vector_get (cache->ssx, 0); - gsl_vector_set (cache->ss_indeps, 0, tmp); - for (i = 1; i < cache->n_coeffs; i++) - { - xi = gsl_matrix_column (design, i); - gsl_blas_ddot (&(xi.vector), Y, &tmp); - gsl_vector_set (&(xty.vector), i, tmp); - } - - /* - Sweep on the matrix sw, which contains XtX, XtY and YtY. - */ - reg_sweep (sw); - post_sweep_computations (cache, dm, sw); - gsl_matrix_free (sw); - } - else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE) - { - /* - Use the SVD of X^T X to find a conditional inverse of X^TX. If - the SVD is X^T X = U D V^T, then set the conditional inverse - to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry - (i, i) has value sigma_i, then entry (i, i) of D^- is 1 / - sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal - equations by setting the estimated parameter vector to - (X^TX)^c X^T Y. - */ - } - else - { - gsl_multifit_linear_workspace *wk; - /* - Use QR decomposition via GSL. - */ - - param_estimates = gsl_vector_alloc (1 + dm->m->size2); - design = gsl_matrix_alloc (dm->m->size1, 1 + dm->m->size2); - - for (j = 0; j < dm->m->size1; j++) - { - gsl_matrix_set (design, j, 0, 1.0); - for (i = 0; i < dm->m->size2; i++) - { - tmp = gsl_matrix_get (dm->m, j, i); - gsl_matrix_set (design, j, i + 1, tmp); - } - } - - wk = gsl_multifit_linear_alloc (design->size1, design->size2); - rc = gsl_multifit_linear (design, Y, param_estimates, - cache->cov, &(cache->sse), wk); - for (i = 0; i < cache->n_coeffs; i++) - { - cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1); - } - cache->intercept = gsl_vector_get (param_estimates, 0); - if (rc == GSL_SUCCESS) - { - gsl_multifit_linear_free (wk); - gsl_vector_free (param_estimates); - } - else - { - fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n", - __FILE__, __LINE__, rc); - } - } - - - cache->ssm = cache->sst - cache->sse; - /* - Get the remaining sums of squares for the independent - variables. - */ - m = 0; - for (i = 1; i < cache->n_indeps; i++) - { - j = i - 1; - m += gsl_vector_get (cache->ss_indeps, j); - tmp = cache->ssm - m; - gsl_vector_set (cache->ss_indeps, i, tmp); - } - - gsl_matrix_free (design); - return GSL_SUCCESS; -} /* - Is the coefficient COEF contained in the list of coefficients - COEF_LIST? - */ -static int -has_coefficient (const struct pspp_coeff **coef_list, const struct pspp_coeff *coef, - size_t n) -{ - size_t i = 0; - - while (i < n) - { - if (coef_list[i] == coef) - { - return 1; - } - i++; - } - return 0; -} -/* - Predict the value of the dependent variable with the - new set of predictors. PREDICTORS must point to a list - of variables, each of whose values are stored in VALS, - in the same order. + Predict the value of the dependent variable with the new set of + predictors. VALS are assumed to be in the order corresponding to the + order of the coefficients in the linreg struct. */ double -pspp_linreg_predict (const struct variable **predictors, - const union value **vals, const void *c_, int n_vals) +linreg_predict (const linreg *c, const double *vals, size_t n_vals) { - const pspp_linreg_cache *c = c_; - int j; - size_t next_coef = 0; - const struct pspp_coeff **coef_list; - const struct pspp_coeff *coe; + size_t j; double result; - double tmp; - if (predictors == NULL || vals == NULL || c == NULL) + assert (n_vals = c->n_coeffs); + if (vals == NULL || c == NULL) { return GSL_NAN; } @@ -542,206 +214,213 @@ pspp_linreg_predict (const struct variable **predictors, /* The stupid model: just guess the mean. */ return c->depvar_mean; } - coef_list = xnmalloc (c->n_coeffs, sizeof (*coef_list)); result = c->intercept; - /* - The loops guard against the possibility that the caller passed us - inadequate information, such as too few or too many values, or - a redundant list of variable names. - */ for (j = 0; j < n_vals; j++) { - coe = pspp_linreg_get_coeff (c, predictors[j], vals[j]); - if (!has_coefficient (coef_list, coe, next_coef)) - { - tmp = pspp_coeff_get_est (coe); - if (var_is_numeric (predictors[j])) - { - tmp *= vals[j]->f; - } - result += tmp; - coef_list[next_coef++] = coe; - } + result += linreg_coeff (c, j) * vals[j]; } - free (coef_list); return result; } double -pspp_linreg_residual (const struct variable **predictors, - const union value **vals, - const union value *obs, const void *c, int n_vals) +linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals) { - double pred; - double result; - - if (predictors == NULL || vals == NULL || c == NULL || obs == NULL) + if (vals == NULL || c == NULL) { return GSL_NAN; } - pred = pspp_linreg_predict (predictors, vals, c, n_vals); - - result = isnan (pred) ? GSL_NAN : (obs->f - pred); - return result; + return (obs - linreg_predict (c, vals, n_vals)); } -/* - Which coefficient is associated with V? The VAL argument is relevant - only to categorical variables. - */ -struct pspp_coeff * -pspp_linreg_get_coeff (const pspp_linreg_cache * c, - const struct variable *v, const union value *val) -{ - if (c == NULL) - { - return NULL; - } - if (c->coeff == NULL || c->n_indeps == 0 || v == NULL) - { - return NULL; - } - return pspp_coeff_var_to_coeff (v, c->coeff, c->n_coeffs, val); -} -/* - Return the standard deviation of the independent variable. - */ -double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v) +double linreg_get_indep_variable_sd (linreg *c, size_t j) { - if (var_is_numeric (v)) - { - const struct pspp_coeff *coef; - coef = pspp_linreg_get_coeff (c, v, NULL); - return pspp_coeff_get_sd (coef); - } - return GSL_NAN; + assert (c != NULL); + return gsl_vector_get (c->indep_std, j); } -void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v, - double s) +void linreg_set_indep_variable_sd (linreg *c, size_t j, double s) { - if (var_is_numeric (v)) - { - struct pspp_coeff *coef; - coef = pspp_linreg_get_coeff (c, v, NULL); - pspp_coeff_set_sd (coef, s); - } + assert (c != NULL); + gsl_vector_set (c->indep_std, j, s); } /* Mean of the independent variable. */ -double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v) +double linreg_get_indep_variable_mean (linreg *c, size_t j) { - if (var_is_numeric (v)) - { - struct pspp_coeff *coef; - coef = pspp_linreg_get_coeff (c, v, NULL); - return pspp_coeff_get_mean (coef); - } - return 0.0; + assert (c != NULL); + return gsl_vector_get (c->indep_means, j); } -void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v, - double m) +void linreg_set_indep_variable_mean (linreg *c, size_t j, double m) { - if (var_is_numeric (v)) - { - struct pspp_coeff *coef; - coef = pspp_linreg_get_coeff (c, v, NULL); - pspp_coeff_set_mean (coef, m); - } + assert (c != NULL); + gsl_vector_set (c->indep_means, j, m); } -/* - Make sure the dependent variable is at the last column, and that - only variables in the model are in the covariance matrix. - */ -static struct design_matrix * -rearrange_covariance_matrix (const struct covariance_matrix *cm, pspp_linreg_cache *c) +static void +linreg_fit_qr (const gsl_matrix *cov, linreg *l) { - const struct variable **model_vars; - struct design_matrix *cov; - struct design_matrix *result; - size_t *permutation; + gsl_matrix *xtx; + gsl_matrix *q; + gsl_matrix *r; + gsl_vector *xty; + gsl_vector *tau; + gsl_vector *params; + double tmp = 0.0; size_t i; size_t j; - size_t k; - size_t n_coeffs = 0; - assert (cm != NULL); - cov = covariance_to_design (cm); - assert (cov != NULL); - assert (c != NULL); - assert (cov->m->size1 > 0); - assert (cov->m->size2 == cov->m->size1); - model_vars = xnmalloc (1 + c->n_indeps, sizeof (*model_vars)); + xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1); + xty = gsl_vector_alloc (cov->size1 - 1); + tau = gsl_vector_alloc (cov->size1 - 1); + params = gsl_vector_alloc (cov->size1 - 1); - /* - Put the model variables in the right order in MODEL_VARS. - Count the number of coefficients. - */ - for (i = 0; i < c->n_indeps; i++) + for (i = 0; i < xtx->size1; i++) { - model_vars[i] = c->indep_vars[i]; + gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i)); + for (j = 0; j < xtx->size2; j++) + { + gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j)); + } + } + gsl_linalg_QR_decomp (xtx, tau); + q = gsl_matrix_alloc (xtx->size1, xtx->size2); + r = gsl_matrix_alloc (xtx->size1, xtx->size2); + gsl_linalg_QR_unpack (xtx, tau, q, r); + gsl_linalg_QR_solve (xtx, tau, xty, params); + for (i = 0; i < params->size; i++) + { + l->coeff[i] = gsl_vector_get (params, i); + } + + l->intercept = l->depvar_mean; + for (i = 0; i < l->n_indeps; i++) + { + l->intercept -= l->coeff[i] * linreg_get_indep_variable_mean (l, i); } - model_vars[i] = c->depvar; - result = covariance_matrix_create (1 + c->n_indeps, model_vars); - permutation = xnmalloc (design_matrix_get_n_cols (result), sizeof (*permutation)); - for (j = 0; j < cov->m->size2; j++) + l->sse = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1); + for (i = 0; i < l->n_indeps; i++) + { + tmp += gsl_vector_get (xty, i) * l->coeff[i]; + } + l->sse -= 2.0 * tmp; + for (i = 0; i < xtx->size1; i++) { - k = 0; - while (k < result->m->size2) + tmp = 0.0; + for (j = i; j < xtx->size2; j++) { - if (design_matrix_col_to_var (cov, j) == design_matrix_col_to_var (result, k)) - { - permutation[k] = j; - } - k++; + tmp += gsl_matrix_get (xtx, i, j) * l->coeff[j]; } + l->sse += tmp * tmp; } - for (i = 0; i < result->m->size1; i++) - for (j = 0; j < result->m->size2; j++) - { - gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, permutation[i], permutation[j])); - } - free (permutation); - free (model_vars); - return result; + +#if 0 + p = l->hat->size1 - 1; + for (i = 0; i < l->cov->size1 - 1; i++) + { + gsl_matrix_set (l->cov, p - i, p - i, 1.0 / gsl_matrix_get (r, p - i, p - i)); + for (j = 0; j < i; j++) + { + tmp = -1.0 * gsl_matrix_get (r, p - i, p - j); + tmp /= gsl_matrix_get (r, p - i, p - i) * gsl_matrix_get (r, p - j, p - j); + gsl_matrix_set (l->cov, p - i, p - j, tmp); + } + } +#endif + gsl_matrix_transpose_memcpy (l->cov, q); + gsl_blas_dtrsm (CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 1.0, + r, l->cov); + + gsl_matrix_free (q); + gsl_matrix_free (r); + gsl_vector_free (xty); + gsl_vector_free (tau); + gsl_matrix_free (xtx); + gsl_vector_free (params); } + /* - Estimate the model parameters from the covariance matrix only. This - method uses less memory than PSPP_LINREG, which requires the entire - data set to be stored in memory. - - The function assumes FULL_COV may contain columns corresponding to - variables that are not in the model. It fixes this in - REARRANG_COVARIANCE_MATRIX. This allows the caller to compute a - large covariance matrix once before, then pass it to this without - having to alter it. The problem is that this means the caller must - set CACHE->N_COEFFS. + Estimate the model parameters from the covariance matrix. This + function assumes the covariance entries corresponding to the + dependent variable are in the final row and column of the covariance + matrix. */ void -pspp_linreg_with_cov (const struct covariance_matrix *full_cov, - pspp_linreg_cache * cache) +linreg_fit (const gsl_matrix *cov, linreg *l) { - struct design_matrix *cov; + gsl_matrix *params; + assert (l != NULL); + assert (cov != NULL); - assert (full_cov != NULL); - assert (cache != NULL); + params = gsl_matrix_calloc (cov->size1, cov->size2); + gsl_matrix_memcpy (params, cov); - cov = rearrange_covariance_matrix (full_cov, cache); - cache_init (cache); - reg_sweep (cov->m); - post_sweep_computations (cache, cov, cov->m); - design_matrix_destroy (cov); + if (l->method == LINREG_SWEEP) + { + reg_sweep (params); + post_sweep_computations (l, params); + } + else if (l->method == LINREG_QR) + { + linreg_fit_qr (params, l); + } + gsl_matrix_free (params); } -double pspp_linreg_mse (const pspp_linreg_cache *c) +double linreg_mse (const linreg *c) { assert (c != NULL); return (c->sse / c->dfe); } + +double linreg_intercept (const linreg *c) +{ + return c->intercept; +} + +gsl_matrix * +linreg_cov (const linreg *c) +{ + return c->cov; +} + +double +linreg_coeff (const linreg *c, size_t i) +{ + return (c->coeff[i]); +} + +const struct variable * +linreg_indep_var (const linreg *c, size_t i) +{ + return (c->indep_vars[i]); +} + +size_t +linreg_n_coeffs (const linreg *c) +{ + return c->n_coeffs; +} + +size_t +linreg_n_obs (const linreg *c) +{ + return c->n_obs; +} + +double +linreg_ssreg (const linreg *c) +{ + return c->ssm; +} + +double +linreg_dfmodel ( const linreg *c) +{ + return c->dfm; +} diff --git a/src/math/linreg.h b/src/math/linreg.h index f9d8c9b3..f6c7e2e8 100644 --- a/src/math/linreg.h +++ b/src/math/linreg.h @@ -20,14 +20,12 @@ #include #include #include -#include -#include enum { - PSPP_LINREG_CONDITIONAL_INVERSE, - PSPP_LINREG_QR, - PSPP_LINREG_SWEEP, + LINREG_CONDITIONAL_INVERSE, + LINREG_QR, + LINREG_SWEEP, }; @@ -89,9 +87,9 @@ typedef struct pspp_linreg_opts_struct pspp_linreg_opts; */ -struct pspp_linreg_cache_struct +struct linreg_struct { - int n_obs; /* Number of observations. */ + double n_obs; /* Number of observations. */ int n_indeps; /* Number of independent variables. */ int n_coeffs; /* The intercept is not considered a coefficient here. */ @@ -102,8 +100,7 @@ struct pspp_linreg_cache_struct const struct variable *depvar; const struct variable **indep_vars; - gsl_vector *residuals; - struct pspp_coeff **coeff; + double *coeff; double intercept; int method; /* Method to use to estimate parameters. */ /* @@ -154,73 +151,54 @@ struct pspp_linreg_cache_struct */ gsl_matrix *hat; - double (*predict) (const struct variable **, const union value **, - const void *, int); - double (*residual) (const struct variable **, - const union value **, - const union value *, const void *, int); - /* - Returns pointers to the variables used in the model. - */ - int (*get_vars) (const void *, const struct variable **); struct variable *resid; struct variable *pred; }; -typedef struct pspp_linreg_cache_struct pspp_linreg_cache; +typedef struct linreg_struct linreg; -/* - Allocate a pspp_linreg_cache and return a pointer - to it. n is the number of cases, p is the number of - independent variables. - */ -pspp_linreg_cache *pspp_linreg_cache_alloc (const struct variable *, const struct variable **, - size_t, size_t); +linreg *linreg_alloc (const struct variable *, const struct variable **, + double, size_t); -bool pspp_linreg_cache_free (void *); +bool linreg_free (void *); /* Fit the linear model via least squares. All pointers passed to pspp_linreg are assumed to be allocated to the correct size and initialized to the values as indicated by opts. */ -int -pspp_linreg (const gsl_vector *, const struct design_matrix *, - const pspp_linreg_opts *, pspp_linreg_cache *); +void +linreg_fit (const gsl_matrix *, linreg *); double -pspp_linreg_predict (const struct variable **, const union value **, - const void *, int); +linreg_predict (const linreg *, const double *, size_t); double -pspp_linreg_residual (const struct variable **, const union value **, - const union value *, const void *, int); -/* - All variables used in the model. - */ -int pspp_linreg_get_vars (const void *, const struct variable **); +linreg_residual (const linreg *, double, const double *, size_t); +const struct variable ** linreg_get_vars (const linreg *); -struct pspp_coeff *pspp_linreg_get_coeff (const pspp_linreg_cache - *, - const struct variable - *, - const union value *); /* Return or set the standard deviation of the independent variable. */ -double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *, const struct variable *); -void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *, const struct variable *, double); +double linreg_get_indep_variable_sd (linreg *, size_t); +void linreg_set_indep_variable_sd (linreg *, size_t, double); /* Mean of the independent variable. */ -double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *, const struct variable *); -void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *, const struct variable *, double); +double linreg_get_indep_variable_mean (linreg *, size_t); +void linreg_set_indep_variable_mean (linreg *, size_t, double); -/* - Regression using only the covariance matrix. - */ -void pspp_linreg_with_cov (const struct covariance_matrix *, pspp_linreg_cache *); -double pspp_linreg_mse (const pspp_linreg_cache *); +double linreg_mse (const linreg *); + +double linreg_intercept (const linreg *); + +gsl_matrix * linreg_cov (const linreg *); +double linreg_coeff (const linreg *, size_t); +const struct variable * linreg_indep_var (const linreg *, size_t); +size_t linreg_n_coeffs (const linreg *); +size_t linreg_n_obs (const linreg *); +double linreg_ssreg (const linreg *); +double linreg_dfmodel (const linreg *); #endif -- 2.30.2