X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fregression.c;h=b4448dd9e58ead03cb61cf3d5e5b3c97201b1edf;hb=1a73ef6db75f41bf209b9aff8d42c0cb579338ad;hp=264a7c1ade150652131ad2b0fe2c0d96a7fb3bf7;hpb=c671071609d53f1b16f5a01718ddd2be4387ba7b;p=pspp diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c index 264a7c1ade..b4448dd9e5 100644 --- a/src/language/stats/regression.c +++ b/src/language/stats/regression.c @@ -1,6 +1,6 @@ /* PSPP - a program for statistical analysis. Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013, 2014, - 2016, 2017 Free Software Foundation, Inc. + 2016, 2017, 2019 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -60,6 +60,7 @@ #define STATS_OUTS 8 #define STATS_CI 16 #define STATS_BCOV 32 +#define STATS_TOL 64 #define STATS_DEFAULT (STATS_R | STATS_COEFF | STATS_ANOVA | STATS_OUTS) @@ -158,7 +159,7 @@ save_trans_free (void *aux) return true; } -static int +static enum trns_result save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED) { struct save_trans_data *save_trans_data = aux; @@ -174,14 +175,14 @@ save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED) { if (ws->pred_idx != -1) { - double pred = case_data_idx (in, ws->extras * k + ws->pred_idx)->f; - case_data_rw (*c, ws->predvars[k])->f = pred; + double pred = case_num_idx (in, ws->extras * k + ws->pred_idx); + *case_num_rw (*c, ws->predvars[k]) = pred; } if (ws->res_idx != -1) { - double resid = case_data_idx (in, ws->extras * k + ws->res_idx)->f; - case_data_rw (*c, ws->residvars[k])->f = resid; + double resid = case_num_idx (in, ws->extras * k + ws->res_idx); + *case_num_rw (*c, ws->residvars[k]) = resid; } } case_unref (in); @@ -190,7 +191,6 @@ save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED) return TRNS_CONTINUE; } - int cmd_regression (struct lexer *lexer, struct dataset *ds) { @@ -308,6 +308,10 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) { statistics |= STATS_BCOV; } + else if (lex_match_id (lexer, "TOL")) + { + statistics |= STATS_TOL; + } else if (lex_match_id (lexer, "CI")) { statistics |= STATS_CI; @@ -445,7 +449,12 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) memcpy (save_trans_data->ws, &workspace, sizeof (workspace)); save_trans_data->n_dep_vars = regression.n_dep_vars; - add_transformation (ds, save_trans_func, save_trans_free, save_trans_data); + static const struct trns_class trns_class = { + .name = "REGRESSION", + .execute = save_trans_func, + .destroy = save_trans_free, + }; + add_transformation (ds, &trns_class, save_trans_data); } @@ -512,6 +521,24 @@ fill_all_vars (const struct variable **vars, const struct regression *cmd) } } + +/* Fill the array VARS, with all the predictor variables from CMD, except + variable X */ +static void +fill_predictor_x (const struct variable **vars, const struct variable *x, const struct regression *cmd) +{ + size_t i; + size_t n = 0; + + for (i = 0; i < cmd->n_vars; i++) + { + if (cmd->vars[i] == x) + continue; + + vars[n++] = cmd->vars[i]; + } +} + /* Is variable k the dependent variable? */ @@ -626,50 +653,69 @@ fill_covariance (gsl_matrix * cov, struct covariance *all_cov, +struct model_container +{ + struct linreg **models; +}; + /* STATISTICS subcommand output functions. */ static void reg_stats_r (const struct linreg *, const struct variable *); -static void reg_stats_coeff (const struct linreg *, const gsl_matrix *, const struct variable *, const struct regression *); +static void reg_stats_coeff (const struct regression *, const struct linreg *, + const struct model_container *, const gsl_matrix *, + const struct variable *); static void reg_stats_anova (const struct linreg *, const struct variable *); static void reg_stats_bcov (const struct linreg *, const struct variable *); -static void -subcommand_statistics (const struct regression *cmd, const struct linreg * c, const gsl_matrix * cm, - const struct variable *var) -{ - if (cmd->stats & STATS_R) - reg_stats_r (c, var); - - if (cmd->stats & STATS_ANOVA) - reg_stats_anova (c, var); - - if (cmd->stats & STATS_COEFF) - reg_stats_coeff (c, cm, var, cmd); - - if (cmd->stats & STATS_BCOV) - reg_stats_bcov (c, var); -} - - -static void -run_regression (const struct regression *cmd, - struct regression_workspace *ws, - struct casereader *input) +static struct linreg ** +run_regression_get_models (const struct regression *cmd, + struct casereader *input, + bool output) { size_t i; - struct linreg **models; + struct model_container *model_container = XCALLOC (cmd->n_vars, struct model_container); - int k; struct ccase *c; struct covariance *cov; struct casereader *reader; + + if (cmd->stats & STATS_TOL) + { + for (i = 0; i < cmd->n_vars; i++) + { + struct regression subreg; + subreg.origin = cmd->origin; + subreg.ds = cmd->ds; + subreg.n_vars = cmd->n_vars - 1; + subreg.n_dep_vars = 1; + subreg.vars = xmalloc (sizeof (*subreg.vars) * cmd->n_vars - 1); + subreg.dep_vars = xmalloc (sizeof (*subreg.dep_vars)); + fill_predictor_x (subreg.vars, cmd->vars[i], cmd); + subreg.dep_vars[0] = cmd->vars[i]; + subreg.stats = STATS_R; + subreg.ci = 0; + subreg.resid = false; + subreg.pred = false; + + model_container[i].models = + run_regression_get_models (&subreg, input, false); + free (subreg.vars); + free (subreg.dep_vars); + } + } + size_t n_all_vars = get_n_all_vars (cmd); const struct variable **all_vars = xnmalloc (n_all_vars, sizeof (*all_vars)); - double *means = xnmalloc (n_all_vars, sizeof (*means)); - + /* In the (rather pointless) case where the dependent variable is + the independent variable, n_all_vars == 1. + However this would result in a buffer overflow so we must + over-allocate the space required in this malloc call. + See bug #58599 */ + double *means = xnmalloc (n_all_vars <= 1 ? 2 : n_all_vars, + sizeof (*means)); fill_all_vars (all_vars, cmd); cov = covariance_1pass_create (n_all_vars, all_vars, dict_get_weight (dataset_dict (cmd->ds)), @@ -678,9 +724,7 @@ run_regression (const struct regression *cmd, reader = casereader_clone (input); reader = casereader_create_filter_missing (reader, all_vars, n_all_vars, MV_ANY, NULL, NULL); - - - { +{ struct casereader *r = casereader_clone (reader); for (; (c = casereader_read (r)) != NULL; case_unref (c)) @@ -690,14 +734,15 @@ run_regression (const struct regression *cmd, casereader_destroy (r); } - models = xcalloc (cmd->n_dep_vars, sizeof (*models)); - for (k = 0; k < cmd->n_dep_vars; k++) + struct linreg **models = XCALLOC (cmd->n_dep_vars, struct linreg*); + + for (int k = 0; k < cmd->n_dep_vars; k++) { const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars)); const struct variable *dep_var = cmd->dep_vars[k]; int n_indep = identify_indep_vars (cmd, vars, dep_var); - gsl_matrix *this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); - double n_data = fill_covariance (this_cm, cov, vars, n_indep, + gsl_matrix *cov_matrix = gsl_matrix_alloc (n_indep + 1, n_indep + 1); + double n_data = fill_covariance (cov_matrix, cov, vars, n_indep, dep_var, all_vars, n_all_vars, means); models[k] = linreg_alloc (dep_var, vars, n_data, n_indep, cmd->origin); for (i = 0; i < n_indep; i++) @@ -707,39 +752,76 @@ run_regression (const struct regression *cmd, linreg_set_depvar_mean (models[k], means[i]); if (n_data > 0) { - /* - Find the least-squares estimates and other statistics. - */ - linreg_fit (this_cm, models[k]); + linreg_fit (cov_matrix, models[k]); - if (!taint_has_tainted_successor (casereader_get_taint (input))) + if (output && !taint_has_tainted_successor (casereader_get_taint (input))) { - subcommand_statistics (cmd, models[k], this_cm, dep_var); - } + /* + Find the least-squares estimates and other statistics. + */ + if (cmd->stats & STATS_R) + reg_stats_r (models[k], dep_var); + + if (cmd->stats & STATS_ANOVA) + reg_stats_anova (models[k], dep_var); + + if (cmd->stats & STATS_COEFF) + reg_stats_coeff (cmd, models[k], + model_container, + cov_matrix, dep_var); + + if (cmd->stats & STATS_BCOV) + reg_stats_bcov (models[k], dep_var); + } } else { msg (SE, _("No valid data found. This command was skipped.")); } - gsl_matrix_free (this_cm); free (vars); + gsl_matrix_free (cov_matrix); + } + + casereader_destroy (reader); + + for (int i = 0; i < cmd->n_vars; i++) + { + if (model_container[i].models) + { + linreg_unref (model_container[i].models[0]); + } + free (model_container[i].models); } + free (model_container); + + free (all_vars); + free (means); + covariance_destroy (cov); + return models; +} +static void +run_regression (const struct regression *cmd, + struct regression_workspace *ws, + struct casereader *input) +{ + struct linreg **models = run_regression_get_models (cmd, input, true); if (ws->extras > 0) { - struct casereader *r = casereader_clone (reader); + struct ccase *c; + struct casereader *r = casereader_clone (input); for (; (c = casereader_read (r)) != NULL; case_unref (c)) { struct ccase *outc = case_create (casewriter_get_proto (ws->writer)); - for (k = 0; k < cmd->n_dep_vars; k++) + for (int k = 0; k < cmd->n_dep_vars; k++) { const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars)); const struct variable *dep_var = cmd->dep_vars[k]; int n_indep = identify_indep_vars (cmd, vars, dep_var); double *vals = xnmalloc (n_indep, sizeof (*vals)); - for (i = 0; i < n_indep; i++) + for (int i = 0; i < n_indep; i++) { const union value *tmp = case_data (c, vars[i]); vals[i] = tmp->f; @@ -748,14 +830,14 @@ run_regression (const struct regression *cmd, if (cmd->pred) { double pred = linreg_predict (models[k], vals, n_indep); - case_data_rw_idx (outc, k * ws->extras + ws->pred_idx)->f = pred; + *case_num_rw_idx (outc, k * ws->extras + ws->pred_idx) = pred; } if (cmd->resid) { - double obs = case_data (c, linreg_dep_var (models[k]))->f; + double obs = case_num (c, linreg_dep_var (models[k])); double res = linreg_residual (models[k], obs, vals, n_indep); - case_data_rw_idx (outc, k * ws->extras + ws->res_idx)->f = res; + *case_num_rw_idx (outc, k * ws->extras + ws->res_idx) = res; } free (vals); free (vars); @@ -765,18 +847,13 @@ run_regression (const struct regression *cmd, casereader_destroy (r); } - casereader_destroy (reader); - - for (k = 0; k < cmd->n_dep_vars; k++) + for (int k = 0; k < cmd->n_dep_vars; k++) { linreg_unref (models[k]); } - free (models); - free (all_vars); - free (means); + free (models); casereader_destroy (input); - covariance_destroy (cov); } @@ -787,7 +864,8 @@ reg_stats_r (const struct linreg * c, const struct variable *var) { struct pivot_table *table = pivot_table_create__ ( pivot_value_new_text_format (N_("Model Summary (%s)"), - var_to_string (var))); + var_to_string (var)), + "Model Summary"); pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"), N_("R"), N_("R Square"), N_("Adjusted R Square"), @@ -812,11 +890,13 @@ reg_stats_r (const struct linreg * c, const struct variable *var) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (const struct linreg * c, const gsl_matrix *cov, const struct variable *var, const struct regression *cmd) +reg_stats_coeff (const struct regression *cmd, const struct linreg *c, + const struct model_container *mc, const gsl_matrix *cov, + const struct variable *var) { struct pivot_table *table = pivot_table_create__ ( - pivot_value_new_text_format (N_("Coefficients (%s)"), - var_to_string (var))); + pivot_value_new_text_format (N_("Coefficients (%s)"), var_to_string (var)), + "Coefficients"); struct pivot_dimension *statistics = pivot_dimension_create ( table, PIVOT_AXIS_COLUMN, N_("Statistics")); @@ -837,6 +917,12 @@ reg_stats_coeff (const struct linreg * c, const gsl_matrix *cov, const struct va N_("Upper Bound")); } + if (cmd->stats & STATS_TOL) + pivot_category_create_group (statistics->root, + N_("Collinearity Statistics"), + N_("Tolerance"), N_("VIF")); + + struct pivot_dimension *variables = pivot_dimension_create ( table, PIVOT_AXIS_ROW, N_("Variables")); @@ -851,19 +937,31 @@ reg_stats_coeff (const struct linreg * c, const gsl_matrix *cov, const struct va double std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); double t_stat = linreg_intercept (c) / std_err; - double entries[] = { + double base_entries[] = { linreg_intercept (c), std_err, 0.0, t_stat, 2.0 * gsl_cdf_tdist_Q (fabs (t_stat), linreg_n_obs (c) - linreg_n_coeffs (c)), - linreg_intercept (c) - tval * std_err, - linreg_intercept (c) + tval * std_err, }; - for (size_t i = 0; i < sizeof entries / sizeof *entries; i++) - pivot_table_put2 (table, i, var_idx, - pivot_value_new_number (entries[i])); + + size_t col = 0; + for (size_t i = 0; i < sizeof base_entries / sizeof *base_entries; i++) + pivot_table_put2 (table, col++, var_idx, + pivot_value_new_number (base_entries[i])); + + if (cmd->stats & STATS_CI) + { + double interval_entries[] = { + linreg_intercept (c) - tval * std_err, + linreg_intercept (c) + tval * std_err, + }; + + for (size_t i = 0; i < sizeof interval_entries / sizeof *interval_entries; i++) + pivot_table_put2 (table, col++, var_idx, + pivot_value_new_number (interval_entries[i])); + } } for (size_t j = 0; j < linreg_n_coeffs (c); j++) @@ -874,19 +972,42 @@ reg_stats_coeff (const struct linreg * c, const gsl_matrix *cov, const struct va double std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); double t_stat = linreg_coeff (c, j) / std_err; - double entries[] = { + double base_entries[] = { linreg_coeff (c, j), sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)), (sqrt (gsl_matrix_get (cov, j, j)) * linreg_coeff (c, j) / sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1))), t_stat, - 2 * gsl_cdf_tdist_Q (fabs (t_stat), df), - linreg_coeff (c, j) - tval * std_err, - linreg_coeff (c, j) + tval * std_err, + 2 * gsl_cdf_tdist_Q (fabs (t_stat), df) }; - for (size_t i = 0; i < sizeof entries / sizeof *entries; i++) - pivot_table_put2 (table, i, var_idx, - pivot_value_new_number (entries[i])); + + size_t col = 0; + for (size_t i = 0; i < sizeof base_entries / sizeof *base_entries; i++) + pivot_table_put2 (table, col++, var_idx, + pivot_value_new_number (base_entries[i])); + + if (cmd->stats & STATS_CI) + { + double interval_entries[] = { + linreg_coeff (c, j) - tval * std_err, + linreg_coeff (c, j) + tval * std_err, + }; + + + for (size_t i = 0; i < sizeof interval_entries / sizeof *interval_entries; i++) + pivot_table_put2 (table, col++, var_idx, + pivot_value_new_number (interval_entries[i])); + } + + if (cmd->stats & STATS_TOL) + { + { + struct linreg *m = mc[j].models[0]; + double rsq = linreg_ssreg (m) / linreg_sst (m); + pivot_table_put2 (table, col++, var_idx, pivot_value_new_number (1.0 - rsq)); + pivot_table_put2 (table, col++, var_idx, pivot_value_new_number (1.0 / (1.0 - rsq))); + } + } } pivot_table_submit (table); @@ -899,7 +1020,8 @@ static void reg_stats_anova (const struct linreg * c, const struct variable *var) { struct pivot_table *table = pivot_table_create__ ( - pivot_value_new_text_format (N_("ANOVA (%s)"), var_to_string (var))); + pivot_value_new_text_format (N_("ANOVA (%s)"), var_to_string (var)), + "ANOVA"); pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"), N_("Sum of Squares"), PIVOT_RC_OTHER, @@ -954,7 +1076,8 @@ reg_stats_bcov (const struct linreg * c, const struct variable *var) { struct pivot_table *table = pivot_table_create__ ( pivot_value_new_text_format (N_("Coefficient Correlations (%s)"), - var_to_string (var))); + var_to_string (var)), + "Coefficient Correlations"); for (size_t i = 0; i < 2; i++) { @@ -978,4 +1101,3 @@ reg_stats_bcov (const struct linreg * c, const struct variable *var) pivot_table_submit (table); } -