X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fregression.q;h=d923a35b00770ed3e09c04f9cc5cffcf8834c98b;hb=ce16a4a594e7ddfc277afc4abb7faaeb1a03d233;hp=690b6809aacda290c88d4175c177113d60c6396f;hpb=92c09e564002d356d20fc1e2e131027ef89f6748;p=pspp-builds.git diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q index 690b6809..d923a35b 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.q @@ -1,20 +1,18 @@ -/* PSPP - linear regression. +/* PSPP - a program for statistical analysis. Copyright (C) 2005 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include @@ -94,9 +92,6 @@ struct moments_var const struct variable *v; }; -/* Linear regression models. */ -static pspp_linreg_cache **models = NULL; - /* Transformations for saving predicted values and residuals, etc. @@ -119,14 +114,14 @@ static size_t n_variables; /* File where the model will be saved if the EXPORT subcommand - is given. + is given. */ static struct file_handle *model_file; static bool run_regression (struct casereader *, struct cmd_regression *, - struct dataset *); + struct dataset *, pspp_linreg_cache **); -/* +/* STATISTICS subcommand output functions. */ static void reg_stats_r (pspp_linreg_cache *); @@ -454,8 +449,8 @@ statistics_keyword_output (void (*function) (pspp_linreg_cache *), static void subcommand_statistics (int *keywords, pspp_linreg_cache * c) { - /* - The order here must match the order in which the STATISTICS + /* + The order here must match the order in which the STATISTICS keywords appear in the specification section above. */ enum @@ -628,7 +623,7 @@ regression_trns_resid_proc (void *t_, struct ccase *c, return TRNS_CONTINUE; } -/* +/* Returns false if NAME is a duplicate of any existing variable name. */ static bool @@ -950,6 +945,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) { struct casegrouper *grouper; struct casereader *group; + pspp_linreg_cache **models; bool ok; size_t i; @@ -965,7 +961,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) /* Data pass. */ grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); while (casegrouper_get_next_group (grouper, &group)) - run_regression (group, &cmd, ds); + run_regression (group, &cmd, ds, models); ok = casegrouper_destroy (grouper); ok = proc_commit (ds) && ok; @@ -1014,7 +1010,8 @@ regression_custom_variables (struct lexer *lexer, struct dataset *ds, /* Identify the explanatory variables in v_variables. Returns the number of independent variables. */ static int -identify_indep_vars (struct variable **indep_vars, struct variable *depvar) +identify_indep_vars (const struct variable **indep_vars, + const struct variable *depvar) { int n_indep_vars = 0; int i; @@ -1022,7 +1019,19 @@ identify_indep_vars (struct variable **indep_vars, struct variable *depvar) for (i = 0; i < n_variables; i++) if (!is_depvar (i, depvar)) indep_vars[n_indep_vars++] = v_variables[i]; - + if ((n_indep_vars < 2) && is_depvar (0, depvar)) + { + /* + There is only one independent variable, and it is the same + as the dependent variable. Print a warning and continue. + */ + msg (SE, + gettext ("The dependent variable is equal to the independent variable." + "The least squares line is therefore Y=X." + "Standard errors and related statistics may be meaningless.")); + n_indep_vars = 1; + indep_vars[0] = v_variables[0]; + } return n_indep_vars; } @@ -1030,35 +1039,38 @@ identify_indep_vars (struct variable **indep_vars, struct variable *depvar) Returns number of valid cases. */ static int prepare_categories (struct casereader *input, - struct variable **vars, size_t n_vars, - struct moments_var *mom) + const struct variable **vars, size_t n_vars, + struct moments_var *mom) { int n_data; struct ccase c; size_t i; + assert (vars != NULL); + assert (mom != NULL); + for (i = 0; i < n_vars; i++) if (var_is_alpha (vars[i])) cat_stored_values_create (vars[i]); n_data = 0; - for (; casereader_read (input, &c); case_destroy (&c)) + for (; casereader_read (input, &c); case_destroy (&c)) { /* - The second condition ensures the program will run even if - there is only one variable to act as both explanatory and - response. + The second condition ensures the program will run even if + there is only one variable to act as both explanatory and + response. */ for (i = 0; i < n_vars; i++) - { - const union value *val = case_data (&c, vars[i]); - if (var_is_alpha (vars[i])) - cat_value_update (vars[i], val); - else - moments1_add (mom[i].m, val->f, 1.0); - } - n_data++; - } + { + const union value *val = case_data (&c, vars[i]); + if (var_is_alpha (vars[i])) + cat_value_update (vars[i], val); + else + moments1_add (mom[i].m, val->f, 1.0); + } + n_data++; + } casereader_destroy (input); return n_data; @@ -1108,7 +1120,7 @@ compute_moments (pspp_linreg_cache * c, struct moments_var *mom, static bool run_regression (struct casereader *input, struct cmd_regression *cmd, - struct dataset *ds) + struct dataset *ds, pspp_linreg_cache **models) { size_t i; int n_indep = 0; @@ -1124,7 +1136,10 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, assert (models != NULL); if (!casereader_peek (input, 0, &c)) - return true; + { + casereader_destroy (input); + return true; + } output_split_file_values (ds, &c); case_destroy (&c); @@ -1156,22 +1171,21 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, for (k = 0; k < cmd->n_dependent; k++) { - struct variable *dep_var; + const struct variable *dep_var; struct casereader *reader; casenumber row; struct ccase c; size_t n_data; /* Number of valid cases. */ - + dep_var = cmd->v_dependent[k]; n_indep = identify_indep_vars (indep_vars, dep_var); - reader = casereader_clone (input); reader = casereader_create_filter_missing (reader, indep_vars, n_indep, - MV_ANY, NULL); + MV_ANY, NULL); reader = casereader_create_filter_missing (reader, &dep_var, 1, - MV_ANY, NULL); - n_data = prepare_categories (casereader_clone (reader), - indep_vars, n_indep, mom); + MV_ANY, NULL); + n_data = prepare_categories (casereader_clone (reader), + indep_vars, n_indep, mom); if ((n_data > 0) && (n_indep > 0)) { @@ -1185,35 +1199,32 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, lopts.get_indep_mean_std[i] = 1; } models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2); - models[k]->indep_means = gsl_vector_alloc (X->m->size2); - models[k]->indep_std = gsl_vector_alloc (X->m->size2); - models[k]->depvar = dep_var; - /* + models[k]->depvar = dep_var; + /* For large data sets, use QR decomposition. */ if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) { - models[k]->method = PSPP_LINREG_SVD; + models[k]->method = PSPP_LINREG_QR; } /* - The second pass fills the design matrix. - */ - reader = casereader_create_counter (reader, &row, -1); - for (; casereader_read (reader, &c); case_destroy (&c)) - { - for (i = 0; i < n_indep; ++i) - { - struct variable *v = indep_vars[i]; - const union value *val = case_data (&c, v); - if (var_is_alpha (v)) - design_matrix_set_categorical (X, row, v, val); - else - design_matrix_set_numeric (X, row, v, val); - } - gsl_vector_set (Y, row, case_num (&c, dep_var)); - } - casereader_destroy (reader); + The second pass fills the design matrix. + */ + reader = casereader_create_counter (reader, &row, -1); + for (; casereader_read (reader, &c); case_destroy (&c)) + { + for (i = 0; i < n_indep; ++i) + { + const struct variable *v = indep_vars[i]; + const union value *val = case_data (&c, v); + if (var_is_alpha (v)) + design_matrix_set_categorical (X, row, v, val); + else + design_matrix_set_numeric (X, row, v, val); + } + gsl_vector_set (Y, row, case_num (&c, dep_var)); + } /* Now that we know the number of coefficients, allocate space and store pointers to the variables that correspond to the @@ -1221,26 +1232,33 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, */ coeff_init (models[k], X); - /* + /* Find the least-squares estimates and other statistics. */ pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]); compute_moments (models[k], mom, X, n_variables); - if (!taint_has_tainted_successor (casereader_get_taint (input))) - { - subcommand_statistics (cmd->a_statistics, models[k]); - subcommand_export (cmd->sbc_export, models[k]); - } + if (!taint_has_tainted_successor (casereader_get_taint (input))) + { + subcommand_statistics (cmd->a_statistics, models[k]); + subcommand_export (cmd->sbc_export, models[k]); + } gsl_vector_free (Y); design_matrix_destroy (X); } else { - msg (SE, gettext ("No valid data found. This command was skipped.")); + msg (SE, + gettext ("No valid data found. This command was skipped.")); } + casereader_destroy (reader); + } + for (i = 0; i < n_variables; i++) + { + moments1_destroy ((mom + i)->m); } + free (mom); free (indep_vars); free (lopts.get_indep_mean_std); casereader_destroy (input); @@ -1249,7 +1267,7 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, } /* - Local Variables: + Local Variables: mode: c End: */