X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fregression.q;h=463d18f9ad16dc26eef74dfd015a4989cd8aa8fc;hb=a4ae68f966bc574326d429119878e733069ced14;hp=d5536e80b58ce78d650f48ee0598a007fae0f498;hpb=814e5d005b8108f9fe1c78b60afce20eeccccd6c;p=pspp-builds.git diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q index d5536e80..463d18f9 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.q @@ -18,32 +18,37 @@ 02110-1301, USA. */ #include -#include + #include -#include #include +#include #include -#include +#include + +#include "regression-export.h" #include #include -#include #include -#include -#include -#include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include #include -#include "gettext.h" #include -#include +#include +#include +#include +#include #include -#include -#include "regression-export.h" +#include #include -#include -#include -#include "procedure.h" + +#include "gettext.h" #define REG_LARGE_DATA 1000 @@ -71,13 +76,26 @@ all; export=custom; ^dependent=varlist; - save=residuals; + save[sv_]=resid,pred; method=enter. */ /* (declarations) */ /* (functions) */ static struct cmd_regression cmd; +/* Linear regression models. */ +pspp_linreg_cache **models = NULL; + +/* + Transformations for saving predicted values + and residuals, etc. + */ +struct reg_trns +{ + int n_trns; /* Number of transformations. */ + int trns_id; /* Which trns is this one? */ + pspp_linreg_cache *c; /* Linear model for this trns. */ +}; /* Variables used (both explanatory and response). */ @@ -99,7 +117,8 @@ struct file_handle *model_file; */ int pspp_reg_rc = CMD_SUCCESS; -static bool run_regression (const struct casefile *, void *); +static bool run_regression (const struct ccase *, + const struct casefile *, void *); /* STATISTICS subcommand output functions. @@ -501,43 +520,186 @@ subcommand_statistics (int *keywords, pspp_linreg_cache * c) statistics_keyword_output (reg_stats_tol, keywords[tol], c); statistics_keyword_output (reg_stats_selection, keywords[selection], c); } -static void -subcommand_save (int save, pspp_linreg_cache *lc, const struct casefile *cf, int *is_missing) +/* + Free the transformation. Free its linear model if this + transformation is the last one. + */ +static +bool regression_trns_free (void *t_) { - int i; - int case_num; - double residual; + bool result = true; + struct reg_trns *t = t_; + + if (t->trns_id == t->n_trns) + { + result = pspp_linreg_cache_free (t->c); + } + free (t); + + return result; +} +/* + Gets the predicted values. + */ +static int +regression_trns_pred_proc (void *t_, struct ccase *c, int case_idx UNUSED) +{ + size_t i; + size_t n_vals; + struct reg_trns *trns = t_; + pspp_linreg_cache *model; + union value *output = NULL; + const union value **vals = NULL; + struct variable **vars = NULL; + + assert (trns!= NULL); + model = trns->c; + assert (model != NULL); + assert (model->depvar != NULL); + assert (model->pred != NULL); + + vars = xnmalloc (model->n_coeffs, sizeof (*vars)); + n_vals = (*model->get_vars) (model, vars); + + vals = xnmalloc (n_vals, sizeof (*vals)); + output = case_data_rw (c, model->pred->fv); + assert (output != NULL); + + for (i = 0; i < n_vals; i++) + { + vals[i] = case_data (c, vars[i]->fv); + } + output->f = (*model->predict) ((const struct variable **) vars, + vals, model, n_vals); + free (vals); + free (vars); + return TRNS_CONTINUE; +} +/* + Gets the residuals. + */ +static int +regression_trns_resid_proc (void *t_, struct ccase *c, int case_idx UNUSED) +{ + size_t i; + size_t n_vals; + struct reg_trns *trns = t_; + pspp_linreg_cache *model; + union value *output = NULL; const union value **vals = NULL; const union value *obs = NULL; - struct casereader *r; - struct ccase c; + struct variable **vars = NULL; + + assert (trns!= NULL); + model = trns->c; + assert (model != NULL); + assert (model->depvar != NULL); + assert (model->resid != NULL); + + vars = xnmalloc (model->n_coeffs, sizeof (*vars)); + n_vals = (*model->get_vars) (model, vars); + + vals = xnmalloc (n_vals, sizeof (*vals)); + output = case_data_rw (c, model->resid->fv); + assert (output != NULL); + + for (i = 0; i < n_vals; i++) + { + vals[i] = case_data (c, vars[i]->fv); + } + obs = case_data (c, model->depvar->fv); + output->f = (*model->residual) ((const struct variable **) vars, + vals, obs, model, n_vals); + free (vals); + free (vars); + return TRNS_CONTINUE; +} +/* + Returns 0 if NAME is a duplicate of any existing variable name. +*/ +static int +try_name (char *name) +{ + if (dict_lookup_var (default_dict, name) != NULL) + return 0; - assert (lc != NULL); - assert (lc->depvar != NULL); - assert (is_missing != NULL); + return 1; +} +static +void reg_get_name (char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN]) +{ + int i = 1; + + snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i); + while (!try_name(name)) + { + i++; + snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i); + } +} +static void +reg_save_var (const char *prefix, trns_proc_func *f, + pspp_linreg_cache *c, struct variable **v, + int n_trns) +{ + static int trns_index = 1; + char name[LONG_NAME_LEN]; + struct variable *new_var; + struct reg_trns *t = NULL; + + t = xmalloc (sizeof (*t)); + t->trns_id = trns_index; + t->n_trns = n_trns; + t->c = c; + reg_get_name (name, prefix); + new_var = dict_create_var (default_dict, name, 0); + assert (new_var != NULL); + *v = new_var; + add_transformation (f, regression_trns_free, t); + trns_index++; +} +static void +subcommand_save (int save, pspp_linreg_cache **models) +{ + pspp_linreg_cache **lc; + int n_trns = 0; + int i; + + assert (models != NULL); if (save) { - vals = xnmalloc (n_variables, sizeof (*vals)); - for (r = casefile_get_reader (cf); casereader_read (r, &c); - case_destroy (&c)) + /* Count the number of transformations we will need. */ + for (i = 0; i < REGRESSION_SV_count; i++) { - case_num = casereader_cnum (r) - 1; - if (!is_missing[case_num]) + if (cmd.a_save[i]) { - for (i = 0; i < n_variables; ++i) - { - vals[i] = case_data (&c, v_variables[i]->fv); - if (v_variables[i]->index == lc->depvar->index) - { - obs = vals[i]; - } - } - residual = (*lc->residual) ((const struct variable **) v_variables, - (const union value **) vals, obs, lc, n_variables); + n_trns++; + } + } + n_trns *= cmd.n_dependent; + + for (lc = models; lc < models + cmd.n_dependent; lc++) + { + assert (*lc != NULL); + assert ((*lc)->depvar != NULL); + if (cmd.a_save[REGRESSION_SV_RESID]) + { + reg_save_var ("RES", regression_trns_resid_proc, *lc, &(*lc)->resid, n_trns); + } + if (cmd.a_save[REGRESSION_SV_PRED]) + { + reg_save_var ("PRED", regression_trns_pred_proc, *lc, &(*lc)->pred, n_trns); } } - free (vals); + } + else + { + for (lc = models; lc < models + cmd.n_dependent; lc++) + { + assert (*lc != NULL); + pspp_linreg_cache_free (*lc); + } } } static int @@ -671,7 +833,7 @@ subcommand_export (int export, pspp_linreg_cache * c) { assert (c != NULL); assert (model_file != NULL); - fp = fopen (fh_get_filename (model_file), "w"); + fp = fopen (fh_get_file_name (model_file), "w"); assert (fp != NULL); fprintf (fp, "%s", reg_preamble); reg_print_getvar (fp, c); @@ -767,11 +929,13 @@ cmd_regression (void) { if (!parse_regression (&cmd)) return CMD_FAILURE; + + models = xnmalloc (cmd.n_dependent, sizeof *models); if (!multipass_procedure_with_splits (run_regression, &cmd)) return CMD_CASCADING_FAILURE; - + subcommand_save (cmd.sbc_save, models); free (v_variables); - + free (models); return pspp_reg_rc; } @@ -908,7 +1072,8 @@ int prepare_data (int n_data, int is_missing_case[], return n_data; } static bool -run_regression (const struct casefile *cf, void *cmd_ UNUSED) +run_regression (const struct ccase *first, + const struct casefile *cf, void *cmd_ UNUSED) { size_t i; size_t n_data = 0; /* Number of valide cases. */ @@ -927,9 +1092,13 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) struct variable **indep_vars; struct design_matrix *X; gsl_vector *Y; - pspp_linreg_cache *lcache; + pspp_linreg_opts lopts; + assert (models != NULL); + + output_split_file_values (first); + if (!v_variables) { dict_get_vars (default_dict, &v_variables, &n_variables, @@ -952,7 +1121,6 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) lopts.get_depvar_mean_std = 1; - for (k = 0; k < cmd.n_dependent; k++) { n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]); @@ -976,20 +1144,20 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) { lopts.get_indep_mean_std[i] = 1; } - lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2); - lcache->indep_means = gsl_vector_alloc (X->m->size2); - lcache->indep_std = gsl_vector_alloc (X->m->size2); - lcache->depvar = (const struct variable *) cmd.v_dependent[k]; + models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2); + models[k]->indep_means = gsl_vector_alloc (X->m->size2); + models[k]->indep_std = gsl_vector_alloc (X->m->size2); + models[k]->depvar = (const struct variable *) cmd.v_dependent[k]; /* For large data sets, use QR decomposition. */ if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) { - lcache->method = PSPP_LINREG_SVD; + models[k]->method = PSPP_LINREG_SVD; } /* - The second pass creates the design matrix. + The second pass fills the design matrix. */ row = 0; for (r = casefile_get_reader (cf); casereader_read (r, &c); @@ -1035,19 +1203,18 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) and store pointers to the variables that correspond to the coefficients. */ - pspp_linreg_coeff_init (lcache, X); + pspp_linreg_coeff_init (models[k], X); /* Find the least-squares estimates and other statistics. */ - pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache); - subcommand_statistics (cmd.a_statistics, lcache); - subcommand_save (cmd.sbc_save, lcache, cf, is_missing_case); - subcommand_export (cmd.sbc_export, lcache); + pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]); + subcommand_statistics (cmd.a_statistics, models[k]); + subcommand_export (cmd.sbc_export, models[k]); + gsl_vector_free (Y); design_matrix_destroy (X); free (indep_vars); - pspp_linreg_cache_free (lcache); free (lopts.get_indep_mean_std); casereader_destroy (r); }