X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fregression.q;h=b879cae90871658313ac47b76b270474ec6119a3;hb=8b42eb7dfd30afe72c307f91c298abe811befc60;hp=68db454d40edfbc38316884903f893227fd9bc0b;hpb=5647b8fa947088764d27a5a76188c8d059e5c8af;p=pspp-builds.git diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q index 68db454d..b879cae9 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.q @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2009 Free Software Foundation, Inc. + Copyright (C) 2005, 2009, 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,11 +21,9 @@ #include #include #include - #include #include #include -#include #include #include #include @@ -39,13 +37,13 @@ #include #include #include -#include -#include +#include #include #include -#include +#include -#include "xalloc.h" +#include "gl/intprops.h" +#include "gl/xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -99,7 +97,7 @@ struct reg_trns { int n_trns; /* Number of transformations. */ int trns_id; /* Which trns is this one? */ - pspp_linreg_cache *c; /* Linear model for this trns. */ + linreg *c; /* Linear model for this trns. */ }; /* Variables used (both explanatory and response). @@ -112,31 +110,31 @@ static const struct variable **v_variables; static size_t n_variables; static bool run_regression (struct casereader *, struct cmd_regression *, - struct dataset *, pspp_linreg_cache **); + struct dataset *, linreg **); /* STATISTICS subcommand output functions. */ -static void reg_stats_r (pspp_linreg_cache *); -static void reg_stats_coeff (pspp_linreg_cache *); -static void reg_stats_anova (pspp_linreg_cache *); -static void reg_stats_outs (pspp_linreg_cache *); -static void reg_stats_zpp (pspp_linreg_cache *); -static void reg_stats_label (pspp_linreg_cache *); -static void reg_stats_sha (pspp_linreg_cache *); -static void reg_stats_ci (pspp_linreg_cache *); -static void reg_stats_f (pspp_linreg_cache *); -static void reg_stats_bcov (pspp_linreg_cache *); -static void reg_stats_ses (pspp_linreg_cache *); -static void reg_stats_xtx (pspp_linreg_cache *); -static void reg_stats_collin (pspp_linreg_cache *); -static void reg_stats_tol (pspp_linreg_cache *); -static void reg_stats_selection (pspp_linreg_cache *); -static void statistics_keyword_output (void (*)(pspp_linreg_cache *), - int, pspp_linreg_cache *); +static void reg_stats_r (linreg *, void *); +static void reg_stats_coeff (linreg *, void *); +static void reg_stats_anova (linreg *, void *); +static void reg_stats_outs (linreg *, void *); +static void reg_stats_zpp (linreg *, void *); +static void reg_stats_label (linreg *, void *); +static void reg_stats_sha (linreg *, void *); +static void reg_stats_ci (linreg *, void *); +static void reg_stats_f (linreg *, void *); +static void reg_stats_bcov (linreg *, void *); +static void reg_stats_ses (linreg *, void *); +static void reg_stats_xtx (linreg *, void *); +static void reg_stats_collin (linreg *, void *); +static void reg_stats_tol (linreg *, void *); +static void reg_stats_selection (linreg *, void *); +static void statistics_keyword_output (void (*)(linreg *, void *), + int, linreg *, void *); static void -reg_stats_r (pspp_linreg_cache * c) +reg_stats_r (linreg *c, void *aux UNUSED) { struct tab_table *t; int n_rows = 2; @@ -146,11 +144,10 @@ reg_stats_r (pspp_linreg_cache * c) double std_error; assert (c != NULL); - rsq = c->ssm / c->sst; - adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps); - std_error = sqrt (pspp_linreg_mse (c)); - t = tab_create (n_cols, n_rows, 0); - tab_dim (t, tab_natural_dimensions); + rsq = linreg_ssreg (c) / linreg_sst (c); + adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c)); + std_error = sqrt (linreg_mse (c)); + t = tab_create (n_cols, n_rows); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); tab_hline (t, TAL_2, 0, n_cols - 1, 1); tab_vline (t, TAL_2, 2, 0, n_rows - 1); @@ -160,10 +157,10 @@ reg_stats_r (pspp_linreg_cache * c) tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square")); tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square")); tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate")); - tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2); - tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2); - tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2); - tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2); + tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL); + tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL); + tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL); + tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL); tab_title (t, _("Model Summary")); tab_submit (t); } @@ -172,7 +169,7 @@ reg_stats_r (pspp_linreg_cache * c) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (pspp_linreg_cache * c) +reg_stats_coeff (linreg * c, void *aux_) { size_t j; int n_cols = 7; @@ -185,15 +182,14 @@ reg_stats_coeff (pspp_linreg_cache * c) const char *label; const struct variable *v; - const union value *val; struct tab_table *t; + gsl_matrix *cov = aux_; assert (c != NULL); - n_rows = c->n_coeffs + 3; + n_rows = linreg_n_coeffs (c) + 3; - t = tab_create (n_cols, n_rows, 0); + t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); - tab_dim (t, tab_natural_dimensions); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); tab_hline (t, TAL_2, 0, n_cols - 1, 1); tab_vline (t, TAL_2, 2, 0, n_rows - 1); @@ -205,67 +201,55 @@ reg_stats_coeff (pspp_linreg_cache * c) tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); - tab_float (t, 2, 1, 0, c->intercept, 10, 2); - std_err = sqrt (gsl_matrix_get (c->cov, 0, 0)); - tab_float (t, 3, 1, 0, std_err, 10, 2); - tab_float (t, 4, 1, 0, 0.0, 10, 2); - t_stat = c->intercept / std_err; - tab_float (t, 5, 1, 0, t_stat, 10, 2); - pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0); - tab_float (t, 6, 1, 0, pval, 10, 2); - for (j = 0; j < c->n_coeffs; j++) + tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); + std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); + tab_double (t, 3, 1, 0, std_err, NULL); + tab_double (t, 4, 1, 0, 0.0, NULL); + t_stat = linreg_intercept (c) / std_err; + tab_double (t, 5, 1, 0, t_stat, NULL); + pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); + tab_double (t, 6, 1, 0, pval, NULL); + for (j = 0; j < linreg_n_coeffs (c); j++) { struct string tstr; ds_init_empty (&tstr); this_row = j + 2; - v = pspp_coeff_get_var (c->coeff[j], 0); + v = linreg_indep_var (c, j); label = var_to_string (v); /* Do not overwrite the variable's name. */ ds_put_cstr (&tstr, label); - if (var_is_alpha (v)) - { - /* - Append the value associated with this coefficient. - This makes sense only if we us the usual binary encoding - for that value. - */ - - val = pspp_coeff_get_value (c->coeff[j], v); - - var_append_value_name (v, val, &tstr); - } - tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); /* Regression coefficients. */ - tab_float (t, 2, this_row, 0, c->coeff[j]->estimate, 10, 2); + tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL); /* Standard error of the coefficients. */ - std_err = sqrt (gsl_matrix_get (c->cov, j + 1, j + 1)); - tab_float (t, 3, this_row, 0, std_err, 10, 2); + std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); + tab_double (t, 3, this_row, 0, std_err, NULL); /* Standardized coefficient, i.e., regression coefficient if all variables had unit variance. */ - beta = pspp_coeff_get_sd (c->coeff[j]); - beta *= c->coeff[j]->estimate / c->depvar_std; - tab_float (t, 4, this_row, 0, beta, 10, 2); + beta = sqrt (gsl_matrix_get (cov, j, j)); + beta *= linreg_coeff (c, j) / + sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); + tab_double (t, 4, this_row, 0, beta, NULL); /* Test statistic for H0: coefficient is 0. */ - t_stat = c->coeff[j]->estimate / std_err; - tab_float (t, 5, this_row, 0, t_stat, 10, 2); + t_stat = linreg_coeff (c, j) / std_err; + tab_double (t, 5, this_row, 0, t_stat, NULL); /* P values for the test statistic above. */ pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (c->n_obs - c->n_coeffs)); - tab_float (t, 6, this_row, 0, pval, 10, 2); + (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); + tab_double (t, 6, this_row, 0, pval, NULL); ds_destroy (&tstr); } tab_title (t, _("Coefficients")); @@ -276,21 +260,20 @@ reg_stats_coeff (pspp_linreg_cache * c) Display the ANOVA table. */ static void -reg_stats_anova (pspp_linreg_cache * c) +reg_stats_anova (linreg * c, void *aux UNUSED) { int n_cols = 7; int n_rows = 4; - const double msm = c->ssm / c->dfm; - const double mse = pspp_linreg_mse (c); + const double msm = linreg_ssreg (c) / linreg_dfmodel (c); + const double mse = linreg_mse (c); const double F = msm / mse; const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe); struct tab_table *t; assert (c != NULL); - t = tab_create (n_cols, n_rows, 0); + t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); - tab_dim (t, tab_natural_dimensions); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); @@ -309,63 +292,63 @@ reg_stats_anova (pspp_linreg_cache * c) tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total")); /* Sums of Squares */ - tab_float (t, 2, 1, 0, c->ssm, 10, 2); - tab_float (t, 2, 3, 0, c->sst, 10, 2); - tab_float (t, 2, 2, 0, c->sse, 10, 2); + tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL); + tab_double (t, 2, 3, 0, linreg_sst (c), NULL); + tab_double (t, 2, 2, 0, linreg_sse (c), NULL); /* Degrees of freedom */ - tab_text (t, 3, 1, TAB_RIGHT | TAT_PRINTF, "%g", c->dfm); - tab_text (t, 3, 2, TAB_RIGHT | TAT_PRINTF, "%g", c->dfe); - tab_text (t, 3, 3, TAB_RIGHT | TAT_PRINTF, "%g", c->dft); + tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm); + tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe); + tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft); /* Mean Squares */ - tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3); - tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3); + tab_double (t, 4, 1, TAB_RIGHT, msm, NULL); + tab_double (t, 4, 2, TAB_RIGHT, mse, NULL); - tab_float (t, 5, 1, 0, F, 8, 3); + tab_double (t, 5, 1, 0, F, NULL); - tab_float (t, 6, 1, 0, pval, 8, 3); + tab_double (t, 6, 1, 0, pval, NULL); tab_title (t, _("ANOVA")); tab_submit (t); } static void -reg_stats_outs (pspp_linreg_cache * c) +reg_stats_outs (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_zpp (pspp_linreg_cache * c) +reg_stats_zpp (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_label (pspp_linreg_cache * c) +reg_stats_label (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_sha (pspp_linreg_cache * c) +reg_stats_sha (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_ci (pspp_linreg_cache * c) +reg_stats_ci (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_f (pspp_linreg_cache * c) +reg_stats_f (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_bcov (pspp_linreg_cache * c) +reg_stats_bcov (linreg * c, void *aux UNUSED) { int n_cols; int n_rows; @@ -379,70 +362,69 @@ reg_stats_bcov (pspp_linreg_cache * c) assert (c != NULL); n_cols = c->n_indeps + 1 + 2; n_rows = 2 * (c->n_indeps + 1); - t = tab_create (n_cols, n_rows, 0); + t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); - tab_dim (t, tab_natural_dimensions); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); tab_hline (t, TAL_2, 0, n_cols - 1, 1); tab_vline (t, TAL_2, 2, 0, n_rows - 1); tab_vline (t, TAL_0, 1, 0, 0); tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model")); tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances")); - for (i = 0; i < c->n_coeffs; i++) + for (i = 0; i < linreg_n_coeffs (c); i++) { - const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0); + const struct variable *v = linreg_indep_var (c, i); label = var_to_string (v); tab_text (t, 2, i, TAB_CENTER, label); tab_text (t, i + 2, 0, TAB_CENTER, label); - for (k = 1; k < c->n_coeffs; k++) + for (k = 1; k < linreg_n_coeffs (c); k++) { col = (i <= k) ? k : i; row = (i <= k) ? i : k; - tab_float (t, k + 2, i, TAB_CENTER, - gsl_matrix_get (c->cov, row, col), 8, 3); + tab_double (t, k + 2, i, TAB_CENTER, + gsl_matrix_get (c->cov, row, col), NULL); } } tab_title (t, _("Coefficient Correlations")); tab_submit (t); } static void -reg_stats_ses (pspp_linreg_cache * c) +reg_stats_ses (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_xtx (pspp_linreg_cache * c) +reg_stats_xtx (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_collin (pspp_linreg_cache * c) +reg_stats_collin (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_tol (pspp_linreg_cache * c) +reg_stats_tol (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_selection (pspp_linreg_cache * c) +reg_stats_selection (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -statistics_keyword_output (void (*function) (pspp_linreg_cache *), - int keyword, pspp_linreg_cache * c) +statistics_keyword_output (void (*function) (linreg *, void *), + int keyword, linreg * c, void *aux) { if (keyword) { - (*function) (c); + (*function) (c, aux); } } static void -subcommand_statistics (int *keywords, pspp_linreg_cache * c) +subcommand_statistics (int *keywords, linreg * c, void *aux) { /* The order here must match the order in which the STATISTICS @@ -502,21 +484,21 @@ subcommand_statistics (int *keywords, pspp_linreg_cache * c) keywords[r] = 1; } } - statistics_keyword_output (reg_stats_r, keywords[r], c); - statistics_keyword_output (reg_stats_anova, keywords[anova], c); - statistics_keyword_output (reg_stats_coeff, keywords[coeff], c); - statistics_keyword_output (reg_stats_outs, keywords[outs], c); - statistics_keyword_output (reg_stats_zpp, keywords[zpp], c); - statistics_keyword_output (reg_stats_label, keywords[label], c); - statistics_keyword_output (reg_stats_sha, keywords[sha], c); - statistics_keyword_output (reg_stats_ci, keywords[ci], c); - statistics_keyword_output (reg_stats_f, keywords[f], c); - statistics_keyword_output (reg_stats_bcov, keywords[bcov], c); - statistics_keyword_output (reg_stats_ses, keywords[ses], c); - statistics_keyword_output (reg_stats_xtx, keywords[xtx], c); - statistics_keyword_output (reg_stats_collin, keywords[collin], c); - statistics_keyword_output (reg_stats_tol, keywords[tol], c); - statistics_keyword_output (reg_stats_selection, keywords[selection], c); + statistics_keyword_output (reg_stats_r, keywords[r], c, aux); + statistics_keyword_output (reg_stats_anova, keywords[anova], c, aux); + statistics_keyword_output (reg_stats_coeff, keywords[coeff], c, aux); + statistics_keyword_output (reg_stats_outs, keywords[outs], c, aux); + statistics_keyword_output (reg_stats_zpp, keywords[zpp], c, aux); + statistics_keyword_output (reg_stats_label, keywords[label], c, aux); + statistics_keyword_output (reg_stats_sha, keywords[sha], c, aux); + statistics_keyword_output (reg_stats_ci, keywords[ci], c, aux); + statistics_keyword_output (reg_stats_f, keywords[f], c, aux); + statistics_keyword_output (reg_stats_bcov, keywords[bcov], c, aux); + statistics_keyword_output (reg_stats_ses, keywords[ses], c, aux); + statistics_keyword_output (reg_stats_xtx, keywords[xtx], c, aux); + statistics_keyword_output (reg_stats_collin, keywords[collin], c, aux); + statistics_keyword_output (reg_stats_tol, keywords[tol], c, aux); + statistics_keyword_output (reg_stats_selection, keywords[selection], c, aux); } /* @@ -531,7 +513,7 @@ regression_trns_free (void *t_) if (t->trns_id == t->n_trns) { - result = pspp_linreg_cache_free (t->c); + result = linreg_free (t->c); } free (t); @@ -548,9 +530,10 @@ regression_trns_pred_proc (void *t_, struct ccase **c, size_t i; size_t n_vals; struct reg_trns *trns = t_; - pspp_linreg_cache *model; + linreg *model; union value *output = NULL; - const union value **vals = NULL; + const union value *tmp; + double *vals; const struct variable **vars = NULL; assert (trns != NULL); @@ -559,21 +542,20 @@ regression_trns_pred_proc (void *t_, struct ccase **c, assert (model->depvar != NULL); assert (model->pred != NULL); - vars = xnmalloc (model->n_coeffs, sizeof (*vars)); - n_vals = (*model->get_vars) (model, vars); - + vars = linreg_get_vars (model); + n_vals = linreg_n_coeffs (model); vals = xnmalloc (n_vals, sizeof (*vals)); *c = case_unshare (*c); + output = case_data_rw (*c, model->pred); for (i = 0; i < n_vals; i++) { - vals[i] = case_data (*c, vars[i]); + tmp = case_data (*c, vars[i]); + vals[i] = tmp->f; } - output->f = (*model->predict) ((const struct variable **) vars, - vals, model, n_vals); + output->f = linreg_predict (model, vals, n_vals); free (vals); - free (vars); return TRNS_CONTINUE; } @@ -587,10 +569,11 @@ regression_trns_resid_proc (void *t_, struct ccase **c, size_t i; size_t n_vals; struct reg_trns *trns = t_; - pspp_linreg_cache *model; + linreg *model; union value *output = NULL; - const union value **vals = NULL; - const union value *obs = NULL; + const union value *tmp; + double *vals = NULL; + double obs; const struct variable **vars = NULL; assert (trns != NULL); @@ -599,8 +582,8 @@ regression_trns_resid_proc (void *t_, struct ccase **c, assert (model->depvar != NULL); assert (model->resid != NULL); - vars = xnmalloc (model->n_coeffs, sizeof (*vars)); - n_vals = (*model->get_vars) (model, vars); + vars = linreg_get_vars (model); + n_vals = linreg_n_coeffs (model); vals = xnmalloc (n_vals, sizeof (*vals)); *c = case_unshare (*c); @@ -609,49 +592,40 @@ regression_trns_resid_proc (void *t_, struct ccase **c, for (i = 0; i < n_vals; i++) { - vals[i] = case_data (*c, vars[i]); + tmp = case_data (*c, vars[i]); + vals[i] = tmp->f; } - obs = case_data (*c, model->depvar); - output->f = (*model->residual) ((const struct variable **) vars, - vals, obs, model, n_vals); + tmp = case_data (*c, model->depvar); + obs = tmp->f; + output->f = linreg_residual (model, obs, vals, n_vals); free (vals); - free (vars); - return TRNS_CONTINUE; -} -/* - Returns false if NAME is a duplicate of any existing variable name. -*/ -static bool -try_name (const struct dictionary *dict, const char *name) -{ - if (dict_lookup_var (dict, name) != NULL) - return false; - - return true; + return TRNS_CONTINUE; } -static void -reg_get_name (const struct dictionary *dict, char name[VAR_NAME_LEN], - const char prefix[VAR_NAME_LEN]) +static char * +reg_get_name (const struct dictionary *dict, const char *prefix) { - int i = 1; + char *name; + int i; - snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i); - while (!try_name (dict, name)) + /* XXX handle too-long prefixes */ + name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1); + for (i = 1; ; i++) { - i++; - snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i); + sprintf (name, "%s%d", prefix, i); + if (dict_lookup_var (dict, name) == NULL) + return name; } } static void reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, - pspp_linreg_cache * c, struct variable **v, int n_trns) + linreg * c, struct variable **v, int n_trns) { struct dictionary *dict = dataset_dict (ds); static int trns_index = 1; - char name[VAR_NAME_LEN]; + char *name; struct variable *new_var; struct reg_trns *t = NULL; @@ -659,22 +633,22 @@ reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, t->trns_id = trns_index; t->n_trns = n_trns; t->c = c; - reg_get_name (dict, name, prefix); - new_var = dict_create_var (dict, name, 0); - assert (new_var != NULL); + + name = reg_get_name (dict, prefix); + new_var = dict_create_var_assert (dict, name, 0); + free (name); + *v = new_var; add_transformation (ds, f, regression_trns_free, t); trns_index++; } static void -subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models) +subcommand_save (struct dataset *ds, int save, linreg ** models) { - pspp_linreg_cache **lc; + linreg **lc; int n_trns = 0; int i; - assert (models != NULL); - if (save) { /* Count the number of transformations we will need. */ @@ -713,7 +687,7 @@ subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models) { if (*lc != NULL) { - pspp_linreg_cache_free (*lc); + linreg_free (*lc); } } } @@ -724,7 +698,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) { struct casegrouper *grouper; struct casereader *group; - pspp_linreg_cache **models; + linreg **models; bool ok; size_t i; @@ -771,10 +745,10 @@ regression_custom_variables (struct lexer *lexer, struct dataset *ds, { const struct dictionary *dict = dataset_dict (ds); - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if ((lex_token (lexer) != T_ID - || dict_lookup_var (dict, lex_tokid (lexer)) == NULL) + || dict_lookup_var (dict, lex_tokcstr (lexer)) == NULL) && lex_token (lexer) != T_ALL) return 2; @@ -817,72 +791,150 @@ identify_indep_vars (const struct variable **indep_vars, } return n_indep_vars; } - -/* Encode categorical variables. - Returns number of valid cases. */ -static int -prepare_categories (struct casereader *input, - const struct variable **vars, size_t n_vars, - struct moments_var *mom) +static double +fill_covariance (gsl_matrix *cov, struct covariance *all_cov, + const struct variable **vars, + size_t n_vars, const struct variable *dep_var, + const struct variable **all_vars, size_t n_all_vars, + double *means) { - int n_data; - struct ccase *c; size_t i; - - assert (vars != NULL); - assert (mom != NULL); - - for (i = 0; i < n_vars; i++) - if (var_is_alpha (vars[i])) - cat_stored_values_create (vars[i]); - - n_data = 0; - for (; (c = casereader_read (input)) != NULL; case_unref (c)) + size_t j; + size_t dep_subscript; + size_t *rows; + const gsl_matrix *ssizes; + gsl_matrix *cm; + const gsl_matrix *mean_matrix; + const gsl_matrix *ssize_matrix; + double result = 0.0; + + cm = covariance_calculate_unnormalized (all_cov); + rows = xnmalloc (cov->size1 - 1, sizeof (*rows)); + + for (i = 0; i < n_all_vars; i++) { - /* - The second condition ensures the program will run even if - there is only one variable to act as both explanatory and - response. - */ - for (i = 0; i < n_vars; i++) + for (j = 0; j < n_vars; j++) + { + if (vars[j] == all_vars[i]) + { + rows[j] = i; + } + } + if (all_vars[i] == dep_var) { - const union value *val = case_data (c, vars[i]); - if (var_is_alpha (vars[i])) - cat_value_update (vars[i], val); - else - moments1_add (mom[i].m, val->f, 1.0); + dep_subscript = i; } - n_data++; } - casereader_destroy (input); - - return n_data; + mean_matrix = covariance_moments (all_cov, MOMENT_MEAN); + ssize_matrix = covariance_moments (all_cov, MOMENT_NONE); + for (i = 0; i < cov->size1 - 1; i++) + { + means[i] = gsl_matrix_get (mean_matrix, rows[i], 0) + / gsl_matrix_get (ssize_matrix, rows[i], 0); + for (j = 0; j < cov->size2 - 1; j++) + { + gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j])); + gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i])); + } + } + means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0) + / gsl_matrix_get (ssize_matrix, dep_subscript, 0); + ssizes = covariance_moments (all_cov, MOMENT_NONE); + result = gsl_matrix_get (ssizes, dep_subscript, rows[0]); + for (i = 0; i < cov->size1 - 1; i++) + { + gsl_matrix_set (cov, i, cov->size1 - 1, + gsl_matrix_get (cm, rows[i], dep_subscript)); + gsl_matrix_set (cov, cov->size1 - 1, i, + gsl_matrix_get (cm, rows[i], dep_subscript)); + if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript)) + { + result = gsl_matrix_get (ssizes, rows[i], dep_subscript); + } + } + gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, + gsl_matrix_get (cm, dep_subscript, dep_subscript)); + free (rows); + gsl_matrix_free (cm); + return result; } +static size_t +get_n_all_vars (struct cmd_regression *cmd) +{ + size_t result = n_variables; + size_t i; + size_t j; + result += cmd->n_dependent; + for (i = 0; i < cmd->n_dependent; i++) + { + for (j = 0; j < n_variables; j++) + { + if (v_variables[j] == cmd->v_dependent[i]) + { + result--; + } + } + } + return result; +} static void -coeff_init (pspp_linreg_cache * c, struct design_matrix *dm) +fill_all_vars (const struct variable **vars, struct cmd_regression *cmd) { - c->coeff = xnmalloc (dm->m->size2, sizeof (*c->coeff)); - pspp_coeff_init (c->coeff, dm); + size_t i; + size_t j; + bool absent; + + for (i = 0; i < n_variables; i++) + { + vars[i] = v_variables[i]; + } + for (i = 0; i < cmd->n_dependent; i++) + { + absent = true; + for (j = 0; j < n_variables; j++) + { + if (cmd->v_dependent[i] == v_variables[j]) + { + absent = false; + break; + } + } + if (absent) + { + vars[i + n_variables] = cmd->v_dependent[i]; + } + } } - static bool run_regression (struct casereader *input, struct cmd_regression *cmd, - struct dataset *ds, pspp_linreg_cache **models) + struct dataset *ds, linreg **models) { size_t i; int n_indep = 0; int k; + double n_data; + double *means; struct ccase *c; - const struct variable **indep_vars; - struct design_matrix *X; - struct moments_var *mom; - gsl_vector *Y; - - pspp_linreg_opts lopts; + struct covariance *cov; + const struct variable **vars; + const struct variable **all_vars; + const struct variable *dep_var; + struct casereader *reader; + const struct dictionary *dict; + size_t n_all_vars; assert (models != NULL); + for (i = 0; i < n_variables; i++) + { + if (!var_is_numeric (v_variables[i])) + { + msg (SE, _("REGRESSION requires numeric variables.")); + return false; + } + } + c = casereader_peek (input, 0); if (c == NULL) { @@ -892,124 +944,81 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, output_split_file_values (ds, c); case_unref (c); + dict = dataset_dict (ds); if (!v_variables) { - dict_get_vars (dataset_dict (ds), &v_variables, &n_variables, 0); + dict_get_vars (dict, &v_variables, &n_variables, 0); } - - for (i = 0; i < cmd->n_dependent; i++) + n_all_vars = get_n_all_vars (cmd); + all_vars = xnmalloc (n_all_vars, sizeof (*all_vars)); + fill_all_vars (all_vars, cmd); + vars = xnmalloc (n_variables, sizeof (*vars)); + means = xnmalloc (n_all_vars, sizeof (*means)); + cov = covariance_1pass_create (n_all_vars, all_vars, + dict_get_weight (dict), MV_ANY); + + reader = casereader_clone (input); + reader = casereader_create_filter_missing (reader, v_variables, n_variables, + MV_ANY, NULL, NULL); + for (; (c = casereader_read (reader)) != NULL; case_unref (c)) { - if (!var_is_numeric (cmd->v_dependent[i])) - { - msg (SE, _("Dependent variable must be numeric.")); - return false; - } + covariance_accumulate (cov, c); } - - mom = xnmalloc (n_variables, sizeof (*mom)); - for (i = 0; i < n_variables; i++) - { - (mom + i)->m = moments1_create (MOMENT_VARIANCE); - (mom + i)->v = v_variables[i]; - } - lopts.get_depvar_mean_std = 1; - - lopts.get_indep_mean_std = xnmalloc (n_variables, sizeof (int)); - indep_vars = xnmalloc (n_variables, sizeof *indep_vars); - + for (k = 0; k < cmd->n_dependent; k++) { - const struct variable *dep_var; - struct casereader *reader; - casenumber row; - struct ccase *c; - size_t n_data; /* Number of valid cases. */ - + gsl_matrix *this_cm; dep_var = cmd->v_dependent[k]; - n_indep = identify_indep_vars (indep_vars, dep_var); - reader = casereader_clone (input); - reader = casereader_create_filter_missing (reader, indep_vars, n_indep, - MV_ANY, NULL, NULL); - reader = casereader_create_filter_missing (reader, &dep_var, 1, - MV_ANY, NULL, NULL); - n_data = prepare_categories (casereader_clone (reader), - indep_vars, n_indep, mom); - - if ((n_data > 0) && (n_indep > 0)) + n_indep = identify_indep_vars (vars, dep_var); + + this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); + n_data = fill_covariance (this_cm, cov, vars, n_indep, + dep_var, all_vars, n_all_vars, means); + models[k] = linreg_alloc (dep_var, (const struct variable **) vars, + n_data, n_indep); + models[k]->depvar = dep_var; + for (i = 0; i < n_indep; i++) + { + linreg_set_indep_variable_mean (models[k], i, means[i]); + } + linreg_set_depvar_mean (models[k], means[i]); + /* + For large data sets, use QR decomposition. + */ + if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) + { + models[k]->method = LINREG_QR; + } + + if (n_data > 0) { - Y = gsl_vector_alloc (n_data); - X = - design_matrix_create (n_indep, - (const struct variable **) indep_vars, - n_data); - for (i = 0; i < X->m->size2; i++) - { - lopts.get_indep_mean_std[i] = 1; - } - models[k] = pspp_linreg_cache_alloc (dep_var, (const struct variable **) indep_vars, - X->m->size1, X->m->size2); - models[k]->depvar = dep_var; - /* - For large data sets, use QR decomposition. - */ - if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) - { - models[k]->method = PSPP_LINREG_QR; - } - - /* - The second pass fills the design matrix. - */ - reader = casereader_create_counter (reader, &row, -1); - for (; (c = casereader_read (reader)) != NULL; case_unref (c)) - { - for (i = 0; i < n_indep; ++i) - { - const struct variable *v = indep_vars[i]; - const union value *val = case_data (c, v); - if (var_is_alpha (v)) - design_matrix_set_categorical (X, row, v, val); - else - design_matrix_set_numeric (X, row, v, val); - } - gsl_vector_set (Y, row, case_num (c, dep_var)); - } - /* - Now that we know the number of coefficients, allocate space - and store pointers to the variables that correspond to the - coefficients. - */ - coeff_init (models[k], X); - /* - Find the least-squares estimates and other statistics. - */ - pspp_linreg ((const gsl_vector *) Y, X, &lopts, models[k]); - + Find the least-squares estimates and other statistics. + */ + linreg_fit (this_cm, models[k]); + if (!taint_has_tainted_successor (casereader_get_taint (input))) { - subcommand_statistics (cmd->a_statistics, models[k]); + subcommand_statistics (cmd->a_statistics, models[k], this_cm); } - - gsl_vector_free (Y); - design_matrix_destroy (X); } else { msg (SE, gettext ("No valid data found. This command was skipped.")); + linreg_free (models[k]); + models[k] = NULL; } - casereader_destroy (reader); + gsl_matrix_free (this_cm); } - for (i = 0; i < n_variables; i++) - { - moments1_destroy ((mom + i)->m); - } - free (mom); - free (indep_vars); - free (lopts.get_indep_mean_std); + + casereader_destroy (reader); + free (vars); + free (all_vars); + free (means); casereader_destroy (input); - + covariance_destroy (cov); + return true; }