X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fregression.q;h=5ca581d8e14c7301ed4ff79b5df5c15bf3105ab5;hb=81579d9e9f994fb2908f50af41c3eb033d216e58;hp=0718b8d4d0bccce1da5ff3269ae0fd1b5ecc9157;hpb=e385eeb8a2ea75fb2d9c1c628619baa03c914dae;p=pspp-builds.git diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q index 0718b8d4..5ca581d8 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.q @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2009 Free Software Foundation, Inc. + Copyright (C) 2005, 2009, 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,29 +21,30 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xalloc.h" + +#include "data/case.h" +#include "data/casegrouper.h" +#include "data/casereader.h" +#include "data/dictionary.h" +#include "data/missing-values.h" +#include "data/procedure.h" +#include "data/transformations.h" +#include "data/value-labels.h" +#include "data/variable.h" +#include "language/command.h" +#include "language/data-io/file-handle.h" +#include "language/dictionary/split-file.h" +#include "language/lexer/lexer.h" +#include "libpspp/compiler.h" +#include "libpspp/message.h" +#include "libpspp/taint.h" +#include "math/covariance.h" +#include "math/linreg.h" +#include "math/moments.h" +#include "output/tab.h" + +#include "gl/intprops.h" +#include "gl/xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -115,26 +116,26 @@ static bool run_regression (struct casereader *, struct cmd_regression *, /* STATISTICS subcommand output functions. */ -static void reg_stats_r (linreg *); -static void reg_stats_coeff (linreg *); -static void reg_stats_anova (linreg *); -static void reg_stats_outs (linreg *); -static void reg_stats_zpp (linreg *); -static void reg_stats_label (linreg *); -static void reg_stats_sha (linreg *); -static void reg_stats_ci (linreg *); -static void reg_stats_f (linreg *); -static void reg_stats_bcov (linreg *); -static void reg_stats_ses (linreg *); -static void reg_stats_xtx (linreg *); -static void reg_stats_collin (linreg *); -static void reg_stats_tol (linreg *); -static void reg_stats_selection (linreg *); -static void statistics_keyword_output (void (*)(linreg *), - int, linreg *); +static void reg_stats_r (linreg *, void *); +static void reg_stats_coeff (linreg *, void *); +static void reg_stats_anova (linreg *, void *); +static void reg_stats_outs (linreg *, void *); +static void reg_stats_zpp (linreg *, void *); +static void reg_stats_label (linreg *, void *); +static void reg_stats_sha (linreg *, void *); +static void reg_stats_ci (linreg *, void *); +static void reg_stats_f (linreg *, void *); +static void reg_stats_bcov (linreg *, void *); +static void reg_stats_ses (linreg *, void *); +static void reg_stats_xtx (linreg *, void *); +static void reg_stats_collin (linreg *, void *); +static void reg_stats_tol (linreg *, void *); +static void reg_stats_selection (linreg *, void *); +static void statistics_keyword_output (void (*)(linreg *, void *), + int, linreg *, void *); static void -reg_stats_r (linreg * c) +reg_stats_r (linreg *c, void *aux UNUSED) { struct tab_table *t; int n_rows = 2; @@ -147,8 +148,7 @@ reg_stats_r (linreg * c) rsq = linreg_ssreg (c) / linreg_sst (c); adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c)); std_error = sqrt (linreg_mse (c)); - t = tab_create (n_cols, n_rows, 0); - tab_dim (t, tab_natural_dimensions, NULL); + t = tab_create (n_cols, n_rows); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); tab_hline (t, TAL_2, 0, n_cols - 1, 1); tab_vline (t, TAL_2, 2, 0, n_rows - 1); @@ -170,7 +170,7 @@ reg_stats_r (linreg * c) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (linreg * c) +reg_stats_coeff (linreg * c, void *aux_) { size_t j; int n_cols = 7; @@ -184,13 +184,13 @@ reg_stats_coeff (linreg * c) const struct variable *v; struct tab_table *t; + gsl_matrix *cov = aux_; assert (c != NULL); n_rows = linreg_n_coeffs (c) + 3; - t = tab_create (n_cols, n_rows, 0); + t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); - tab_dim (t, tab_natural_dimensions, NULL); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); tab_hline (t, TAL_2, 0, n_cols - 1, 1); tab_vline (t, TAL_2, 2, 0, n_rows - 1); @@ -208,7 +208,7 @@ reg_stats_coeff (linreg * c) tab_double (t, 4, 1, 0, 0.0, NULL); t_stat = linreg_intercept (c) / std_err; tab_double (t, 5, 1, 0, t_stat, NULL); - pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0); + pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); tab_double (t, 6, 1, 0, pval, NULL); for (j = 0; j < linreg_n_coeffs (c); j++) { @@ -234,8 +234,9 @@ reg_stats_coeff (linreg * c) Standardized coefficient, i.e., regression coefficient if all variables had unit variance. */ - beta = sqrt (gsl_matrix_get (linreg_cov (c), j, j)); - beta *= linreg_coeff (c, j) / c->depvar_std; + beta = sqrt (gsl_matrix_get (cov, j, j)); + beta *= linreg_coeff (c, j) / + sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); tab_double (t, 4, this_row, 0, beta, NULL); /* @@ -260,7 +261,7 @@ reg_stats_coeff (linreg * c) Display the ANOVA table. */ static void -reg_stats_anova (linreg * c) +reg_stats_anova (linreg * c, void *aux UNUSED) { int n_cols = 7; int n_rows = 4; @@ -272,9 +273,8 @@ reg_stats_anova (linreg * c) struct tab_table *t; assert (c != NULL); - t = tab_create (n_cols, n_rows, 0); + t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); - tab_dim (t, tab_natural_dimensions, NULL); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); @@ -316,40 +316,40 @@ reg_stats_anova (linreg * c) } static void -reg_stats_outs (linreg * c) +reg_stats_outs (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_zpp (linreg * c) +reg_stats_zpp (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_label (linreg * c) +reg_stats_label (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_sha (linreg * c) +reg_stats_sha (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_ci (linreg * c) +reg_stats_ci (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_f (linreg * c) +reg_stats_f (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_bcov (linreg * c) +reg_stats_bcov (linreg * c, void *aux UNUSED) { int n_cols; int n_rows; @@ -363,9 +363,8 @@ reg_stats_bcov (linreg * c) assert (c != NULL); n_cols = c->n_indeps + 1 + 2; n_rows = 2 * (c->n_indeps + 1); - t = tab_create (n_cols, n_rows, 0); + t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); - tab_dim (t, tab_natural_dimensions, NULL); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); tab_hline (t, TAL_2, 0, n_cols - 1, 1); tab_vline (t, TAL_2, 2, 0, n_rows - 1); @@ -390,43 +389,43 @@ reg_stats_bcov (linreg * c) tab_submit (t); } static void -reg_stats_ses (linreg * c) +reg_stats_ses (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_xtx (linreg * c) +reg_stats_xtx (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_collin (linreg * c) +reg_stats_collin (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_tol (linreg * c) +reg_stats_tol (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_selection (linreg * c) +reg_stats_selection (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -statistics_keyword_output (void (*function) (linreg *), - int keyword, linreg * c) +statistics_keyword_output (void (*function) (linreg *, void *), + int keyword, linreg * c, void *aux) { if (keyword) { - (*function) (c); + (*function) (c, aux); } } static void -subcommand_statistics (int *keywords, linreg * c) +subcommand_statistics (int *keywords, linreg * c, void *aux) { /* The order here must match the order in which the STATISTICS @@ -486,21 +485,21 @@ subcommand_statistics (int *keywords, linreg * c) keywords[r] = 1; } } - statistics_keyword_output (reg_stats_r, keywords[r], c); - statistics_keyword_output (reg_stats_anova, keywords[anova], c); - statistics_keyword_output (reg_stats_coeff, keywords[coeff], c); - statistics_keyword_output (reg_stats_outs, keywords[outs], c); - statistics_keyword_output (reg_stats_zpp, keywords[zpp], c); - statistics_keyword_output (reg_stats_label, keywords[label], c); - statistics_keyword_output (reg_stats_sha, keywords[sha], c); - statistics_keyword_output (reg_stats_ci, keywords[ci], c); - statistics_keyword_output (reg_stats_f, keywords[f], c); - statistics_keyword_output (reg_stats_bcov, keywords[bcov], c); - statistics_keyword_output (reg_stats_ses, keywords[ses], c); - statistics_keyword_output (reg_stats_xtx, keywords[xtx], c); - statistics_keyword_output (reg_stats_collin, keywords[collin], c); - statistics_keyword_output (reg_stats_tol, keywords[tol], c); - statistics_keyword_output (reg_stats_selection, keywords[selection], c); + statistics_keyword_output (reg_stats_r, keywords[r], c, aux); + statistics_keyword_output (reg_stats_anova, keywords[anova], c, aux); + statistics_keyword_output (reg_stats_coeff, keywords[coeff], c, aux); + statistics_keyword_output (reg_stats_outs, keywords[outs], c, aux); + statistics_keyword_output (reg_stats_zpp, keywords[zpp], c, aux); + statistics_keyword_output (reg_stats_label, keywords[label], c, aux); + statistics_keyword_output (reg_stats_sha, keywords[sha], c, aux); + statistics_keyword_output (reg_stats_ci, keywords[ci], c, aux); + statistics_keyword_output (reg_stats_f, keywords[f], c, aux); + statistics_keyword_output (reg_stats_bcov, keywords[bcov], c, aux); + statistics_keyword_output (reg_stats_ses, keywords[ses], c, aux); + statistics_keyword_output (reg_stats_xtx, keywords[xtx], c, aux); + statistics_keyword_output (reg_stats_collin, keywords[collin], c, aux); + statistics_keyword_output (reg_stats_tol, keywords[tol], c, aux); + statistics_keyword_output (reg_stats_selection, keywords[selection], c, aux); } /* @@ -605,29 +604,19 @@ regression_trns_resid_proc (void *t_, struct ccase **c, return TRNS_CONTINUE; } -/* - Returns false if NAME is a duplicate of any existing variable name. -*/ -static bool -try_name (const struct dictionary *dict, const char *name) +static char * +reg_get_name (const struct dictionary *dict, const char *prefix) { - if (dict_lookup_var (dict, name) != NULL) - return false; - - return true; -} - -static void -reg_get_name (const struct dictionary *dict, char name[VAR_NAME_LEN], - const char prefix[VAR_NAME_LEN]) -{ - int i = 1; + char *name; + int i; - snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i); - while (!try_name (dict, name)) + /* XXX handle too-long prefixes */ + name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1); + for (i = 1; ; i++) { - i++; - snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i); + sprintf (name, "%s%d", prefix, i); + if (dict_lookup_var (dict, name) == NULL) + return name; } } @@ -637,7 +626,7 @@ reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, { struct dictionary *dict = dataset_dict (ds); static int trns_index = 1; - char name[VAR_NAME_LEN]; + char *name; struct variable *new_var; struct reg_trns *t = NULL; @@ -645,9 +634,11 @@ reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, t->trns_id = trns_index; t->n_trns = n_trns; t->c = c; - reg_get_name (dict, name, prefix); - new_var = dict_create_var (dict, name, 0); - assert (new_var != NULL); + + name = reg_get_name (dict, prefix); + new_var = dict_create_var_assert (dict, name, 0); + free (name); + *v = new_var; add_transformation (ds, f, regression_trns_free, t); trns_index++; @@ -755,10 +746,10 @@ regression_custom_variables (struct lexer *lexer, struct dataset *ds, { const struct dictionary *dict = dataset_dict (ds); - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if ((lex_token (lexer) != T_ID - || dict_lookup_var (dict, lex_tokid (lexer)) == NULL) + || dict_lookup_var (dict, lex_tokcstr (lexer)) == NULL) && lex_token (lexer) != T_ALL) return 2; @@ -813,11 +804,12 @@ fill_covariance (gsl_matrix *cov, struct covariance *all_cov, size_t dep_subscript; size_t *rows; const gsl_matrix *ssizes; - const gsl_matrix *cm; + gsl_matrix *cm; const gsl_matrix *mean_matrix; + const gsl_matrix *ssize_matrix; double result = 0.0; - cm = covariance_calculate (all_cov); + cm = covariance_calculate_unnormalized (all_cov); rows = xnmalloc (cov->size1 - 1, sizeof (*rows)); for (i = 0; i < n_all_vars; i++) @@ -835,16 +827,19 @@ fill_covariance (gsl_matrix *cov, struct covariance *all_cov, } } mean_matrix = covariance_moments (all_cov, MOMENT_MEAN); + ssize_matrix = covariance_moments (all_cov, MOMENT_NONE); for (i = 0; i < cov->size1 - 1; i++) { - means[i] = gsl_matrix_get (mean_matrix, rows[i], 0); + means[i] = gsl_matrix_get (mean_matrix, rows[i], 0) + / gsl_matrix_get (ssize_matrix, rows[i], 0); for (j = 0; j < cov->size2 - 1; j++) { gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j])); gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i])); } } - means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0); + means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0) + / gsl_matrix_get (ssize_matrix, dep_subscript, 0); ssizes = covariance_moments (all_cov, MOMENT_NONE); result = gsl_matrix_get (ssizes, dep_subscript, rows[0]); for (i = 0; i < cov->size1 - 1; i++) @@ -861,9 +856,57 @@ fill_covariance (gsl_matrix *cov, struct covariance *all_cov, gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, gsl_matrix_get (cm, dep_subscript, dep_subscript)); free (rows); + gsl_matrix_free (cm); return result; } +static size_t +get_n_all_vars (struct cmd_regression *cmd) +{ + size_t result = n_variables; + size_t i; + size_t j; + result += cmd->n_dependent; + for (i = 0; i < cmd->n_dependent; i++) + { + for (j = 0; j < n_variables; j++) + { + if (v_variables[j] == cmd->v_dependent[i]) + { + result--; + } + } + } + return result; +} +static void +fill_all_vars (const struct variable **vars, struct cmd_regression *cmd) +{ + size_t i; + size_t j; + bool absent; + + for (i = 0; i < n_variables; i++) + { + vars[i] = v_variables[i]; + } + for (i = 0; i < cmd->n_dependent; i++) + { + absent = true; + for (j = 0; j < n_variables; j++) + { + if (cmd->v_dependent[i] == v_variables[j]) + { + absent = false; + break; + } + } + if (absent) + { + vars[i + n_variables] = cmd->v_dependent[i]; + } + } +} static bool run_regression (struct casereader *input, struct cmd_regression *cmd, struct dataset *ds, linreg **models) @@ -876,10 +919,11 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, struct ccase *c; struct covariance *cov; const struct variable **vars; + const struct variable **all_vars; const struct variable *dep_var; struct casereader *reader; const struct dictionary *dict; - gsl_matrix *this_cm; + size_t n_all_vars; assert (models != NULL); @@ -906,9 +950,12 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, { dict_get_vars (dict, &v_variables, &n_variables, 0); } + n_all_vars = get_n_all_vars (cmd); + all_vars = xnmalloc (n_all_vars, sizeof (*all_vars)); + fill_all_vars (all_vars, cmd); vars = xnmalloc (n_variables, sizeof (*vars)); - means = xnmalloc (n_variables, sizeof (*means)); - cov = covariance_1pass_create (n_variables, v_variables, + means = xnmalloc (n_all_vars, sizeof (*means)); + cov = covariance_1pass_create (n_all_vars, all_vars, dict_get_weight (dict), MV_ANY); reader = casereader_clone (input); @@ -921,12 +968,13 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, for (k = 0; k < cmd->n_dependent; k++) { + gsl_matrix *this_cm; dep_var = cmd->v_dependent[k]; n_indep = identify_indep_vars (vars, dep_var); this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); n_data = fill_covariance (this_cm, cov, vars, n_indep, - dep_var, v_variables, n_variables, means); + dep_var, all_vars, n_all_vars, means); models[k] = linreg_alloc (dep_var, (const struct variable **) vars, n_data, n_indep); models[k]->depvar = dep_var; @@ -934,6 +982,7 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, { linreg_set_indep_variable_mean (models[k], i, means[i]); } + linreg_set_depvar_mean (models[k], means[i]); /* For large data sets, use QR decomposition. */ @@ -951,7 +1000,7 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, if (!taint_has_tainted_successor (casereader_get_taint (input))) { - subcommand_statistics (cmd->a_statistics, models[k]); + subcommand_statistics (cmd->a_statistics, models[k], this_cm); } } else @@ -961,10 +1010,12 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, linreg_free (models[k]); models[k] = NULL; } + gsl_matrix_free (this_cm); } casereader_destroy (reader); free (vars); + free (all_vars); free (means); casereader_destroy (input); covariance_destroy (cov);