From: Jason H Stover Date: Mon, 17 May 2010 20:39:28 +0000 (-0400) Subject: Rewrote to use new covariance functions. X-Git-Tag: v0.7.5~7^2~1 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp-builds.git;a=commitdiff_plain;h=e80304e52b4c7cb2e2924570f97c34c2f69f8aae Rewrote to use new covariance functions. --- diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q index 0718b8d4..c27d8a40 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.q @@ -115,26 +115,26 @@ static bool run_regression (struct casereader *, struct cmd_regression *, /* STATISTICS subcommand output functions. */ -static void reg_stats_r (linreg *); -static void reg_stats_coeff (linreg *); -static void reg_stats_anova (linreg *); -static void reg_stats_outs (linreg *); -static void reg_stats_zpp (linreg *); -static void reg_stats_label (linreg *); -static void reg_stats_sha (linreg *); -static void reg_stats_ci (linreg *); -static void reg_stats_f (linreg *); -static void reg_stats_bcov (linreg *); -static void reg_stats_ses (linreg *); -static void reg_stats_xtx (linreg *); -static void reg_stats_collin (linreg *); -static void reg_stats_tol (linreg *); -static void reg_stats_selection (linreg *); -static void statistics_keyword_output (void (*)(linreg *), - int, linreg *); +static void reg_stats_r (linreg *, void *); +static void reg_stats_coeff (linreg *, void *); +static void reg_stats_anova (linreg *, void *); +static void reg_stats_outs (linreg *, void *); +static void reg_stats_zpp (linreg *, void *); +static void reg_stats_label (linreg *, void *); +static void reg_stats_sha (linreg *, void *); +static void reg_stats_ci (linreg *, void *); +static void reg_stats_f (linreg *, void *); +static void reg_stats_bcov (linreg *, void *); +static void reg_stats_ses (linreg *, void *); +static void reg_stats_xtx (linreg *, void *); +static void reg_stats_collin (linreg *, void *); +static void reg_stats_tol (linreg *, void *); +static void reg_stats_selection (linreg *, void *); +static void statistics_keyword_output (void (*)(linreg *, void *), + int, linreg *, void *); static void -reg_stats_r (linreg * c) +reg_stats_r (linreg *c, void *aux UNUSED) { struct tab_table *t; int n_rows = 2; @@ -170,7 +170,7 @@ reg_stats_r (linreg * c) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (linreg * c) +reg_stats_coeff (linreg * c, void *aux_) { size_t j; int n_cols = 7; @@ -186,6 +186,7 @@ reg_stats_coeff (linreg * c) struct tab_table *t; assert (c != NULL); + gsl_matrix *cov = (gsl_matrix *) aux_; n_rows = linreg_n_coeffs (c) + 3; t = tab_create (n_cols, n_rows, 0); @@ -234,8 +235,9 @@ reg_stats_coeff (linreg * c) Standardized coefficient, i.e., regression coefficient if all variables had unit variance. */ - beta = sqrt (gsl_matrix_get (linreg_cov (c), j, j)); - beta *= linreg_coeff (c, j) / c->depvar_std; + beta = sqrt (gsl_matrix_get (cov, j, j)); + beta *= linreg_coeff (c, j) / + sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); tab_double (t, 4, this_row, 0, beta, NULL); /* @@ -260,7 +262,7 @@ reg_stats_coeff (linreg * c) Display the ANOVA table. */ static void -reg_stats_anova (linreg * c) +reg_stats_anova (linreg * c, void *aux UNUSED) { int n_cols = 7; int n_rows = 4; @@ -316,40 +318,40 @@ reg_stats_anova (linreg * c) } static void -reg_stats_outs (linreg * c) +reg_stats_outs (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_zpp (linreg * c) +reg_stats_zpp (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_label (linreg * c) +reg_stats_label (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_sha (linreg * c) +reg_stats_sha (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_ci (linreg * c) +reg_stats_ci (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_f (linreg * c) +reg_stats_f (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_bcov (linreg * c) +reg_stats_bcov (linreg * c, void *aux UNUSED) { int n_cols; int n_rows; @@ -390,43 +392,43 @@ reg_stats_bcov (linreg * c) tab_submit (t); } static void -reg_stats_ses (linreg * c) +reg_stats_ses (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_xtx (linreg * c) +reg_stats_xtx (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_collin (linreg * c) +reg_stats_collin (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_tol (linreg * c) +reg_stats_tol (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -reg_stats_selection (linreg * c) +reg_stats_selection (linreg * c, void *aux UNUSED) { assert (c != NULL); } static void -statistics_keyword_output (void (*function) (linreg *), - int keyword, linreg * c) +statistics_keyword_output (void (*function) (linreg *, void *), + int keyword, linreg * c, void *aux) { if (keyword) { - (*function) (c); + (*function) (c, aux); } } static void -subcommand_statistics (int *keywords, linreg * c) +subcommand_statistics (int *keywords, linreg * c, void *aux) { /* The order here must match the order in which the STATISTICS @@ -486,21 +488,21 @@ subcommand_statistics (int *keywords, linreg * c) keywords[r] = 1; } } - statistics_keyword_output (reg_stats_r, keywords[r], c); - statistics_keyword_output (reg_stats_anova, keywords[anova], c); - statistics_keyword_output (reg_stats_coeff, keywords[coeff], c); - statistics_keyword_output (reg_stats_outs, keywords[outs], c); - statistics_keyword_output (reg_stats_zpp, keywords[zpp], c); - statistics_keyword_output (reg_stats_label, keywords[label], c); - statistics_keyword_output (reg_stats_sha, keywords[sha], c); - statistics_keyword_output (reg_stats_ci, keywords[ci], c); - statistics_keyword_output (reg_stats_f, keywords[f], c); - statistics_keyword_output (reg_stats_bcov, keywords[bcov], c); - statistics_keyword_output (reg_stats_ses, keywords[ses], c); - statistics_keyword_output (reg_stats_xtx, keywords[xtx], c); - statistics_keyword_output (reg_stats_collin, keywords[collin], c); - statistics_keyword_output (reg_stats_tol, keywords[tol], c); - statistics_keyword_output (reg_stats_selection, keywords[selection], c); + statistics_keyword_output (reg_stats_r, keywords[r], c, aux); + statistics_keyword_output (reg_stats_anova, keywords[anova], c, aux); + statistics_keyword_output (reg_stats_coeff, keywords[coeff], c, aux); + statistics_keyword_output (reg_stats_outs, keywords[outs], c, aux); + statistics_keyword_output (reg_stats_zpp, keywords[zpp], c, aux); + statistics_keyword_output (reg_stats_label, keywords[label], c, aux); + statistics_keyword_output (reg_stats_sha, keywords[sha], c, aux); + statistics_keyword_output (reg_stats_ci, keywords[ci], c, aux); + statistics_keyword_output (reg_stats_f, keywords[f], c, aux); + statistics_keyword_output (reg_stats_bcov, keywords[bcov], c, aux); + statistics_keyword_output (reg_stats_ses, keywords[ses], c, aux); + statistics_keyword_output (reg_stats_xtx, keywords[xtx], c, aux); + statistics_keyword_output (reg_stats_collin, keywords[collin], c, aux); + statistics_keyword_output (reg_stats_tol, keywords[tol], c, aux); + statistics_keyword_output (reg_stats_selection, keywords[selection], c, aux); } /* @@ -815,9 +817,10 @@ fill_covariance (gsl_matrix *cov, struct covariance *all_cov, const gsl_matrix *ssizes; const gsl_matrix *cm; const gsl_matrix *mean_matrix; + const gsl_matrix *ssize_matrix; double result = 0.0; - cm = covariance_calculate (all_cov); + cm = covariance_calculate_unnormalized (all_cov); rows = xnmalloc (cov->size1 - 1, sizeof (*rows)); for (i = 0; i < n_all_vars; i++) @@ -835,16 +838,19 @@ fill_covariance (gsl_matrix *cov, struct covariance *all_cov, } } mean_matrix = covariance_moments (all_cov, MOMENT_MEAN); + ssize_matrix = covariance_moments (all_cov, MOMENT_NONE); for (i = 0; i < cov->size1 - 1; i++) { - means[i] = gsl_matrix_get (mean_matrix, rows[i], 0); + means[i] = gsl_matrix_get (mean_matrix, rows[i], 0) + / gsl_matrix_get (ssize_matrix, rows[i], 0); for (j = 0; j < cov->size2 - 1; j++) { gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j])); gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i])); } } - means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0); + means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0) + / gsl_matrix_get (ssize_matrix, dep_subscript, 0); ssizes = covariance_moments (all_cov, MOMENT_NONE); result = gsl_matrix_get (ssizes, dep_subscript, rows[0]); for (i = 0; i < cov->size1 - 1; i++) @@ -934,6 +940,7 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, { linreg_set_indep_variable_mean (models[k], i, means[i]); } + linreg_set_depvar_mean (models[k], means[i]); /* For large data sets, use QR decomposition. */ @@ -951,7 +958,7 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, if (!taint_has_tainted_successor (casereader_get_taint (input))) { - subcommand_statistics (cmd->a_statistics, models[k]); + subcommand_statistics (cmd->a_statistics, models[k], this_cm); } } else diff --git a/src/math/linreg.c b/src/math/linreg.c index 6a0d57f4..e0083eca 100644 --- a/src/math/linreg.c +++ b/src/math/linreg.c @@ -81,9 +81,7 @@ linreg_alloc (const struct variable *depvar, const struct variable **indep_vars, } c->indep_means = gsl_vector_alloc (p); c->indep_std = gsl_vector_alloc (p); - c->ssx = gsl_vector_alloc (p); /* Sums of squares for the - independent variables. - */ + c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the model parameters. */ @@ -91,11 +89,13 @@ linreg_alloc (const struct variable *depvar, const struct variable **indep_vars, c->n_indeps = p; c->n_coeffs = p; c->coeff = xnmalloc (p, sizeof (*c->coeff)); - c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1); + c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1); c->dft = n - 1; c->dfm = p; c->dfe = c->dft - c->dfm; c->intercept = 0.0; + c->depvar_mean = 0.0; + c->depvar_std = 0.0; /* Default settings. */ @@ -115,7 +115,6 @@ linreg_free (void *m) gsl_vector_free (c->indep_means); gsl_vector_free (c->indep_std); gsl_matrix_free (c->cov); - gsl_vector_free (c->ssx); free (c->indep_vars); free (c->coeff); free (c); @@ -266,6 +265,34 @@ void linreg_set_indep_variable_mean (linreg *c, size_t j, double m) assert (c != NULL); gsl_vector_set (c->indep_means, j, m); } +static void invert_r (gsl_matrix *r, gsl_matrix *r_inv) +{ + size_t i; + size_t j; + size_t k; + size_t row; + double tmp; + + for (i = 0; i < r->size1; i++) + { + gsl_matrix_set (r_inv, i, i, 1.0 / gsl_matrix_get (r, i, i)); + } + for (i = 0; i < r->size1; i++) + { + row = 0; + for (j = row + 1 + i; j < r->size2; j++) + { + tmp = 0.0; + for (k = 1; k <= j - row; k++) + { + tmp += gsl_matrix_get (r, row, row + k) + * gsl_matrix_get (r_inv, row + k, j); + } + gsl_matrix_set (r_inv, row, j, -tmp / gsl_matrix_get (r, row, row)); + row++; + } + } +} static void linreg_fit_qr (const gsl_matrix *cov, linreg *l) @@ -296,52 +323,61 @@ linreg_fit_qr (const gsl_matrix *cov, linreg *l) gsl_linalg_QR_decomp (xtx, tau); q = gsl_matrix_alloc (xtx->size1, xtx->size2); r = gsl_matrix_alloc (xtx->size1, xtx->size2); + gsl_linalg_QR_unpack (xtx, tau, q, r); gsl_linalg_QR_solve (xtx, tau, xty, params); for (i = 0; i < params->size; i++) { l->coeff[i] = gsl_vector_get (params, i); } - - l->intercept = l->depvar_mean; + l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1); + l->ssm = 0.0; for (i = 0; i < l->n_indeps; i++) { - l->intercept -= l->coeff[i] * linreg_get_indep_variable_mean (l, i); + l->ssm += gsl_vector_get (xty, i) * l->coeff[i]; } + l->sse = l->sst - l->ssm; - l->sse = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1); - for (i = 0; i < l->n_indeps; i++) + gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l), + r, q); + /* Copy the lower triangle into the upper triangle. */ + double intercept_variance = 0.0; + for (i = 0; i < q->size1; i++) { - tmp += gsl_vector_get (xty, i) * l->coeff[i]; - } - l->sse -= 2.0 * tmp; - for (i = 0; i < xtx->size1; i++) - { - tmp = 0.0; - for (j = i; j < xtx->size2; j++) + gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i)); + for (j = i + 1; j < q->size2; j++) { - tmp += gsl_matrix_get (xtx, i, j) * l->coeff[j]; + intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) * + linreg_get_indep_variable_mean (l, i) * + linreg_get_indep_variable_mean (l, j); + gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i)); } - l->sse += tmp * tmp; + } + l->intercept = linreg_get_depvar_mean (l); + tmp = 0.0; + for (i = 0; i < l->n_indeps; i++) + { + tmp = linreg_get_indep_variable_mean (l, i); + l->intercept -= l->coeff[i] * tmp; + intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i); } -#if 0 - p = l->hat->size1 - 1; - for (i = 0; i < l->cov->size1 - 1; i++) + /* Covariances related to the intercept. */ + intercept_variance += linreg_mse (l) / linreg_n_obs (l); + gsl_matrix_set (l->cov, 0, 0, intercept_variance); + double intcpt_coef = 0.0; + for (i = 0; i < q->size1; i++) { - gsl_matrix_set (l->cov, p - i, p - i, 1.0 / gsl_matrix_get (r, p - i, p - i)); - for (j = 0; j < i; j++) + for (j = 0; j < q->size2; j++) { - tmp = -1.0 * gsl_matrix_get (r, p - i, p - j); - tmp /= gsl_matrix_get (r, p - i, p - i) * gsl_matrix_get (r, p - j, p - j); - gsl_matrix_set (l->cov, p - i, p - j, tmp); + intcpt_coef -= gsl_matrix_get (q, i, j) + * linreg_get_indep_variable_mean (l, j); } + gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef); + gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef); + intcpt_coef = 0.0; } -#endif - gsl_matrix_transpose_memcpy (l->cov, q); - gsl_blas_dtrsm (CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 1.0, - r, l->cov); - + gsl_matrix_free (q); gsl_matrix_free (r); gsl_vector_free (xty); @@ -359,24 +395,23 @@ linreg_fit_qr (const gsl_matrix *cov, linreg *l) void linreg_fit (const gsl_matrix *cov, linreg *l) { - gsl_matrix *params; assert (l != NULL); assert (cov != NULL); - params = gsl_matrix_calloc (cov->size1, cov->size2); - gsl_matrix_memcpy (params, cov); l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1); - if (l->method == LINREG_SWEEP) { + gsl_matrix *params; + params = gsl_matrix_calloc (cov->size1, cov->size2); + gsl_matrix_memcpy (params, cov); reg_sweep (params); post_sweep_computations (l, params); + gsl_matrix_free (params); } else if (l->method == LINREG_QR) { - linreg_fit_qr (params, l); + linreg_fit_qr (cov, l); } - gsl_matrix_free (params); } double linreg_mse (const linreg *c) @@ -414,7 +449,7 @@ linreg_n_coeffs (const linreg *c) return c->n_coeffs; } -size_t +double linreg_n_obs (const linreg *c) { return c->n_obs; @@ -442,3 +477,15 @@ linreg_dfmodel ( const linreg *c) { return c->dfm; } + +void +linreg_set_depvar_mean (linreg *c, double x) +{ + c->depvar_mean = x; +} + +double +linreg_get_depvar_mean (linreg *c) +{ + return c->depvar_mean; +} diff --git a/src/math/linreg.h b/src/math/linreg.h index 17fe3c73..349d5a90 100644 --- a/src/math/linreg.h +++ b/src/math/linreg.h @@ -129,11 +129,6 @@ struct linreg_struct dfe, but since it is the best unbiased estimate of the population variance, it has its own entry here. */ - gsl_vector *ssx; /* Centered sums of squares for independent - variables, i.e. \sum (x[i] - mean(x))^2. */ - double ssy; /* Centered sums of squares for dependent - variable. - */ /* Covariance matrix of the parameter estimates. */ @@ -145,12 +140,6 @@ struct linreg_struct double dfe; double dfm; - /* - 'Hat' or Hessian matrix, i.e. (X'X)^{-1}, where X is our - design matrix. - */ - gsl_matrix *hat; - struct variable *pred; struct variable *resid; }; @@ -197,9 +186,11 @@ gsl_matrix * linreg_cov (const linreg *); double linreg_coeff (const linreg *, size_t); const struct variable * linreg_indep_var (const linreg *, size_t); size_t linreg_n_coeffs (const linreg *); -size_t linreg_n_obs (const linreg *); +double linreg_n_obs (const linreg *); double linreg_sse (const linreg *); double linreg_ssreg (const linreg *); double linreg_dfmodel (const linreg *); double linreg_sst (const linreg *); +void linreg_set_depvar_mean (linreg *, double); +double linreg_get_depvar_mean (linreg *); #endif