From 840f7bace2423e6d240320ab308f0fbaa8c559f1 Mon Sep 17 00:00:00 2001 From: Jason H Stover Date: Fri, 24 Jun 2011 10:05:51 -0400 Subject: [PATCH] Added type 3 sums of squares to GLM --- src/language/stats/glm.c | 99 ++++++++++++++++++++++++++++++++++++++-- src/math/categoricals.c | 9 ++-- src/math/covariance.c | 25 ++++++++++ src/math/covariance.h | 2 + 4 files changed, 126 insertions(+), 9 deletions(-) diff --git a/src/language/stats/glm.c b/src/language/stats/glm.c index e09382a5..b8aa71cb 100644 --- a/src/language/stats/glm.c +++ b/src/language/stats/glm.c @@ -58,6 +58,13 @@ struct glm_spec /* The weight variable */ const struct variable *wv; + /* + Sums of squares due to different variables. Element 0 is the SSE + for the entire model. For i > 0, element i is the SS due to + variable i. + */ + gsl_vector * ssq; + bool intercept; }; @@ -67,8 +74,8 @@ struct glm_workspace struct moments *totals; }; -static void output_glm (const struct glm_spec *, const struct glm_workspace *ws); -static void run_glm (const struct glm_spec *cmd, struct casereader *input, const struct dataset *ds); +static void output_glm (struct glm_spec *, const struct glm_workspace *ws); +static void run_glm (struct glm_spec *cmd, struct casereader *input, const struct dataset *ds); int cmd_glm (struct lexer *lexer, struct dataset *ds) @@ -183,10 +190,86 @@ cmd_glm (struct lexer *lexer, struct dataset *ds) return CMD_FAILURE; } +static void get_ssq (struct covariance *, gsl_vector *, struct glm_spec *); + +static bool +not_dropped (size_t j, size_t * dropped, size_t n_dropped) +{ + size_t i; + + for (i = 0; i < n_dropped; i++) + { + if (j == dropped [i]) + return false; + } + return true; +} + +static void +get_ssq (struct covariance * cov, gsl_vector * ssq, struct glm_spec * cmd) +{ + const struct variable **vars; + gsl_matrix * small_cov = NULL; + gsl_matrix * cm = covariance_calculate_unnormalized (cov); + size_t i; + size_t j; + size_t k; + size_t n; + size_t m; + size_t * dropped; + size_t n_dropped; + + dropped = xcalloc (covariance_dim (cov), sizeof (*dropped)); + vars = xcalloc (covariance_dim (cov), sizeof (*vars)); + covariance_get_var_indices (cov, vars); + + for (k = 0; k < cmd->n_factor_vars; k++) + { + n_dropped = 0; + for (i = 1; i < covariance_dim (cov); i++) + { + if (vars [i] == cmd->factor_vars [k]) + { + dropped [n_dropped++] = i; + } + } + small_cov = gsl_matrix_alloc (cm->size1 - n_dropped, cm->size2 - n_dropped); + gsl_matrix_set (small_cov, 0, 0, gsl_matrix_get (cm, 0, 0)); + n = 0; + m = 0; + for (i = 0; i < cm->size1; i++) + { + if (not_dropped (i, dropped, n_dropped)) + { + m = 0; + for (j = 0; j < cm->size2; j++) + { + if (not_dropped (j, dropped, n_dropped)) + { + gsl_matrix_set (small_cov, n, m, gsl_matrix_get (cm, i, j)); + m++; + } + } + n++; + } + } + reg_sweep (small_cov, 0); + gsl_vector_set (ssq, k + 1, + gsl_matrix_get (small_cov, 0, 0) + - gsl_vector_get (ssq, 0)); + gsl_matrix_free (small_cov); + } + + free (dropped); + free (vars); + gsl_matrix_free (cm); + +} + static void dump_matrix (const gsl_matrix *m); static void -run_glm (const struct glm_spec *cmd, struct casereader *input, const struct dataset *ds) +run_glm (struct glm_spec *cmd, struct casereader *input, const struct dataset *ds) { int v; struct taint *taint; @@ -255,6 +338,14 @@ run_glm (const struct glm_spec *cmd, struct casereader *input, const struct data reg_sweep (cm, 0); + /* + Store the overall SSE. + */ + cmd->ssq = gsl_vector_alloc (cm->size1); + gsl_vector_set (cmd->ssq, 0, gsl_matrix_get (cm, 0, 0)); + get_ssq (cov, cmd->ssq, cmd); + + gsl_vector_free (cmd->ssq); dump_matrix (cm); gsl_matrix_free (cm); @@ -267,7 +358,7 @@ run_glm (const struct glm_spec *cmd, struct casereader *input, const struct data } static void -output_glm (const struct glm_spec *cmd, const struct glm_workspace *ws) +output_glm (struct glm_spec *cmd, const struct glm_workspace *ws) { const struct fmt_spec *wfmt = cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0; diff --git a/src/math/categoricals.c b/src/math/categoricals.c index 215f4d1f..f40ae10b 100644 --- a/src/math/categoricals.c +++ b/src/math/categoricals.c @@ -177,7 +177,7 @@ categoricals_dump (const struct categoricals *cat) printf ("\nReverse variable map:\n"); - for (v = 0 ; v < cat->n_cats_total; ++v) + for (v = 0 ; v < cat->n_cats_total - cat->n_vars; ++v) printf ("%d ", cat->reverse_variable_map[v]); printf ("\n"); } @@ -323,7 +323,7 @@ categoricals_done (const struct categoricals *cat_) int v; int idx = 0; cat->reverse_variable_map = pool_calloc (cat->pool, - cat->n_cats_total, + cat->n_cats_total - cat->n_vars, sizeof *cat->reverse_variable_map); for (v = 0 ; v < cat->n_vp; ++v) @@ -349,12 +349,11 @@ categoricals_done (const struct categoricals *cat_) /* Populate the reverse variable map. */ - for (i = 0; i < vp->n_cats; ++i) + for (i = 0; i < vp->n_cats - 1; ++i) cat->reverse_variable_map[idx++] = v; } assert (cat->n_vars <= cat->n_vp); - } @@ -363,7 +362,7 @@ reverse_variable_lookup (const struct categoricals *cat, int subscript) { assert (cat->reverse_variable_map); assert (subscript >= 0); - assert (subscript < cat->n_cats_total); + assert (subscript < cat->n_cats_total - cat->n_vars); return cat->reverse_variable_map[subscript]; } diff --git a/src/math/covariance.c b/src/math/covariance.c index 446ee662..a7a5f131 100644 --- a/src/math/covariance.c +++ b/src/math/covariance.c @@ -731,3 +731,28 @@ covariance_destroy (struct covariance *cov) free (cov->cm); free (cov); } + +size_t +covariance_dim (const struct covariance * cov) +{ + return (cov->dim); +} + +/* + Returns an array of variables corresponding to rows of the covariance matrix. + In other words, element i of the array is the variable corresponding to + row (and column) i of the covariance matrix. + */ +void +covariance_get_var_indices (const struct covariance *cov, struct variable **vars) +{ + int i; + for (i = 0; i < cov->n_vars; i++) + { + vars[i] = cov->vars[i]; + } + for (i = cov->n_vars; i < cov->dim; i++) + { + vars[i] = categoricals_get_variable_by_subscript (cov->categoricals, i - cov->n_vars); + } +} diff --git a/src/math/covariance.h b/src/math/covariance.h index 4eb2f3f6..6ec645b7 100644 --- a/src/math/covariance.h +++ b/src/math/covariance.h @@ -48,4 +48,6 @@ const gsl_matrix *covariance_moments (const struct covariance *cov, int m); const struct categoricals * covariance_get_categoricals (const struct covariance *cov); +void covariance_get_var_indices (const struct covariance *cov, struct variable **vars); +size_t covariance_dim (const struct covariance * cov); #endif -- 2.30.2