From 840f7bace2423e6d240320ab308f0fbaa8c559f1 Mon Sep 17 00:00:00 2001
From: Jason H Stover <jhs@franz.gcsu.edu>
Date: Fri, 24 Jun 2011 10:05:51 -0400
Subject: [PATCH] Added type 3 sums of squares to GLM

---
 src/language/stats/glm.c | 99 ++++++++++++++++++++++++++++++++++++++--
 src/math/categoricals.c  |  9 ++--
 src/math/covariance.c    | 25 ++++++++++
 src/math/covariance.h    |  2 +
 4 files changed, 126 insertions(+), 9 deletions(-)

diff --git a/src/language/stats/glm.c b/src/language/stats/glm.c
index e09382a5c5..b8aa71cb4d 100644
--- a/src/language/stats/glm.c
+++ b/src/language/stats/glm.c
@@ -58,6 +58,13 @@ struct glm_spec
   /* The weight variable */
   const struct variable *wv;
 
+  /* 
+     Sums of squares due to different variables. Element 0 is the SSE
+     for the entire model. For i > 0, element i is the SS due to
+     variable i.
+   */
+  gsl_vector * ssq;
+
   bool intercept;
 };
 
@@ -67,8 +74,8 @@ struct glm_workspace
   struct moments *totals;
 };
 
-static void output_glm (const struct glm_spec *, const struct glm_workspace *ws);
-static void run_glm (const struct glm_spec *cmd, struct casereader *input, const struct dataset *ds);
+static void output_glm (struct glm_spec *, const struct glm_workspace *ws);
+static void run_glm (struct glm_spec *cmd, struct casereader *input, const struct dataset *ds);
 
 int
 cmd_glm (struct lexer *lexer, struct dataset *ds)
@@ -183,10 +190,86 @@ cmd_glm (struct lexer *lexer, struct dataset *ds)
   return CMD_FAILURE;
 }
 
+static void get_ssq (struct covariance *, gsl_vector *, struct glm_spec *);
+
+static bool
+not_dropped (size_t j, size_t * dropped, size_t n_dropped)
+{
+  size_t i;
+
+  for (i = 0; i < n_dropped; i++)
+    {
+      if (j == dropped [i])
+	return false;
+    }
+  return true;
+}
+
+static void
+get_ssq (struct covariance * cov, gsl_vector * ssq, struct glm_spec * cmd)
+{
+  const struct variable **vars;
+  gsl_matrix * small_cov = NULL;
+  gsl_matrix * cm = covariance_calculate_unnormalized (cov);
+  size_t i;
+  size_t j;
+  size_t k;
+  size_t n;
+  size_t m;
+  size_t * dropped;
+  size_t n_dropped;
+
+  dropped = xcalloc (covariance_dim (cov), sizeof (*dropped));
+  vars = xcalloc (covariance_dim (cov), sizeof (*vars));
+  covariance_get_var_indices (cov, vars);
+
+  for (k = 0; k < cmd->n_factor_vars; k++)
+    {
+      n_dropped = 0;
+      for (i = 1; i < covariance_dim (cov); i++)
+	{
+	  if (vars [i] == cmd->factor_vars [k])
+	    {
+	      dropped [n_dropped++] = i;
+	    }
+	}
+      small_cov = gsl_matrix_alloc (cm->size1 - n_dropped, cm->size2 - n_dropped);
+      gsl_matrix_set (small_cov, 0, 0, gsl_matrix_get (cm, 0, 0));
+      n = 0;
+      m = 0;
+      for (i = 0; i < cm->size1; i++)
+	{
+	  if (not_dropped (i, dropped, n_dropped))
+	    {
+	      m = 0;
+	      for (j = 0; j < cm->size2; j++)
+		{
+		  if (not_dropped (j, dropped, n_dropped))
+		    {
+		      gsl_matrix_set (small_cov, n, m, gsl_matrix_get (cm, i, j));
+		      m++;
+		    }
+		}
+	      n++;
+	    }
+	}
+      reg_sweep (small_cov, 0);
+      gsl_vector_set (ssq, k + 1, 
+		      gsl_matrix_get (small_cov, 0, 0)
+		      - gsl_vector_get (ssq, 0));
+      gsl_matrix_free (small_cov);
+    }
+
+  free (dropped);
+  free (vars);
+  gsl_matrix_free (cm);
+
+}
+
 static  void dump_matrix (const gsl_matrix *m);
 
 static void
-run_glm (const struct glm_spec *cmd, struct casereader *input, const struct dataset *ds)
+run_glm (struct glm_spec *cmd, struct casereader *input, const struct dataset *ds)
 {
   int v;
   struct taint *taint;
@@ -255,6 +338,14 @@ run_glm (const struct glm_spec *cmd, struct casereader *input, const struct data
 
     reg_sweep (cm, 0);
 
+    /*
+      Store the overall SSE.
+     */
+    cmd->ssq = gsl_vector_alloc (cm->size1);
+    gsl_vector_set (cmd->ssq, 0, gsl_matrix_get (cm, 0, 0));
+    get_ssq (cov, cmd->ssq, cmd);
+
+    gsl_vector_free (cmd->ssq);
     dump_matrix (cm);
 
     gsl_matrix_free (cm);
@@ -267,7 +358,7 @@ run_glm (const struct glm_spec *cmd, struct casereader *input, const struct data
 }
 
 static void
-output_glm (const struct glm_spec *cmd, const struct glm_workspace *ws)
+output_glm (struct glm_spec *cmd, const struct glm_workspace *ws)
 {
   const struct fmt_spec *wfmt = cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0;
 
diff --git a/src/math/categoricals.c b/src/math/categoricals.c
index 215f4d1fbc..f40ae10b5d 100644
--- a/src/math/categoricals.c
+++ b/src/math/categoricals.c
@@ -177,7 +177,7 @@ categoricals_dump (const struct categoricals *cat)
 
   printf ("\nReverse variable map:\n");
 
-  for (v = 0 ; v < cat->n_cats_total; ++v)
+  for (v = 0 ; v < cat->n_cats_total - cat->n_vars; ++v)
     printf ("%d ", cat->reverse_variable_map[v]);
   printf ("\n");
 }
@@ -323,7 +323,7 @@ categoricals_done (const struct categoricals *cat_)
   int v;
   int idx = 0;
   cat->reverse_variable_map = pool_calloc (cat->pool,
-					   cat->n_cats_total,
+					   cat->n_cats_total - cat->n_vars,
 					   sizeof *cat->reverse_variable_map);
   
   for (v = 0 ; v < cat->n_vp; ++v)
@@ -349,12 +349,11 @@ categoricals_done (const struct categoricals *cat_)
 
       /* Populate the reverse variable map.
        */
-      for (i = 0; i < vp->n_cats; ++i)
+      for (i = 0; i < vp->n_cats - 1; ++i)
 	cat->reverse_variable_map[idx++] = v;
     }
 
   assert (cat->n_vars <= cat->n_vp);
-
 }
 
 
@@ -363,7 +362,7 @@ reverse_variable_lookup (const struct categoricals *cat, int subscript)
 {
   assert (cat->reverse_variable_map);
   assert (subscript >= 0);
-  assert (subscript < cat->n_cats_total);
+  assert (subscript < cat->n_cats_total - cat->n_vars);
 
   return cat->reverse_variable_map[subscript];
 }
diff --git a/src/math/covariance.c b/src/math/covariance.c
index 446ee6621a..a7a5f13165 100644
--- a/src/math/covariance.c
+++ b/src/math/covariance.c
@@ -731,3 +731,28 @@ covariance_destroy (struct covariance *cov)
   free (cov->cm);
   free (cov);
 }
+
+size_t
+covariance_dim (const struct covariance * cov)
+{
+  return (cov->dim);
+}
+
+/*
+  Returns an array of variables corresponding to rows of the covariance matrix.
+  In other words, element i of the array is the variable corresponding to 
+  row (and column) i of the covariance matrix.
+ */
+void
+covariance_get_var_indices (const struct covariance *cov, struct variable **vars)
+{
+  int i;
+  for (i = 0; i < cov->n_vars; i++)
+    {
+      vars[i] = cov->vars[i];
+    }
+  for (i = cov->n_vars; i < cov->dim; i++)
+    {
+      vars[i] = categoricals_get_variable_by_subscript (cov->categoricals, i - cov->n_vars);
+    }
+}
diff --git a/src/math/covariance.h b/src/math/covariance.h
index 4eb2f3f648..6ec645b7e2 100644
--- a/src/math/covariance.h
+++ b/src/math/covariance.h
@@ -48,4 +48,6 @@ const gsl_matrix *covariance_moments (const struct covariance *cov, int m);
 
 const struct categoricals * covariance_get_categoricals (const struct covariance *cov);
 
+void covariance_get_var_indices (const struct covariance *cov, struct variable **vars);
+size_t covariance_dim (const struct covariance * cov);
 #endif
-- 
2.30.2