From b4fdd51a0bf62800c53c8a805f31ea735d931029 Mon Sep 17 00:00:00 2001
From: Jason H Stover <jhs@math.gcsu.edu>
Date: Mon, 15 Sep 2008 12:35:11 -0400
Subject: [PATCH] pspp_coeff_var_to_coeff: Guard against a null pointer in
 coefs[i]->v_info. regression.q: Pass variable lists to
 pspp_linreg_cache_alloc. pspp_linreg_cache_alloc: Take variable lists as
 arguments to allow easier access 			 later and computation
 of number of coefficients during allocation. rearrange_covariance_matrix: Use
 the new stored list of variables in the linreg_cache 			    
 instead of calling pspp_linreg_get_vars(). cache_init: Move computation of
 n_coeffs to pspp_linreg_cache_alloc().

---
 src/language/stats/glm.q        |  14 ++--
 src/language/stats/regression.q |   3 +-
 src/math/coefficient.c          |  10 ++-
 src/math/linreg.c               | 113 +++++++++++++++-----------------
 src/math/linreg.h               |   6 +-
 5 files changed, 77 insertions(+), 69 deletions(-)

diff --git a/src/language/stats/glm.q b/src/language/stats/glm.q
index ad8bf55e..ddc2a78b 100644
--- a/src/language/stats/glm.q
+++ b/src/language/stats/glm.q
@@ -151,13 +151,18 @@ glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
   return 1;
 }
 
+/*
+  COV is the covariance matrix for variables included in the
+  model. That means the dependent variable is in there, too.
+ */
 static void
-coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
+coeff_init (pspp_linreg_cache * c, struct design_matrix *cov)
 {
-  c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
+  c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff));
+  c->n_coeffs = cov->m->size2 - 1;
   c->coeff[0] = xmalloc (sizeof (*(c->coeff[0])));	/* The first coefficient is the intercept. */
   c->coeff[0]->v_info = NULL;	/* Intercept has no associated variable. */
-  pspp_coeff_init (c->coeff + 1, dm);
+  pspp_coeff_init (c->coeff + 1, cov);
 }
 
 /*
@@ -339,8 +344,7 @@ run_glm (struct casereader *input,
 		}
 	    }
 	}
-      model = pspp_linreg_cache_alloc (n_data, n_indep);
-      model->depvar = v_dependent;
+      model = pspp_linreg_cache_alloc (v_dependent, indep_vars, n_data, n_indep);
       /*
 	For large data sets, use QR decomposition.
       */
diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q
index 0f8ae594..08954d87 100644
--- a/src/language/stats/regression.q
+++ b/src/language/stats/regression.q
@@ -973,7 +973,8 @@ run_regression (struct casereader *input, struct cmd_regression *cmd,
 	    {
 	      lopts.get_indep_mean_std[i] = 1;
 	    }
-	  models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
+	  models[k] = pspp_linreg_cache_alloc (dep_var, (const struct variable **) indep_vars,
+					       X->m->size1, X->m->size2);
 	  models[k]->depvar = dep_var;
 	  /*
 	     For large data sets, use QR decomposition.
diff --git a/src/math/coefficient.c b/src/math/coefficient.c
index 5872b576..2feeedba 100644
--- a/src/math/coefficient.c
+++ b/src/math/coefficient.c
@@ -160,13 +160,21 @@ pspp_coeff_var_to_coeff (const struct variable *v, struct pspp_coeff **coefs,
   size_t i = 0;
   size_t j = 0;
   size_t v_idx;
+  int found = 0;
   struct pspp_coeff *result = NULL;
 
   if (v != NULL)
     {
       v_idx = var_get_dict_index (v);
-      while (i < n_coef && var_get_dict_index (coefs[i]->v_info->v) != v_idx)
+      while (i < n_coef)
 	{
+	  if (coefs[i]->v_info != NULL)
+	    {
+	      if (var_get_dict_index (coefs[i]->v_info->v) == v_idx)
+		{
+		  break;
+		}
+	    }
 	  i++;
 	}
       result = coefs[i];
diff --git a/src/math/linreg.c b/src/math/linreg.c
index 355261b6..a7fa1565 100644
--- a/src/math/linreg.c
+++ b/src/math/linreg.c
@@ -99,7 +99,7 @@ int
 pspp_linreg_get_vars (const void *c_, struct variable **v)
 {
   const pspp_linreg_cache *c = c_;
-  const struct variable *tmp;
+  struct variable *tmp;
   int i;
   int j;
   int result = 0;
@@ -138,12 +138,15 @@ pspp_linreg_get_vars (const void *c_, struct variable **v)
   independent variables.
  */
 pspp_linreg_cache *
-pspp_linreg_cache_alloc (size_t n, size_t p)
+pspp_linreg_cache_alloc (const struct variable *depvar, const struct variable **indep_vars,
+			 size_t n, size_t p)
 {
+  size_t i;
   pspp_linreg_cache *c;
 
   c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
-  c->depvar = NULL;
+  c->depvar = depvar;
+  c->indep_vars = indep_vars;
   c->indep_means = gsl_vector_alloc (p);
   c->indep_std = gsl_vector_alloc (p);
   c->ssx = gsl_vector_alloc (p);	/* Sums of squares for the
@@ -152,9 +155,22 @@ pspp_linreg_cache_alloc (size_t n, size_t p)
   c->ss_indeps = gsl_vector_alloc (p);	/* Sums of squares for the
 					   model parameters.
 					 */
-  c->cov = gsl_matrix_alloc (p + 1, p + 1);	/* Covariance matrix. */
   c->n_obs = n;
   c->n_indeps = p;
+  c->n_coeffs = 0;
+  for (i = 0; i < p; i++)
+    {
+      if (var_is_numeric (indep_vars[i]))
+	{
+	  c->n_coeffs++;
+	}
+      else
+	{
+	  c->n_coeffs += cat_get_n_categories (indep_vars[i]) - 1;
+	}
+    }
+
+  c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1);
   /*
      Default settings.
    */
@@ -194,13 +210,12 @@ pspp_linreg_cache_free (void *m)
   return true;
 }
 static void
-cache_init (pspp_linreg_cache *cache, const struct design_matrix *dm)
+cache_init (pspp_linreg_cache *cache)
 {
   assert (cache != NULL);
   cache->dft = cache->n_obs - 1;
   cache->dfm = cache->n_indeps;
   cache->dfe = cache->dft - cache->dfm;
-  cache->n_coeffs = dm->m->size2;
   cache->intercept = 0.0;
 }
 
@@ -320,7 +335,8 @@ pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
       cache->depvar_std = s;
       cache->sst = ss;
     }
-  cache_init (cache, dm);
+  cache_init (cache);
+  cache->n_coeffs = dm->m->size2;
   for (i = 0; i < dm->m->size2; i++)
     {
       if (opts->get_indep_mean_std[i])
@@ -647,83 +663,60 @@ void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct var
 static struct design_matrix *
 rearrange_covariance_matrix (const struct design_matrix *cov, pspp_linreg_cache *c)
 {
-  struct variable **v;
   struct variable **model_vars;
-  struct variable *tmp;
   struct design_matrix *result;
-  int n_vars;
-  int found;
-  size_t *columns;
+  size_t *permutation;
   size_t i;
   size_t j;
   size_t k;
-  size_t dep_col;
 
   assert (cov != NULL);
   assert (c != NULL);
   assert (cov->m->size1 > 0);
   assert (cov->m->size2 == cov->m->size1);
-  v = xnmalloc (c->n_coeffs, sizeof (*v));
-  model_vars = xnmalloc (c->n_coeffs, sizeof (*model_vars));
-  columns = xnmalloc (cov->m->size2, sizeof (*columns));
-  n_vars = pspp_linreg_get_vars (c, v);
-  dep_col = 0;
-  k = 0;
-  for (i = 0; i < cov->m->size2; i++)
-    {
-      tmp = design_matrix_col_to_var (cov, i);
-      found = 0;
-      j = 0;
-      while (!found && j < n_vars)
-	{
-	  if (tmp == v[j])
-	    {
-	      found = 1;
-	      if (tmp == c->depvar)
-		{
-		  dep_col = j;
-		}
-	      else
-		{
-		  columns[k] = j;
-		  k++;
-		}
-	    }
-	  j++;
-	}
-    }
-  k++;
-  columns[k] = dep_col;
-  /*
-    K should now be equal to C->N_INDEPS + 1. If it is not, then
-    either the code above is wrong or the caller didn't send us the
-    correct values in C.
-   */
-  assert (k == c->n_indeps + 1);
+  permutation = xnmalloc (cov->m->size2, sizeof (*permutation));
+  model_vars = xnmalloc (1 + c->n_indeps, sizeof (*model_vars));
+
   /*
     Put the model variables in the right order in MODEL_VARS.
    */
-  for (i = 0; i < k; i++)
+  for (i = 0; i < c->n_indeps; i++)
     {
-      model_vars[i] = v[columns[i]];
+      model_vars[i] = c->indep_vars[i];
     }
-
-  result = covariance_matrix_create (k, model_vars);
-  for (i = 0; i < result->m->size1; i++)
+  model_vars[i] = c->depvar;
+  result = covariance_matrix_create (1 + c->n_indeps, model_vars);
+  for (j = 0; j < cov->m->size2; j++)
     {
-      for (j = 0; j < result->m->size2; j++)
+      for (k = 0; k < result->m->size2; k++)
 	{
-	  gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, columns[i], columns[j]));
+	  if (design_matrix_col_to_var (cov, j) == design_matrix_col_to_var (result, k)) 
+	    {
+	      permutation[k] = j;
+	      break;
+	    }
 	}
     }
-  free (columns);
-  free (v);
+  for (j = 0; j < result->m->size2; j++)
+    for (i = 0; i < result->m->size1; i++)
+      {
+	gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, permutation[i], permutation[j]));
+      }
+  free (permutation);
+  free (model_vars);
   return result;
 }
 /*
   Estimate the model parameters from the covariance matrix only. This
   method uses less memory than PSPP_LINREG, which requires the entire
   data set to be stored in memory.
+
+  The function assumes FULL_COV may contain columns corresponding to
+  variables that are not in the model. It fixes this in
+  REARRANG_COVARIANCE_MATRIX. This allows the caller to compute a
+  large covariance matrix once before, then pass it to this without
+  having to alter it. The problem is that this means the caller must
+  set CACHE->N_COEFFS.
 */
 int
 pspp_linreg_with_cov (const struct design_matrix *full_cov, 
@@ -735,7 +728,7 @@ pspp_linreg_with_cov (const struct design_matrix *full_cov,
   assert (cache != NULL);
 
   cov = rearrange_covariance_matrix (full_cov, cache);
-  cache_init (cache, cov);
+  cache_init (cache);
   reg_sweep (cov->m);
   post_sweep_computations (cache, cov, cov->m);  
   covariance_matrix_destroy (cov);
diff --git a/src/math/linreg.h b/src/math/linreg.h
index 18f962c0..5a5c4c8e 100644
--- a/src/math/linreg.h
+++ b/src/math/linreg.h
@@ -96,9 +96,10 @@ struct pspp_linreg_cache_struct
 				   coefficient here. */
 
   /*
-    Pointer to the dependent variable.
+    Pointers to the variables.
    */
   const struct variable *depvar;
+  const struct variable **indep_vars;
 
   gsl_vector *residuals;
   struct pspp_coeff **coeff;
@@ -175,7 +176,8 @@ typedef struct pspp_linreg_cache_struct pspp_linreg_cache;
   to it. n is the number of cases, p is the number of
   independent variables.
  */
-pspp_linreg_cache *pspp_linreg_cache_alloc (size_t n, size_t p);
+pspp_linreg_cache *pspp_linreg_cache_alloc (const struct variable *, const struct variable **, 
+					    size_t, size_t);
 
 bool pspp_linreg_cache_free (void *);
 
-- 
2.30.2