pspp_coeff_var_to_coeff: Guard against a null pointer in coefs[i]->v_info.

author Jason H Stover <jhs@math.gcsu.edu>

Mon, 15 Sep 2008 16:35:11 +0000 (12:35 -0400)

committer Jason H Stover <jhs@math.gcsu.edu>

Mon, 15 Sep 2008 16:35:11 +0000 (12:35 -0400)
author Jason H Stover <jhs@math.gcsu.edu>
Mon, 15 Sep 2008 16:35:11 +0000 (12:35 -0400)
committer Jason H Stover <jhs@math.gcsu.edu>
Mon, 15 Sep 2008 16:35:11 +0000 (12:35 -0400)
diff --git a/src/language/stats/glm.q b/src/language/stats/glm.q

index ad8bf55e9dfd5dfee172289eb099e42c8922ab22..ddc2a78ba00388e8e507a2f32ce6e753aba25b77 100644 (file)
--- a/src/language/stats/glm.q
+++ b/src/language/stats/glm.q
@@ -151,13 +151,18 @@ glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
    return 1;
  }
  
+/*
+  COV is the covariance matrix for variables included in the
+  model. That means the dependent variable is in there, too.
+ */
  static void
-coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
+coeff_init (pspp_linreg_cache * c, struct design_matrix *cov)
  {
-  c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
+  c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff));
+  c->n_coeffs = cov->m->size2 - 1;
    c->coeff[0] = xmalloc (sizeof (*(c->coeff[0])));     /* The first coefficient is the intercept. */
    c->coeff[0]->v_info = NULL;  /* Intercept has no associated variable. */
-  pspp_coeff_init (c->coeff + 1, dm);
+  pspp_coeff_init (c->coeff + 1, cov);
  }
  
  /*
@@ -339,8 +344,7 @@ run_glm (struct casereader *input,
                 }
             }
         }
-      model = pspp_linreg_cache_alloc (n_data, n_indep);
-      model->depvar = v_dependent;
+      model = pspp_linreg_cache_alloc (v_dependent, indep_vars, n_data, n_indep);
        /*
         For large data sets, use QR decomposition.
        */
diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q

index 0f8ae59433b615f3fb53fae51d04264bdbab88f1..08954d87e5e63d778b258fcf3356e9a4c5ff8ece 100644 (file)
--- a/src/language/stats/regression.q
+++ b/src/language/stats/regression.q
@@ -973,7 +973,8 @@ run_regression (struct casereader *input, struct cmd_regression *cmd,
             {
               lopts.get_indep_mean_std[i] = 1;
             }
-         models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
+         models[k] = pspp_linreg_cache_alloc (dep_var, (const struct variable **) indep_vars,
+                                              X->m->size1, X->m->size2);
           models[k]->depvar = dep_var;
           /*
              For large data sets, use QR decomposition.
diff --git a/src/math/coefficient.c b/src/math/coefficient.c

index 5872b576fabfc82227aac28dae759ec43ded96ac..2feeedbaa8f2aff28ace93a23cd0a72fd0d43436 100644 (file)
--- a/src/math/coefficient.c
+++ b/src/math/coefficient.c
@@ -160,13 +160,21 @@ pspp_coeff_var_to_coeff (const struct variable *v, struct pspp_coeff **coefs,
    size_t i = 0;
    size_t j = 0;
    size_t v_idx;
+  int found = 0;
    struct pspp_coeff *result = NULL;
  
    if (v != NULL)
      {
        v_idx = var_get_dict_index (v);
-      while (i < n_coef && var_get_dict_index (coefs[i]->v_info->v) != v_idx)
+      while (i < n_coef)
         {
+         if (coefs[i]->v_info != NULL)
+           {
+             if (var_get_dict_index (coefs[i]->v_info->v) == v_idx)
+               {
+                 break;
+               }
+           }
           i++;
         }
        result = coefs[i];
diff --git a/src/math/linreg.c b/src/math/linreg.c

index 355261b6ea15d2eb2f86d44503f9bb1f258f5b40..a7fa15657344bd1cf52aad870acffd80c9d661f6 100644 (file)
--- a/src/math/linreg.c
+++ b/src/math/linreg.c
@@ -99,7 +99,7 @@ int
  pspp_linreg_get_vars (const void *c_, struct variable **v)
  {
    const pspp_linreg_cache *c = c_;
-  const struct variable *tmp;
+  struct variable *tmp;
    int i;
    int j;
    int result = 0;
@@ -138,12 +138,15 @@ pspp_linreg_get_vars (const void *c_, struct variable **v)
    independent variables.
   */
  pspp_linreg_cache *
-pspp_linreg_cache_alloc (size_t n, size_t p)
+pspp_linreg_cache_alloc (const struct variable *depvar, const struct variable **indep_vars,
+                        size_t n, size_t p)
  {
+  size_t i;
    pspp_linreg_cache *c;
  
    c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
-  c->depvar = NULL;
+  c->depvar = depvar;
+  c->indep_vars = indep_vars;
    c->indep_means = gsl_vector_alloc (p);
    c->indep_std = gsl_vector_alloc (p);
    c->ssx = gsl_vector_alloc (p);       /* Sums of squares for the
@@ -152,9 +155,22 @@ pspp_linreg_cache_alloc (size_t n, size_t p)
    c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
                                            model parameters.
                                          */
-  c->cov = gsl_matrix_alloc (p + 1, p + 1);    /* Covariance matrix. */
    c->n_obs = n;
    c->n_indeps = p;
+  c->n_coeffs = 0;
+  for (i = 0; i < p; i++)
+    {
+      if (var_is_numeric (indep_vars[i]))
+       {
+         c->n_coeffs++;
+       }
+      else
+       {
+         c->n_coeffs += cat_get_n_categories (indep_vars[i]) - 1;
+       }
+    }
+
+  c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1);
    /*
       Default settings.
     */
@@ -194,13 +210,12 @@ pspp_linreg_cache_free (void *m)
    return true;
  }
  static void
-cache_init (pspp_linreg_cache *cache, const struct design_matrix *dm)
+cache_init (pspp_linreg_cache *cache)
  {
    assert (cache != NULL);
    cache->dft = cache->n_obs - 1;
    cache->dfm = cache->n_indeps;
    cache->dfe = cache->dft - cache->dfm;
-  cache->n_coeffs = dm->m->size2;
    cache->intercept = 0.0;
  }
  
@@ -320,7 +335,8 @@ pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
        cache->depvar_std = s;
        cache->sst = ss;
      }
-  cache_init (cache, dm);
+  cache_init (cache);
+  cache->n_coeffs = dm->m->size2;
    for (i = 0; i < dm->m->size2; i++)
      {
        if (opts->get_indep_mean_std[i])
@@ -647,83 +663,60 @@ void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct var
  static struct design_matrix *
  rearrange_covariance_matrix (const struct design_matrix *cov, pspp_linreg_cache *c)
  {
-  struct variable **v;
    struct variable **model_vars;
-  struct variable *tmp;
    struct design_matrix *result;
-  int n_vars;
-  int found;
-  size_t *columns;
+  size_t *permutation;
    size_t i;
    size_t j;
    size_t k;
-  size_t dep_col;
  
    assert (cov != NULL);
    assert (c != NULL);
    assert (cov->m->size1 > 0);
    assert (cov->m->size2 == cov->m->size1);
-  v = xnmalloc (c->n_coeffs, sizeof (*v));
-  model_vars = xnmalloc (c->n_coeffs, sizeof (*model_vars));
-  columns = xnmalloc (cov->m->size2, sizeof (*columns));
-  n_vars = pspp_linreg_get_vars (c, v);
-  dep_col = 0;
-  k = 0;
-  for (i = 0; i < cov->m->size2; i++)
-    {
-      tmp = design_matrix_col_to_var (cov, i);
-      found = 0;
-      j = 0;
-      while (!found && j < n_vars)
-       {
-         if (tmp == v[j])
-           {
-             found = 1;
-             if (tmp == c->depvar)
-               {
-                 dep_col = j;
-               }
-             else
-               {
-                 columns[k] = j;
-                 k++;
-               }
-           }
-         j++;
-       }
-    }
-  k++;
-  columns[k] = dep_col;
-  /*
-    K should now be equal to C->N_INDEPS + 1. If it is not, then
-    either the code above is wrong or the caller didn't send us the
-    correct values in C.
-   */
-  assert (k == c->n_indeps + 1);
+  permutation = xnmalloc (cov->m->size2, sizeof (*permutation));
+  model_vars = xnmalloc (1 + c->n_indeps, sizeof (*model_vars));
+
    /*
      Put the model variables in the right order in MODEL_VARS.
     */
-  for (i = 0; i < k; i++)
+  for (i = 0; i < c->n_indeps; i++)
      {
-      model_vars[i] = v[columns[i]];
+      model_vars[i] = c->indep_vars[i];
      }
-
-  result = covariance_matrix_create (k, model_vars);
-  for (i = 0; i < result->m->size1; i++)
+  model_vars[i] = c->depvar;
+  result = covariance_matrix_create (1 + c->n_indeps, model_vars);
+  for (j = 0; j < cov->m->size2; j++)
      {
-      for (j = 0; j < result->m->size2; j++)
+      for (k = 0; k < result->m->size2; k++)
         {
-         gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, columns[i], columns[j]));
+         if (design_matrix_col_to_var (cov, j) == design_matrix_col_to_var (result, k)) 
+           {
+             permutation[k] = j;
+             break;
+           }
         }
      }
-  free (columns);
-  free (v);
+  for (j = 0; j < result->m->size2; j++)
+    for (i = 0; i < result->m->size1; i++)
+      {
+       gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, permutation[i], permutation[j]));
+      }
+  free (permutation);
+  free (model_vars);
    return result;
  }
  /*
    Estimate the model parameters from the covariance matrix only. This
    method uses less memory than PSPP_LINREG, which requires the entire
    data set to be stored in memory.
+
+  The function assumes FULL_COV may contain columns corresponding to
+  variables that are not in the model. It fixes this in
+  REARRANG_COVARIANCE_MATRIX. This allows the caller to compute a
+  large covariance matrix once before, then pass it to this without
+  having to alter it. The problem is that this means the caller must
+  set CACHE->N_COEFFS.
  */
  int
  pspp_linreg_with_cov (const struct design_matrix *full_cov, 
@@ -735,7 +728,7 @@ pspp_linreg_with_cov (const struct design_matrix *full_cov,
    assert (cache != NULL);
  
    cov = rearrange_covariance_matrix (full_cov, cache);
-  cache_init (cache, cov);
+  cache_init (cache);
    reg_sweep (cov->m);
    post_sweep_computations (cache, cov, cov->m);  
    covariance_matrix_destroy (cov);
diff --git a/src/math/linreg.h b/src/math/linreg.h

index 18f962c06046b1af232af22b656d231722bcbd65..5a5c4c8eb3e8db543b1db068ae5670dd6f62f1da 100644 (file)
--- a/src/math/linreg.h
+++ b/src/math/linreg.h
@@ -96,9 +96,10 @@ struct pspp_linreg_cache_struct
                                    coefficient here. */
  
    /*
-    Pointer to the dependent variable.
+    Pointers to the variables.
     */
    const struct variable *depvar;
+  const struct variable **indep_vars;
  
    gsl_vector *residuals;
    struct pspp_coeff **coeff;
@@ -175,7 +176,8 @@ typedef struct pspp_linreg_cache_struct pspp_linreg_cache;
    to it. n is the number of cases, p is the number of
    independent variables.
   */
-pspp_linreg_cache *pspp_linreg_cache_alloc (size_t n, size_t p);
+pspp_linreg_cache *pspp_linreg_cache_alloc (const struct variable *, const struct variable **, 
+                                           size_t, size_t);
  
  bool pspp_linreg_cache_free (void *);
author	Jason H Stover <jhs@math.gcsu.edu>
	Mon, 15 Sep 2008 16:35:11 +0000 (12:35 -0400)
committer	Jason H Stover <jhs@math.gcsu.edu>
	Mon, 15 Sep 2008 16:35:11 +0000 (12:35 -0400)
src/language/stats/glm.q		patch \| blob \| history
src/language/stats/regression.q		patch \| blob \| history
src/math/coefficient.c		patch \| blob \| history
src/math/linreg.c		patch \| blob \| history
src/math/linreg.h		patch \| blob \| history