src/math/linreg.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2005 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18 #include <gsl/gsl_blas.h>
  19 #include <gsl/gsl_cblas.h>
  20 #include <gsl/gsl_errno.h>
  21 #include <gsl/gsl_fit.h>
  22 #include <gsl/gsl_multifit.h>
  23 #include <linreg/sweep.h>
  24 #include <math/coefficient.h>
  25 #include <math/linreg.h>
  26 #include <math/coefficient.h>
  27 #include <math/covariance-matrix.h>
  28 #include <math/design-matrix.h>
  29 #include <src/data/category.h>
  30 #include <src/data/variable.h>
  31 #include <src/data/value.h>
  32 #include <gl/xalloc.h>
  33
  34 /*
  35   Find the least-squares estimate of b for the linear model:
  36
  37   Y = Xb + Z
  38
  39   where Y is an n-by-1 column vector, X is an n-by-p matrix of
  40   independent variables, b is a p-by-1 vector of regression coefficients,
  41   and Z is an n-by-1 normally-distributed random vector with independent
  42   identically distributed components with mean 0.
  43
  44   This estimate is found via the sweep operator or singular-value
  45   decomposition with gsl.
  46
  47
  48   References:
  49
  50   1. Matrix Computations, third edition. GH Golub and CF Van Loan.
  51   The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
  52
  53   2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
  54   ISBN 0-387-94979-8.
  55
  56   3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
  57   Springer. 1998. ISBN 0-387-98542-5.
  58 */
  59
  60
  61 /*
  62   Get the mean and standard deviation of a vector
  63   of doubles via a form of the Kalman filter as
  64   described on page 32 of [3].
  65  */
  66 static int
  67 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
  68 {
  69   size_t i;
  70   double j = 0.0;
  71   double d;
  72   double tmp;
  73   double mean;
  74   double variance;
  75
  76   mean = gsl_vector_get (&v.vector, 0);
  77   variance = 0;
  78   for (i = 1; i < v.vector.size; i++)
  79     {
  80       j = (double) i + 1.0;
  81       tmp = gsl_vector_get (&v.vector, i);
  82       d = (tmp - mean) / j;
  83       mean += d;
  84       variance += j * (j - 1.0) * d * d;
  85     }
  86   *mp = mean;
  87   *sp = sqrt (variance / (j - 1.0));
  88   *ssp = variance;
  89
  90   return GSL_SUCCESS;
  91 }
  92
  93 /*
  94   Set V to contain an array of pointers to the variables
  95   used in the model. V must be at least C->N_COEFFS in length.
  96   The return value is the number of distinct variables found.
  97  */
  98 int
  99 pspp_linreg_get_vars (const void *c_, struct variable **v)
 100 {
 101   const pspp_linreg_cache *c = c_;
 102   const struct variable *tmp;
 103   int i;
 104   int j;
 105   int result = 0;
 106
 107   /*
 108      Make sure the caller doesn't try to sneak a variable
 109      into V that is not in the model.
 110    */
 111   for (i = 0; i < c->n_coeffs; i++)
 112     {
 113       v[i] = NULL;
 114     }
 115   for (j = 0; j < c->n_coeffs; j++)
 116     {
 117       tmp = pspp_coeff_get_var (c->coeff[j], 0);
 118       assert (tmp != NULL);
 119       /* Repeated variables are likely to bunch together, at the end
 120          of the array. */
 121       i = result - 1;
 122       while (i >= 0 && v[i] != tmp)
 123         {
 124           i--;
 125         }
 126       if (i < 0 && result < c->n_coeffs)
 127         {
 128           v[result] = tmp;
 129           result++;
 130         }
 131     }
 132   return result;
 133 }
 134
 135 /*
 136   Allocate a pspp_linreg_cache and return a pointer
 137   to it. n is the number of cases, p is the number of
 138   independent variables.
 139  */
 140 pspp_linreg_cache *
 141 pspp_linreg_cache_alloc (size_t n, size_t p)
 142 {
 143   pspp_linreg_cache *c;
 144
 145   c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
 146   c->depvar = NULL;
 147   c->indep_means = gsl_vector_alloc (p);
 148   c->indep_std = gsl_vector_alloc (p);
 149   c->ssx = gsl_vector_alloc (p);        /* Sums of squares for the
 150                                            independent variables.
 151                                          */
 152   c->ss_indeps = gsl_vector_alloc (p);  /* Sums of squares for the
 153                                            model parameters.
 154                                          */
 155   c->cov = gsl_matrix_alloc (p + 1, p + 1);     /* Covariance matrix. */
 156   c->n_obs = n;
 157   c->n_indeps = p;
 158   /*
 159      Default settings.
 160    */
 161   c->method = PSPP_LINREG_SWEEP;
 162   c->predict = pspp_linreg_predict;
 163   c->residual = pspp_linreg_residual;   /* The procedure to compute my
 164                                            residuals. */
 165   c->get_vars = pspp_linreg_get_vars;   /* The procedure that returns
 166                                            pointers to model
 167                                            variables. */
 168   c->resid = NULL;              /* The variable storing my residuals. */
 169   c->pred = NULL;               /* The variable storing my predicted values. */
 170
 171   return c;
 172 }
 173
 174 bool
 175 pspp_linreg_cache_free (void *m)
 176 {
 177   int i;
 178
 179   pspp_linreg_cache *c = m;
 180   if (c != NULL)
 181     {
 182       gsl_vector_free (c->indep_means);
 183       gsl_vector_free (c->indep_std);
 184       gsl_vector_free (c->ss_indeps);
 185       gsl_matrix_free (c->cov);
 186       gsl_vector_free (c->ssx);
 187       for (i = 0; i < c->n_coeffs; i++)
 188         {
 189           pspp_coeff_free (c->coeff[i]);
 190         }
 191       free (c->coeff);
 192       free (c);
 193     }
 194   return true;
 195 }
 196 static void
 197 cache_init (pspp_linreg_cache *cache, const struct design_matrix *dm)
 198 {
 199   assert (cache != NULL);
 200   cache->dft = cache->n_obs - 1;
 201   cache->dfm = cache->n_indeps;
 202   cache->dfe = cache->dft - cache->dfm;
 203   cache->n_coeffs = dm->m->size2;
 204   cache->intercept = 0.0;
 205 }
 206
 207 static void
 208 post_sweep_computations (pspp_linreg_cache *cache, const struct design_matrix *dm,
 209                          gsl_matrix *sw)
 210 {
 211   gsl_matrix *xm;
 212   gsl_matrix_view xtx;
 213   gsl_matrix_view xmxtx;
 214   double m;
 215   double tmp;
 216   size_t i;
 217   size_t j;
 218   int rc;
 219
 220   assert (sw != NULL);
 221   assert (cache != NULL);
 222
 223   cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
 224   cache->mse = cache->sse / cache->dfe;
 225   /*
 226     Get the intercept.
 227   */
 228   m = cache->depvar_mean;
 229   for (i = 0; i < cache->n_indeps; i++)
 230     {
 231       tmp = gsl_matrix_get (sw, i, cache->n_indeps);
 232       cache->coeff[i]->estimate = tmp;
 233       m -= tmp * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i));
 234     }
 235   /*
 236     Get the covariance matrix of the parameter estimates.
 237     Only the upper triangle is necessary.
 238   */
 239
 240   /*
 241     The loops below do not compute the entries related
 242     to the estimated intercept.
 243   */
 244   for (i = 0; i < cache->n_indeps; i++)
 245     for (j = i; j < cache->n_indeps; j++)
 246       {
 247         tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
 248         gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
 249       }
 250   /*
 251     Get the covariances related to the intercept.
 252   */
 253   xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
 254   xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
 255   xm = gsl_matrix_calloc (1, cache->n_indeps);
 256   for (i = 0; i < xm->size2; i++)
 257     {
 258       gsl_matrix_set (xm, 0, i,
 259                       pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i)));
 260     }
 261   rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
 262                        &xtx.matrix, xm, 0.0, &xmxtx.matrix);
 263   gsl_matrix_free (xm);
 264   if (rc == GSL_SUCCESS)
 265     {
 266       tmp = cache->mse / cache->n_obs;
 267       for (i = 1; i < 1 + cache->n_indeps; i++)
 268         {
 269           tmp -= gsl_matrix_get (cache->cov, 0, i)
 270             * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i - 1));
 271         }
 272       gsl_matrix_set (cache->cov, 0, 0, tmp);
 273
 274       cache->intercept = m;
 275     }
 276   else
 277     {
 278       fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
 279                __FILE__, __LINE__, gsl_strerror (rc));
 280       exit (rc);
 281     }
 282 }
 283
 284 /*
 285   Fit the linear model via least squares. All pointers passed to pspp_linreg
 286   are assumed to be allocated to the correct size and initialized to the
 287   values as indicated by opts.
 288  */
 289 int
 290 pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
 291              const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
 292 {
 293   int rc;
 294   gsl_matrix *design = NULL;
 295   gsl_matrix_view xtx;
 296   gsl_vector_view xty;
 297   gsl_vector_view xi;
 298   gsl_vector_view xj;
 299   gsl_vector *param_estimates;
 300   struct pspp_coeff *coef;
 301   const struct variable *v;
 302   const union value *val;
 303
 304   size_t i;
 305   size_t j;
 306   double tmp;
 307   double m;
 308   double s;
 309   double ss;
 310
 311   if (cache == NULL)
 312     {
 313       return GSL_EFAULT;
 314     }
 315   if (opts->get_depvar_mean_std)
 316     {
 317       linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
 318                        &m, &s, &ss);
 319       cache->depvar_mean = m;
 320       cache->depvar_std = s;
 321       cache->sst = ss;
 322     }
 323   cache_init (cache, dm);
 324   for (i = 0; i < dm->m->size2; i++)
 325     {
 326       if (opts->get_indep_mean_std[i])
 327         {
 328           linreg_mean_std (gsl_matrix_const_column (dm->m, i), &m, &s, &ss);
 329           v = design_matrix_col_to_var (dm, i);
 330           val = NULL;
 331           if (var_is_alpha (v))
 332             {
 333               j = i - design_matrix_var_to_column (dm, v);
 334               val = cat_subscript_to_value (j, v);
 335             }
 336           coef = pspp_linreg_get_coeff (cache, v, val);
 337           pspp_coeff_set_mean (coef, m);
 338           pspp_coeff_set_sd (coef, s);
 339           gsl_vector_set (cache->ssx, i, ss);
 340
 341         }
 342     }
 343
 344   if (cache->method == PSPP_LINREG_SWEEP)
 345     {
 346       gsl_matrix *sw;
 347       /*
 348          Subtract the means to improve the condition of the design
 349          matrix. This requires copying dm->m and Y. We do not divide by the
 350          standard deviations of the independent variables here since doing
 351          so would cause a miscalculation of the residual sums of
 352          squares. Dividing by the standard deviation is done GSL's linear
 353          regression functions, so if the design matrix has a poor
 354          condition, use QR decomposition.
 355
 356          The design matrix here does not include a column for the intercept
 357          (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
 358          so design is allocated here when sweeping, or below if using QR.
 359        */
 360       design = gsl_matrix_alloc (dm->m->size1, dm->m->size2);
 361       for (i = 0; i < dm->m->size2; i++)
 362         {
 363           v = design_matrix_col_to_var (dm, i);
 364           m = pspp_linreg_get_indep_variable_mean (cache, v);
 365           for (j = 0; j < dm->m->size1; j++)
 366             {
 367               tmp = (gsl_matrix_get (dm->m, j, i) - m);
 368               gsl_matrix_set (design, j, i, tmp);
 369             }
 370         }
 371       sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
 372       xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
 373
 374       for (i = 0; i < xtx.matrix.size1; i++)
 375         {
 376           tmp = gsl_vector_get (cache->ssx, i);
 377           gsl_matrix_set (&(xtx.matrix), i, i, tmp);
 378           xi = gsl_matrix_column (design, i);
 379           for (j = (i + 1); j < xtx.matrix.size2; j++)
 380             {
 381               xj = gsl_matrix_column (design, j);
 382               gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
 383               gsl_matrix_set (&(xtx.matrix), i, j, tmp);
 384             }
 385         }
 386
 387       gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
 388       xty = gsl_matrix_column (sw, cache->n_indeps);
 389       /*
 390          This loop starts at 1, with i=0 outside the loop, so we can get
 391          the model sum of squares due to the first independent variable.
 392        */
 393       xi = gsl_matrix_column (design, 0);
 394       gsl_blas_ddot (&(xi.vector), Y, &tmp);
 395       gsl_vector_set (&(xty.vector), 0, tmp);
 396       tmp *= tmp / gsl_vector_get (cache->ssx, 0);
 397       gsl_vector_set (cache->ss_indeps, 0, tmp);
 398       for (i = 1; i < cache->n_indeps; i++)
 399         {
 400           xi = gsl_matrix_column (design, i);
 401           gsl_blas_ddot (&(xi.vector), Y, &tmp);
 402           gsl_vector_set (&(xty.vector), i, tmp);
 403         }
 404
 405       /*
 406          Sweep on the matrix sw, which contains XtX, XtY and YtY.
 407        */
 408       reg_sweep (sw);
 409       post_sweep_computations (cache, dm, sw);
 410       gsl_matrix_free (sw);
 411     }
 412   else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
 413     {
 414       /*
 415         Use the SVD of X^T X to find a conditional inverse of X^TX. If
 416         the SVD is X^T X = U D V^T, then set the conditional inverse
 417         to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
 418         (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
 419         sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
 420         equations by setting the estimated parameter vector to
 421         (X^TX)^c X^T Y.
 422        */
 423     }
 424   else
 425     {
 426       gsl_multifit_linear_workspace *wk;
 427       /*
 428          Use QR decomposition via GSL.
 429        */
 430
 431       param_estimates = gsl_vector_alloc (1 + dm->m->size2);
 432       design = gsl_matrix_alloc (dm->m->size1, 1 + dm->m->size2);
 433
 434       for (j = 0; j < dm->m->size1; j++)
 435         {
 436           gsl_matrix_set (design, j, 0, 1.0);
 437           for (i = 0; i < dm->m->size2; i++)
 438             {
 439               tmp = gsl_matrix_get (dm->m, j, i);
 440               gsl_matrix_set (design, j, i + 1, tmp);
 441             }
 442         }
 443
 444       wk = gsl_multifit_linear_alloc (design->size1, design->size2);
 445       rc = gsl_multifit_linear (design, Y, param_estimates,
 446                                 cache->cov, &(cache->sse), wk);
 447       for (i = 0; i < cache->n_coeffs; i++)
 448         {
 449           cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1);
 450         }
 451       cache->intercept = gsl_vector_get (param_estimates, 0);
 452       if (rc == GSL_SUCCESS)
 453         {
 454           gsl_multifit_linear_free (wk);
 455           gsl_vector_free (param_estimates);
 456         }
 457       else
 458         {
 459           fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
 460                    __FILE__, __LINE__, rc);
 461         }
 462     }
 463
 464
 465   cache->ssm = cache->sst - cache->sse;
 466   /*
 467      Get the remaining sums of squares for the independent
 468      variables.
 469    */
 470   m = 0;
 471   for (i = 1; i < cache->n_indeps; i++)
 472     {
 473       j = i - 1;
 474       m += gsl_vector_get (cache->ss_indeps, j);
 475       tmp = cache->ssm - m;
 476       gsl_vector_set (cache->ss_indeps, i, tmp);
 477     }
 478
 479   gsl_matrix_free (design);
 480   return GSL_SUCCESS;
 481 }
 482
 483 /*
 484   Is the coefficient COEF contained in the list of coefficients
 485   COEF_LIST?
 486  */
 487 static int
 488 has_coefficient (const struct pspp_coeff **coef_list, const struct pspp_coeff *coef,
 489                  size_t n)
 490 {
 491   size_t i = 0;
 492
 493   while (i < n)
 494     {
 495       if (coef_list[i] == coef)
 496         {
 497           return 1;
 498         }
 499       i++;
 500     }
 501   return 0;
 502 }
 503 /*
 504   Predict the value of the dependent variable with the
 505   new set of predictors. PREDICTORS must point to a list
 506   of variables, each of whose values are stored in VALS,
 507   in the same order.
 508  */
 509 double
 510 pspp_linreg_predict (const struct variable **predictors,
 511                      const union value **vals, const void *c_, int n_vals)
 512 {
 513   const pspp_linreg_cache *c = c_;
 514   int j;
 515   size_t next_coef = 0;
 516   const struct pspp_coeff **coef_list;
 517   const struct pspp_coeff *coe;
 518   double result;
 519   double tmp;
 520
 521   if (predictors == NULL || vals == NULL || c == NULL)
 522     {
 523       return GSL_NAN;
 524     }
 525   if (c->coeff == NULL)
 526     {
 527       /* The stupid model: just guess the mean. */
 528       return c->depvar_mean;
 529     }
 530   coef_list = xnmalloc (c->n_coeffs, sizeof (*coef_list));
 531   result = c->intercept;
 532
 533   /*
 534      The loops guard against the possibility that the caller passed us
 535      inadequate information, such as too few or too many values, or
 536      a redundant list of variable names.
 537    */
 538   for (j = 0; j < n_vals; j++)
 539     {
 540       coe = pspp_linreg_get_coeff (c, predictors[j], vals[j]);
 541       if (!has_coefficient (coef_list, coe, next_coef))
 542         {
 543           tmp = pspp_coeff_get_est (coe);
 544           if (var_is_numeric (predictors[j]))
 545             {
 546               tmp *= vals[j]->f;
 547             }
 548           result += tmp;
 549           coef_list[next_coef++] = coe;
 550         }
 551     }
 552   free (coef_list);
 553
 554   return result;
 555 }
 556
 557 double
 558 pspp_linreg_residual (const struct variable **predictors,
 559                       const union value **vals,
 560                       const union value *obs, const void *c, int n_vals)
 561 {
 562   double pred;
 563   double result;
 564
 565   if (predictors == NULL || vals == NULL || c == NULL || obs == NULL)
 566     {
 567       return GSL_NAN;
 568     }
 569   pred = pspp_linreg_predict (predictors, vals, c, n_vals);
 570
 571   result = isnan (pred) ? GSL_NAN : (obs->f - pred);
 572   return result;
 573 }
 574
 575 /*
 576   Which coefficient is associated with V? The VAL argument is relevant
 577   only to categorical variables.
 578  */
 579 struct pspp_coeff *
 580 pspp_linreg_get_coeff (const pspp_linreg_cache * c,
 581                        const struct variable *v, const union value *val)
 582 {
 583   if (c == NULL)
 584     {
 585       return NULL;
 586     }
 587   if (c->coeff == NULL || c->n_indeps == 0 || v == NULL)
 588     {
 589       return NULL;
 590     }
 591   return pspp_coeff_var_to_coeff (v, c->coeff, c->n_coeffs, val);
 592 }
 593 /*
 594   Return the standard deviation of the independent variable.
 595  */
 596 double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v)
 597 {
 598   if (var_is_numeric (v))
 599     {
 600       const struct pspp_coeff *coef;
 601       coef = pspp_linreg_get_coeff (c, v, NULL);
 602       return pspp_coeff_get_sd (coef);
 603     }
 604   return GSL_NAN;
 605 }
 606
 607 void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v,
 608                                         double s)
 609 {
 610   if (var_is_numeric (v))
 611     {
 612       struct pspp_coeff *coef;
 613       coef = pspp_linreg_get_coeff (c, v, NULL);
 614       pspp_coeff_set_sd (coef, s);
 615     }
 616 }
 617
 618 /*
 619   Mean of the independent variable.
 620  */
 621 double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v)
 622 {
 623   if (var_is_numeric (v))
 624     {
 625       struct pspp_coeff *coef;
 626       coef = pspp_linreg_get_coeff (c, v, NULL);
 627       return pspp_coeff_get_mean (coef);
 628     }
 629   return GSL_NAN;
 630 }
 631
 632 void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v,
 633                                           double m)
 634 {
 635   if (var_is_numeric (v))
 636     {
 637       struct pspp_coeff *coef;
 638       coef = pspp_linreg_get_coeff (c, v, NULL);
 639       pspp_coeff_set_mean (coef, m);
 640     }
 641 }
 642
 643 /*
 644   Make sure the dependent variable is at the last column, and that
 645   only variables in the model are in the covariance matrix.
 646  */
 647 static struct design_matrix *
 648 rearrange_covariance_matrix (const struct design_matrix *cov, pspp_linreg_cache *c)
 649 {
 650   struct variable **v;
 651   struct variable **model_vars;
 652   struct variable *tmp;
 653   struct design_matrix *result;
 654   int n_vars;
 655   int found;
 656   size_t *columns;
 657   size_t i;
 658   size_t j;
 659   size_t k;
 660   size_t dep_col;
 661
 662   assert (cov != NULL);
 663   assert (c != NULL);
 664   assert (cov->m->size1 > 0);
 665   assert (cov->m->size2 == cov->m->size1);
 666   v = xnmalloc (c->n_coeffs, sizeof (*v));
 667   model_vars = xnmalloc (c->n_coeffs, sizeof (*model_vars));
 668   columns = xnmalloc (cov->m->size2, sizeof (*columns));
 669   n_vars = pspp_linreg_get_vars (c, v);
 670   dep_col = 0;
 671   k = 0;
 672   for (i = 0; i < cov->m->size2; i++)
 673     {
 674       tmp = design_matrix_col_to_var (cov, i);
 675       found = 0;
 676       j = 0;
 677       while (!found && j < n_vars)
 678         {
 679           if (tmp == v[j])
 680             {
 681               found = 1;
 682               if (tmp == c->depvar)
 683                 {
 684                   dep_col = j;
 685                 }
 686               else
 687                 {
 688                   columns[k] = j;
 689                   k++;
 690                 }
 691             }
 692           j++;
 693         }
 694     }
 695   k++;
 696   columns[k] = dep_col;
 697   /*
 698     K should now be equal to C->N_INDEPS + 1. If it is not, then
 699     either the code above is wrong or the caller didn't send us the
 700     correct values in C.
 701    */
 702   assert (k == c->n_indeps + 1);
 703   /*
 704     Put the model variables in the right order in MODEL_VARS.
 705    */
 706   for (i = 0; i < k; i++)
 707     {
 708       model_vars[i] = v[columns[i]];
 709     }
 710
 711   result = covariance_matrix_create (k, model_vars);
 712   for (i = 0; i < result->m->size1; i++)
 713     {
 714       for (j = 0; j < result->m->size2; j++)
 715         {
 716           gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, columns[i], columns[j]));
 717         }
 718     }
 719   free (columns);
 720   free (v);
 721   return result;
 722 }
 723 /*
 724   Estimate the model parameters from the covariance matrix only. This
 725   method uses less memory than PSPP_LINREG, which requires the entire
 726   data set to be stored in memory.
 727 */
 728 int
 729 pspp_linreg_with_cov (const struct design_matrix *full_cov,
 730                       pspp_linreg_cache * cache)
 731 {
 732   struct design_matrix *cov;
 733
 734   assert (full_cov != NULL);
 735   assert (cache != NULL);
 736
 737   cov = rearrange_covariance_matrix (full_cov, cache);
 738   cache_init (cache, cov);
 739   reg_sweep (cov->m);
 740   post_sweep_computations (cache, cov, cov->m);
 741   covariance_matrix_destroy (cov);
 742 }
 743