pintos-os.org Git - pspp/blob - src/language/stats/regression.q

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2005, 2009 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include <gsl/gsl_cdf.h>
  20 #include <gsl/gsl_matrix.h>
  21 #include <gsl/gsl_vector.h>
  22 #include <math.h>
  23 #include <stdlib.h>
  24 #include <data/case.h>
  25 #include <data/casegrouper.h>
  26 #include <data/casereader.h>
  27 #include <data/category.h>
  28 #include <data/dictionary.h>
  29 #include <data/missing-values.h>
  30 #include <data/procedure.h>
  31 #include <data/transformations.h>
  32 #include <data/value-labels.h>
  33 #include <data/variable.h>
  34 #include <language/command.h>
  35 #include <language/dictionary/split-file.h>
  36 #include <language/data-io/file-handle.h>
  37 #include <language/lexer/lexer.h>
  38 #include <libpspp/compiler.h>
  39 #include <libpspp/message.h>
  40 #include <libpspp/taint.h>
  41 #include <math/covariance.h>
  42 #include <math/linreg.h>
  43 #include <math/moments.h>
  44 #include <output/table.h>
  45
  46 #include "xalloc.h"
  47
  48 #include "gettext.h"
  49 #define _(msgid) gettext (msgid)
  50
  51 #define REG_LARGE_DATA 1000
  52
  53 /* (headers) */
  54
  55 /* (specification)
  56    "REGRESSION" (regression_):
  57    *variables=custom;
  58    +statistics[st_]=r,
  59                     coeff,
  60                     anova,
  61                     outs,
  62                     zpp,
  63                     label,
  64                     sha,
  65                     ci,
  66                     bcov,
  67                     ses,
  68                     xtx,
  69                     collin,
  70                     tol,
  71                     selection,
  72                     f,
  73                     defaults,
  74                     all;
  75    ^dependent=varlist;
  76    +save[sv_]=resid,pred;
  77    +method=enter.
  78 */
  79 /* (declarations) */
  80 /* (functions) */
  81 static struct cmd_regression cmd;
  82
  83 /*
  84   Moments for each of the variables used.
  85  */
  86 struct moments_var
  87 {
  88   struct moments1 *m;
  89   const struct variable *v;
  90 };
  91
  92 /*
  93   Transformations for saving predicted values
  94   and residuals, etc.
  95  */
  96 struct reg_trns
  97 {
  98   int n_trns;                   /* Number of transformations. */
  99   int trns_id;                  /* Which trns is this one? */
 100   linreg *c;            /* Linear model for this trns. */
 101 };
 102 /*
 103   Variables used (both explanatory and response).
 104  */
 105 static const struct variable **v_variables;
 106
 107 /*
 108   Number of variables.
 109  */
 110 static size_t n_variables;
 111
 112 static bool run_regression (struct casereader *, struct cmd_regression *,
 113                             struct dataset *, linreg **);
 114
 115 /*
 116    STATISTICS subcommand output functions.
 117  */
 118 static void reg_stats_r (linreg *);
 119 static void reg_stats_coeff (linreg *);
 120 static void reg_stats_anova (linreg *);
 121 static void reg_stats_outs (linreg *);
 122 static void reg_stats_zpp (linreg *);
 123 static void reg_stats_label (linreg *);
 124 static void reg_stats_sha (linreg *);
 125 static void reg_stats_ci (linreg *);
 126 static void reg_stats_f (linreg *);
 127 static void reg_stats_bcov (linreg *);
 128 static void reg_stats_ses (linreg *);
 129 static void reg_stats_xtx (linreg *);
 130 static void reg_stats_collin (linreg *);
 131 static void reg_stats_tol (linreg *);
 132 static void reg_stats_selection (linreg *);
 133 static void statistics_keyword_output (void (*)(linreg *),
 134                                        int, linreg *);
 135
 136 static void
 137 reg_stats_r (linreg * c)
 138 {
 139   struct tab_table *t;
 140   int n_rows = 2;
 141   int n_cols = 5;
 142   double rsq;
 143   double adjrsq;
 144   double std_error;
 145
 146   assert (c != NULL);
 147   rsq = c->ssm / c->sst;
 148   adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
 149   std_error = sqrt (linreg_mse (c));
 150   t = tab_create (n_cols, n_rows, 0);
 151   tab_dim (t, tab_natural_dimensions, NULL);
 152   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 153   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 154   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 155   tab_vline (t, TAL_0, 1, 0, 0);
 156
 157   tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
 158   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
 159   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
 160   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
 161   tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL);
 162   tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL);
 163   tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL);
 164   tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL);
 165   tab_title (t, _("Model Summary"));
 166   tab_submit (t);
 167 }
 168
 169 /*
 170   Table showing estimated regression coefficients.
 171  */
 172 static void
 173 reg_stats_coeff (linreg * c)
 174 {
 175   size_t j;
 176   int n_cols = 7;
 177   int n_rows;
 178   int this_row;
 179   double t_stat;
 180   double pval;
 181   double std_err;
 182   double beta;
 183   const char *label;
 184
 185   const struct variable *v;
 186   struct tab_table *t;
 187
 188   assert (c != NULL);
 189   n_rows = c->n_coeffs + 3;
 190
 191   t = tab_create (n_cols, n_rows, 0);
 192   tab_headers (t, 2, 0, 1, 0);
 193   tab_dim (t, tab_natural_dimensions, NULL);
 194   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 195   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 196   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 197   tab_vline (t, TAL_0, 1, 0, 0);
 198
 199   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
 200   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
 201   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
 202   tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
 203   tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
 204   tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
 205   tab_double (t, 2, 1, 0, linreg_intercept (c), NULL);
 206   std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0));
 207   tab_double (t, 3, 1, 0, std_err, NULL);
 208   tab_double (t, 4, 1, 0, 0.0, NULL);
 209   t_stat = linreg_intercept (c) / std_err;
 210   tab_double (t, 5, 1, 0, t_stat, NULL);
 211   pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
 212   tab_double (t, 6, 1, 0, pval, NULL);
 213   for (j = 0; j < linreg_n_coeffs (c); j++)
 214     {
 215       struct string tstr;
 216       ds_init_empty (&tstr);
 217       this_row = j + 2;
 218
 219       v = linreg_indep_var (c, j);
 220       label = var_to_string (v);
 221       /* Do not overwrite the variable's name. */
 222       ds_put_cstr (&tstr, label);
 223       tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr));
 224       /*
 225          Regression coefficients.
 226        */
 227       tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL);
 228       /*
 229          Standard error of the coefficients.
 230        */
 231       std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1));
 232       tab_double (t, 3, this_row, 0, std_err, NULL);
 233       /*
 234          Standardized coefficient, i.e., regression coefficient
 235          if all variables had unit variance.
 236        */
 237       beta = sqrt (gsl_matrix_get (linreg_cov (c), j, j));
 238       beta *= linreg_coeff (c, j) / c->depvar_std;
 239       tab_double (t, 4, this_row, 0, beta, NULL);
 240
 241       /*
 242          Test statistic for H0: coefficient is 0.
 243        */
 244       t_stat = linreg_coeff (c, j) / std_err;
 245       tab_double (t, 5, this_row, 0, t_stat, NULL);
 246       /*
 247          P values for the test statistic above.
 248        */
 249       pval =
 250         2 * gsl_cdf_tdist_Q (fabs (t_stat),
 251                              (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
 252       tab_double (t, 6, this_row, 0, pval, NULL);
 253       ds_destroy (&tstr);
 254     }
 255   tab_title (t, _("Coefficients"));
 256   tab_submit (t);
 257 }
 258
 259 /*
 260   Display the ANOVA table.
 261  */
 262 static void
 263 reg_stats_anova (linreg * c)
 264 {
 265   int n_cols = 7;
 266   int n_rows = 4;
 267   const double msm = linreg_ssreg (c) / linreg_dfmodel (c);
 268   const double mse = linreg_mse (c);
 269   const double F = msm / mse;
 270   const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
 271
 272   struct tab_table *t;
 273
 274   assert (c != NULL);
 275   t = tab_create (n_cols, n_rows, 0);
 276   tab_headers (t, 2, 0, 1, 0);
 277   tab_dim (t, tab_natural_dimensions, NULL);
 278
 279   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 280
 281   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 282   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 283   tab_vline (t, TAL_0, 1, 0, 0);
 284
 285   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
 286   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
 287   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
 288   tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
 289   tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
 290
 291   tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
 292   tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
 293   tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
 294
 295   /* Sums of Squares */
 296   tab_double (t, 2, 1, 0, c->ssm, NULL);
 297   tab_double (t, 2, 3, 0, c->sst, NULL);
 298   tab_double (t, 2, 2, 0, c->sse, NULL);
 299
 300
 301   /* Degrees of freedom */
 302   tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm);
 303   tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe);
 304   tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft);
 305
 306   /* Mean Squares */
 307   tab_double (t, 4, 1, TAB_RIGHT, msm, NULL);
 308   tab_double (t, 4, 2, TAB_RIGHT, mse, NULL);
 309
 310   tab_double (t, 5, 1, 0, F, NULL);
 311
 312   tab_double (t, 6, 1, 0, pval, NULL);
 313
 314   tab_title (t, _("ANOVA"));
 315   tab_submit (t);
 316 }
 317
 318 static void
 319 reg_stats_outs (linreg * c)
 320 {
 321   assert (c != NULL);
 322 }
 323
 324 static void
 325 reg_stats_zpp (linreg * c)
 326 {
 327   assert (c != NULL);
 328 }
 329
 330 static void
 331 reg_stats_label (linreg * c)
 332 {
 333   assert (c != NULL);
 334 }
 335
 336 static void
 337 reg_stats_sha (linreg * c)
 338 {
 339   assert (c != NULL);
 340 }
 341 static void
 342 reg_stats_ci (linreg * c)
 343 {
 344   assert (c != NULL);
 345 }
 346 static void
 347 reg_stats_f (linreg * c)
 348 {
 349   assert (c != NULL);
 350 }
 351 static void
 352 reg_stats_bcov (linreg * c)
 353 {
 354   int n_cols;
 355   int n_rows;
 356   int i;
 357   int k;
 358   int row;
 359   int col;
 360   const char *label;
 361   struct tab_table *t;
 362
 363   assert (c != NULL);
 364   n_cols = c->n_indeps + 1 + 2;
 365   n_rows = 2 * (c->n_indeps + 1);
 366   t = tab_create (n_cols, n_rows, 0);
 367   tab_headers (t, 2, 0, 1, 0);
 368   tab_dim (t, tab_natural_dimensions, NULL);
 369   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 370   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 371   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 372   tab_vline (t, TAL_0, 1, 0, 0);
 373   tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
 374   tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
 375   for (i = 0; i < linreg_n_coeffs (c); i++)
 376     {
 377       const struct variable *v = linreg_indep_var (c, i);
 378       label = var_to_string (v);
 379       tab_text (t, 2, i, TAB_CENTER, label);
 380       tab_text (t, i + 2, 0, TAB_CENTER, label);
 381       for (k = 1; k < linreg_n_coeffs (c); k++)
 382         {
 383           col = (i <= k) ? k : i;
 384           row = (i <= k) ? i : k;
 385           tab_double (t, k + 2, i, TAB_CENTER,
 386                      gsl_matrix_get (c->cov, row, col), NULL);
 387         }
 388     }
 389   tab_title (t, _("Coefficient Correlations"));
 390   tab_submit (t);
 391 }
 392 static void
 393 reg_stats_ses (linreg * c)
 394 {
 395   assert (c != NULL);
 396 }
 397 static void
 398 reg_stats_xtx (linreg * c)
 399 {
 400   assert (c != NULL);
 401 }
 402 static void
 403 reg_stats_collin (linreg * c)
 404 {
 405   assert (c != NULL);
 406 }
 407 static void
 408 reg_stats_tol (linreg * c)
 409 {
 410   assert (c != NULL);
 411 }
 412 static void
 413 reg_stats_selection (linreg * c)
 414 {
 415   assert (c != NULL);
 416 }
 417
 418 static void
 419 statistics_keyword_output (void (*function) (linreg *),
 420                            int keyword, linreg * c)
 421 {
 422   if (keyword)
 423     {
 424       (*function) (c);
 425     }
 426 }
 427
 428 static void
 429 subcommand_statistics (int *keywords, linreg * c)
 430 {
 431   /*
 432      The order here must match the order in which the STATISTICS
 433      keywords appear in the specification section above.
 434    */
 435   enum
 436   { r,
 437     coeff,
 438     anova,
 439     outs,
 440     zpp,
 441     label,
 442     sha,
 443     ci,
 444     bcov,
 445     ses,
 446     xtx,
 447     collin,
 448     tol,
 449     selection,
 450     f,
 451     defaults,
 452     all
 453   };
 454   int i;
 455   int d = 1;
 456
 457   if (keywords[all])
 458     {
 459       /*
 460          Set everything but F.
 461        */
 462       for (i = 0; i < f; i++)
 463         {
 464           keywords[i] = 1;
 465         }
 466     }
 467   else
 468     {
 469       for (i = 0; i < all; i++)
 470         {
 471           if (keywords[i])
 472             {
 473               d = 0;
 474             }
 475         }
 476       /*
 477          Default output: ANOVA table, parameter estimates,
 478          and statistics for variables not entered into model,
 479          if appropriate.
 480        */
 481       if (keywords[defaults] | d)
 482         {
 483           keywords[anova] = 1;
 484           keywords[outs] = 1;
 485           keywords[coeff] = 1;
 486           keywords[r] = 1;
 487         }
 488     }
 489   statistics_keyword_output (reg_stats_r, keywords[r], c);
 490   statistics_keyword_output (reg_stats_anova, keywords[anova], c);
 491   statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
 492   statistics_keyword_output (reg_stats_outs, keywords[outs], c);
 493   statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
 494   statistics_keyword_output (reg_stats_label, keywords[label], c);
 495   statistics_keyword_output (reg_stats_sha, keywords[sha], c);
 496   statistics_keyword_output (reg_stats_ci, keywords[ci], c);
 497   statistics_keyword_output (reg_stats_f, keywords[f], c);
 498   statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
 499   statistics_keyword_output (reg_stats_ses, keywords[ses], c);
 500   statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
 501   statistics_keyword_output (reg_stats_collin, keywords[collin], c);
 502   statistics_keyword_output (reg_stats_tol, keywords[tol], c);
 503   statistics_keyword_output (reg_stats_selection, keywords[selection], c);
 504 }
 505
 506 /*
 507   Free the transformation. Free its linear model if this
 508   transformation is the last one.
 509  */
 510 static bool
 511 regression_trns_free (void *t_)
 512 {
 513   bool result = true;
 514   struct reg_trns *t = t_;
 515
 516   if (t->trns_id == t->n_trns)
 517     {
 518       result = linreg_free (t->c);
 519     }
 520   free (t);
 521
 522   return result;
 523 }
 524
 525 /*
 526   Gets the predicted values.
 527  */
 528 static int
 529 regression_trns_pred_proc (void *t_, struct ccase **c,
 530                            casenumber case_idx UNUSED)
 531 {
 532   size_t i;
 533   size_t n_vals;
 534   struct reg_trns *trns = t_;
 535   linreg *model;
 536   union value *output = NULL;
 537   const union value *tmp;
 538   double *vals;
 539   const struct variable **vars = NULL;
 540
 541   assert (trns != NULL);
 542   model = trns->c;
 543   assert (model != NULL);
 544   assert (model->depvar != NULL);
 545   assert (model->pred != NULL);
 546
 547   vars = linreg_get_vars (model);
 548   n_vals = linreg_n_coeffs (model);
 549   vals = xnmalloc (n_vals, sizeof (*vals));
 550   *c = case_unshare (*c);
 551
 552   output = case_data_rw (*c, model->pred);
 553
 554   for (i = 0; i < n_vals; i++)
 555     {
 556       tmp = case_data (*c, vars[i]);
 557       vals[i] = tmp->f;
 558     }
 559   output->f = linreg_predict (model, vals, n_vals);
 560   free (vals);
 561   return TRNS_CONTINUE;
 562 }
 563
 564 /*
 565   Gets the residuals.
 566  */
 567 static int
 568 regression_trns_resid_proc (void *t_, struct ccase **c,
 569                             casenumber case_idx UNUSED)
 570 {
 571   size_t i;
 572   size_t n_vals;
 573   struct reg_trns *trns = t_;
 574   linreg *model;
 575   union value *output = NULL;
 576   const union value *tmp;
 577   double *vals = NULL;
 578   double obs;
 579   const struct variable **vars = NULL;
 580
 581   assert (trns != NULL);
 582   model = trns->c;
 583   assert (model != NULL);
 584   assert (model->depvar != NULL);
 585   assert (model->resid != NULL);
 586
 587   vars = linreg_get_vars (model);
 588   n_vals = linreg_n_coeffs (model);
 589
 590   vals = xnmalloc (n_vals, sizeof (*vals));
 591   *c = case_unshare (*c);
 592   output = case_data_rw (*c, model->resid);
 593   assert (output != NULL);
 594
 595   for (i = 0; i < n_vals; i++)
 596     {
 597       tmp = case_data (*c, vars[i]);
 598       vals[i] = tmp->f;
 599     }
 600   tmp = case_data (*c, model->depvar);
 601   obs = tmp->f;
 602   output->f = linreg_residual (model, obs, vals, n_vals);
 603   free (vals);
 604
 605   return TRNS_CONTINUE;
 606 }
 607
 608 /*
 609    Returns false if NAME is a duplicate of any existing variable name.
 610 */
 611 static bool
 612 try_name (const struct dictionary *dict, const char *name)
 613 {
 614   if (dict_lookup_var (dict, name) != NULL)
 615     return false;
 616
 617   return true;
 618 }
 619
 620 static void
 621 reg_get_name (const struct dictionary *dict, char name[VAR_NAME_LEN],
 622               const char prefix[VAR_NAME_LEN])
 623 {
 624   int i = 1;
 625
 626   snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
 627   while (!try_name (dict, name))
 628     {
 629       i++;
 630       snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
 631     }
 632 }
 633
 634 static void
 635 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
 636               linreg * c, struct variable **v, int n_trns)
 637 {
 638   struct dictionary *dict = dataset_dict (ds);
 639   static int trns_index = 1;
 640   char name[VAR_NAME_LEN];
 641   struct variable *new_var;
 642   struct reg_trns *t = NULL;
 643
 644   t = xmalloc (sizeof (*t));
 645   t->trns_id = trns_index;
 646   t->n_trns = n_trns;
 647   t->c = c;
 648   reg_get_name (dict, name, prefix);
 649   new_var = dict_create_var (dict, name, 0);
 650   assert (new_var != NULL);
 651   *v = new_var;
 652   add_transformation (ds, f, regression_trns_free, t);
 653   trns_index++;
 654 }
 655 static void
 656 subcommand_save (struct dataset *ds, int save, linreg ** models)
 657 {
 658   linreg **lc;
 659   int n_trns = 0;
 660   int i;
 661
 662   assert (models != NULL);
 663
 664   if (save)
 665     {
 666       /* Count the number of transformations we will need. */
 667       for (i = 0; i < REGRESSION_SV_count; i++)
 668         {
 669           if (cmd.a_save[i])
 670             {
 671               n_trns++;
 672             }
 673         }
 674       n_trns *= cmd.n_dependent;
 675
 676       for (lc = models; lc < models + cmd.n_dependent; lc++)
 677         {
 678           if (*lc != NULL)
 679             {
 680               if ((*lc)->depvar != NULL)
 681                 {
 682                   if (cmd.a_save[REGRESSION_SV_RESID])
 683                     {
 684                       reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
 685                                     &(*lc)->resid, n_trns);
 686                     }
 687                   if (cmd.a_save[REGRESSION_SV_PRED])
 688                     {
 689                       reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
 690                                     &(*lc)->pred, n_trns);
 691                     }
 692                 }
 693             }
 694         }
 695     }
 696   else
 697     {
 698       for (lc = models; lc < models + cmd.n_dependent; lc++)
 699         {
 700           if (*lc != NULL)
 701             {
 702               linreg_free (*lc);
 703             }
 704         }
 705     }
 706 }
 707
 708 int
 709 cmd_regression (struct lexer *lexer, struct dataset *ds)
 710 {
 711   struct casegrouper *grouper;
 712   struct casereader *group;
 713   linreg **models;
 714   bool ok;
 715   size_t i;
 716
 717   if (!parse_regression (lexer, ds, &cmd, NULL))
 718     {
 719       return CMD_FAILURE;
 720     }
 721
 722   models = xnmalloc (cmd.n_dependent, sizeof *models);
 723   for (i = 0; i < cmd.n_dependent; i++)
 724     {
 725       models[i] = NULL;
 726     }
 727
 728   /* Data pass. */
 729   grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
 730   while (casegrouper_get_next_group (grouper, &group))
 731     run_regression (group, &cmd, ds, models);
 732   ok = casegrouper_destroy (grouper);
 733   ok = proc_commit (ds) && ok;
 734
 735   subcommand_save (ds, cmd.sbc_save, models);
 736   free (v_variables);
 737   free (models);
 738   free_regression (&cmd);
 739
 740   return ok ? CMD_SUCCESS : CMD_FAILURE;
 741 }
 742
 743 /*
 744   Is variable k the dependent variable?
 745  */
 746 static bool
 747 is_depvar (size_t k, const struct variable *v)
 748 {
 749   return v == v_variables[k];
 750 }
 751
 752 /* Parser for the variables sub command */
 753 static int
 754 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
 755                              struct cmd_regression *cmd UNUSED,
 756                              void *aux UNUSED)
 757 {
 758   const struct dictionary *dict = dataset_dict (ds);
 759
 760   lex_match (lexer, '=');
 761
 762   if ((lex_token (lexer) != T_ID
 763        || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
 764       && lex_token (lexer) != T_ALL)
 765     return 2;
 766
 767
 768   if (!parse_variables_const
 769       (lexer, dict, &v_variables, &n_variables, PV_NONE))
 770     {
 771       free (v_variables);
 772       return 0;
 773     }
 774   assert (n_variables);
 775
 776   return 1;
 777 }
 778
 779 /* Identify the explanatory variables in v_variables.  Returns
 780    the number of independent variables. */
 781 static int
 782 identify_indep_vars (const struct variable **indep_vars,
 783                      const struct variable *depvar)
 784 {
 785   int n_indep_vars = 0;
 786   int i;
 787
 788   for (i = 0; i < n_variables; i++)
 789     if (!is_depvar (i, depvar))
 790       indep_vars[n_indep_vars++] = v_variables[i];
 791   if ((n_indep_vars < 1) && is_depvar (0, depvar))
 792     {
 793       /*
 794         There is only one independent variable, and it is the same
 795         as the dependent variable. Print a warning and continue.
 796        */
 797       msg (SE,
 798            gettext ("The dependent variable is equal to the independent variable."
 799                     "The least squares line is therefore Y=X."
 800                     "Standard errors and related statistics may be meaningless."));
 801       n_indep_vars = 1;
 802       indep_vars[0] = v_variables[0];
 803     }
 804   return n_indep_vars;
 805 }
 806 static double
 807 fill_covariance (gsl_matrix *cov, struct covariance *all_cov,
 808                  const struct variable **vars,
 809                  size_t n_vars, const struct variable *dep_var,
 810                  const struct variable **all_vars, size_t n_all_vars)
 811 {
 812   size_t i;
 813   size_t j;
 814   size_t k = 0;
 815   size_t dep_subscript;
 816   size_t *rows;
 817   const gsl_matrix *ssizes;
 818   const gsl_matrix *cm;
 819   double result = 0.0;
 820
 821   cm = covariance_calculate (all_cov);
 822   rows = xnmalloc (cov->size1 - 1, sizeof (*rows));
 823
 824   for (i = 0; i < n_all_vars; i++)
 825     {
 826       for (j = k; j < n_vars; j++)
 827         {
 828           if (vars[j] == all_vars[i])
 829             {
 830               if (vars[j] != dep_var)
 831                 {
 832                   rows[j] = i;
 833                 }
 834               else
 835                 {
 836                   dep_subscript = i;
 837                 }
 838               k++;
 839               break;
 840             }
 841         }
 842     }
 843   for (i = 0; i < cov->size1 - 1; i++)
 844     {
 845       for (j = 0; j < cov->size2 - 1; j++)
 846         {
 847           gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j]));
 848           gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i]));
 849         }
 850     }
 851   ssizes = covariance_moments (all_cov, MOMENT_NONE);
 852   result = gsl_matrix_get (ssizes, dep_subscript, rows[0]);
 853   for (i = 0; i < cov->size1 - 1; i++)
 854     {
 855       gsl_matrix_set (cov, i, cov->size1 - 1,
 856                       gsl_matrix_get (cm, rows[i], dep_subscript));
 857       gsl_matrix_set (cov, cov->size1 - 1, i,
 858                       gsl_matrix_get (cm, rows[i], dep_subscript));
 859       if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript))
 860         {
 861           result = gsl_matrix_get (ssizes, rows[i], dep_subscript);
 862         }
 863     }
 864   free (rows);
 865 }
 866
 867 static bool
 868 run_regression (struct casereader *input, struct cmd_regression *cmd,
 869                 struct dataset *ds, linreg **models)
 870 {
 871   size_t i;
 872   int n_indep = 0;
 873   int k;
 874   double n_data;
 875   struct ccase *c;
 876   struct covariance *cov;
 877   const struct variable **vars;
 878   const struct variable *dep_var;
 879   struct casereader *reader;
 880   const struct dictionary *dict;
 881   gsl_matrix *this_cm;
 882
 883   assert (models != NULL);
 884
 885   for (i = 0; i < n_variables; i++)
 886     {
 887       if (!var_is_numeric (v_variables[i]))
 888         {
 889           msg (SE, _("REGRESSION requires numeric variables."));
 890           return false;
 891         }
 892     }
 893
 894   c = casereader_peek (input, 0);
 895   if (c == NULL)
 896     {
 897       casereader_destroy (input);
 898       return true;
 899     }
 900   output_split_file_values (ds, c);
 901   case_unref (c);
 902
 903   dict = dataset_dict (ds);
 904   if (!v_variables)
 905     {
 906       dict_get_vars (dict, &v_variables, &n_variables, 0);
 907     }
 908   vars = xnmalloc (n_variables, sizeof (*vars));
 909   cov = covariance_1pass_create (n_variables, v_variables,
 910                                  dict_get_weight (dict), MV_ANY);
 911
 912   reader = casereader_clone (input);
 913   reader = casereader_create_filter_missing (reader, v_variables, n_variables,
 914                                              MV_ANY, NULL, NULL);
 915   for (; (c = casereader_read (reader)) != NULL; case_unref (c))
 916     {
 917       covariance_accumulate (cov, c);
 918     }
 919
 920   for (k = 0; k < cmd->n_dependent; k++)
 921     {
 922       dep_var = cmd->v_dependent[k];
 923       n_indep = identify_indep_vars (vars, dep_var);
 924
 925       this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1);
 926       n_data = fill_covariance (this_cm, cov, vars, n_indep,
 927                                        dep_var, v_variables, n_variables);
 928       models[k] = linreg_alloc (dep_var, (const struct variable **) vars,
 929                                 n_data, n_indep);
 930       models[k]->depvar = dep_var;
 931
 932       /*
 933         For large data sets, use QR decomposition.
 934       */
 935       if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
 936         {
 937           models[k]->method = LINREG_QR;
 938         }
 939
 940       if (n_data > 0)
 941         {
 942           /*
 943             Find the least-squares estimates and other statistics.
 944           */
 945           linreg_fit (this_cm, models[k]);
 946
 947           if (!taint_has_tainted_successor (casereader_get_taint (input)))
 948             {
 949               subcommand_statistics (cmd->a_statistics, models[k]);
 950             }
 951         }
 952       else
 953         {
 954           msg (SE,
 955                gettext ("No valid data found. This command was skipped."));
 956         }
 957     }
 958
 959   casereader_destroy (reader);
 960   free (vars);
 961   casereader_destroy (input);
 962   covariance_destroy (cov);
 963
 964   return true;
 965 }
 966
 967 /*
 968   Local Variables:
 969   mode: c
 970   End:
 971 */