pintos-os.org Git - pspp/blob - src/language/stats/regression.q

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2005, 2009, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include <gsl/gsl_cdf.h>
  20 #include <gsl/gsl_matrix.h>
  21 #include <gsl/gsl_vector.h>
  22 #include <math.h>
  23 #include <stdlib.h>
  24 #include <data/case.h>
  25 #include <data/casegrouper.h>
  26 #include <data/casereader.h>
  27 #include <data/dictionary.h>
  28 #include <data/missing-values.h>
  29 #include <data/procedure.h>
  30 #include <data/transformations.h>
  31 #include <data/value-labels.h>
  32 #include <data/variable.h>
  33 #include <language/command.h>
  34 #include <language/dictionary/split-file.h>
  35 #include <language/data-io/file-handle.h>
  36 #include <language/lexer/lexer.h>
  37 #include <libpspp/compiler.h>
  38 #include <libpspp/message.h>
  39 #include <libpspp/taint.h>
  40 #include <math/covariance.h>
  41 #include <math/linreg.h>
  42 #include <math/moments.h>
  43 #include <output/tab.h>
  44
  45 #include "gl/intprops.h"
  46 #include "gl/xalloc.h"
  47
  48 #include "gettext.h"
  49 #define _(msgid) gettext (msgid)
  50
  51 #define REG_LARGE_DATA 1000
  52
  53 /* (headers) */
  54
  55 /* (specification)
  56    "REGRESSION" (regression_):
  57    *variables=custom;
  58    +statistics[st_]=r,
  59                     coeff,
  60                     anova,
  61                     outs,
  62                     zpp,
  63                     label,
  64                     sha,
  65                     ci,
  66                     bcov,
  67                     ses,
  68                     xtx,
  69                     collin,
  70                     tol,
  71                     selection,
  72                     f,
  73                     defaults,
  74                     all;
  75    ^dependent=varlist;
  76    +save[sv_]=resid,pred;
  77    +method=enter.
  78 */
  79 /* (declarations) */
  80 /* (functions) */
  81 static struct cmd_regression cmd;
  82
  83 /*
  84   Moments for each of the variables used.
  85  */
  86 struct moments_var
  87 {
  88   struct moments1 *m;
  89   const struct variable *v;
  90 };
  91
  92 /*
  93   Transformations for saving predicted values
  94   and residuals, etc.
  95  */
  96 struct reg_trns
  97 {
  98   int n_trns;                   /* Number of transformations. */
  99   int trns_id;                  /* Which trns is this one? */
 100   linreg *c;            /* Linear model for this trns. */
 101 };
 102 /*
 103   Variables used (both explanatory and response).
 104  */
 105 static const struct variable **v_variables;
 106
 107 /*
 108   Number of variables.
 109  */
 110 static size_t n_variables;
 111
 112 static bool run_regression (struct casereader *, struct cmd_regression *,
 113                             struct dataset *, linreg **);
 114
 115 /*
 116    STATISTICS subcommand output functions.
 117  */
 118 static void reg_stats_r (linreg *, void *);
 119 static void reg_stats_coeff (linreg *, void *);
 120 static void reg_stats_anova (linreg *, void *);
 121 static void reg_stats_outs (linreg *, void *);
 122 static void reg_stats_zpp (linreg *, void *);
 123 static void reg_stats_label (linreg *, void *);
 124 static void reg_stats_sha (linreg *, void *);
 125 static void reg_stats_ci (linreg *, void *);
 126 static void reg_stats_f (linreg *, void *);
 127 static void reg_stats_bcov (linreg *, void *);
 128 static void reg_stats_ses (linreg *, void *);
 129 static void reg_stats_xtx (linreg *, void *);
 130 static void reg_stats_collin (linreg *, void *);
 131 static void reg_stats_tol (linreg *, void *);
 132 static void reg_stats_selection (linreg *, void *);
 133 static void statistics_keyword_output (void (*)(linreg *, void *),
 134                                        int, linreg *, void *);
 135
 136 static void
 137 reg_stats_r (linreg *c, void *aux UNUSED)
 138 {
 139   struct tab_table *t;
 140   int n_rows = 2;
 141   int n_cols = 5;
 142   double rsq;
 143   double adjrsq;
 144   double std_error;
 145
 146   assert (c != NULL);
 147   rsq = linreg_ssreg (c) / linreg_sst (c);
 148   adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c));
 149   std_error = sqrt (linreg_mse (c));
 150   t = tab_create (n_cols, n_rows);
 151   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 152   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 153   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 154   tab_vline (t, TAL_0, 1, 0, 0);
 155
 156   tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
 157   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
 158   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
 159   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
 160   tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL);
 161   tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL);
 162   tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL);
 163   tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL);
 164   tab_title (t, _("Model Summary"));
 165   tab_submit (t);
 166 }
 167
 168 /*
 169   Table showing estimated regression coefficients.
 170  */
 171 static void
 172 reg_stats_coeff (linreg * c, void *aux_)
 173 {
 174   size_t j;
 175   int n_cols = 7;
 176   int n_rows;
 177   int this_row;
 178   double t_stat;
 179   double pval;
 180   double std_err;
 181   double beta;
 182   const char *label;
 183
 184   const struct variable *v;
 185   struct tab_table *t;
 186   gsl_matrix *cov = aux_;
 187
 188   assert (c != NULL);
 189   n_rows = linreg_n_coeffs (c) + 3;
 190
 191   t = tab_create (n_cols, n_rows);
 192   tab_headers (t, 2, 0, 1, 0);
 193   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 194   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 195   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 196   tab_vline (t, TAL_0, 1, 0, 0);
 197
 198   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
 199   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
 200   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
 201   tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
 202   tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
 203   tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
 204   tab_double (t, 2, 1, 0, linreg_intercept (c), NULL);
 205   std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0));
 206   tab_double (t, 3, 1, 0, std_err, NULL);
 207   tab_double (t, 4, 1, 0, 0.0, NULL);
 208   t_stat = linreg_intercept (c) / std_err;
 209   tab_double (t, 5, 1, 0, t_stat, NULL);
 210   pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
 211   tab_double (t, 6, 1, 0, pval, NULL);
 212   for (j = 0; j < linreg_n_coeffs (c); j++)
 213     {
 214       struct string tstr;
 215       ds_init_empty (&tstr);
 216       this_row = j + 2;
 217
 218       v = linreg_indep_var (c, j);
 219       label = var_to_string (v);
 220       /* Do not overwrite the variable's name. */
 221       ds_put_cstr (&tstr, label);
 222       tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr));
 223       /*
 224          Regression coefficients.
 225        */
 226       tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL);
 227       /*
 228          Standard error of the coefficients.
 229        */
 230       std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1));
 231       tab_double (t, 3, this_row, 0, std_err, NULL);
 232       /*
 233          Standardized coefficient, i.e., regression coefficient
 234          if all variables had unit variance.
 235        */
 236       beta = sqrt (gsl_matrix_get (cov, j, j));
 237       beta *= linreg_coeff (c, j) /
 238         sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1));
 239       tab_double (t, 4, this_row, 0, beta, NULL);
 240
 241       /*
 242          Test statistic for H0: coefficient is 0.
 243        */
 244       t_stat = linreg_coeff (c, j) / std_err;
 245       tab_double (t, 5, this_row, 0, t_stat, NULL);
 246       /*
 247          P values for the test statistic above.
 248        */
 249       pval =
 250         2 * gsl_cdf_tdist_Q (fabs (t_stat),
 251                              (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
 252       tab_double (t, 6, this_row, 0, pval, NULL);
 253       ds_destroy (&tstr);
 254     }
 255   tab_title (t, _("Coefficients"));
 256   tab_submit (t);
 257 }
 258
 259 /*
 260   Display the ANOVA table.
 261  */
 262 static void
 263 reg_stats_anova (linreg * c, void *aux UNUSED)
 264 {
 265   int n_cols = 7;
 266   int n_rows = 4;
 267   const double msm = linreg_ssreg (c) / linreg_dfmodel (c);
 268   const double mse = linreg_mse (c);
 269   const double F = msm / mse;
 270   const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
 271
 272   struct tab_table *t;
 273
 274   assert (c != NULL);
 275   t = tab_create (n_cols, n_rows);
 276   tab_headers (t, 2, 0, 1, 0);
 277
 278   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 279
 280   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 281   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 282   tab_vline (t, TAL_0, 1, 0, 0);
 283
 284   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
 285   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
 286   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
 287   tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
 288   tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
 289
 290   tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
 291   tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
 292   tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
 293
 294   /* Sums of Squares */
 295   tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL);
 296   tab_double (t, 2, 3, 0, linreg_sst (c), NULL);
 297   tab_double (t, 2, 2, 0, linreg_sse (c), NULL);
 298
 299
 300   /* Degrees of freedom */
 301   tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm);
 302   tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe);
 303   tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft);
 304
 305   /* Mean Squares */
 306   tab_double (t, 4, 1, TAB_RIGHT, msm, NULL);
 307   tab_double (t, 4, 2, TAB_RIGHT, mse, NULL);
 308
 309   tab_double (t, 5, 1, 0, F, NULL);
 310
 311   tab_double (t, 6, 1, 0, pval, NULL);
 312
 313   tab_title (t, _("ANOVA"));
 314   tab_submit (t);
 315 }
 316
 317 static void
 318 reg_stats_outs (linreg * c, void *aux UNUSED)
 319 {
 320   assert (c != NULL);
 321 }
 322
 323 static void
 324 reg_stats_zpp (linreg * c, void *aux UNUSED)
 325 {
 326   assert (c != NULL);
 327 }
 328
 329 static void
 330 reg_stats_label (linreg * c, void *aux UNUSED)
 331 {
 332   assert (c != NULL);
 333 }
 334
 335 static void
 336 reg_stats_sha (linreg * c, void *aux UNUSED)
 337 {
 338   assert (c != NULL);
 339 }
 340 static void
 341 reg_stats_ci (linreg * c, void *aux UNUSED)
 342 {
 343   assert (c != NULL);
 344 }
 345 static void
 346 reg_stats_f (linreg * c, void *aux UNUSED)
 347 {
 348   assert (c != NULL);
 349 }
 350 static void
 351 reg_stats_bcov (linreg * c, void *aux UNUSED)
 352 {
 353   int n_cols;
 354   int n_rows;
 355   int i;
 356   int k;
 357   int row;
 358   int col;
 359   const char *label;
 360   struct tab_table *t;
 361
 362   assert (c != NULL);
 363   n_cols = c->n_indeps + 1 + 2;
 364   n_rows = 2 * (c->n_indeps + 1);
 365   t = tab_create (n_cols, n_rows);
 366   tab_headers (t, 2, 0, 1, 0);
 367   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 368   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 369   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 370   tab_vline (t, TAL_0, 1, 0, 0);
 371   tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
 372   tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
 373   for (i = 0; i < linreg_n_coeffs (c); i++)
 374     {
 375       const struct variable *v = linreg_indep_var (c, i);
 376       label = var_to_string (v);
 377       tab_text (t, 2, i, TAB_CENTER, label);
 378       tab_text (t, i + 2, 0, TAB_CENTER, label);
 379       for (k = 1; k < linreg_n_coeffs (c); k++)
 380         {
 381           col = (i <= k) ? k : i;
 382           row = (i <= k) ? i : k;
 383           tab_double (t, k + 2, i, TAB_CENTER,
 384                      gsl_matrix_get (c->cov, row, col), NULL);
 385         }
 386     }
 387   tab_title (t, _("Coefficient Correlations"));
 388   tab_submit (t);
 389 }
 390 static void
 391 reg_stats_ses (linreg * c, void *aux UNUSED)
 392 {
 393   assert (c != NULL);
 394 }
 395 static void
 396 reg_stats_xtx (linreg * c, void *aux UNUSED)
 397 {
 398   assert (c != NULL);
 399 }
 400 static void
 401 reg_stats_collin (linreg * c, void *aux UNUSED)
 402 {
 403   assert (c != NULL);
 404 }
 405 static void
 406 reg_stats_tol (linreg * c, void *aux UNUSED)
 407 {
 408   assert (c != NULL);
 409 }
 410 static void
 411 reg_stats_selection (linreg * c, void *aux UNUSED)
 412 {
 413   assert (c != NULL);
 414 }
 415
 416 static void
 417 statistics_keyword_output (void (*function) (linreg *, void *),
 418                            int keyword, linreg * c, void *aux)
 419 {
 420   if (keyword)
 421     {
 422       (*function) (c, aux);
 423     }
 424 }
 425
 426 static void
 427 subcommand_statistics (int *keywords, linreg * c, void *aux)
 428 {
 429   /*
 430      The order here must match the order in which the STATISTICS
 431      keywords appear in the specification section above.
 432    */
 433   enum
 434   { r,
 435     coeff,
 436     anova,
 437     outs,
 438     zpp,
 439     label,
 440     sha,
 441     ci,
 442     bcov,
 443     ses,
 444     xtx,
 445     collin,
 446     tol,
 447     selection,
 448     f,
 449     defaults,
 450     all
 451   };
 452   int i;
 453   int d = 1;
 454
 455   if (keywords[all])
 456     {
 457       /*
 458          Set everything but F.
 459        */
 460       for (i = 0; i < f; i++)
 461         {
 462           keywords[i] = 1;
 463         }
 464     }
 465   else
 466     {
 467       for (i = 0; i < all; i++)
 468         {
 469           if (keywords[i])
 470             {
 471               d = 0;
 472             }
 473         }
 474       /*
 475          Default output: ANOVA table, parameter estimates,
 476          and statistics for variables not entered into model,
 477          if appropriate.
 478        */
 479       if (keywords[defaults] | d)
 480         {
 481           keywords[anova] = 1;
 482           keywords[outs] = 1;
 483           keywords[coeff] = 1;
 484           keywords[r] = 1;
 485         }
 486     }
 487   statistics_keyword_output (reg_stats_r, keywords[r], c, aux);
 488   statistics_keyword_output (reg_stats_anova, keywords[anova], c, aux);
 489   statistics_keyword_output (reg_stats_coeff, keywords[coeff], c, aux);
 490   statistics_keyword_output (reg_stats_outs, keywords[outs], c, aux);
 491   statistics_keyword_output (reg_stats_zpp, keywords[zpp], c, aux);
 492   statistics_keyword_output (reg_stats_label, keywords[label], c, aux);
 493   statistics_keyword_output (reg_stats_sha, keywords[sha], c, aux);
 494   statistics_keyword_output (reg_stats_ci, keywords[ci], c, aux);
 495   statistics_keyword_output (reg_stats_f, keywords[f], c, aux);
 496   statistics_keyword_output (reg_stats_bcov, keywords[bcov], c, aux);
 497   statistics_keyword_output (reg_stats_ses, keywords[ses], c, aux);
 498   statistics_keyword_output (reg_stats_xtx, keywords[xtx], c, aux);
 499   statistics_keyword_output (reg_stats_collin, keywords[collin], c, aux);
 500   statistics_keyword_output (reg_stats_tol, keywords[tol], c, aux);
 501   statistics_keyword_output (reg_stats_selection, keywords[selection], c, aux);
 502 }
 503
 504 /*
 505   Free the transformation. Free its linear model if this
 506   transformation is the last one.
 507  */
 508 static bool
 509 regression_trns_free (void *t_)
 510 {
 511   bool result = true;
 512   struct reg_trns *t = t_;
 513
 514   if (t->trns_id == t->n_trns)
 515     {
 516       result = linreg_free (t->c);
 517     }
 518   free (t);
 519
 520   return result;
 521 }
 522
 523 /*
 524   Gets the predicted values.
 525  */
 526 static int
 527 regression_trns_pred_proc (void *t_, struct ccase **c,
 528                            casenumber case_idx UNUSED)
 529 {
 530   size_t i;
 531   size_t n_vals;
 532   struct reg_trns *trns = t_;
 533   linreg *model;
 534   union value *output = NULL;
 535   const union value *tmp;
 536   double *vals;
 537   const struct variable **vars = NULL;
 538
 539   assert (trns != NULL);
 540   model = trns->c;
 541   assert (model != NULL);
 542   assert (model->depvar != NULL);
 543   assert (model->pred != NULL);
 544
 545   vars = linreg_get_vars (model);
 546   n_vals = linreg_n_coeffs (model);
 547   vals = xnmalloc (n_vals, sizeof (*vals));
 548   *c = case_unshare (*c);
 549
 550   output = case_data_rw (*c, model->pred);
 551
 552   for (i = 0; i < n_vals; i++)
 553     {
 554       tmp = case_data (*c, vars[i]);
 555       vals[i] = tmp->f;
 556     }
 557   output->f = linreg_predict (model, vals, n_vals);
 558   free (vals);
 559   return TRNS_CONTINUE;
 560 }
 561
 562 /*
 563   Gets the residuals.
 564  */
 565 static int
 566 regression_trns_resid_proc (void *t_, struct ccase **c,
 567                             casenumber case_idx UNUSED)
 568 {
 569   size_t i;
 570   size_t n_vals;
 571   struct reg_trns *trns = t_;
 572   linreg *model;
 573   union value *output = NULL;
 574   const union value *tmp;
 575   double *vals = NULL;
 576   double obs;
 577   const struct variable **vars = NULL;
 578
 579   assert (trns != NULL);
 580   model = trns->c;
 581   assert (model != NULL);
 582   assert (model->depvar != NULL);
 583   assert (model->resid != NULL);
 584
 585   vars = linreg_get_vars (model);
 586   n_vals = linreg_n_coeffs (model);
 587
 588   vals = xnmalloc (n_vals, sizeof (*vals));
 589   *c = case_unshare (*c);
 590   output = case_data_rw (*c, model->resid);
 591   assert (output != NULL);
 592
 593   for (i = 0; i < n_vals; i++)
 594     {
 595       tmp = case_data (*c, vars[i]);
 596       vals[i] = tmp->f;
 597     }
 598   tmp = case_data (*c, model->depvar);
 599   obs = tmp->f;
 600   output->f = linreg_residual (model, obs, vals, n_vals);
 601   free (vals);
 602
 603   return TRNS_CONTINUE;
 604 }
 605
 606 static char *
 607 reg_get_name (const struct dictionary *dict, const char *prefix)
 608 {
 609   char *name;
 610   int i;
 611
 612   /* XXX handle too-long prefixes */
 613   name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1);
 614   for (i = 1; ; i++)
 615     {
 616       sprintf (name, "%s%d", prefix, i);
 617       if (dict_lookup_var (dict, name) == NULL)
 618         return name;
 619     }
 620 }
 621
 622 static void
 623 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
 624               linreg * c, struct variable **v, int n_trns)
 625 {
 626   struct dictionary *dict = dataset_dict (ds);
 627   static int trns_index = 1;
 628   char *name;
 629   struct variable *new_var;
 630   struct reg_trns *t = NULL;
 631
 632   t = xmalloc (sizeof (*t));
 633   t->trns_id = trns_index;
 634   t->n_trns = n_trns;
 635   t->c = c;
 636
 637   name = reg_get_name (dict, prefix);
 638   new_var = dict_create_var_assert (dict, name, 0);
 639   free (name);
 640
 641   *v = new_var;
 642   add_transformation (ds, f, regression_trns_free, t);
 643   trns_index++;
 644 }
 645 static void
 646 subcommand_save (struct dataset *ds, int save, linreg ** models)
 647 {
 648   linreg **lc;
 649   int n_trns = 0;
 650   int i;
 651
 652   if (save)
 653     {
 654       /* Count the number of transformations we will need. */
 655       for (i = 0; i < REGRESSION_SV_count; i++)
 656         {
 657           if (cmd.a_save[i])
 658             {
 659               n_trns++;
 660             }
 661         }
 662       n_trns *= cmd.n_dependent;
 663
 664       for (lc = models; lc < models + cmd.n_dependent; lc++)
 665         {
 666           if (*lc != NULL)
 667             {
 668               if ((*lc)->depvar != NULL)
 669                 {
 670                   if (cmd.a_save[REGRESSION_SV_RESID])
 671                     {
 672                       reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
 673                                     &(*lc)->resid, n_trns);
 674                     }
 675                   if (cmd.a_save[REGRESSION_SV_PRED])
 676                     {
 677                       reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
 678                                     &(*lc)->pred, n_trns);
 679                     }
 680                 }
 681             }
 682         }
 683     }
 684   else
 685     {
 686       for (lc = models; lc < models + cmd.n_dependent; lc++)
 687         {
 688           if (*lc != NULL)
 689             {
 690               linreg_free (*lc);
 691             }
 692         }
 693     }
 694 }
 695
 696 int
 697 cmd_regression (struct lexer *lexer, struct dataset *ds)
 698 {
 699   struct casegrouper *grouper;
 700   struct casereader *group;
 701   linreg **models;
 702   bool ok;
 703   size_t i;
 704
 705   if (!parse_regression (lexer, ds, &cmd, NULL))
 706     {
 707       return CMD_FAILURE;
 708     }
 709
 710   models = xnmalloc (cmd.n_dependent, sizeof *models);
 711   for (i = 0; i < cmd.n_dependent; i++)
 712     {
 713       models[i] = NULL;
 714     }
 715
 716   /* Data pass. */
 717   grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
 718   while (casegrouper_get_next_group (grouper, &group))
 719     run_regression (group, &cmd, ds, models);
 720   ok = casegrouper_destroy (grouper);
 721   ok = proc_commit (ds) && ok;
 722
 723   subcommand_save (ds, cmd.sbc_save, models);
 724   free (v_variables);
 725   free (models);
 726   free_regression (&cmd);
 727
 728   return ok ? CMD_SUCCESS : CMD_FAILURE;
 729 }
 730
 731 /*
 732   Is variable k the dependent variable?
 733  */
 734 static bool
 735 is_depvar (size_t k, const struct variable *v)
 736 {
 737   return v == v_variables[k];
 738 }
 739
 740 /* Parser for the variables sub command */
 741 static int
 742 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
 743                              struct cmd_regression *cmd UNUSED,
 744                              void *aux UNUSED)
 745 {
 746   const struct dictionary *dict = dataset_dict (ds);
 747
 748   lex_match (lexer, T_EQUALS);
 749
 750   if ((lex_token (lexer) != T_ID
 751        || dict_lookup_var (dict, lex_tokcstr (lexer)) == NULL)
 752       && lex_token (lexer) != T_ALL)
 753     return 2;
 754
 755
 756   if (!parse_variables_const
 757       (lexer, dict, &v_variables, &n_variables, PV_NONE))
 758     {
 759       free (v_variables);
 760       return 0;
 761     }
 762   assert (n_variables);
 763
 764   return 1;
 765 }
 766
 767 /* Identify the explanatory variables in v_variables.  Returns
 768    the number of independent variables. */
 769 static int
 770 identify_indep_vars (const struct variable **indep_vars,
 771                      const struct variable *depvar)
 772 {
 773   int n_indep_vars = 0;
 774   int i;
 775
 776   for (i = 0; i < n_variables; i++)
 777     if (!is_depvar (i, depvar))
 778       indep_vars[n_indep_vars++] = v_variables[i];
 779   if ((n_indep_vars < 1) && is_depvar (0, depvar))
 780     {
 781       /*
 782         There is only one independent variable, and it is the same
 783         as the dependent variable. Print a warning and continue.
 784        */
 785       msg (SE,
 786            gettext ("The dependent variable is equal to the independent variable."
 787                     "The least squares line is therefore Y=X."
 788                     "Standard errors and related statistics may be meaningless."));
 789       n_indep_vars = 1;
 790       indep_vars[0] = v_variables[0];
 791     }
 792   return n_indep_vars;
 793 }
 794 static double
 795 fill_covariance (gsl_matrix *cov, struct covariance *all_cov,
 796                  const struct variable **vars,
 797                  size_t n_vars, const struct variable *dep_var,
 798                  const struct variable **all_vars, size_t n_all_vars,
 799                  double *means)
 800 {
 801   size_t i;
 802   size_t j;
 803   size_t dep_subscript;
 804   size_t *rows;
 805   const gsl_matrix *ssizes;
 806   gsl_matrix *cm;
 807   const gsl_matrix *mean_matrix;
 808   const gsl_matrix *ssize_matrix;
 809   double result = 0.0;
 810
 811   cm = covariance_calculate_unnormalized (all_cov);
 812   rows = xnmalloc (cov->size1 - 1, sizeof (*rows));
 813
 814   for (i = 0; i < n_all_vars; i++)
 815     {
 816       for (j = 0; j < n_vars; j++)
 817         {
 818           if (vars[j] == all_vars[i])
 819             {
 820               rows[j] = i;
 821             }
 822         }
 823       if (all_vars[i] == dep_var)
 824         {
 825           dep_subscript = i;
 826         }
 827     }
 828   mean_matrix = covariance_moments (all_cov, MOMENT_MEAN);
 829   ssize_matrix = covariance_moments (all_cov, MOMENT_NONE);
 830   for (i = 0; i < cov->size1 - 1; i++)
 831     {
 832       means[i] = gsl_matrix_get (mean_matrix, rows[i], 0)
 833         / gsl_matrix_get (ssize_matrix, rows[i], 0);
 834       for (j = 0; j < cov->size2 - 1; j++)
 835         {
 836           gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j]));
 837           gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i]));
 838         }
 839     }
 840   means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0)
 841     / gsl_matrix_get (ssize_matrix, dep_subscript, 0);
 842   ssizes = covariance_moments (all_cov, MOMENT_NONE);
 843   result = gsl_matrix_get (ssizes, dep_subscript, rows[0]);
 844   for (i = 0; i < cov->size1 - 1; i++)
 845     {
 846       gsl_matrix_set (cov, i, cov->size1 - 1,
 847                       gsl_matrix_get (cm, rows[i], dep_subscript));
 848       gsl_matrix_set (cov, cov->size1 - 1, i,
 849                       gsl_matrix_get (cm, rows[i], dep_subscript));
 850       if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript))
 851         {
 852           result = gsl_matrix_get (ssizes, rows[i], dep_subscript);
 853         }
 854     }
 855   gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1,
 856                   gsl_matrix_get (cm, dep_subscript, dep_subscript));
 857   free (rows);
 858   gsl_matrix_free (cm);
 859   return result;
 860 }
 861 static size_t
 862 get_n_all_vars (struct cmd_regression *cmd)
 863 {
 864   size_t result = n_variables;
 865   size_t i;
 866   size_t j;
 867
 868   result += cmd->n_dependent;
 869   for (i = 0; i < cmd->n_dependent; i++)
 870     {
 871       for (j = 0; j < n_variables; j++)
 872         {
 873           if (v_variables[j] == cmd->v_dependent[i])
 874             {
 875               result--;
 876             }
 877         }
 878     }
 879   return result;
 880 }
 881 static void
 882 fill_all_vars (const struct variable **vars, struct cmd_regression *cmd)
 883 {
 884   size_t i;
 885   size_t j;
 886   bool absent;
 887
 888   for (i = 0; i < n_variables; i++)
 889     {
 890       vars[i] = v_variables[i];
 891     }
 892   for (i = 0; i < cmd->n_dependent; i++)
 893     {
 894       absent = true;
 895       for (j = 0; j < n_variables; j++)
 896         {
 897           if (cmd->v_dependent[i] == v_variables[j])
 898             {
 899               absent = false;
 900               break;
 901             }
 902         }
 903       if (absent)
 904         {
 905           vars[i + n_variables] = cmd->v_dependent[i];
 906         }
 907     }
 908 }
 909 static bool
 910 run_regression (struct casereader *input, struct cmd_regression *cmd,
 911                 struct dataset *ds, linreg **models)
 912 {
 913   size_t i;
 914   int n_indep = 0;
 915   int k;
 916   double n_data;
 917   double *means;
 918   struct ccase *c;
 919   struct covariance *cov;
 920   const struct variable **vars;
 921   const struct variable **all_vars;
 922   const struct variable *dep_var;
 923   struct casereader *reader;
 924   const struct dictionary *dict;
 925   size_t n_all_vars;
 926
 927   assert (models != NULL);
 928
 929   for (i = 0; i < n_variables; i++)
 930     {
 931       if (!var_is_numeric (v_variables[i]))
 932         {
 933           msg (SE, _("REGRESSION requires numeric variables."));
 934           return false;
 935         }
 936     }
 937
 938   c = casereader_peek (input, 0);
 939   if (c == NULL)
 940     {
 941       casereader_destroy (input);
 942       return true;
 943     }
 944   output_split_file_values (ds, c);
 945   case_unref (c);
 946
 947   dict = dataset_dict (ds);
 948   if (!v_variables)
 949     {
 950       dict_get_vars (dict, &v_variables, &n_variables, 0);
 951     }
 952   n_all_vars = get_n_all_vars (cmd);
 953   all_vars = xnmalloc (n_all_vars, sizeof (*all_vars));
 954   fill_all_vars (all_vars, cmd);
 955   vars = xnmalloc (n_variables, sizeof (*vars));
 956   means  = xnmalloc (n_all_vars, sizeof (*means));
 957   cov = covariance_1pass_create (n_all_vars, all_vars,
 958                                  dict_get_weight (dict), MV_ANY);
 959
 960   reader = casereader_clone (input);
 961   reader = casereader_create_filter_missing (reader, v_variables, n_variables,
 962                                              MV_ANY, NULL, NULL);
 963   for (; (c = casereader_read (reader)) != NULL; case_unref (c))
 964     {
 965       covariance_accumulate (cov, c);
 966     }
 967
 968   for (k = 0; k < cmd->n_dependent; k++)
 969     {
 970       gsl_matrix *this_cm;
 971       dep_var = cmd->v_dependent[k];
 972       n_indep = identify_indep_vars (vars, dep_var);
 973
 974       this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1);
 975       n_data = fill_covariance (this_cm, cov, vars, n_indep,
 976                                 dep_var, all_vars, n_all_vars, means);
 977       models[k] = linreg_alloc (dep_var, (const struct variable **) vars,
 978                                 n_data, n_indep);
 979       models[k]->depvar = dep_var;
 980       for (i = 0; i < n_indep; i++)
 981         {
 982           linreg_set_indep_variable_mean (models[k], i, means[i]);
 983         }
 984       linreg_set_depvar_mean (models[k], means[i]);
 985       /*
 986         For large data sets, use QR decomposition.
 987       */
 988       if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
 989         {
 990           models[k]->method = LINREG_QR;
 991         }
 992
 993       if (n_data > 0)
 994         {
 995           /*
 996             Find the least-squares estimates and other statistics.
 997           */
 998           linreg_fit (this_cm, models[k]);
 999
1000           if (!taint_has_tainted_successor (casereader_get_taint (input)))
1001             {
1002               subcommand_statistics (cmd->a_statistics, models[k], this_cm);
1003             }
1004         }
1005       else
1006         {
1007           msg (SE,
1008                gettext ("No valid data found. This command was skipped."));
1009           linreg_free (models[k]);
1010           models[k] = NULL;
1011         }
1012       gsl_matrix_free (this_cm);
1013     }
1014
1015   casereader_destroy (reader);
1016   free (vars);
1017   free (all_vars);
1018   free (means);
1019   casereader_destroy (input);
1020   covariance_destroy (cov);
1021
1022   return true;
1023 }
1024
1025 /*
1026   Local Variables:
1027   mode: c
1028   End:
1029 */