pintos-os.org Git - pspp/blob - src/language/stats/regression.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include <float.h>
  20 #include <stdbool.h>
  21
  22 #include <gsl/gsl_cdf.h>
  23 #include <gsl/gsl_matrix.h>
  24
  25 #include <data/dataset.h>
  26 #include <data/casewriter.h>
  27
  28 #include "language/command.h"
  29 #include "language/lexer/lexer.h"
  30 #include "language/lexer/value-parser.h"
  31 #include "language/lexer/variable-parser.h"
  32
  33
  34 #include "data/casegrouper.h"
  35 #include "data/casereader.h"
  36 #include "data/dictionary.h"
  37
  38 #include "math/covariance.h"
  39 #include "math/linreg.h"
  40 #include "math/moments.h"
  41
  42 #include "libpspp/message.h"
  43 #include "libpspp/taint.h"
  44
  45 #include "output/tab.h"
  46
  47 #include "gettext.h"
  48 #define _(msgid) gettext (msgid)
  49 #define N_(msgid) msgid
  50
  51
  52 #include <gl/intprops.h>
  53
  54 #define REG_LARGE_DATA 1000
  55
  56 #define STATS_R      1
  57 #define STATS_COEFF  2
  58 #define STATS_ANOVA  4
  59 #define STATS_OUTS   8
  60 #define STATS_CI    16
  61 #define STATS_BCOV  32
  62
  63 #define STATS_DEFAULT  (STATS_R | STATS_COEFF | STATS_ANOVA | STATS_OUTS)
  64
  65
  66
  67 struct regression
  68 {
  69   struct dataset *ds;
  70
  71   const struct variable **vars;
  72   size_t n_vars;
  73
  74   const struct variable **dep_vars;
  75   size_t n_dep_vars;
  76
  77   unsigned int stats;
  78   double ci;
  79
  80   bool resid;
  81   bool pred;
  82 };
  83
  84 struct regression_workspace
  85 {
  86   /* The new variables which will be introduced by /SAVE */
  87   const struct variable **predvars;
  88   const struct variable **residvars;
  89
  90   /* A reader/writer pair to temporarily hold the
  91      values of the new variables */
  92   struct casewriter *writer;
  93   struct casereader *reader;
  94
  95   /* Indeces of the new values in the reader/writer (-1 if not applicable) */
  96   int res_idx;
  97   int pred_idx;
  98
  99   /* 0, 1 or 2 depending on what new variables are to be created */
 100   int extras;
 101 };
 102
 103 static void run_regression (const struct regression *cmd,
 104                             struct regression_workspace *ws,
 105                             struct casereader *input);
 106
 107
 108 /* Return a string based on PREFIX which may be used as the name
 109    of a new variable in DICT */
 110 static char *
 111 reg_get_name (const struct dictionary *dict, const char *prefix)
 112 {
 113   char *name;
 114   int i;
 115
 116   /* XXX handle too-long prefixes */
 117   name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1);
 118   for (i = 1;; i++)
 119     {
 120       sprintf (name, "%s%d", prefix, i);
 121       if (dict_lookup_var (dict, name) == NULL)
 122         return name;
 123     }
 124 }
 125
 126
 127 static const struct variable *
 128 create_aux_var (struct dataset *ds, const char *prefix)
 129 {
 130   struct variable *var;
 131   struct dictionary *dict = dataset_dict (ds);
 132   char *name = reg_get_name (dict, prefix);
 133   var = dict_create_var_assert (dict, name, 0);
 134   free (name);
 135   return var;
 136 }
 137
 138 /* Auxilliary data for transformation when /SAVE is entered */
 139 struct save_trans_data
 140 {
 141   int n_dep_vars;
 142   struct regression_workspace *ws;
 143 };
 144
 145 static bool
 146 save_trans_free (void *aux)
 147 {
 148   struct save_trans_data *save_trans_data = aux;
 149   free (save_trans_data->ws->predvars);
 150   free (save_trans_data->ws->residvars);
 151
 152   casereader_destroy (save_trans_data->ws->reader);
 153   free (save_trans_data->ws);
 154   free (save_trans_data);
 155   return true;
 156 }
 157
 158 static int
 159 save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED)
 160 {
 161   struct save_trans_data *save_trans_data = aux;
 162   struct regression_workspace *ws = save_trans_data->ws;
 163   struct ccase *in =  casereader_read (ws->reader);
 164
 165   if (in)
 166     {
 167       int k;
 168       *c = case_unshare (*c);
 169
 170       for (k = 0; k < save_trans_data->n_dep_vars; ++k)
 171         {
 172           if (ws->pred_idx != -1)
 173             {
 174               double pred = case_data_idx (in, ws->extras * k + ws->pred_idx)->f;
 175               case_data_rw (*c, ws->predvars[k])->f = pred;
 176             }
 177
 178           if (ws->res_idx != -1)
 179             {
 180               double resid = case_data_idx (in, ws->extras * k + ws->res_idx)->f;
 181               case_data_rw (*c, ws->residvars[k])->f = resid;
 182             }
 183         }
 184       case_unref (in);
 185     }
 186
 187   return TRNS_CONTINUE;
 188 }
 189
 190
 191 int
 192 cmd_regression (struct lexer *lexer, struct dataset *ds)
 193 {
 194   struct regression_workspace workspace;
 195   struct regression regression;
 196   const struct dictionary *dict = dataset_dict (ds);
 197   bool save;
 198
 199   memset (&regression, 0, sizeof (struct regression));
 200
 201   regression.ci = 0.95;
 202   regression.stats = STATS_DEFAULT;
 203   regression.pred = false;
 204   regression.resid = false;
 205
 206   regression.ds = ds;
 207
 208   /* Accept an optional, completely pointless "/VARIABLES=" */
 209   lex_match (lexer, T_SLASH);
 210   if (lex_match_id (lexer, "VARIABLES"))
 211     {
 212       if (!lex_force_match (lexer, T_EQUALS))
 213         goto error;
 214     }
 215
 216   if (!parse_variables_const (lexer, dict,
 217                               &regression.vars, &regression.n_vars,
 218                               PV_NO_DUPLICATE | PV_NUMERIC))
 219     goto error;
 220
 221
 222   while (lex_token (lexer) != T_ENDCMD)
 223     {
 224       lex_match (lexer, T_SLASH);
 225
 226       if (lex_match_id (lexer, "DEPENDENT"))
 227         {
 228           if (!lex_force_match (lexer, T_EQUALS))
 229             goto error;
 230
 231           free (regression.dep_vars);
 232           regression.n_dep_vars = 0;
 233
 234           if (!parse_variables_const (lexer, dict,
 235                                       &regression.dep_vars,
 236                                       &regression.n_dep_vars,
 237                                       PV_NO_DUPLICATE | PV_NUMERIC))
 238             goto error;
 239         }
 240       else if (lex_match_id (lexer, "METHOD"))
 241         {
 242           lex_match (lexer, T_EQUALS);
 243
 244           if (!lex_force_match_id (lexer, "ENTER"))
 245             {
 246               goto error;
 247             }
 248         }
 249       else if (lex_match_id (lexer, "STATISTICS"))
 250         {
 251           lex_match (lexer, T_EQUALS);
 252
 253           while (lex_token (lexer) != T_ENDCMD
 254                  && lex_token (lexer) != T_SLASH)
 255             {
 256               if (lex_match (lexer, T_ALL))
 257                 {
 258                   regression.stats = ~0;
 259                 }
 260               else if (lex_match_id (lexer, "DEFAULTS"))
 261                 {
 262                   regression.stats |= STATS_DEFAULT;
 263                 }
 264               else if (lex_match_id (lexer, "R"))
 265                 {
 266                   regression.stats |= STATS_R;
 267                 }
 268               else if (lex_match_id (lexer, "COEFF"))
 269                 {
 270                   regression.stats |= STATS_COEFF;
 271                 }
 272               else if (lex_match_id (lexer, "ANOVA"))
 273                 {
 274                   regression.stats |= STATS_ANOVA;
 275                 }
 276               else if (lex_match_id (lexer, "BCOV"))
 277                 {
 278                   regression.stats |= STATS_BCOV;
 279                 }
 280               else if (lex_match_id (lexer, "CI"))
 281                 {
 282                   regression.stats |= STATS_CI;
 283
 284                   if (lex_match (lexer, T_LPAREN))
 285                     {
 286                       regression.ci = lex_number (lexer) / 100.0;
 287                       lex_get (lexer);
 288                       lex_force_match (lexer, T_RPAREN);
 289                     }
 290                 }
 291               else
 292                 {
 293                   lex_error (lexer, NULL);
 294                   goto error;
 295                 }
 296             }
 297         }
 298       else if (lex_match_id (lexer, "SAVE"))
 299         {
 300           lex_match (lexer, T_EQUALS);
 301
 302           while (lex_token (lexer) != T_ENDCMD
 303                  && lex_token (lexer) != T_SLASH)
 304             {
 305               if (lex_match_id (lexer, "PRED"))
 306                 {
 307                   regression.pred = true;
 308                 }
 309               else if (lex_match_id (lexer, "RESID"))
 310                 {
 311                   regression.resid = true;
 312                 }
 313               else
 314                 {
 315                   lex_error (lexer, NULL);
 316                   goto error;
 317                 }
 318             }
 319         }
 320       else
 321         {
 322           lex_error (lexer, NULL);
 323           goto error;
 324         }
 325     }
 326
 327   if (!regression.vars)
 328     {
 329       dict_get_vars (dict, &regression.vars, &regression.n_vars, 0);
 330     }
 331
 332   save = regression.pred || regression.resid;
 333   workspace.extras = 0;
 334   workspace.res_idx = -1;
 335   workspace.pred_idx = -1;
 336   workspace.writer = NULL;
 337   workspace.reader = NULL;
 338   workspace.residvars = NULL;
 339   workspace.predvars = NULL;
 340   if (save)
 341     {
 342       int i;
 343       struct caseproto *proto = caseproto_create ();
 344
 345       if (regression.resid)
 346         {
 347           workspace.extras ++;
 348           workspace.res_idx = 0;
 349           workspace.residvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.residvars));
 350
 351           for (i = 0; i < regression.n_dep_vars; ++i)
 352             {
 353               workspace.residvars[i] = create_aux_var (ds, "RES");
 354               proto = caseproto_add_width (proto, 0);
 355             }
 356         }
 357
 358       if (regression.pred)
 359         {
 360           workspace.extras ++;
 361           workspace.pred_idx = 1;
 362           workspace.predvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.predvars));
 363
 364           for (i = 0; i < regression.n_dep_vars; ++i)
 365             {
 366               workspace.predvars[i] = create_aux_var (ds, "PRED");
 367               proto = caseproto_add_width (proto, 0);
 368             }
 369         }
 370
 371       if (proc_make_temporary_transformations_permanent (ds))
 372         msg (SW, _("REGRESSION with SAVE ignores TEMPORARY.  "
 373                    "Temporary transformations will be made permanent."));
 374
 375       workspace.writer = autopaging_writer_create (proto);
 376       caseproto_unref (proto);
 377     }
 378
 379
 380   {
 381     struct casegrouper *grouper;
 382     struct casereader *group;
 383     bool ok;
 384
 385     grouper = casegrouper_create_splits (proc_open_filtering (ds, !save), dict);
 386
 387
 388     while (casegrouper_get_next_group (grouper, &group))
 389       {
 390         run_regression (&regression,
 391                         &workspace,
 392                         group);
 393
 394       }
 395     ok = casegrouper_destroy (grouper);
 396     ok = proc_commit (ds) && ok;
 397   }
 398
 399   if (workspace.writer)
 400     {
 401       struct save_trans_data *save_trans_data = xmalloc (sizeof *save_trans_data);
 402       struct casereader *r = casewriter_make_reader (workspace.writer);
 403       workspace.writer = NULL;
 404       workspace.reader = r;
 405       save_trans_data->ws = xmalloc (sizeof (workspace));
 406       memcpy (save_trans_data->ws, &workspace, sizeof (workspace));
 407       save_trans_data->n_dep_vars = regression.n_dep_vars;
 408
 409       add_transformation (ds, save_trans_func, save_trans_free, save_trans_data);
 410     }
 411
 412
 413   free (regression.vars);
 414   free (regression.dep_vars);
 415   return CMD_SUCCESS;
 416
 417 error:
 418
 419   free (regression.vars);
 420   free (regression.dep_vars);
 421   return CMD_FAILURE;
 422 }
 423
 424 /* Return the size of the union of dependent and independent variables */
 425 static size_t
 426 get_n_all_vars (const struct regression *cmd)
 427 {
 428   size_t result = cmd->n_vars;
 429   size_t i;
 430   size_t j;
 431
 432   result += cmd->n_dep_vars;
 433   for (i = 0; i < cmd->n_dep_vars; i++)
 434     {
 435       for (j = 0; j < cmd->n_vars; j++)
 436         {
 437           if (cmd->vars[j] == cmd->dep_vars[i])
 438             {
 439               result--;
 440             }
 441         }
 442     }
 443   return result;
 444 }
 445
 446 /* Fill VARS with the union of dependent and independent variables */
 447 static void
 448 fill_all_vars (const struct variable **vars, const struct regression *cmd)
 449 {
 450   size_t x = 0;
 451   size_t i;
 452   for (i = 0; i < cmd->n_vars; i++)
 453     {
 454       vars[i] = cmd->vars[i];
 455     }
 456
 457   for (i = 0; i < cmd->n_dep_vars; i++)
 458     {
 459       size_t j;
 460       bool absent = true;
 461       for (j = 0; j < cmd->n_vars; j++)
 462         {
 463           if (cmd->dep_vars[i] == cmd->vars[j])
 464             {
 465               absent = false;
 466               break;
 467             }
 468         }
 469       if (absent)
 470         {
 471           vars[cmd->n_vars + x++] = cmd->dep_vars[i];
 472         }
 473     }
 474 }
 475
 476 /*
 477   Is variable k the dependent variable?
 478 */
 479 static bool
 480 is_depvar (const struct regression *cmd, size_t k, const struct variable *v)
 481 {
 482   return v == cmd->vars[k];
 483 }
 484
 485
 486 /* Identify the explanatory variables in v_variables.  Returns
 487    the number of independent variables. */
 488 static int
 489 identify_indep_vars (const struct regression *cmd,
 490                      const struct variable **indep_vars,
 491                      const struct variable *depvar)
 492 {
 493   int n_indep_vars = 0;
 494   int i;
 495
 496   for (i = 0; i < cmd->n_vars; i++)
 497     if (!is_depvar (cmd, i, depvar))
 498       indep_vars[n_indep_vars++] = cmd->vars[i];
 499   if ((n_indep_vars < 1) && is_depvar (cmd, 0, depvar))
 500     {
 501       /*
 502          There is only one independent variable, and it is the same
 503          as the dependent variable. Print a warning and continue.
 504        */
 505       msg (SW,
 506            gettext
 507            ("The dependent variable is equal to the independent variable. "
 508             "The least squares line is therefore Y=X. "
 509             "Standard errors and related statistics may be meaningless."));
 510       n_indep_vars = 1;
 511       indep_vars[0] = cmd->vars[0];
 512     }
 513   return n_indep_vars;
 514 }
 515
 516
 517 static double
 518 fill_covariance (gsl_matrix * cov, struct covariance *all_cov,
 519                  const struct variable **vars,
 520                  size_t n_vars, const struct variable *dep_var,
 521                  const struct variable **all_vars, size_t n_all_vars,
 522                  double *means)
 523 {
 524   size_t i;
 525   size_t j;
 526   size_t dep_subscript;
 527   size_t *rows;
 528   const gsl_matrix *ssizes;
 529   const gsl_matrix *mean_matrix;
 530   const gsl_matrix *ssize_matrix;
 531   double result = 0.0;
 532
 533   const gsl_matrix *cm = covariance_calculate_unnormalized (all_cov);
 534
 535   if (cm == NULL)
 536     return 0;
 537
 538   rows = xnmalloc (cov->size1 - 1, sizeof (*rows));
 539
 540   for (i = 0; i < n_all_vars; i++)
 541     {
 542       for (j = 0; j < n_vars; j++)
 543         {
 544           if (vars[j] == all_vars[i])
 545             {
 546               rows[j] = i;
 547             }
 548         }
 549       if (all_vars[i] == dep_var)
 550         {
 551           dep_subscript = i;
 552         }
 553     }
 554   mean_matrix = covariance_moments (all_cov, MOMENT_MEAN);
 555   ssize_matrix = covariance_moments (all_cov, MOMENT_NONE);
 556   for (i = 0; i < cov->size1 - 1; i++)
 557     {
 558       means[i] = gsl_matrix_get (mean_matrix, rows[i], 0)
 559         / gsl_matrix_get (ssize_matrix, rows[i], 0);
 560       for (j = 0; j < cov->size2 - 1; j++)
 561         {
 562           gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j]));
 563           gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i]));
 564         }
 565     }
 566   means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0)
 567     / gsl_matrix_get (ssize_matrix, dep_subscript, 0);
 568   ssizes = covariance_moments (all_cov, MOMENT_NONE);
 569   result = gsl_matrix_get (ssizes, dep_subscript, rows[0]);
 570   for (i = 0; i < cov->size1 - 1; i++)
 571     {
 572       gsl_matrix_set (cov, i, cov->size1 - 1,
 573                       gsl_matrix_get (cm, rows[i], dep_subscript));
 574       gsl_matrix_set (cov, cov->size1 - 1, i,
 575                       gsl_matrix_get (cm, rows[i], dep_subscript));
 576       if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript))
 577         {
 578           result = gsl_matrix_get (ssizes, rows[i], dep_subscript);
 579         }
 580     }
 581   gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1,
 582                   gsl_matrix_get (cm, dep_subscript, dep_subscript));
 583   free (rows);
 584   return result;
 585 }
 586
 587 \f
 588
 589 /*
 590   STATISTICS subcommand output functions.
 591 */
 592 static void reg_stats_r (const linreg *,     const struct variable *);
 593 static void reg_stats_coeff (const linreg *, const gsl_matrix *, const struct variable *, const struct regression *);
 594 static void reg_stats_anova (const linreg *, const struct variable *);
 595 static void reg_stats_bcov (const linreg *,  const struct variable *);
 596
 597
 598 static void
 599 subcommand_statistics (const struct regression *cmd, const linreg * c, const gsl_matrix * cm,
 600                        const struct variable *var)
 601 {
 602   if (cmd->stats & STATS_R)
 603     reg_stats_r     (c, var);
 604
 605   if (cmd->stats & STATS_ANOVA)
 606     reg_stats_anova (c, var);
 607
 608   if (cmd->stats & STATS_COEFF)
 609     reg_stats_coeff (c, cm, var, cmd);
 610
 611   if (cmd->stats & STATS_BCOV)
 612     reg_stats_bcov  (c, var);
 613 }
 614
 615
 616 static void
 617 run_regression (const struct regression *cmd,
 618                 struct regression_workspace *ws,
 619                 struct casereader *input)
 620 {
 621   size_t i;
 622   linreg **models;
 623
 624   int k;
 625   struct ccase *c;
 626   struct covariance *cov;
 627   struct casereader *reader;
 628   size_t n_all_vars = get_n_all_vars (cmd);
 629   const struct variable **all_vars = xnmalloc (n_all_vars, sizeof (*all_vars));
 630
 631   double *means = xnmalloc (n_all_vars, sizeof (*means));
 632
 633   fill_all_vars (all_vars, cmd);
 634   cov = covariance_1pass_create (n_all_vars, all_vars,
 635                                  dict_get_weight (dataset_dict (cmd->ds)),
 636                                  MV_ANY);
 637
 638   reader = casereader_clone (input);
 639   reader = casereader_create_filter_missing (reader, all_vars, n_all_vars,
 640                                              MV_ANY, NULL, NULL);
 641
 642
 643   {
 644     struct casereader *r = casereader_clone (reader);
 645
 646     for (; (c = casereader_read (r)) != NULL; case_unref (c))
 647       {
 648         covariance_accumulate (cov, c);
 649       }
 650     casereader_destroy (r);
 651   }
 652
 653   models = xcalloc (cmd->n_dep_vars, sizeof (*models));
 654   for (k = 0; k < cmd->n_dep_vars; k++)
 655     {
 656       const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars));
 657       const struct variable *dep_var = cmd->dep_vars[k];
 658       int n_indep = identify_indep_vars (cmd, vars, dep_var);
 659       gsl_matrix *this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1);
 660       double n_data = fill_covariance (this_cm, cov, vars, n_indep,
 661                                 dep_var, all_vars, n_all_vars, means);
 662       models[k] = linreg_alloc (dep_var, vars,  n_data, n_indep);
 663       models[k]->depvar = dep_var;
 664       for (i = 0; i < n_indep; i++)
 665         {
 666           linreg_set_indep_variable_mean (models[k], i, means[i]);
 667         }
 668       linreg_set_depvar_mean (models[k], means[i]);
 669       /*
 670          For large data sets, use QR decomposition.
 671        */
 672       if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
 673         {
 674           models[k]->method = LINREG_QR;
 675         }
 676
 677       if (n_data > 0)
 678         {
 679           /*
 680              Find the least-squares estimates and other statistics.
 681            */
 682           linreg_fit (this_cm, models[k]);
 683
 684           if (!taint_has_tainted_successor (casereader_get_taint (input)))
 685             {
 686               subcommand_statistics (cmd, models[k], this_cm, dep_var);
 687             }
 688         }
 689       else
 690         {
 691           msg (SE, _("No valid data found. This command was skipped."));
 692         }
 693       gsl_matrix_free (this_cm);
 694       free (vars);
 695     }
 696
 697
 698   if (ws->extras > 0)
 699    {
 700       struct casereader *r = casereader_clone (reader);
 701
 702       for (; (c = casereader_read (r)) != NULL; case_unref (c))
 703         {
 704           struct ccase *outc = case_clone (c);
 705           for (k = 0; k < cmd->n_dep_vars; k++)
 706             {
 707               const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars));
 708               const struct variable *dep_var = cmd->dep_vars[k];
 709               int n_indep = identify_indep_vars (cmd, vars, dep_var);
 710               double *vals = xnmalloc (n_indep, sizeof (*vals));
 711               for (i = 0; i < n_indep; i++)
 712                 {
 713                   const union value *tmp = case_data (c, vars[i]);
 714                   vals[i] = tmp->f;
 715                 }
 716
 717               if (cmd->pred)
 718                 {
 719                   double pred = linreg_predict (models[k], vals, n_indep);
 720                   case_data_rw_idx (outc, k * ws->extras + ws->pred_idx)->f = pred;
 721                 }
 722
 723               if (cmd->resid)
 724                 {
 725                   double obs = case_data (c, models[k]->depvar)->f;
 726                   double res = linreg_residual (models[k], obs,  vals, n_indep);
 727                   case_data_rw_idx (outc, k * ws->extras + ws->res_idx)->f = res;
 728                 }
 729               free (vals);
 730               free (vars);
 731             }
 732           casewriter_write (ws->writer, outc);
 733         }
 734       casereader_destroy (r);
 735     }
 736
 737   casereader_destroy (reader);
 738
 739   for (k = 0; k < cmd->n_dep_vars; k++)
 740     {
 741       linreg_unref (models[k]);
 742     }
 743   free (models);
 744
 745   free (all_vars);
 746   free (means);
 747   casereader_destroy (input);
 748   covariance_destroy (cov);
 749 }
 750
 751 \f
 752
 753
 754 static void
 755 reg_stats_r (const linreg * c, const struct variable *var)
 756 {
 757   struct tab_table *t;
 758   int n_rows = 2;
 759   int n_cols = 5;
 760   double rsq;
 761   double adjrsq;
 762   double std_error;
 763
 764   assert (c != NULL);
 765   rsq = linreg_ssreg (c) / linreg_sst (c);
 766   adjrsq = rsq -
 767     (1.0 - rsq) * linreg_n_coeffs (c) / (linreg_n_obs (c) -
 768                                          linreg_n_coeffs (c) - 1);
 769   std_error = sqrt (linreg_mse (c));
 770   t = tab_create (n_cols, n_rows);
 771   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 772   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 773   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 774   tab_vline (t, TAL_0, 1, 0, 0);
 775
 776   tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
 777   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
 778   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
 779   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
 780   tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL, RC_OTHER);
 781   tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL, RC_OTHER);
 782   tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL, RC_OTHER);
 783   tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL, RC_OTHER);
 784   tab_title (t, _("Model Summary (%s)"), var_to_string (var));
 785   tab_submit (t);
 786 }
 787
 788 /*
 789   Table showing estimated regression coefficients.
 790 */
 791 static void
 792 reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable *var, const struct regression *cmd)
 793 {
 794   size_t j;
 795   int n_cols = 7;
 796   const int heading_rows = 2;
 797   int n_rows;
 798   int this_row;
 799   double t_stat;
 800   double pval;
 801   double std_err;
 802   double beta;
 803   const char *label;
 804
 805   const struct variable *v;
 806   struct tab_table *t;
 807
 808   const double df = linreg_n_obs (c) - linreg_n_coeffs (c) - 1;
 809   double q = (1 - cmd->ci) / 2.0;  /* 2-tailed test */
 810   double tval = gsl_cdf_tdist_Qinv (q, df);
 811
 812   assert (c != NULL);
 813   n_rows = linreg_n_coeffs (c) + heading_rows + 1;
 814
 815   if (cmd->stats & STATS_CI)
 816     n_cols += 2;
 817
 818   t = tab_create (n_cols, n_rows);
 819   tab_headers (t, 2, 0, 1, 0);
 820   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 821   tab_hline (t, TAL_2, 0, n_cols - 1, heading_rows);
 822   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 823   tab_vline (t, TAL_0, 1, 0, 0);
 824
 825
 826   tab_hline (t, TAL_1, 2, 4, 1);
 827   tab_joint_text (t, 2, 0, 3, 0, TAB_CENTER | TAT_TITLE, _("Unstandardized Coefficients"));
 828   tab_text (t, 2, 1, TAB_CENTER | TAT_TITLE, _("B"));
 829   tab_text (t, 3, 1, TAB_CENTER | TAT_TITLE, _("Std. Error"));
 830   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Standardized Coefficients"));
 831   tab_text (t, 4, 1, TAB_CENTER | TAT_TITLE, _("Beta"));
 832   tab_text (t, 5, 1, TAB_CENTER | TAT_TITLE, _("t"));
 833   tab_text (t, 6, 1, TAB_CENTER | TAT_TITLE, _("Sig."));
 834   tab_text (t, 1, heading_rows, TAB_LEFT | TAT_TITLE, _("(Constant)"));
 835   tab_double (t, 2, heading_rows, 0, linreg_intercept (c), NULL, RC_OTHER);
 836   std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0));
 837
 838   if (cmd->stats & STATS_CI)
 839     {
 840       double lower = linreg_intercept (c) - tval * std_err ;
 841       double upper = linreg_intercept (c) + tval * std_err ;
 842       tab_double (t, 7, heading_rows, 0, lower, NULL, RC_OTHER);
 843       tab_double (t, 8, heading_rows, 0, upper, NULL, RC_OTHER);
 844
 845       tab_joint_text_format (t, 7, 0, 8, 0, TAB_CENTER | TAT_TITLE, _("%g%% Confidence Interval for B"), cmd->ci * 100);
 846       tab_hline (t, TAL_1, 7, 8, 1);
 847       tab_text (t, 7, 1, TAB_CENTER | TAT_TITLE, _("Lower Bound"));
 848       tab_text (t, 8, 1, TAB_CENTER | TAT_TITLE, _("Upper Bound"));
 849     }
 850   tab_double (t, 3, heading_rows, 0, std_err, NULL, RC_OTHER);
 851   tab_double (t, 4, heading_rows, 0, 0.0, NULL, RC_OTHER);
 852   t_stat = linreg_intercept (c) / std_err;
 853   tab_double (t, 5, heading_rows, 0, t_stat, NULL, RC_OTHER);
 854   pval =
 855     2 * gsl_cdf_tdist_Q (fabs (t_stat),
 856                          (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
 857   tab_double (t, 6, heading_rows, 0, pval, NULL, RC_PVALUE);
 858
 859   for (j = 0; j < linreg_n_coeffs (c); j++)
 860     {
 861       struct string tstr;
 862       ds_init_empty (&tstr);
 863       this_row = j + heading_rows + 1;
 864
 865       v = linreg_indep_var (c, j);
 866       label = var_to_string (v);
 867       /* Do not overwrite the variable's name. */
 868       ds_put_cstr (&tstr, label);
 869       tab_text (t, 1, this_row, TAB_LEFT, ds_cstr (&tstr));
 870       /*
 871          Regression coefficients.
 872        */
 873       tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL, RC_OTHER);
 874       /*
 875          Standard error of the coefficients.
 876        */
 877       std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1));
 878       tab_double (t, 3, this_row, 0, std_err, NULL, RC_OTHER);
 879       /*
 880          Standardized coefficient, i.e., regression coefficient
 881          if all variables had unit variance.
 882        */
 883       beta = sqrt (gsl_matrix_get (cov, j, j));
 884       beta *= linreg_coeff (c, j) /
 885         sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1));
 886       tab_double (t, 4, this_row, 0, beta, NULL, RC_OTHER);
 887
 888       /*
 889          Test statistic for H0: coefficient is 0.
 890        */
 891       t_stat = linreg_coeff (c, j) / std_err;
 892       tab_double (t, 5, this_row, 0, t_stat, NULL, RC_OTHER);
 893       /*
 894          P values for the test statistic above.
 895        */
 896       pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), df);
 897       tab_double (t, 6, this_row, 0, pval, NULL, RC_PVALUE);
 898       ds_destroy (&tstr);
 899
 900       if (cmd->stats & STATS_CI)
 901         {
 902           double lower = linreg_coeff (c, j)  - tval * std_err ;
 903           double upper = linreg_coeff (c, j)  + tval * std_err ;
 904
 905           tab_double (t, 7, this_row, 0, lower, NULL, RC_OTHER);
 906           tab_double (t, 8, this_row, 0, upper, NULL, RC_OTHER);
 907         }
 908     }
 909   tab_title (t, _("Coefficients (%s)"), var_to_string (var));
 910   tab_submit (t);
 911 }
 912
 913 /*
 914   Display the ANOVA table.
 915 */
 916 static void
 917 reg_stats_anova (const linreg * c, const struct variable *var)
 918 {
 919   int n_cols = 7;
 920   int n_rows = 4;
 921   const double msm = linreg_ssreg (c) / linreg_dfmodel (c);
 922   const double mse = linreg_mse (c);
 923   const double F = msm / mse;
 924   const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
 925
 926   struct tab_table *t;
 927
 928   assert (c != NULL);
 929   t = tab_create (n_cols, n_rows);
 930   tab_headers (t, 2, 0, 1, 0);
 931
 932   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 933
 934   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 935   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 936   tab_vline (t, TAL_0, 1, 0, 0);
 937
 938   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
 939   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
 940   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
 941   tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
 942   tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Sig."));
 943
 944   tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
 945   tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
 946   tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
 947
 948   /* Sums of Squares */
 949   tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL, RC_OTHER);
 950   tab_double (t, 2, 3, 0, linreg_sst (c), NULL, RC_OTHER);
 951   tab_double (t, 2, 2, 0, linreg_sse (c), NULL, RC_OTHER);
 952
 953
 954   /* Degrees of freedom */
 955   tab_text_format (t, 3, 1, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dfm);
 956   tab_text_format (t, 3, 2, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dfe);
 957   tab_text_format (t, 3, 3, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dft);
 958
 959   /* Mean Squares */
 960   tab_double (t, 4, 1, TAB_RIGHT, msm, NULL, RC_OTHER);
 961   tab_double (t, 4, 2, TAB_RIGHT, mse, NULL, RC_OTHER);
 962
 963   tab_double (t, 5, 1, 0, F, NULL, RC_OTHER);
 964
 965   tab_double (t, 6, 1, 0, pval, NULL, RC_PVALUE);
 966
 967   tab_title (t, _("ANOVA (%s)"), var_to_string (var));
 968   tab_submit (t);
 969 }
 970
 971
 972 static void
 973 reg_stats_bcov (const linreg * c, const struct variable *var)
 974 {
 975   int n_cols;
 976   int n_rows;
 977   int i;
 978   int k;
 979   int row;
 980   int col;
 981   const char *label;
 982   struct tab_table *t;
 983
 984   assert (c != NULL);
 985   n_cols = c->n_indeps + 1 + 2;
 986   n_rows = 2 * (c->n_indeps + 1);
 987   t = tab_create (n_cols, n_rows);
 988   tab_headers (t, 2, 0, 1, 0);
 989   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 990   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 991   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 992   tab_vline (t, TAL_0, 1, 0, 0);
 993   tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
 994   tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
 995   for (i = 0; i < linreg_n_coeffs (c); i++)
 996     {
 997       const struct variable *v = linreg_indep_var (c, i);
 998       label = var_to_string (v);
 999       tab_text (t, 2, i, TAB_CENTER, label);
1000       tab_text (t, i + 2, 0, TAB_CENTER, label);
1001       for (k = 1; k < linreg_n_coeffs (c); k++)
1002         {
1003           col = (i <= k) ? k : i;
1004           row = (i <= k) ? i : k;
1005           tab_double (t, k + 2, i, TAB_CENTER,
1006                       gsl_matrix_get (c->cov, row, col), NULL, RC_OTHER);
1007         }
1008     }
1009   tab_title (t, _("Coefficient Correlations (%s)"), var_to_string (var));
1010   tab_submit (t);
1011 }
1012