pintos-os.org Git - pspp/blob - src/language/stats/regression.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include <stdbool.h>
  20
  21 #include <gsl/gsl_cdf.h>
  22 #include <gsl/gsl_matrix.h>
  23
  24 #include <data/dataset.h>
  25 #include <data/casewriter.h>
  26
  27 #include "language/command.h"
  28 #include "language/lexer/lexer.h"
  29 #include "language/lexer/value-parser.h"
  30 #include "language/lexer/variable-parser.h"
  31
  32
  33 #include "data/casegrouper.h"
  34 #include "data/casereader.h"
  35 #include "data/dictionary.h"
  36
  37 #include "math/covariance.h"
  38 #include "math/linreg.h"
  39 #include "math/moments.h"
  40
  41 #include "libpspp/message.h"
  42 #include "libpspp/taint.h"
  43
  44 #include "output/tab.h"
  45
  46 #include "gettext.h"
  47 #define _(msgid) gettext (msgid)
  48 #define N_(msgid) msgid
  49
  50
  51 #include <gl/intprops.h>
  52
  53 #define REG_LARGE_DATA 1000
  54
  55 struct regression
  56 {
  57   struct dataset *ds;
  58
  59   const struct variable **vars;
  60   size_t n_vars;
  61
  62   const struct variable **dep_vars;
  63   size_t n_dep_vars;
  64
  65   bool r;
  66   bool coeff;
  67   bool anova;
  68   bool bcov;
  69
  70
  71   bool resid;
  72   bool pred;
  73 };
  74
  75 struct per_split_ws
  76 {
  77   linreg **models;
  78 };
  79
  80 struct regression_workspace
  81 {
  82   struct per_split_ws *psw;
  83
  84   /* The new variables which will be introduced by /SAVE */
  85   const struct variable **predvars;
  86   const struct variable **residvars;
  87
  88   /* A reader/writer pair to temporarily hold the
  89      values of the new variables */
  90   struct casewriter *writer;
  91   struct casereader *reader;
  92
  93   /* Indeces of the new values in the reader/writer (-1 if not applicable) */
  94   int res_idx;
  95   int pred_idx;
  96
  97   /* 0, 1 or 2 depending on what new variables are to be created */
  98   int extras;
  99 };
 100
 101 static void run_regression (const struct regression *cmd,
 102                             struct per_split_ws *psw,
 103                             struct regression_workspace *ws,
 104                             struct casereader *input);
 105
 106
 107 /* Return a string based on PREFIX which may be used as the name
 108    of a new variable in DICT */
 109 static char *
 110 reg_get_name (const struct dictionary *dict, const char *prefix)
 111 {
 112   char *name;
 113   int i;
 114
 115   /* XXX handle too-long prefixes */
 116   name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1);
 117   for (i = 1;; i++)
 118     {
 119       sprintf (name, "%s%d", prefix, i);
 120       if (dict_lookup_var (dict, name) == NULL)
 121         return name;
 122     }
 123 }
 124
 125
 126 static const struct variable *
 127 create_aux_var (struct dataset *ds, const char *prefix)
 128 {
 129   struct variable *var;
 130   struct dictionary *dict = dataset_dict (ds);
 131   char *name = reg_get_name (dict, prefix);
 132   var = dict_create_var_assert (dict, name, 0);
 133   free (name);
 134   return var;
 135 }
 136
 137 /* Auxilliary data for transformation when /SAVE is entered */
 138 struct save_trans_data
 139 {
 140   int n_dep_vars;
 141   struct regression_workspace *ws;
 142 };
 143
 144 static bool
 145 save_trans_free (void *aux)
 146 {
 147   struct save_trans_data *save_trans_data = aux;
 148   free (save_trans_data->ws->predvars);
 149   free (save_trans_data->ws->residvars);
 150
 151   casereader_destroy (save_trans_data->ws->reader);
 152   free (save_trans_data->ws);
 153   free (save_trans_data);
 154   return true;
 155 }
 156
 157 static int
 158 save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED)
 159 {
 160   struct save_trans_data *save_trans_data = aux;
 161   struct regression_workspace *ws = save_trans_data->ws;
 162   struct ccase *in =  casereader_read (ws->reader);
 163
 164   if (in)
 165     {
 166       int k;
 167       *c = case_unshare (*c);
 168
 169       for (k = 0; k < save_trans_data->n_dep_vars; ++k)
 170         {
 171           if (ws->pred_idx != -1)
 172             {
 173               double pred = case_data_idx (in, ws->extras * k + ws->pred_idx)->f;
 174               case_data_rw (*c, ws->predvars[k])->f = pred;
 175             }
 176
 177           if (ws->res_idx != -1)
 178             {
 179               double resid = case_data_idx (in, ws->extras * k + ws->res_idx)->f;
 180               case_data_rw (*c, ws->residvars[k])->f = resid;
 181             }
 182         }
 183       case_unref (in);
 184     }
 185
 186   return TRNS_CONTINUE;
 187 }
 188
 189
 190 int
 191 cmd_regression (struct lexer *lexer, struct dataset *ds)
 192 {
 193   int i;
 194   int n_splits = 0;
 195   struct regression_workspace workspace;
 196   struct regression regression;
 197   const struct dictionary *dict = dataset_dict (ds);
 198   bool save;
 199   workspace.psw = NULL;
 200
 201   memset (&regression, 0, sizeof (struct regression));
 202
 203   regression.anova = true;
 204   regression.coeff = true;
 205   regression.r = true;
 206
 207   regression.pred = false;
 208   regression.resid = false;
 209
 210   regression.ds = ds;
 211
 212   /* Accept an optional, completely pointless "/VARIABLES=" */
 213   lex_match (lexer, T_SLASH);
 214   if (lex_match_id (lexer, "VARIABLES"))
 215     {
 216       if (!lex_force_match (lexer, T_EQUALS))
 217         goto error;
 218     }
 219
 220   if (!parse_variables_const (lexer, dict,
 221                               &regression.vars, &regression.n_vars,
 222                               PV_NO_DUPLICATE | PV_NUMERIC))
 223     goto error;
 224
 225
 226   while (lex_token (lexer) != T_ENDCMD)
 227     {
 228       lex_match (lexer, T_SLASH);
 229
 230       if (lex_match_id (lexer, "DEPENDENT"))
 231         {
 232           if (!lex_force_match (lexer, T_EQUALS))
 233             goto error;
 234
 235           if (!parse_variables_const (lexer, dict,
 236                                       &regression.dep_vars,
 237                                       &regression.n_dep_vars,
 238                                       PV_NO_DUPLICATE | PV_NUMERIC))
 239             goto error;
 240         }
 241       else if (lex_match_id (lexer, "METHOD"))
 242         {
 243           lex_match (lexer, T_EQUALS);
 244
 245           if (!lex_force_match_id (lexer, "ENTER"))
 246             {
 247               goto error;
 248             }
 249         }
 250       else if (lex_match_id (lexer, "STATISTICS"))
 251         {
 252           lex_match (lexer, T_EQUALS);
 253
 254           while (lex_token (lexer) != T_ENDCMD
 255                  && lex_token (lexer) != T_SLASH)
 256             {
 257               if (lex_match (lexer, T_ALL))
 258                 {
 259                 }
 260               else if (lex_match_id (lexer, "DEFAULTS"))
 261                 {
 262                 }
 263               else if (lex_match_id (lexer, "R"))
 264                 {
 265                 }
 266               else if (lex_match_id (lexer, "COEFF"))
 267                 {
 268                 }
 269               else if (lex_match_id (lexer, "ANOVA"))
 270                 {
 271                 }
 272               else if (lex_match_id (lexer, "BCOV"))
 273                 {
 274                 }
 275               else
 276                 {
 277                   lex_error (lexer, NULL);
 278                   goto error;
 279                 }
 280             }
 281         }
 282       else if (lex_match_id (lexer, "SAVE"))
 283         {
 284           lex_match (lexer, T_EQUALS);
 285
 286           while (lex_token (lexer) != T_ENDCMD
 287                  && lex_token (lexer) != T_SLASH)
 288             {
 289               if (lex_match_id (lexer, "PRED"))
 290                 {
 291                   regression.pred = true;
 292                 }
 293               else if (lex_match_id (lexer, "RESID"))
 294                 {
 295                   regression.resid = true;
 296                 }
 297               else
 298                 {
 299                   lex_error (lexer, NULL);
 300                   goto error;
 301                 }
 302             }
 303         }
 304       else
 305         {
 306           lex_error (lexer, NULL);
 307           goto error;
 308         }
 309     }
 310
 311   if (!regression.vars)
 312     {
 313       dict_get_vars (dict, &regression.vars, &regression.n_vars, 0);
 314     }
 315
 316   save = regression.pred || regression.resid;
 317   workspace.extras = 0;
 318   workspace.res_idx = -1;
 319   workspace.pred_idx = -1;
 320   workspace.writer = NULL;
 321   workspace.reader = NULL;
 322   if (save)
 323     {
 324       int i;
 325       struct caseproto *proto = caseproto_create ();
 326
 327       if (regression.resid)
 328         {
 329           workspace.extras ++;
 330           workspace.res_idx = 0;
 331           workspace.residvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.residvars));
 332
 333           for (i = 0; i < regression.n_dep_vars; ++i)
 334             {
 335               workspace.residvars[i] = create_aux_var (ds, "RES");
 336               proto = caseproto_add_width (proto, 0);
 337             }
 338         }
 339
 340       if (regression.pred)
 341         {
 342           workspace.extras ++;
 343           workspace.pred_idx = 1;
 344           workspace.predvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.predvars));
 345
 346           for (i = 0; i < regression.n_dep_vars; ++i)
 347             {
 348               workspace.predvars[i] = create_aux_var (ds, "PRED");
 349               proto = caseproto_add_width (proto, 0);
 350             }
 351         }
 352
 353       if (proc_make_temporary_transformations_permanent (ds))
 354         msg (SW, _("REGRESSION with SAVE ignores TEMPORARY.  "
 355                    "Temporary transformations will be made permanent."));
 356
 357       workspace.writer = autopaging_writer_create (proto);
 358       caseproto_unref (proto);
 359     }
 360
 361
 362   n_splits = 0;
 363   {
 364     struct casegrouper *grouper;
 365     struct casereader *group;
 366     bool ok;
 367
 368     grouper = casegrouper_create_splits (proc_open_filtering (ds, !save), dict);
 369
 370
 371     while (casegrouper_get_next_group (grouper, &group))
 372       {
 373         workspace.psw = xrealloc (workspace.psw, ++n_splits * sizeof (*workspace.psw));
 374
 375         run_regression (&regression, &workspace.psw[n_splits - 1],
 376                         &workspace,
 377                         group);
 378
 379       }
 380     ok = casegrouper_destroy (grouper);
 381     ok = proc_commit (ds) && ok;
 382   }
 383
 384   if (workspace.writer)
 385     {
 386       struct save_trans_data *save_trans_data = xmalloc (sizeof *save_trans_data);
 387       struct casereader *r = casewriter_make_reader (workspace.writer);
 388       workspace.writer = NULL;
 389       workspace.reader = r;
 390       save_trans_data->ws = xmalloc (sizeof (workspace));
 391       memcpy (save_trans_data->ws, &workspace, sizeof (workspace));
 392       save_trans_data->n_dep_vars = regression.n_dep_vars;
 393
 394       add_transformation (ds, save_trans_func, save_trans_free, save_trans_data);
 395     }
 396
 397   for (i = 0; i < n_splits; ++i)
 398     {
 399       int k;
 400
 401       for (k = 0; k < regression.n_dep_vars; ++k)
 402         linreg_unref (workspace.psw[i].models[k]);
 403
 404       free (workspace.psw[i].models);
 405     }
 406   free (workspace.psw);
 407
 408
 409   free (regression.vars);
 410   free (regression.dep_vars);
 411   return CMD_SUCCESS;
 412
 413 error:
 414
 415   free (regression.vars);
 416   free (regression.dep_vars);
 417   return CMD_FAILURE;
 418 }
 419
 420 /* Return the size of the union of dependent and independent variables */
 421 static size_t
 422 get_n_all_vars (const struct regression *cmd)
 423 {
 424   size_t result = cmd->n_vars;
 425   size_t i;
 426   size_t j;
 427
 428   result += cmd->n_dep_vars;
 429   for (i = 0; i < cmd->n_dep_vars; i++)
 430     {
 431       for (j = 0; j < cmd->n_vars; j++)
 432         {
 433           if (cmd->vars[j] == cmd->dep_vars[i])
 434             {
 435               result--;
 436             }
 437         }
 438     }
 439   return result;
 440 }
 441
 442 /* Fill VARS with the union of dependent and independent variables */
 443 static void
 444 fill_all_vars (const struct variable **vars, const struct regression *cmd)
 445 {
 446   size_t i;
 447   size_t j;
 448   bool absent;
 449
 450   for (i = 0; i < cmd->n_vars; i++)
 451     {
 452       vars[i] = cmd->vars[i];
 453     }
 454   for (i = 0; i < cmd->n_dep_vars; i++)
 455     {
 456       absent = true;
 457       for (j = 0; j < cmd->n_vars; j++)
 458         {
 459           if (cmd->dep_vars[i] == cmd->vars[j])
 460             {
 461               absent = false;
 462               break;
 463             }
 464         }
 465       if (absent)
 466         {
 467           vars[i + cmd->n_vars] = cmd->dep_vars[i];
 468         }
 469     }
 470 }
 471
 472 /*
 473   Is variable k the dependent variable?
 474 */
 475 static bool
 476 is_depvar (const struct regression *cmd, size_t k, const struct variable *v)
 477 {
 478   return v == cmd->vars[k];
 479 }
 480
 481
 482 /* Identify the explanatory variables in v_variables.  Returns
 483    the number of independent variables. */
 484 static int
 485 identify_indep_vars (const struct regression *cmd,
 486                      const struct variable **indep_vars,
 487                      const struct variable *depvar)
 488 {
 489   int n_indep_vars = 0;
 490   int i;
 491
 492   for (i = 0; i < cmd->n_vars; i++)
 493     if (!is_depvar (cmd, i, depvar))
 494       indep_vars[n_indep_vars++] = cmd->vars[i];
 495   if ((n_indep_vars < 1) && is_depvar (cmd, 0, depvar))
 496     {
 497       /*
 498          There is only one independent variable, and it is the same
 499          as the dependent variable. Print a warning and continue.
 500        */
 501       msg (SW,
 502            gettext
 503            ("The dependent variable is equal to the independent variable."
 504             "The least squares line is therefore Y=X."
 505             "Standard errors and related statistics may be meaningless."));
 506       n_indep_vars = 1;
 507       indep_vars[0] = cmd->vars[0];
 508     }
 509   return n_indep_vars;
 510 }
 511
 512
 513 static double
 514 fill_covariance (gsl_matrix * cov, struct covariance *all_cov,
 515                  const struct variable **vars,
 516                  size_t n_vars, const struct variable *dep_var,
 517                  const struct variable **all_vars, size_t n_all_vars,
 518                  double *means)
 519 {
 520   size_t i;
 521   size_t j;
 522   size_t dep_subscript;
 523   size_t *rows;
 524   const gsl_matrix *ssizes;
 525   const gsl_matrix *mean_matrix;
 526   const gsl_matrix *ssize_matrix;
 527   double result = 0.0;
 528
 529   const gsl_matrix *cm = covariance_calculate_unnormalized (all_cov);
 530
 531   if (cm == NULL)
 532     return 0;
 533
 534   rows = xnmalloc (cov->size1 - 1, sizeof (*rows));
 535
 536   for (i = 0; i < n_all_vars; i++)
 537     {
 538       for (j = 0; j < n_vars; j++)
 539         {
 540           if (vars[j] == all_vars[i])
 541             {
 542               rows[j] = i;
 543             }
 544         }
 545       if (all_vars[i] == dep_var)
 546         {
 547           dep_subscript = i;
 548         }
 549     }
 550   mean_matrix = covariance_moments (all_cov, MOMENT_MEAN);
 551   ssize_matrix = covariance_moments (all_cov, MOMENT_NONE);
 552   for (i = 0; i < cov->size1 - 1; i++)
 553     {
 554       means[i] = gsl_matrix_get (mean_matrix, rows[i], 0)
 555         / gsl_matrix_get (ssize_matrix, rows[i], 0);
 556       for (j = 0; j < cov->size2 - 1; j++)
 557         {
 558           gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j]));
 559           gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i]));
 560         }
 561     }
 562   means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0)
 563     / gsl_matrix_get (ssize_matrix, dep_subscript, 0);
 564   ssizes = covariance_moments (all_cov, MOMENT_NONE);
 565   result = gsl_matrix_get (ssizes, dep_subscript, rows[0]);
 566   for (i = 0; i < cov->size1 - 1; i++)
 567     {
 568       gsl_matrix_set (cov, i, cov->size1 - 1,
 569                       gsl_matrix_get (cm, rows[i], dep_subscript));
 570       gsl_matrix_set (cov, cov->size1 - 1, i,
 571                       gsl_matrix_get (cm, rows[i], dep_subscript));
 572       if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript))
 573         {
 574           result = gsl_matrix_get (ssizes, rows[i], dep_subscript);
 575         }
 576     }
 577   gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1,
 578                   gsl_matrix_get (cm, dep_subscript, dep_subscript));
 579   free (rows);
 580   return result;
 581 }
 582
 583 \f
 584
 585 /*
 586   STATISTICS subcommand output functions.
 587 */
 588 static void reg_stats_r (const linreg *,     const struct variable *);
 589 static void reg_stats_coeff (const linreg *, const gsl_matrix *, const struct variable *);
 590 static void reg_stats_anova (const linreg *, const struct variable *);
 591 static void reg_stats_bcov (const linreg *,  const struct variable *);
 592
 593
 594 static void
 595 subcommand_statistics (const struct regression *cmd, const linreg * c, const gsl_matrix * cm,
 596                        const struct variable *var)
 597 {
 598   if (cmd->r)
 599     reg_stats_r     (c, var);
 600
 601   if (cmd->anova)
 602     reg_stats_anova (c, var);
 603
 604   if (cmd->coeff)
 605     reg_stats_coeff (c, cm, var);
 606
 607   if (cmd->bcov)
 608     reg_stats_bcov  (c, var);
 609 }
 610
 611
 612 static void
 613 run_regression (const struct regression *cmd,
 614                 struct per_split_ws *psw,
 615                 struct regression_workspace *ws,
 616                 struct casereader *input)
 617 {
 618   size_t i;
 619
 620   int k;
 621   struct ccase *c;
 622   struct covariance *cov;
 623   struct casereader *reader;
 624   size_t n_all_vars = get_n_all_vars (cmd);
 625   const struct variable **all_vars = xnmalloc (n_all_vars, sizeof (*all_vars));
 626
 627   double *means = xnmalloc (n_all_vars, sizeof (*means));
 628
 629   fill_all_vars (all_vars, cmd);
 630   cov = covariance_1pass_create (n_all_vars, all_vars,
 631                                  dict_get_weight (dataset_dict (cmd->ds)),
 632                                  MV_ANY);
 633
 634   reader = casereader_clone (input);
 635   reader = casereader_create_filter_missing (reader, all_vars, n_all_vars,
 636                                              MV_ANY, NULL, NULL);
 637
 638
 639   {
 640     struct casereader *r = casereader_clone (reader);
 641
 642     for (; (c = casereader_read (r)) != NULL; case_unref (c))
 643       {
 644         covariance_accumulate (cov, c);
 645       }
 646     casereader_destroy (r);
 647   }
 648
 649   psw->models = xcalloc (cmd->n_dep_vars, sizeof (*psw->models));
 650   for (k = 0; k < cmd->n_dep_vars; k++)
 651     {
 652
 653       const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars));
 654       const struct variable *dep_var = cmd->dep_vars[k];
 655       int n_indep = identify_indep_vars (cmd, vars, dep_var);
 656       gsl_matrix *this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1);
 657       double n_data = fill_covariance (this_cm, cov, vars, n_indep,
 658                                 dep_var, all_vars, n_all_vars, means);
 659       psw->models[k] = linreg_alloc (dep_var, vars,  n_data, n_indep);
 660       psw->models[k]->depvar = dep_var;
 661       for (i = 0; i < n_indep; i++)
 662         {
 663           linreg_set_indep_variable_mean (psw->models[k], i, means[i]);
 664         }
 665       linreg_set_depvar_mean (psw->models[k], means[i]);
 666       /*
 667          For large data sets, use QR decomposition.
 668        */
 669       if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
 670         {
 671           psw->models[k]->method = LINREG_QR;
 672         }
 673
 674       if (n_data > 0)
 675         {
 676           /*
 677              Find the least-squares estimates and other statistics.
 678            */
 679           linreg_fit (this_cm, psw->models[k]);
 680
 681           if (!taint_has_tainted_successor (casereader_get_taint (input)))
 682             {
 683               subcommand_statistics (cmd, psw->models[k], this_cm, dep_var);
 684             }
 685         }
 686       else
 687         {
 688           msg (SE, _("No valid data found. This command was skipped."));
 689         }
 690       gsl_matrix_free (this_cm);
 691       free (vars);
 692     }
 693
 694
 695   if (ws->extras > 0)
 696    {
 697       struct casereader *r = casereader_clone (reader);
 698
 699       for (; (c = casereader_read (r)) != NULL; case_unref (c))
 700         {
 701           struct ccase *outc = case_clone (c);
 702           for (k = 0; k < cmd->n_dep_vars; k++)
 703             {
 704               const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars));
 705               const struct variable *dep_var = cmd->dep_vars[k];
 706               int n_indep = identify_indep_vars (cmd, vars, dep_var);
 707               double *vals = xnmalloc (n_indep, sizeof (*vals));
 708               for (i = 0; i < n_indep; i++)
 709                 {
 710                   const union value *tmp = case_data (c, vars[i]);
 711                   vals[i] = tmp->f;
 712                 }
 713
 714               if (cmd->pred)
 715                 {
 716                   double pred = linreg_predict (psw->models[k], vals, n_indep);
 717                   case_data_rw_idx (outc, k * ws->extras + ws->pred_idx)->f = pred;
 718                 }
 719
 720               if (cmd->resid)
 721                 {
 722                   double obs = case_data (c, psw->models[k]->depvar)->f;
 723                   double res = linreg_residual (psw->models[k], obs,  vals, n_indep);
 724                   case_data_rw_idx (outc, k * ws->extras + ws->res_idx)->f = res;
 725                 }
 726               free (vals);
 727               free (vars);
 728             }
 729           casewriter_write (ws->writer, outc);
 730         }
 731       casereader_destroy (r);
 732     }
 733
 734   casereader_destroy (reader);
 735
 736
 737   free (all_vars);
 738   free (means);
 739   casereader_destroy (input);
 740   covariance_destroy (cov);
 741 }
 742 \f
 743
 744
 745
 746
 747 static void
 748 reg_stats_r (const linreg * c, const struct variable *var)
 749 {
 750   struct tab_table *t;
 751   int n_rows = 2;
 752   int n_cols = 5;
 753   double rsq;
 754   double adjrsq;
 755   double std_error;
 756
 757   assert (c != NULL);
 758   rsq = linreg_ssreg (c) / linreg_sst (c);
 759   adjrsq = rsq -
 760     (1.0 - rsq) * linreg_n_coeffs (c) / (linreg_n_obs (c) -
 761                                          linreg_n_coeffs (c) - 1);
 762   std_error = sqrt (linreg_mse (c));
 763   t = tab_create (n_cols, n_rows);
 764   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 765   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 766   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 767   tab_vline (t, TAL_0, 1, 0, 0);
 768
 769   tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
 770   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
 771   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
 772   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
 773   tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL);
 774   tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL);
 775   tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL);
 776   tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL);
 777   tab_title (t, _("Model Summary (%s)"), var_to_string (var));
 778   tab_submit (t);
 779 }
 780
 781 /*
 782   Table showing estimated regression coefficients.
 783 */
 784 static void
 785 reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable *var)
 786 {
 787   size_t j;
 788   int n_cols = 7;
 789   int n_rows;
 790   int this_row;
 791   double t_stat;
 792   double pval;
 793   double std_err;
 794   double beta;
 795   const char *label;
 796
 797   const struct variable *v;
 798   struct tab_table *t;
 799
 800   assert (c != NULL);
 801   n_rows = linreg_n_coeffs (c) + 3;
 802
 803   t = tab_create (n_cols, n_rows);
 804   tab_headers (t, 2, 0, 1, 0);
 805   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 806   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 807   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 808   tab_vline (t, TAL_0, 1, 0, 0);
 809
 810   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
 811   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
 812   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
 813   tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
 814   tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
 815   tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
 816   tab_double (t, 2, 1, 0, linreg_intercept (c), NULL);
 817   std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0));
 818   tab_double (t, 3, 1, 0, std_err, NULL);
 819   tab_double (t, 4, 1, 0, 0.0, NULL);
 820   t_stat = linreg_intercept (c) / std_err;
 821   tab_double (t, 5, 1, 0, t_stat, NULL);
 822   pval =
 823     2 * gsl_cdf_tdist_Q (fabs (t_stat),
 824                          (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
 825   tab_double (t, 6, 1, 0, pval, NULL);
 826   for (j = 0; j < linreg_n_coeffs (c); j++)
 827     {
 828       struct string tstr;
 829       ds_init_empty (&tstr);
 830       this_row = j + 2;
 831
 832       v = linreg_indep_var (c, j);
 833       label = var_to_string (v);
 834       /* Do not overwrite the variable's name. */
 835       ds_put_cstr (&tstr, label);
 836       tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr));
 837       /*
 838          Regression coefficients.
 839        */
 840       tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL);
 841       /*
 842          Standard error of the coefficients.
 843        */
 844       std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1));
 845       tab_double (t, 3, this_row, 0, std_err, NULL);
 846       /*
 847          Standardized coefficient, i.e., regression coefficient
 848          if all variables had unit variance.
 849        */
 850       beta = sqrt (gsl_matrix_get (cov, j, j));
 851       beta *= linreg_coeff (c, j) /
 852         sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1));
 853       tab_double (t, 4, this_row, 0, beta, NULL);
 854
 855       /*
 856          Test statistic for H0: coefficient is 0.
 857        */
 858       t_stat = linreg_coeff (c, j) / std_err;
 859       tab_double (t, 5, this_row, 0, t_stat, NULL);
 860       /*
 861          P values for the test statistic above.
 862        */
 863       pval =
 864         2 * gsl_cdf_tdist_Q (fabs (t_stat),
 865                              (double) (linreg_n_obs (c) -
 866                                        linreg_n_coeffs (c) - 1));
 867       tab_double (t, 6, this_row, 0, pval, NULL);
 868       ds_destroy (&tstr);
 869     }
 870   tab_title (t, _("Coefficients (%s)"), var_to_string (var));
 871   tab_submit (t);
 872 }
 873
 874 /*
 875   Display the ANOVA table.
 876 */
 877 static void
 878 reg_stats_anova (const linreg * c, const struct variable *var)
 879 {
 880   int n_cols = 7;
 881   int n_rows = 4;
 882   const double msm = linreg_ssreg (c) / linreg_dfmodel (c);
 883   const double mse = linreg_mse (c);
 884   const double F = msm / mse;
 885   const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
 886
 887   struct tab_table *t;
 888
 889   assert (c != NULL);
 890   t = tab_create (n_cols, n_rows);
 891   tab_headers (t, 2, 0, 1, 0);
 892
 893   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 894
 895   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 896   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 897   tab_vline (t, TAL_0, 1, 0, 0);
 898
 899   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
 900   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
 901   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
 902   tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
 903   tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
 904
 905   tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
 906   tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
 907   tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
 908
 909   /* Sums of Squares */
 910   tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL);
 911   tab_double (t, 2, 3, 0, linreg_sst (c), NULL);
 912   tab_double (t, 2, 2, 0, linreg_sse (c), NULL);
 913
 914
 915   /* Degrees of freedom */
 916   tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm);
 917   tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe);
 918   tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft);
 919
 920   /* Mean Squares */
 921   tab_double (t, 4, 1, TAB_RIGHT, msm, NULL);
 922   tab_double (t, 4, 2, TAB_RIGHT, mse, NULL);
 923
 924   tab_double (t, 5, 1, 0, F, NULL);
 925
 926   tab_double (t, 6, 1, 0, pval, NULL);
 927
 928   tab_title (t, _("ANOVA (%s)"), var_to_string (var));
 929   tab_submit (t);
 930 }
 931
 932
 933 static void
 934 reg_stats_bcov (const linreg * c, const struct variable *var)
 935 {
 936   int n_cols;
 937   int n_rows;
 938   int i;
 939   int k;
 940   int row;
 941   int col;
 942   const char *label;
 943   struct tab_table *t;
 944
 945   assert (c != NULL);
 946   n_cols = c->n_indeps + 1 + 2;
 947   n_rows = 2 * (c->n_indeps + 1);
 948   t = tab_create (n_cols, n_rows);
 949   tab_headers (t, 2, 0, 1, 0);
 950   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 951   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 952   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 953   tab_vline (t, TAL_0, 1, 0, 0);
 954   tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
 955   tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
 956   for (i = 0; i < linreg_n_coeffs (c); i++)
 957     {
 958       const struct variable *v = linreg_indep_var (c, i);
 959       label = var_to_string (v);
 960       tab_text (t, 2, i, TAB_CENTER, label);
 961       tab_text (t, i + 2, 0, TAB_CENTER, label);
 962       for (k = 1; k < linreg_n_coeffs (c); k++)
 963         {
 964           col = (i <= k) ? k : i;
 965           row = (i <= k) ? i : k;
 966           tab_double (t, k + 2, i, TAB_CENTER,
 967                       gsl_matrix_get (c->cov, row, col), NULL);
 968         }
 969     }
 970   tab_title (t, _("Coefficient Correlations (%s)"), var_to_string (var));
 971   tab_submit (t);
 972 }
 973