pintos-os.org Git - pspp/blob - src/language/stats/regression.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013, 2014,
   3    2016, 2017 Free Software Foundation, Inc.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  17
  18 #include <config.h>
  19
  20 #include <float.h>
  21 #include <stdbool.h>
  22
  23 #include <gsl/gsl_math.h>
  24 #include <gsl/gsl_cdf.h>
  25 #include <gsl/gsl_matrix.h>
  26
  27 #include <data/dataset.h>
  28 #include <data/casewriter.h>
  29
  30 #include "language/command.h"
  31 #include "language/lexer/lexer.h"
  32 #include "language/lexer/value-parser.h"
  33 #include "language/lexer/variable-parser.h"
  34
  35
  36 #include "data/casegrouper.h"
  37 #include "data/casereader.h"
  38 #include "data/dictionary.h"
  39
  40 #include "math/covariance.h"
  41 #include "math/linreg.h"
  42 #include "math/moments.h"
  43
  44 #include "libpspp/message.h"
  45 #include "libpspp/taint.h"
  46
  47 #include "output/tab.h"
  48
  49 #include "gettext.h"
  50 #define _(msgid) gettext (msgid)
  51 #define N_(msgid) msgid
  52
  53
  54 #include <gl/intprops.h>
  55
  56 #define STATS_R      1
  57 #define STATS_COEFF  2
  58 #define STATS_ANOVA  4
  59 #define STATS_OUTS   8
  60 #define STATS_CI    16
  61 #define STATS_BCOV  32
  62
  63 #define STATS_DEFAULT  (STATS_R | STATS_COEFF | STATS_ANOVA | STATS_OUTS)
  64
  65
  66
  67 struct regression
  68 {
  69   struct dataset *ds;
  70
  71   const struct variable **vars;
  72   size_t n_vars;
  73
  74   const struct variable **dep_vars;
  75   size_t n_dep_vars;
  76
  77   unsigned int stats;
  78   double ci;
  79
  80   bool resid;
  81   bool pred;
  82
  83   bool origin;
  84 };
  85
  86 struct regression_workspace
  87 {
  88   /* The new variables which will be introduced by /SAVE */
  89   const struct variable **predvars;
  90   const struct variable **residvars;
  91
  92   /* A reader/writer pair to temporarily hold the
  93      values of the new variables */
  94   struct casewriter *writer;
  95   struct casereader *reader;
  96
  97   /* Indeces of the new values in the reader/writer (-1 if not applicable) */
  98   int res_idx;
  99   int pred_idx;
 100
 101   /* 0, 1 or 2 depending on what new variables are to be created */
 102   int extras;
 103 };
 104
 105 static void run_regression (const struct regression *cmd,
 106                             struct regression_workspace *ws,
 107                             struct casereader *input);
 108
 109
 110 /* Return a string based on PREFIX which may be used as the name
 111    of a new variable in DICT */
 112 static char *
 113 reg_get_name (const struct dictionary *dict, const char *prefix)
 114 {
 115   char *name;
 116   int i;
 117
 118   /* XXX handle too-long prefixes */
 119   name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1);
 120   for (i = 1;; i++)
 121     {
 122       sprintf (name, "%s%d", prefix, i);
 123       if (dict_lookup_var (dict, name) == NULL)
 124         return name;
 125     }
 126 }
 127
 128
 129 static const struct variable *
 130 create_aux_var (struct dataset *ds, const char *prefix)
 131 {
 132   struct variable *var;
 133   struct dictionary *dict = dataset_dict (ds);
 134   char *name = reg_get_name (dict, prefix);
 135   var = dict_create_var_assert (dict, name, 0);
 136   free (name);
 137   return var;
 138 }
 139
 140 /* Auxilliary data for transformation when /SAVE is entered */
 141 struct save_trans_data
 142 {
 143   int n_dep_vars;
 144   struct regression_workspace *ws;
 145 };
 146
 147 static bool
 148 save_trans_free (void *aux)
 149 {
 150   struct save_trans_data *save_trans_data = aux;
 151   free (save_trans_data->ws->predvars);
 152   free (save_trans_data->ws->residvars);
 153
 154   casereader_destroy (save_trans_data->ws->reader);
 155   free (save_trans_data->ws);
 156   free (save_trans_data);
 157   return true;
 158 }
 159
 160 static int
 161 save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED)
 162 {
 163   struct save_trans_data *save_trans_data = aux;
 164   struct regression_workspace *ws = save_trans_data->ws;
 165   struct ccase *in =  casereader_read (ws->reader);
 166
 167   if (in)
 168     {
 169       int k;
 170       *c = case_unshare (*c);
 171
 172       for (k = 0; k < save_trans_data->n_dep_vars; ++k)
 173         {
 174           if (ws->pred_idx != -1)
 175             {
 176               double pred = case_data_idx (in, ws->extras * k + ws->pred_idx)->f;
 177               case_data_rw (*c, ws->predvars[k])->f = pred;
 178             }
 179
 180           if (ws->res_idx != -1)
 181             {
 182               double resid = case_data_idx (in, ws->extras * k + ws->res_idx)->f;
 183               case_data_rw (*c, ws->residvars[k])->f = resid;
 184             }
 185         }
 186       case_unref (in);
 187     }
 188
 189   return TRNS_CONTINUE;
 190 }
 191
 192
 193 int
 194 cmd_regression (struct lexer *lexer, struct dataset *ds)
 195 {
 196   struct regression_workspace workspace;
 197   struct regression regression;
 198   const struct dictionary *dict = dataset_dict (ds);
 199   bool save;
 200
 201   memset (&regression, 0, sizeof (struct regression));
 202
 203   regression.ci = 0.95;
 204   regression.stats = STATS_DEFAULT;
 205   regression.pred = false;
 206   regression.resid = false;
 207
 208   regression.ds = ds;
 209   regression.origin = false;
 210
 211   bool variables_seen = false;
 212   bool method_seen = false;
 213   bool dependent_seen = false;
 214   while (lex_token (lexer) != T_ENDCMD)
 215     {
 216       lex_match (lexer, T_SLASH);
 217
 218       if (lex_match_id (lexer, "VARIABLES"))
 219         {
 220           if (method_seen)
 221             {
 222               msg (SE, _("VARIABLES may not appear after %s"), "METHOD");
 223               goto error;
 224             }
 225           if (dependent_seen)
 226             {
 227               msg (SE, _("VARIABLES may not appear after %s"), "DEPENDENT");
 228               goto error;
 229             }
 230           variables_seen = true;
 231           lex_match (lexer, T_EQUALS);
 232
 233           if (!parse_variables_const (lexer, dict,
 234                                       &regression.vars, &regression.n_vars,
 235                                       PV_NO_DUPLICATE | PV_NUMERIC))
 236             goto error;
 237         }
 238       else if (lex_match_id (lexer, "DEPENDENT"))
 239         {
 240           dependent_seen = true;
 241           lex_match (lexer, T_EQUALS);
 242
 243           free (regression.dep_vars);
 244           regression.n_dep_vars = 0;
 245
 246           if (!parse_variables_const (lexer, dict,
 247                                       &regression.dep_vars,
 248                                       &regression.n_dep_vars,
 249                                       PV_NO_DUPLICATE | PV_NUMERIC))
 250             goto error;
 251         }
 252       else if (lex_match_id (lexer, "ORIGIN"))
 253         {
 254           regression.origin = true;
 255         }
 256       else if (lex_match_id (lexer, "NOORIGIN"))
 257         {
 258           regression.origin = false;
 259         }
 260       else if (lex_match_id (lexer, "METHOD"))
 261         {
 262           method_seen = true;
 263           lex_match (lexer, T_EQUALS);
 264
 265           if (!lex_force_match_id (lexer, "ENTER"))
 266             {
 267               goto error;
 268             }
 269
 270           if (! variables_seen)
 271             {
 272               if (!parse_variables_const (lexer, dict,
 273                                           &regression.vars, &regression.n_vars,
 274                                           PV_NO_DUPLICATE | PV_NUMERIC))
 275                 goto error;
 276             }
 277         }
 278       else if (lex_match_id (lexer, "STATISTICS"))
 279         {
 280           unsigned long statistics = 0;
 281           lex_match (lexer, T_EQUALS);
 282
 283           while (lex_token (lexer) != T_ENDCMD
 284                  && lex_token (lexer) != T_SLASH)
 285             {
 286               if (lex_match (lexer, T_ALL))
 287                 {
 288                   statistics = ~0;
 289                 }
 290               else if (lex_match_id (lexer, "DEFAULTS"))
 291                 {
 292                   statistics |= STATS_DEFAULT;
 293                 }
 294               else if (lex_match_id (lexer, "R"))
 295                 {
 296                   statistics |= STATS_R;
 297                 }
 298               else if (lex_match_id (lexer, "COEFF"))
 299                 {
 300                   statistics |= STATS_COEFF;
 301                 }
 302               else if (lex_match_id (lexer, "ANOVA"))
 303                 {
 304                   statistics |= STATS_ANOVA;
 305                 }
 306               else if (lex_match_id (lexer, "BCOV"))
 307                 {
 308                   statistics |= STATS_BCOV;
 309                 }
 310               else if (lex_match_id (lexer, "CI"))
 311                 {
 312                   statistics |= STATS_CI;
 313
 314                   if (lex_match (lexer, T_LPAREN) &&
 315                       lex_force_num (lexer))
 316                     {
 317                       regression.ci = lex_number (lexer) / 100.0;
 318                       lex_get (lexer);
 319                       if (! lex_force_match (lexer, T_RPAREN))
 320                         goto error;
 321                     }
 322                 }
 323               else
 324                 {
 325                   lex_error (lexer, NULL);
 326                   goto error;
 327                 }
 328             }
 329
 330           if (statistics)
 331             regression.stats = statistics;
 332
 333         }
 334       else if (lex_match_id (lexer, "SAVE"))
 335         {
 336           lex_match (lexer, T_EQUALS);
 337
 338           while (lex_token (lexer) != T_ENDCMD
 339                  && lex_token (lexer) != T_SLASH)
 340             {
 341               if (lex_match_id (lexer, "PRED"))
 342                 {
 343                   regression.pred = true;
 344                 }
 345               else if (lex_match_id (lexer, "RESID"))
 346                 {
 347                   regression.resid = true;
 348                 }
 349               else
 350                 {
 351                   lex_error (lexer, NULL);
 352                   goto error;
 353                 }
 354             }
 355         }
 356       else
 357         {
 358           lex_error (lexer, NULL);
 359           goto error;
 360         }
 361     }
 362
 363   if (!regression.vars)
 364     {
 365       dict_get_vars (dict, &regression.vars, &regression.n_vars, 0);
 366     }
 367
 368   save = regression.pred || regression.resid;
 369   workspace.extras = 0;
 370   workspace.res_idx = -1;
 371   workspace.pred_idx = -1;
 372   workspace.writer = NULL;
 373   workspace.reader = NULL;
 374   workspace.residvars = NULL;
 375   workspace.predvars = NULL;
 376   if (save)
 377     {
 378       int i;
 379       struct caseproto *proto = caseproto_create ();
 380
 381       if (regression.resid)
 382         {
 383           workspace.res_idx = workspace.extras ++;
 384           workspace.residvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.residvars));
 385
 386           for (i = 0; i < regression.n_dep_vars; ++i)
 387             {
 388               workspace.residvars[i] = create_aux_var (ds, "RES");
 389               proto = caseproto_add_width (proto, 0);
 390             }
 391         }
 392
 393       if (regression.pred)
 394         {
 395           workspace.pred_idx = workspace.extras ++;
 396           workspace.predvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.predvars));
 397
 398           for (i = 0; i < regression.n_dep_vars; ++i)
 399             {
 400               workspace.predvars[i] = create_aux_var (ds, "PRED");
 401               proto = caseproto_add_width (proto, 0);
 402             }
 403         }
 404
 405       if (proc_make_temporary_transformations_permanent (ds))
 406         msg (SW, _("REGRESSION with SAVE ignores TEMPORARY.  "
 407                    "Temporary transformations will be made permanent."));
 408
 409       if (dict_get_filter (dict))
 410         msg (SW, _("REGRESSION with SAVE ignores FILTER.  "
 411                    "All cases will be processed."));
 412
 413       workspace.writer = autopaging_writer_create (proto);
 414       caseproto_unref (proto);
 415     }
 416
 417
 418   {
 419     struct casegrouper *grouper;
 420     struct casereader *group;
 421     bool ok;
 422
 423     grouper = casegrouper_create_splits (proc_open_filtering (ds, !save), dict);
 424
 425
 426     while (casegrouper_get_next_group (grouper, &group))
 427       {
 428         run_regression (&regression,
 429                         &workspace,
 430                         group);
 431
 432       }
 433     ok = casegrouper_destroy (grouper);
 434     ok = proc_commit (ds) && ok;
 435   }
 436
 437   if (workspace.writer)
 438     {
 439       struct save_trans_data *save_trans_data = xmalloc (sizeof *save_trans_data);
 440       struct casereader *r = casewriter_make_reader (workspace.writer);
 441       workspace.writer = NULL;
 442       workspace.reader = r;
 443       save_trans_data->ws = xmalloc (sizeof (workspace));
 444       memcpy (save_trans_data->ws, &workspace, sizeof (workspace));
 445       save_trans_data->n_dep_vars = regression.n_dep_vars;
 446
 447       add_transformation (ds, save_trans_func, save_trans_free, save_trans_data);
 448     }
 449
 450
 451   free (regression.vars);
 452   free (regression.dep_vars);
 453   return CMD_SUCCESS;
 454
 455 error:
 456
 457   free (regression.vars);
 458   free (regression.dep_vars);
 459   return CMD_FAILURE;
 460 }
 461
 462 /* Return the size of the union of dependent and independent variables */
 463 static size_t
 464 get_n_all_vars (const struct regression *cmd)
 465 {
 466   size_t result = cmd->n_vars;
 467   size_t i;
 468   size_t j;
 469
 470   result += cmd->n_dep_vars;
 471   for (i = 0; i < cmd->n_dep_vars; i++)
 472     {
 473       for (j = 0; j < cmd->n_vars; j++)
 474         {
 475           if (cmd->vars[j] == cmd->dep_vars[i])
 476             {
 477               result--;
 478             }
 479         }
 480     }
 481   return result;
 482 }
 483
 484 /* Fill VARS with the union of dependent and independent variables */
 485 static void
 486 fill_all_vars (const struct variable **vars, const struct regression *cmd)
 487 {
 488   size_t x = 0;
 489   size_t i;
 490   for (i = 0; i < cmd->n_vars; i++)
 491     {
 492       vars[i] = cmd->vars[i];
 493     }
 494
 495   for (i = 0; i < cmd->n_dep_vars; i++)
 496     {
 497       size_t j;
 498       bool absent = true;
 499       for (j = 0; j < cmd->n_vars; j++)
 500         {
 501           if (cmd->dep_vars[i] == cmd->vars[j])
 502             {
 503               absent = false;
 504               break;
 505             }
 506         }
 507       if (absent)
 508         {
 509           vars[cmd->n_vars + x++] = cmd->dep_vars[i];
 510         }
 511     }
 512 }
 513
 514 /*
 515   Is variable k the dependent variable?
 516 */
 517 static bool
 518 is_depvar (const struct regression *cmd, size_t k, const struct variable *v)
 519 {
 520   return v == cmd->vars[k];
 521 }
 522
 523
 524 /* Identify the explanatory variables in v_variables.  Returns
 525    the number of independent variables. */
 526 static int
 527 identify_indep_vars (const struct regression *cmd,
 528                      const struct variable **indep_vars,
 529                      const struct variable *depvar)
 530 {
 531   int n_indep_vars = 0;
 532   int i;
 533
 534   for (i = 0; i < cmd->n_vars; i++)
 535     if (!is_depvar (cmd, i, depvar))
 536       indep_vars[n_indep_vars++] = cmd->vars[i];
 537   if ((n_indep_vars < 1) && is_depvar (cmd, 0, depvar))
 538     {
 539       /*
 540          There is only one independent variable, and it is the same
 541          as the dependent variable. Print a warning and continue.
 542        */
 543       msg (SW,
 544            gettext
 545            ("The dependent variable is equal to the independent variable. "
 546             "The least squares line is therefore Y=X. "
 547             "Standard errors and related statistics may be meaningless."));
 548       n_indep_vars = 1;
 549       indep_vars[0] = cmd->vars[0];
 550     }
 551   return n_indep_vars;
 552 }
 553
 554 static double
 555 fill_covariance (gsl_matrix * cov, struct covariance *all_cov,
 556                  const struct variable **vars,
 557                  size_t n_vars, const struct variable *dep_var,
 558                  const struct variable **all_vars, size_t n_all_vars,
 559                  double *means)
 560 {
 561   size_t i;
 562   size_t j;
 563   size_t dep_subscript;
 564   size_t *rows;
 565   const gsl_matrix *ssizes;
 566   const gsl_matrix *mean_matrix;
 567   const gsl_matrix *ssize_matrix;
 568   double result = 0.0;
 569
 570   const gsl_matrix *cm = covariance_calculate_unnormalized (all_cov);
 571
 572   if (cm == NULL)
 573     return 0;
 574
 575   rows = xnmalloc (cov->size1 - 1, sizeof (*rows));
 576
 577   for (i = 0; i < n_all_vars; i++)
 578     {
 579       for (j = 0; j < n_vars; j++)
 580         {
 581           if (vars[j] == all_vars[i])
 582             {
 583               rows[j] = i;
 584             }
 585         }
 586       if (all_vars[i] == dep_var)
 587         {
 588           dep_subscript = i;
 589         }
 590     }
 591   mean_matrix = covariance_moments (all_cov, MOMENT_MEAN);
 592   ssize_matrix = covariance_moments (all_cov, MOMENT_NONE);
 593   for (i = 0; i < cov->size1 - 1; i++)
 594     {
 595       means[i] = gsl_matrix_get (mean_matrix, rows[i], 0)
 596         / gsl_matrix_get (ssize_matrix, rows[i], 0);
 597       for (j = 0; j < cov->size2 - 1; j++)
 598         {
 599           gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j]));
 600           gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i]));
 601         }
 602     }
 603   means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0)
 604     / gsl_matrix_get (ssize_matrix, dep_subscript, 0);
 605   ssizes = covariance_moments (all_cov, MOMENT_NONE);
 606   result = gsl_matrix_get (ssizes, dep_subscript, rows[0]);
 607   for (i = 0; i < cov->size1 - 1; i++)
 608     {
 609       gsl_matrix_set (cov, i, cov->size1 - 1,
 610                       gsl_matrix_get (cm, rows[i], dep_subscript));
 611       gsl_matrix_set (cov, cov->size1 - 1, i,
 612                       gsl_matrix_get (cm, rows[i], dep_subscript));
 613       if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript))
 614         {
 615           result = gsl_matrix_get (ssizes, rows[i], dep_subscript);
 616         }
 617     }
 618   gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1,
 619                   gsl_matrix_get (cm, dep_subscript, dep_subscript));
 620   free (rows);
 621   return result;
 622 }
 623
 624 \f
 625
 626 /*
 627   STATISTICS subcommand output functions.
 628 */
 629 static void reg_stats_r (const struct linreg *,     const struct variable *);
 630 static void reg_stats_coeff (const struct linreg *, const gsl_matrix *, const struct variable *, const struct regression *);
 631 static void reg_stats_anova (const struct linreg *, const struct variable *);
 632 static void reg_stats_bcov (const struct linreg *,  const struct variable *);
 633
 634
 635 static void
 636 subcommand_statistics (const struct regression *cmd, const struct linreg * c, const gsl_matrix * cm,
 637                        const struct variable *var)
 638 {
 639   if (cmd->stats & STATS_R)
 640     reg_stats_r     (c, var);
 641
 642   if (cmd->stats & STATS_ANOVA)
 643     reg_stats_anova (c, var);
 644
 645   if (cmd->stats & STATS_COEFF)
 646     reg_stats_coeff (c, cm, var, cmd);
 647
 648   if (cmd->stats & STATS_BCOV)
 649     reg_stats_bcov  (c, var);
 650 }
 651
 652
 653 static void
 654 run_regression (const struct regression *cmd,
 655                 struct regression_workspace *ws,
 656                 struct casereader *input)
 657 {
 658   size_t i;
 659   struct linreg **models;
 660
 661   int k;
 662   struct ccase *c;
 663   struct covariance *cov;
 664   struct casereader *reader;
 665   size_t n_all_vars = get_n_all_vars (cmd);
 666   const struct variable **all_vars = xnmalloc (n_all_vars, sizeof (*all_vars));
 667
 668   double *means = xnmalloc (n_all_vars, sizeof (*means));
 669
 670   fill_all_vars (all_vars, cmd);
 671   cov = covariance_1pass_create (n_all_vars, all_vars,
 672                                  dict_get_weight (dataset_dict (cmd->ds)),
 673                                  MV_ANY, cmd->origin == false);
 674
 675   reader = casereader_clone (input);
 676   reader = casereader_create_filter_missing (reader, all_vars, n_all_vars,
 677                                              MV_ANY, NULL, NULL);
 678
 679
 680   {
 681     struct casereader *r = casereader_clone (reader);
 682
 683     for (; (c = casereader_read (r)) != NULL; case_unref (c))
 684       {
 685         covariance_accumulate (cov, c);
 686       }
 687     casereader_destroy (r);
 688   }
 689
 690   models = xcalloc (cmd->n_dep_vars, sizeof (*models));
 691   for (k = 0; k < cmd->n_dep_vars; k++)
 692     {
 693       const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars));
 694       const struct variable *dep_var = cmd->dep_vars[k];
 695       int n_indep = identify_indep_vars (cmd, vars, dep_var);
 696       gsl_matrix *this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1);
 697       double n_data = fill_covariance (this_cm, cov, vars, n_indep,
 698                                 dep_var, all_vars, n_all_vars, means);
 699       models[k] = linreg_alloc (dep_var, vars,  n_data, n_indep, cmd->origin);
 700       for (i = 0; i < n_indep; i++)
 701         {
 702           linreg_set_indep_variable_mean (models[k], i, means[i]);
 703         }
 704       linreg_set_depvar_mean (models[k], means[i]);
 705       if (n_data > 0)
 706         {
 707           /*
 708              Find the least-squares estimates and other statistics.
 709            */
 710           linreg_fit (this_cm, models[k]);
 711
 712           if (!taint_has_tainted_successor (casereader_get_taint (input)))
 713             {
 714               subcommand_statistics (cmd, models[k], this_cm, dep_var);
 715             }
 716         }
 717       else
 718         {
 719           msg (SE, _("No valid data found. This command was skipped."));
 720         }
 721       gsl_matrix_free (this_cm);
 722       free (vars);
 723     }
 724
 725
 726   if (ws->extras > 0)
 727    {
 728       struct casereader *r = casereader_clone (reader);
 729
 730       for (; (c = casereader_read (r)) != NULL; case_unref (c))
 731         {
 732           struct ccase *outc = case_create (casewriter_get_proto (ws->writer));
 733           for (k = 0; k < cmd->n_dep_vars; k++)
 734             {
 735               const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars));
 736               const struct variable *dep_var = cmd->dep_vars[k];
 737               int n_indep = identify_indep_vars (cmd, vars, dep_var);
 738               double *vals = xnmalloc (n_indep, sizeof (*vals));
 739               for (i = 0; i < n_indep; i++)
 740                 {
 741                   const union value *tmp = case_data (c, vars[i]);
 742                   vals[i] = tmp->f;
 743                 }
 744
 745               if (cmd->pred)
 746                 {
 747                   double pred = linreg_predict (models[k], vals, n_indep);
 748                   case_data_rw_idx (outc, k * ws->extras + ws->pred_idx)->f = pred;
 749                 }
 750
 751               if (cmd->resid)
 752                 {
 753                   double obs = case_data (c, linreg_dep_var (models[k]))->f;
 754                   double res = linreg_residual (models[k], obs,  vals, n_indep);
 755                   case_data_rw_idx (outc, k * ws->extras + ws->res_idx)->f = res;
 756                 }
 757               free (vals);
 758               free (vars);
 759             }
 760           casewriter_write (ws->writer, outc);
 761         }
 762       casereader_destroy (r);
 763     }
 764
 765   casereader_destroy (reader);
 766
 767   for (k = 0; k < cmd->n_dep_vars; k++)
 768     {
 769       linreg_unref (models[k]);
 770     }
 771   free (models);
 772
 773   free (all_vars);
 774   free (means);
 775   casereader_destroy (input);
 776   covariance_destroy (cov);
 777 }
 778
 779 \f
 780
 781
 782 static void
 783 reg_stats_r (const struct linreg * c, const struct variable *var)
 784 {
 785   struct tab_table *t;
 786   int n_rows = 2;
 787   int n_cols = 5;
 788   double rsq;
 789   double adjrsq;
 790   double std_error;
 791
 792   assert (c != NULL);
 793   rsq = linreg_ssreg (c) / linreg_sst (c);
 794   adjrsq = rsq -
 795     (1.0 - rsq) * linreg_n_coeffs (c) / (linreg_n_obs (c) -
 796                                          linreg_n_coeffs (c) - 1);
 797   std_error = sqrt (linreg_mse (c));
 798   t = tab_create (n_cols, n_rows);
 799   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 800   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 801   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 802   tab_vline (t, TAL_0, 1, 0, 0);
 803
 804   tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
 805   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
 806   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
 807   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
 808   tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL, RC_OTHER);
 809   tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL, RC_OTHER);
 810   tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL, RC_OTHER);
 811   tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL, RC_OTHER);
 812   tab_title (t, _("Model Summary (%s)"), var_to_string (var));
 813   tab_submit (t);
 814 }
 815
 816 /*
 817   Table showing estimated regression coefficients.
 818 */
 819 static void
 820 reg_stats_coeff (const struct linreg * c, const gsl_matrix *cov, const struct variable *var, const struct regression *cmd)
 821 {
 822   size_t j;
 823   int n_cols = 7;
 824   const int heading_rows = 2;
 825   int n_rows;
 826   int this_row = heading_rows;
 827   double pval;
 828   double std_err;
 829   double beta;
 830   const char *label;
 831
 832   const struct variable *v;
 833   struct tab_table *t;
 834
 835   const double df = linreg_n_obs (c) - linreg_n_coeffs (c) - 1;
 836   double q = (1 - cmd->ci) / 2.0;  /* 2-tailed test */
 837   double tval = gsl_cdf_tdist_Qinv (q, df);
 838
 839   assert (c != NULL);
 840   n_rows = linreg_n_coeffs (c) + heading_rows + 1;
 841
 842   if (cmd->stats & STATS_CI)
 843     n_cols += 2;
 844
 845   t = tab_create (n_cols, n_rows);
 846   tab_headers (t, 2, 0, 1, 0);
 847   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 848   tab_hline (t, TAL_2, 0, n_cols - 1, heading_rows);
 849   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 850   tab_vline (t, TAL_0, 1, 0, 0);
 851
 852
 853   tab_hline (t, TAL_1, 2, 4, 1);
 854   tab_joint_text (t, 2, 0, 3, 0, TAB_CENTER | TAT_TITLE, _("Unstandardized Coefficients"));
 855   tab_text (t, 2, 1, TAB_CENTER | TAT_TITLE, _("B"));
 856   tab_text (t, 3, 1, TAB_CENTER | TAT_TITLE, _("Std. Error"));
 857   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Standardized Coefficients"));
 858   tab_text (t, 4, 1, TAB_CENTER | TAT_TITLE, _("Beta"));
 859   tab_text (t, 5, 1, TAB_CENTER | TAT_TITLE, _("t"));
 860   tab_text (t, 6, 1, TAB_CENTER | TAT_TITLE, _("Sig."));
 861
 862   std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0));
 863
 864   if (cmd->stats & STATS_CI)
 865     {
 866       double lower = linreg_intercept (c) - tval * std_err ;
 867       double upper = linreg_intercept (c) + tval * std_err ;
 868       tab_double (t, 7, heading_rows, 0, lower, NULL, RC_OTHER);
 869       tab_double (t, 8, heading_rows, 0, upper, NULL, RC_OTHER);
 870
 871       tab_joint_text_format (t, 7, 0, 8, 0, TAB_CENTER | TAT_TITLE, _("%g%% Confidence Interval for B"), cmd->ci * 100);
 872       tab_hline (t, TAL_1, 7, 8, 1);
 873       tab_text (t, 7, 1, TAB_CENTER | TAT_TITLE, _("Lower Bound"));
 874       tab_text (t, 8, 1, TAB_CENTER | TAT_TITLE, _("Upper Bound"));
 875     }
 876
 877   if (!cmd->origin)
 878     {
 879       tab_text (t, 1, this_row, TAB_LEFT | TAT_TITLE, _("(Constant)"));
 880       tab_double (t, 2, this_row, 0, linreg_intercept (c), NULL, RC_OTHER);
 881       tab_double (t, 3, this_row, 0, std_err, NULL, RC_OTHER);
 882       tab_double (t, 4, this_row, 0, 0.0, NULL, RC_OTHER);
 883       double t_stat = linreg_intercept (c) / std_err;
 884       tab_double (t, 5, this_row, 0, t_stat, NULL, RC_OTHER);
 885
 886       double pval =
 887         2 * gsl_cdf_tdist_Q (fabs (t_stat),
 888                              (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
 889       tab_double (t, 6, this_row, 0, pval, NULL, RC_PVALUE);
 890       this_row++;
 891     }
 892
 893   for (j = 0; j < linreg_n_coeffs (c); j++, this_row++)
 894     {
 895       struct string tstr;
 896       ds_init_empty (&tstr);
 897
 898       v = linreg_indep_var (c, j);
 899       label = var_to_string (v);
 900       /* Do not overwrite the variable's name. */
 901       ds_put_cstr (&tstr, label);
 902       tab_text (t, 1, this_row, TAB_LEFT, ds_cstr (&tstr));
 903       /*
 904          Regression coefficients.
 905        */
 906       tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL, RC_OTHER);
 907       /*
 908          Standard error of the coefficients.
 909        */
 910       std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1));
 911       tab_double (t, 3, this_row, 0, std_err, NULL, RC_OTHER);
 912       /*
 913          Standardized coefficient, i.e., regression coefficient
 914          if all variables had unit variance.
 915        */
 916       beta = sqrt (gsl_matrix_get (cov, j, j));
 917       beta *= linreg_coeff (c, j) /
 918         sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1));
 919       tab_double (t, 4, this_row, 0, beta, NULL, RC_OTHER);
 920
 921       /*
 922          Test statistic for H0: coefficient is 0.
 923        */
 924       double t_stat = linreg_coeff (c, j) / std_err;
 925       tab_double (t, 5, this_row, 0, t_stat, NULL, RC_OTHER);
 926       /*
 927          P values for the test statistic above.
 928        */
 929       pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), df);
 930       tab_double (t, 6, this_row, 0, pval, NULL, RC_PVALUE);
 931       ds_destroy (&tstr);
 932
 933       if (cmd->stats & STATS_CI)
 934         {
 935           double lower = linreg_coeff (c, j)  - tval * std_err ;
 936           double upper = linreg_coeff (c, j)  + tval * std_err ;
 937
 938           tab_double (t, 7, this_row, 0, lower, NULL, RC_OTHER);
 939           tab_double (t, 8, this_row, 0, upper, NULL, RC_OTHER);
 940         }
 941     }
 942   tab_title (t, _("Coefficients (%s)"), var_to_string (var));
 943   tab_submit (t);
 944 }
 945
 946 /*
 947   Display the ANOVA table.
 948 */
 949 static void
 950 reg_stats_anova (const struct linreg * c, const struct variable *var)
 951 {
 952   int n_cols = 7;
 953   int n_rows = 4;
 954   const double msm = linreg_ssreg (c) / linreg_dfmodel (c);
 955   const double mse = linreg_mse (c);
 956   const double F = msm / mse;
 957   const double pval = gsl_cdf_fdist_Q (F, linreg_dfmodel (c),
 958                                        linreg_dferror (c));
 959
 960   struct tab_table *t;
 961
 962   assert (c != NULL);
 963   t = tab_create (n_cols, n_rows);
 964   tab_headers (t, 2, 0, 1, 0);
 965
 966   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
 967
 968   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
 969   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
 970   tab_vline (t, TAL_0, 1, 0, 0);
 971
 972   tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
 973   tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
 974   tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
 975   tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
 976   tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Sig."));
 977
 978   tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
 979   tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
 980   tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
 981
 982   /* Sums of Squares */
 983   tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL, RC_OTHER);
 984   tab_double (t, 2, 3, 0, linreg_sst (c), NULL, RC_OTHER);
 985   tab_double (t, 2, 2, 0, linreg_sse (c), NULL, RC_OTHER);
 986
 987
 988   /* Degrees of freedom */
 989   tab_text_format (t, 3, 1, TAB_RIGHT, "%.*g", DBL_DIG + 1, linreg_dfmodel (c));
 990   tab_text_format (t, 3, 2, TAB_RIGHT, "%.*g", DBL_DIG + 1, linreg_dferror (c));
 991   tab_text_format (t, 3, 3, TAB_RIGHT, "%.*g", DBL_DIG + 1, linreg_dftotal (c));
 992
 993   /* Mean Squares */
 994   tab_double (t, 4, 1, TAB_RIGHT, msm, NULL, RC_OTHER);
 995   tab_double (t, 4, 2, TAB_RIGHT, mse, NULL, RC_OTHER);
 996
 997   tab_double (t, 5, 1, 0, F, NULL, RC_OTHER);
 998
 999   tab_double (t, 6, 1, 0, pval, NULL, RC_PVALUE);
1000
1001   tab_title (t, _("ANOVA (%s)"), var_to_string (var));
1002   tab_submit (t);
1003 }
1004
1005
1006 static void
1007 reg_stats_bcov (const struct linreg * c, const struct variable *var)
1008 {
1009   int n_cols;
1010   int n_rows;
1011   int i;
1012   int k;
1013   int row;
1014   int col;
1015   const char *label;
1016   struct tab_table *t;
1017
1018   assert (c != NULL);
1019   n_cols = linreg_n_indeps (c) + 1 + 2;
1020   n_rows = 2 * (linreg_n_indeps (c) + 1);
1021   t = tab_create (n_cols, n_rows);
1022   tab_headers (t, 2, 0, 1, 0);
1023   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
1024   tab_hline (t, TAL_2, 0, n_cols - 1, 1);
1025   tab_vline (t, TAL_2, 2, 0, n_rows - 1);
1026   tab_vline (t, TAL_0, 1, 0, 0);
1027   tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
1028   tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
1029   for (i = 0; i < linreg_n_coeffs (c); i++)
1030     {
1031       const struct variable *v = linreg_indep_var (c, i);
1032       label = var_to_string (v);
1033       tab_text (t, 2, i, TAB_CENTER, label);
1034       tab_text (t, i + 2, 0, TAB_CENTER, label);
1035       for (k = 1; k < linreg_n_coeffs (c); k++)
1036         {
1037           col = (i <= k) ? k : i;
1038           row = (i <= k) ? i : k;
1039           tab_double (t, k + 2, i, TAB_CENTER,
1040                       gsl_matrix_get (linreg_cov (c), row, col), NULL, RC_OTHER);
1041         }
1042     }
1043   tab_title (t, _("Coefficient Correlations (%s)"), var_to_string (var));
1044   tab_submit (t);
1045 }
1046