pintos-os.org Git - pspp/blob - src/language/stats/logistic.c

   1 /* pspp - a program for statistical analysis.
   2    Copyright (C) 2012 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17
  18 /*
  19    References:
  20    1. "Coding Logistic Regression with Newton-Raphson", James McCaffrey
  21    http://msdn.microsoft.com/en-us/magazine/jj618304.aspx
  22
  23    2. "SPSS Statistical Algorithms" Chapter LOGISTIC REGRESSION Algorithms
  24
  25
  26    The Newton Raphson method finds successive approximations to $\bf b$ where
  27    approximation ${\bf b}_t$ is (hopefully) better than the previous ${\bf b}_{t-1}$.
  28
  29    $ {\bf b}_t = {\bf b}_{t -1} + ({\bf X}^T{\bf W}_{t-1}{\bf X})^{-1}{\bf X}^T({\bf y} - {\bf \pi}_{t-1})$
  30    where:
  31
  32    $\bf X$ is the $n \times p$ design matrix, $n$ being the number of cases,
  33    $p$ the number of parameters, \par
  34    $\bf W$ is the diagonal matrix whose diagonal elements are
  35    $\hat{\pi}_0(1 - \hat{\pi}_0), \, \hat{\pi}_1(1 - \hat{\pi}_2)\dots \hat{\pi}_{n-1}(1 - \hat{\pi}_{n-1})$
  36    \par
  37
  38 */
  39
  40 #include <config.h>
  41
  42 #include <gsl/gsl_blas.h>
  43
  44 #include <gsl/gsl_linalg.h>
  45 #include <gsl/gsl_cdf.h>
  46 #include <gsl/gsl_matrix.h>
  47 #include <gsl/gsl_vector.h>
  48 #include <math.h>
  49
  50 #include "data/case.h"
  51 #include "data/casegrouper.h"
  52 #include "data/casereader.h"
  53 #include "data/dataset.h"
  54 #include "data/dictionary.h"
  55 #include "data/format.h"
  56 #include "data/value.h"
  57 #include "language/command.h"
  58 #include "language/dictionary/split-file.h"
  59 #include "language/lexer/lexer.h"
  60 #include "language/lexer/value-parser.h"
  61 #include "language/lexer/variable-parser.h"
  62 #include "libpspp/assertion.h"
  63 #include "libpspp/ll.h"
  64 #include "libpspp/message.h"
  65 #include "libpspp/misc.h"
  66 #include "math/categoricals.h"
  67 #include "math/interaction.h"
  68 #include "libpspp/hmap.h"
  69 #include "libpspp/hash-functions.h"
  70
  71 #include "output/tab.h"
  72
  73 #include "gettext.h"
  74 #define _(msgid) gettext (msgid)
  75
  76
  77
  78
  79 #define   PRINT_EACH_STEP  0x01
  80 #define   PRINT_SUMMARY    0x02
  81 #define   PRINT_CORR       0x04
  82 #define   PRINT_ITER       0x08
  83 #define   PRINT_GOODFIT    0x10
  84 #define   PRINT_CI         0x20
  85
  86
  87 #define PRINT_DEFAULT (PRINT_SUMMARY | PRINT_EACH_STEP)
  88
  89 /*
  90   The constant parameters of the procedure.
  91   That is, those which are set by the user.
  92 */
  93 struct lr_spec
  94 {
  95   /* The dependent variable */
  96   const struct variable *dep_var;
  97
  98   /* The predictor variables (excluding categorical ones) */
  99   const struct variable **predictor_vars;
 100   size_t n_predictor_vars;
 101
 102   /* The categorical predictors */
 103   struct interaction **cat_predictors;
 104   size_t n_cat_predictors;
 105
 106
 107   /* The union of the categorical and non-categorical variables */
 108   const struct variable **indep_vars;
 109   size_t n_indep_vars;
 110
 111
 112   /* Which classes of missing vars are to be excluded */
 113   enum mv_class exclude;
 114
 115   /* The weight variable */
 116   const struct variable *wv;
 117
 118   /* The dictionary of the dataset */
 119   const struct dictionary *dict;
 120
 121   /* True iff the constant (intercept) is to be included in the model */
 122   bool constant;
 123
 124   /* Ths maximum number of iterations */
 125   int max_iter;
 126
 127   /* Other iteration limiting conditions */
 128   double bcon;
 129   double min_epsilon;
 130   double lcon;
 131
 132   /* The confidence interval (in percent) */
 133   int confidence;
 134
 135   /* What results should be presented */
 136   unsigned int print;
 137
 138   double cut_point;
 139 };
 140
 141
 142 /* The results and intermediate result of the procedure.
 143    These are mutated as the procedure runs. Used for
 144    temporary variables etc.
 145 */
 146 struct lr_result
 147 {
 148   /* Used to indicate if a pass should flag a warning when
 149      invalid (ie negative or missing) weight values are encountered */
 150   bool warn_bad_weight;
 151
 152   /* The two values of the dependent variable. */
 153   union value y0;
 154   union value y1;
 155
 156
 157   /* The sum of caseweights */
 158   double cc;
 159
 160   /* The number of missing and nonmissing cases */
 161   casenumber n_missing;
 162   casenumber n_nonmissing;
 163
 164
 165   gsl_matrix *hessian;
 166
 167   /* The categoricals and their payload. Null if  the analysis has no
 168    categorical predictors */
 169   struct categoricals *cats;
 170   struct payload cp;
 171
 172
 173   /* The estimates of the predictor coefficients */
 174   gsl_vector *beta_hat;
 175
 176   /* The predicted classifications:
 177      True Negative, True Positive, False Negative, False Positive */
 178   double tn, tp, fn, fp;
 179 };
 180
 181
 182 /*
 183   Convert INPUT into a dichotomous scalar, according to how the dependent variable's
 184   values are mapped.
 185   For simple cases, this is a 1:1 mapping
 186   The return value is always either 0 or 1
 187 */
 188 static double
 189 map_dependent_var (const struct lr_spec *cmd, const struct lr_result *res, const union value *input)
 190 {
 191   const int width = var_get_width (cmd->dep_var);
 192   if (value_equal (input, &res->y0, width))
 193     return 0;
 194
 195   if (value_equal (input, &res->y1, width))
 196     return 1;
 197
 198   /* This should never happen.  If it does,  then y0 and/or y1 have probably not been set */
 199   NOT_REACHED ();
 200
 201   return SYSMIS;
 202 }
 203
 204 static void output_classification_table (const struct lr_spec *cmd, const struct lr_result *res);
 205
 206 static void output_categories (const struct lr_spec *cmd, const struct lr_result *res);
 207
 208 static void output_depvarmap (const struct lr_spec *cmd, const struct lr_result *);
 209
 210 static void output_variables (const struct lr_spec *cmd,
 211                               const struct lr_result *);
 212
 213 static void output_model_summary (const struct lr_result *,
 214                                   double initial_likelihood, double likelihood);
 215
 216 static void case_processing_summary (const struct lr_result *);
 217
 218
 219 /* Return the value of case C corresponding to the INDEX'th entry in the
 220    model */
 221 static double
 222 predictor_value (const struct ccase *c,
 223                     const struct variable **x, size_t n_x,
 224                     const struct categoricals *cats,
 225                     size_t index)
 226 {
 227   /* Values of the scalar predictor variables */
 228   if (index < n_x)
 229     return case_data (c, x[index])->f;
 230
 231   /* Coded values of categorical predictor variables (or interactions) */
 232   if (cats && index - n_x  < categoricals_df_total (cats))
 233     {
 234       double x = categoricals_get_dummy_code_for_case (cats, index - n_x, c);
 235       return x;
 236     }
 237
 238   /* The constant term */
 239   return 1.0;
 240 }
 241
 242
 243 /*
 244   Return the probability beta_hat (that is the estimator logit(y) )
 245   corresponding to the coefficient estimator for case C
 246 */
 247 static double
 248 pi_hat (const struct lr_spec *cmd,
 249         const struct lr_result *res,
 250         const struct variable **x, size_t n_x,
 251         const struct ccase *c)
 252 {
 253   int v0;
 254   double pi = 0;
 255   size_t n_coeffs = res->beta_hat->size;
 256
 257   if (cmd->constant)
 258     {
 259       pi += gsl_vector_get (res->beta_hat, res->beta_hat->size - 1);
 260       n_coeffs--;
 261     }
 262
 263   for (v0 = 0; v0 < n_coeffs; ++v0)
 264     {
 265       pi += gsl_vector_get (res->beta_hat, v0) *
 266         predictor_value (c, x, n_x, res->cats, v0);
 267     }
 268
 269   pi = 1.0 / (1.0 + exp(-pi));
 270
 271   return pi;
 272 }
 273
 274
 275 /*
 276   Calculates the Hessian matrix X' V  X,
 277   where: X is the n by N_X matrix comprising the n cases in INPUT
 278   V is a diagonal matrix { (pi_hat_0)(1 - pi_hat_0), (pi_hat_1)(1 - pi_hat_1), ... (pi_hat_{N-1})(1 - pi_hat_{N-1})}
 279   (the partial derivative of the predicted values)
 280
 281   If ALL predicted values derivatives are close to zero or one, then CONVERGED
 282   will be set to true.
 283 */
 284 static void
 285 hessian (const struct lr_spec *cmd,
 286          struct lr_result *res,
 287          struct casereader *input,
 288          const struct variable **x, size_t n_x,
 289          bool *converged)
 290 {
 291   struct casereader *reader;
 292   struct ccase *c;
 293
 294   double max_w = -DBL_MAX;
 295
 296   gsl_matrix_set_zero (res->hessian);
 297
 298   for (reader = casereader_clone (input);
 299        (c = casereader_read (reader)) != NULL; case_unref (c))
 300     {
 301       int v0, v1;
 302       double pi = pi_hat (cmd, res, x, n_x, c);
 303
 304       double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight);
 305       double w = pi * (1 - pi);
 306       if (w > max_w)
 307         max_w = w;
 308       w *= weight;
 309
 310       for (v0 = 0; v0 < res->beta_hat->size; ++v0)
 311         {
 312           double in0 = predictor_value (c, x, n_x, res->cats, v0);
 313           for (v1 = 0; v1 < res->beta_hat->size; ++v1)
 314             {
 315               double in1 = predictor_value (c, x, n_x, res->cats, v1);
 316               double *o = gsl_matrix_ptr (res->hessian, v0, v1);
 317               *o += in0 * w * in1;
 318             }
 319         }
 320     }
 321   casereader_destroy (reader);
 322
 323   if ( max_w < cmd->min_epsilon)
 324     {
 325       *converged = true;
 326       msg (MN, _("All predicted values are either 1 or 0"));
 327     }
 328 }
 329
 330
 331 /* Calculates the value  X' (y - pi)
 332    where X is the design model,
 333    y is the vector of observed independent variables
 334    pi is the vector of estimates for y
 335
 336    Side effects:
 337      the likelihood is stored in LIKELIHOOD;
 338      the predicted values are placed in the respective tn, fn, tp fp values in RES
 339 */
 340 static gsl_vector *
 341 xt_times_y_pi (const struct lr_spec *cmd,
 342                struct lr_result *res,
 343                struct casereader *input,
 344                const struct variable **x, size_t n_x,
 345                const struct variable *y_var,
 346                double *likelihood)
 347 {
 348   struct casereader *reader;
 349   struct ccase *c;
 350   gsl_vector *output = gsl_vector_calloc (res->beta_hat->size);
 351
 352   *likelihood = 1.0;
 353   res->tn = res->tp = res->fn = res->fp = 0;
 354   for (reader = casereader_clone (input);
 355        (c = casereader_read (reader)) != NULL; case_unref (c))
 356     {
 357       double pred_y = 0;
 358       int v0;
 359       double pi = pi_hat (cmd, res, x, n_x, c);
 360       double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight);
 361
 362
 363       double y = map_dependent_var (cmd, res, case_data (c, y_var));
 364
 365       *likelihood *= pow (pi, weight * y) * pow (1 - pi, weight * (1 - y));
 366
 367       for (v0 = 0; v0 < res->beta_hat->size; ++v0)
 368         {
 369           double in0 = predictor_value (c, x, n_x, res->cats, v0);
 370           double *o = gsl_vector_ptr (output, v0);
 371           *o += in0 * (y - pi) * weight;
 372           pred_y += gsl_vector_get (res->beta_hat, v0) * in0;
 373         }
 374
 375       pred_y = 1 / (1.0 + exp(-pred_y));
 376       assert (pred_y >= 0);
 377       assert (pred_y <= 1);
 378
 379       if (pred_y <= cmd->cut_point)
 380         {
 381           if (y == 0)
 382             res->tn += weight;
 383           else
 384             res->fn += weight;
 385         }
 386       else
 387         {
 388           if (y == 0)
 389             res->fp += weight;
 390           else
 391             res->tp += weight;
 392         }
 393     }
 394
 395   casereader_destroy (reader);
 396
 397   return output;
 398 }
 399
 400 \f
 401
 402 /* "payload" functions for the categoricals.
 403    The only function is to accumulate the frequency of each
 404    category.
 405  */
 406
 407 static void *
 408 frq_create  (const void *aux1 UNUSED, void *aux2 UNUSED)
 409 {
 410   return xzalloc (sizeof (double));
 411 }
 412
 413 static void
 414 frq_update  (const void *aux1 UNUSED, void *aux2 UNUSED,
 415              void *ud, const struct ccase *c UNUSED , double weight)
 416 {
 417   double *freq = ud;
 418   *freq += weight;
 419 }
 420
 421 static void
 422 frq_destroy (const void *aux1 UNUSED, void *aux2 UNUSED, void *user_data UNUSED)
 423 {
 424   free (user_data);
 425 }
 426
 427 \f
 428
 429 /*
 430    Makes an initial pass though the data, doing the following:
 431
 432    * Checks that the dependent variable is  dichotomous,
 433    * Creates and initialises the categoricals,
 434    * Accumulates summary results,
 435    * Calculates necessary initial values.
 436    * Creates an initial value for \hat\beta the vector of beta_hats of \beta
 437
 438    Returns true if successful
 439 */
 440 static bool
 441 initial_pass (const struct lr_spec *cmd, struct lr_result *res, struct casereader *input)
 442 {
 443   const int width = var_get_width (cmd->dep_var);
 444
 445   struct ccase *c;
 446   struct casereader *reader;
 447
 448   double sum;
 449   double sumA = 0.0;
 450   double sumB = 0.0;
 451
 452   bool v0set = false;
 453   bool v1set = false;
 454
 455   size_t n_coefficients = cmd->n_predictor_vars;
 456   if (cmd->constant)
 457     n_coefficients++;
 458
 459   /* Create categoricals if appropriate */
 460   if (cmd->n_cat_predictors > 0)
 461     {
 462       res->cp.create = frq_create;
 463       res->cp.update = frq_update;
 464       res->cp.calculate = NULL;
 465       res->cp.destroy = frq_destroy;
 466
 467       res->cats = categoricals_create (cmd->cat_predictors, cmd->n_cat_predictors,
 468                                        cmd->wv, cmd->exclude, MV_ANY);
 469
 470       categoricals_set_payload (res->cats, &res->cp, cmd, res);
 471     }
 472
 473   res->cc = 0;
 474   for (reader = casereader_clone (input);
 475        (c = casereader_read (reader)) != NULL; case_unref (c))
 476     {
 477       int v;
 478       bool missing = false;
 479       double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight);
 480       const union value *depval = case_data (c, cmd->dep_var);
 481
 482       for (v = 0; v < cmd->n_indep_vars; ++v)
 483         {
 484           const union value *val = case_data (c, cmd->indep_vars[v]);
 485           if (var_is_value_missing (cmd->indep_vars[v], val, cmd->exclude))
 486             {
 487               missing = true;
 488               break;
 489             }
 490         }
 491
 492       /* Accumulate the missing and non-missing counts */
 493       if (missing)
 494         {
 495           res->n_missing++;
 496           continue;
 497         }
 498       res->n_nonmissing++;
 499
 500       /* Find the values of the dependent variable */
 501       if (!v0set)
 502         {
 503           value_clone (&res->y0, depval, width);
 504           v0set = true;
 505         }
 506       else if (!v1set)
 507         {
 508           if ( !value_equal (&res->y0, depval, width))
 509             {
 510               value_clone (&res->y1, depval, width);
 511               v1set = true;
 512             }
 513         }
 514       else
 515         {
 516           if (! value_equal (&res->y0, depval, width)
 517               &&
 518               ! value_equal (&res->y1, depval, width)
 519               )
 520             {
 521               msg (ME, _("Dependent variable's values are not dichotomous."));
 522               goto error;
 523             }
 524         }
 525
 526       if (v0set && value_equal (&res->y0, depval, width))
 527           sumA += weight;
 528
 529       if (v1set && value_equal (&res->y1, depval, width))
 530           sumB += weight;
 531
 532
 533       res->cc += weight;
 534
 535       categoricals_update (res->cats, c);
 536     }
 537   casereader_destroy (reader);
 538
 539   categoricals_done (res->cats);
 540
 541   sum = sumB;
 542
 543   /* Ensure that Y0 is less than Y1.  Otherwise the mapping gets
 544      inverted, which is confusing to users */
 545   if (var_is_numeric (cmd->dep_var) && value_compare_3way (&res->y0, &res->y1, width) > 0)
 546     {
 547       union value tmp;
 548       value_clone (&tmp, &res->y0, width);
 549       value_copy (&res->y0, &res->y1, width);
 550       value_copy (&res->y1, &tmp, width);
 551       value_destroy (&tmp, width);
 552       sum = sumA;
 553     }
 554
 555   n_coefficients += categoricals_df_total (res->cats);
 556   res->beta_hat = gsl_vector_calloc (n_coefficients);
 557
 558   if (cmd->constant)
 559     {
 560       double mean = sum / res->cc;
 561       gsl_vector_set (res->beta_hat, res->beta_hat->size - 1, log (mean / (1 - mean)));
 562     }
 563
 564   return true;
 565
 566  error:
 567   casereader_destroy (reader);
 568   return false;
 569 }
 570
 571
 572
 573 /* Start of the logistic regression routine proper */
 574 static bool
 575 run_lr (const struct lr_spec *cmd, struct casereader *input,
 576         const struct dataset *ds UNUSED)
 577 {
 578   int i;
 579
 580   bool converged = false;
 581
 582   /* Set the likelihoods to a negative sentinel value */
 583   double likelihood = -1;
 584   double prev_likelihood = -1;
 585   double initial_likelihood = -1;
 586
 587   struct lr_result work;
 588   work.n_missing = 0;
 589   work.n_nonmissing = 0;
 590   work.warn_bad_weight = true;
 591   work.cats = NULL;
 592   work.beta_hat = NULL;
 593
 594   /* Get the initial estimates of \beta and their standard errors.
 595      And perform other auxilliary initialisation.  */
 596   if (! initial_pass (cmd, &work, input))
 597     return false;
 598
 599   for (i = 0; i < cmd->n_cat_predictors; ++i)
 600     {
 601       if (1 >= categoricals_n_count (work.cats, i))
 602         {
 603           struct string str;
 604           ds_init_empty (&str);
 605
 606           interaction_to_string (cmd->cat_predictors[i], &str);
 607
 608           msg (ME, _("Category %s does not have at least two distinct values. Logistic regression will not be run."),
 609                ds_cstr(&str));
 610           ds_destroy (&str);
 611           return false;
 612         }
 613     }
 614
 615   output_depvarmap (cmd, &work);
 616
 617   case_processing_summary (&work);
 618
 619
 620   input = casereader_create_filter_missing (input,
 621                                             cmd->indep_vars,
 622                                             cmd->n_indep_vars,
 623                                             cmd->exclude,
 624                                             NULL,
 625                                             NULL);
 626
 627
 628   work.hessian = gsl_matrix_calloc (work.beta_hat->size, work.beta_hat->size);
 629
 630   /* Start the Newton Raphson iteration process... */
 631   for( i = 0 ; i < cmd->max_iter ; ++i)
 632     {
 633       double min, max;
 634       gsl_vector *v ;
 635
 636
 637       hessian (cmd, &work, input,
 638                cmd->predictor_vars, cmd->n_predictor_vars,
 639                &converged);
 640
 641       gsl_linalg_cholesky_decomp (work.hessian);
 642       gsl_linalg_cholesky_invert (work.hessian);
 643
 644       v = xt_times_y_pi (cmd, &work, input,
 645                          cmd->predictor_vars, cmd->n_predictor_vars,
 646                          cmd->dep_var,
 647                          &likelihood);
 648
 649       {
 650         /* delta = M.v */
 651         gsl_vector *delta = gsl_vector_alloc (v->size);
 652         gsl_blas_dgemv (CblasNoTrans, 1.0, work.hessian, v, 0, delta);
 653         gsl_vector_free (v);
 654
 655
 656         gsl_vector_add (work.beta_hat, delta);
 657
 658         gsl_vector_minmax (delta, &min, &max);
 659
 660         if ( fabs (min) < cmd->bcon && fabs (max) < cmd->bcon)
 661           {
 662             msg (MN, _("Estimation terminated at iteration number %d because parameter estimates changed by less than %g"),
 663                  i + 1, cmd->bcon);
 664             converged = true;
 665           }
 666
 667         gsl_vector_free (delta);
 668       }
 669
 670       if ( prev_likelihood >= 0)
 671         {
 672           if (-log (likelihood) > -(1.0 - cmd->lcon) * log (prev_likelihood))
 673             {
 674               msg (MN, _("Estimation terminated at iteration number %d because Log Likelihood decreased by less than %g%%"), i + 1, 100 * cmd->lcon);
 675               converged = true;
 676             }
 677         }
 678       if (i == 0)
 679         initial_likelihood = likelihood;
 680       prev_likelihood = likelihood;
 681
 682       if (converged)
 683         break;
 684     }
 685   casereader_destroy (input);
 686   assert (initial_likelihood >= 0);
 687
 688   if ( ! converged)
 689     msg (MW, _("Estimation terminated at iteration number %d because maximum iterations has been reached"), i );
 690
 691
 692   output_model_summary (&work, initial_likelihood, likelihood);
 693
 694   if (work.cats)
 695     output_categories (cmd, &work);
 696
 697   output_classification_table (cmd, &work);
 698   output_variables (cmd, &work);
 699
 700   gsl_matrix_free (work.hessian);
 701   gsl_vector_free (work.beta_hat);
 702
 703   categoricals_destroy (work.cats);
 704
 705   return true;
 706 }
 707
 708 struct variable_node
 709 {
 710   struct hmap_node node;      /* Node in hash map. */
 711   const struct variable *var; /* The variable */
 712 };
 713
 714 static struct variable_node *
 715 lookup_variable (const struct hmap *map, const struct variable *var, unsigned int hash)
 716 {
 717   struct variable_node *vn = NULL;
 718   HMAP_FOR_EACH_WITH_HASH (vn, struct variable_node, node, hash, map)
 719     {
 720       if (vn->var == var)
 721         break;
 722
 723       fprintf (stderr, "Warning: Hash table collision\n");
 724     }
 725
 726   return vn;
 727 }
 728
 729
 730 /* Parse the LOGISTIC REGRESSION command syntax */
 731 int
 732 cmd_logistic (struct lexer *lexer, struct dataset *ds)
 733 {
 734   /* Temporary location for the predictor variables.
 735      These may or may not include the categorical predictors */
 736   const struct variable **pred_vars;
 737   size_t n_pred_vars;
 738
 739   int v, x;
 740   struct lr_spec lr;
 741   lr.dict = dataset_dict (ds);
 742   lr.n_predictor_vars = 0;
 743   lr.predictor_vars = NULL;
 744   lr.exclude = MV_ANY;
 745   lr.wv = dict_get_weight (lr.dict);
 746   lr.max_iter = 20;
 747   lr.lcon = 0.0000;
 748   lr.bcon = 0.001;
 749   lr.min_epsilon = 0.00000001;
 750   lr.cut_point = 0.5;
 751   lr.constant = true;
 752   lr.confidence = 95;
 753   lr.print = PRINT_DEFAULT;
 754   lr.cat_predictors = NULL;
 755   lr.n_cat_predictors = 0;
 756   lr.indep_vars = NULL;
 757
 758
 759   if (lex_match_id (lexer, "VARIABLES"))
 760     lex_match (lexer, T_EQUALS);
 761
 762   if (! (lr.dep_var = parse_variable_const (lexer, lr.dict)))
 763     goto error;
 764
 765   lex_force_match (lexer, T_WITH);
 766
 767   if (!parse_variables_const (lexer, lr.dict,
 768                               &pred_vars, &n_pred_vars,
 769                               PV_NO_DUPLICATE))
 770     goto error;
 771
 772
 773   while (lex_token (lexer) != T_ENDCMD)
 774     {
 775       lex_match (lexer, T_SLASH);
 776
 777       if (lex_match_id (lexer, "MISSING"))
 778         {
 779           lex_match (lexer, T_EQUALS);
 780           while (lex_token (lexer) != T_ENDCMD
 781                  && lex_token (lexer) != T_SLASH)
 782             {
 783               if (lex_match_id (lexer, "INCLUDE"))
 784                 {
 785                   lr.exclude = MV_SYSTEM;
 786                 }
 787               else if (lex_match_id (lexer, "EXCLUDE"))
 788                 {
 789                   lr.exclude = MV_ANY;
 790                 }
 791               else
 792                 {
 793                   lex_error (lexer, NULL);
 794                   goto error;
 795                 }
 796             }
 797         }
 798       else if (lex_match_id (lexer, "ORIGIN"))
 799         {
 800           lr.constant = false;
 801         }
 802       else if (lex_match_id (lexer, "NOORIGIN"))
 803         {
 804           lr.constant = true;
 805         }
 806       else if (lex_match_id (lexer, "NOCONST"))
 807         {
 808           lr.constant = false;
 809         }
 810       else if (lex_match_id (lexer, "EXTERNAL"))
 811         {
 812           /* This is for compatibility.  It does nothing */
 813         }
 814       else if (lex_match_id (lexer, "CATEGORICAL"))
 815         {
 816           lex_match (lexer, T_EQUALS);
 817           do
 818             {
 819               lr.cat_predictors = xrealloc (lr.cat_predictors,
 820                                   sizeof (*lr.cat_predictors) * ++lr.n_cat_predictors);
 821               lr.cat_predictors[lr.n_cat_predictors - 1] = 0;
 822             }
 823           while (parse_design_interaction (lexer, lr.dict,
 824                                            lr.cat_predictors + lr.n_cat_predictors - 1));
 825           lr.n_cat_predictors--;
 826         }
 827       else if (lex_match_id (lexer, "PRINT"))
 828         {
 829           lex_match (lexer, T_EQUALS);
 830           while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
 831             {
 832               if (lex_match_id (lexer, "DEFAULT"))
 833                 {
 834                   lr.print |= PRINT_DEFAULT;
 835                 }
 836               else if (lex_match_id (lexer, "SUMMARY"))
 837                 {
 838                   lr.print |= PRINT_SUMMARY;
 839                 }
 840 #if 0
 841               else if (lex_match_id (lexer, "CORR"))
 842                 {
 843                   lr.print |= PRINT_CORR;
 844                 }
 845               else if (lex_match_id (lexer, "ITER"))
 846                 {
 847                   lr.print |= PRINT_ITER;
 848                 }
 849               else if (lex_match_id (lexer, "GOODFIT"))
 850                 {
 851                   lr.print |= PRINT_GOODFIT;
 852                 }
 853 #endif
 854               else if (lex_match_id (lexer, "CI"))
 855                 {
 856                   lr.print |= PRINT_CI;
 857                   if (lex_force_match (lexer, T_LPAREN))
 858                     {
 859                       if (! lex_force_int (lexer))
 860                         {
 861                           lex_error (lexer, NULL);
 862                           goto error;
 863                         }
 864                       lr.confidence = lex_integer (lexer);
 865                       lex_get (lexer);
 866                       if ( ! lex_force_match (lexer, T_RPAREN))
 867                         {
 868                           lex_error (lexer, NULL);
 869                           goto error;
 870                         }
 871                     }
 872                 }
 873               else if (lex_match_id (lexer, "ALL"))
 874                 {
 875                   lr.print = ~0x0000;
 876                 }
 877               else
 878                 {
 879                   lex_error (lexer, NULL);
 880                   goto error;
 881                 }
 882             }
 883         }
 884       else if (lex_match_id (lexer, "CRITERIA"))
 885         {
 886           lex_match (lexer, T_EQUALS);
 887           while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
 888             {
 889               if (lex_match_id (lexer, "BCON"))
 890                 {
 891                   if (lex_force_match (lexer, T_LPAREN))
 892                     {
 893                       if (! lex_force_num (lexer))
 894                         {
 895                           lex_error (lexer, NULL);
 896                           goto error;
 897                         }
 898                       lr.bcon = lex_number (lexer);
 899                       lex_get (lexer);
 900                       if ( ! lex_force_match (lexer, T_RPAREN))
 901                         {
 902                           lex_error (lexer, NULL);
 903                           goto error;
 904                         }
 905                     }
 906                 }
 907               else if (lex_match_id (lexer, "ITERATE"))
 908                 {
 909                   if (lex_force_match (lexer, T_LPAREN))
 910                     {
 911                       if (! lex_force_int (lexer))
 912                         {
 913                           lex_error (lexer, NULL);
 914                           goto error;
 915                         }
 916                       lr.max_iter = lex_integer (lexer);
 917                       lex_get (lexer);
 918                       if ( ! lex_force_match (lexer, T_RPAREN))
 919                         {
 920                           lex_error (lexer, NULL);
 921                           goto error;
 922                         }
 923                     }
 924                 }
 925               else if (lex_match_id (lexer, "LCON"))
 926                 {
 927                   if (lex_force_match (lexer, T_LPAREN))
 928                     {
 929                       if (! lex_force_num (lexer))
 930                         {
 931                           lex_error (lexer, NULL);
 932                           goto error;
 933                         }
 934                       lr.lcon = lex_number (lexer);
 935                       lex_get (lexer);
 936                       if ( ! lex_force_match (lexer, T_RPAREN))
 937                         {
 938                           lex_error (lexer, NULL);
 939                           goto error;
 940                         }
 941                     }
 942                 }
 943               else if (lex_match_id (lexer, "EPS"))
 944                 {
 945                   if (lex_force_match (lexer, T_LPAREN))
 946                     {
 947                       if (! lex_force_num (lexer))
 948                         {
 949                           lex_error (lexer, NULL);
 950                           goto error;
 951                         }
 952                       lr.min_epsilon = lex_number (lexer);
 953                       lex_get (lexer);
 954                       if ( ! lex_force_match (lexer, T_RPAREN))
 955                         {
 956                           lex_error (lexer, NULL);
 957                           goto error;
 958                         }
 959                     }
 960                 }
 961               else if (lex_match_id (lexer, "CUT"))
 962                 {
 963                   if (lex_force_match (lexer, T_LPAREN))
 964                     {
 965                       if (! lex_force_num (lexer))
 966                         {
 967                           lex_error (lexer, NULL);
 968                           goto error;
 969                         }
 970                       lr.cut_point = lex_number (lexer);
 971                       if (lr.cut_point < 0 || lr.cut_point > 1.0)
 972                         {
 973                           msg (ME, _("Cut point value must be in the range [0,1]"));
 974                           goto error;
 975                         }
 976                       lex_get (lexer);
 977                       if ( ! lex_force_match (lexer, T_RPAREN))
 978                         {
 979                           lex_error (lexer, NULL);
 980                           goto error;
 981                         }
 982                     }
 983                 }
 984               else
 985                 {
 986                   lex_error (lexer, NULL);
 987                   goto error;
 988                 }
 989             }
 990         }
 991       else
 992         {
 993           lex_error (lexer, NULL);
 994           goto error;
 995         }
 996     }
 997
 998   /* Copy the predictor variables from the temporary location into the
 999      final one, dropping any categorical variables which appear there.
1000      FIXME: This is O(NxM).
1001   */
1002
1003   {
1004   struct variable_node *vn, *next;
1005   struct hmap allvars;
1006   hmap_init (&allvars);
1007   for (v = x = 0; v < n_pred_vars; ++v)
1008     {
1009       bool drop = false;
1010       const struct variable *var = pred_vars[v];
1011       int cv = 0;
1012
1013       unsigned int hash = hash_pointer (var, 0);
1014       struct variable_node *vn = lookup_variable (&allvars, var, hash);
1015       if (vn == NULL)
1016         {
1017           vn = xmalloc (sizeof *vn);
1018           vn->var = var;
1019           hmap_insert (&allvars, &vn->node,  hash);
1020         }
1021
1022       for (cv = 0; cv < lr.n_cat_predictors ; ++cv)
1023         {
1024           int iv;
1025           const struct interaction *iact = lr.cat_predictors[cv];
1026           for (iv = 0 ; iv < iact->n_vars ; ++iv)
1027             {
1028               const struct variable *ivar = iact->vars[iv];
1029               unsigned int hash = hash_pointer (ivar, 0);
1030               struct variable_node *vn = lookup_variable (&allvars, ivar, hash);
1031               if (vn == NULL)
1032                 {
1033                   vn = xmalloc (sizeof *vn);
1034                   vn->var = ivar;
1035
1036                   hmap_insert (&allvars, &vn->node,  hash);
1037                 }
1038
1039               if (var == ivar)
1040                 {
1041                   drop = true;
1042                 }
1043             }
1044         }
1045
1046       if (drop)
1047         continue;
1048
1049       lr.predictor_vars = xrealloc (lr.predictor_vars, sizeof *lr.predictor_vars * (x + 1));
1050       lr.predictor_vars[x++] = var;
1051       lr.n_predictor_vars++;
1052     }
1053   free (pred_vars);
1054
1055   lr.n_indep_vars = hmap_count (&allvars);
1056   lr.indep_vars = xmalloc (lr.n_indep_vars * sizeof *lr.indep_vars);
1057
1058   /* Interate over each variable and push it into the array */
1059   x = 0;
1060   HMAP_FOR_EACH_SAFE (vn, next, struct variable_node, node, &allvars)
1061     {
1062       lr.indep_vars[x++] = vn->var;
1063       free (vn);
1064     }
1065   hmap_destroy (&allvars);
1066   }
1067
1068
1069   /* logistical regression for each split group */
1070   {
1071     struct casegrouper *grouper;
1072     struct casereader *group;
1073     bool ok;
1074
1075     grouper = casegrouper_create_splits (proc_open (ds), lr.dict);
1076     while (casegrouper_get_next_group (grouper, &group))
1077       ok = run_lr (&lr, group, ds);
1078     ok = casegrouper_destroy (grouper);
1079     ok = proc_commit (ds) && ok;
1080   }
1081
1082   free (lr.predictor_vars);
1083   free (lr.cat_predictors);
1084   free (lr.indep_vars);
1085
1086   return CMD_SUCCESS;
1087
1088  error:
1089
1090   free (lr.predictor_vars);
1091   free (lr.cat_predictors);
1092   free (lr.indep_vars);
1093
1094   return CMD_FAILURE;
1095 }
1096
1097
1098 \f
1099
1100 /* Show the Dependent Variable Encoding box.
1101    This indicates how the dependent variable
1102    is mapped to the internal zero/one values.
1103 */
1104 static void
1105 output_depvarmap (const struct lr_spec *cmd, const struct lr_result *res)
1106 {
1107   const int heading_columns = 0;
1108   const int heading_rows = 1;
1109   struct tab_table *t;
1110   struct string str;
1111
1112   const int nc = 2;
1113   int nr = heading_rows + 2;
1114
1115   t = tab_create (nc, nr);
1116   tab_title (t, _("Dependent Variable Encoding"));
1117
1118   tab_headers (t, heading_columns, 0, heading_rows, 0);
1119
1120   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1121
1122   tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1123   tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1124
1125   tab_text (t,  0, 0, TAB_CENTER | TAT_TITLE, _("Original Value"));
1126   tab_text (t,  1, 0, TAB_CENTER | TAT_TITLE, _("Internal Value"));
1127
1128
1129
1130   ds_init_empty (&str);
1131   var_append_value_name (cmd->dep_var, &res->y0, &str);
1132   tab_text (t,  0, 0 + heading_rows,  0, ds_cstr (&str));
1133
1134   ds_clear (&str);
1135   var_append_value_name (cmd->dep_var, &res->y1, &str);
1136   tab_text (t,  0, 1 + heading_rows,  0, ds_cstr (&str));
1137
1138
1139   tab_double (t, 1, 0 + heading_rows, 0, map_dependent_var (cmd, res, &res->y0), &F_8_0);
1140   tab_double (t, 1, 1 + heading_rows, 0, map_dependent_var (cmd, res, &res->y1), &F_8_0);
1141   ds_destroy (&str);
1142
1143   tab_submit (t);
1144 }
1145
1146
1147 /* Show the Variables in the Equation box */
1148 static void
1149 output_variables (const struct lr_spec *cmd,
1150                   const struct lr_result *res)
1151 {
1152   int row = 0;
1153   const int heading_columns = 1;
1154   int heading_rows = 1;
1155   struct tab_table *t;
1156
1157   int nc = 8;
1158   int nr ;
1159   int i = 0;
1160   int ivar = 0;
1161   int idx_correction = 0;
1162
1163   if (cmd->print & PRINT_CI)
1164     {
1165       nc += 2;
1166       heading_rows += 1;
1167       row++;
1168     }
1169   nr = heading_rows + cmd->n_predictor_vars;
1170   if (cmd->constant)
1171     nr++;
1172
1173   if (res->cats)
1174     nr += categoricals_df_total (res->cats) + cmd->n_cat_predictors;
1175
1176   t = tab_create (nc, nr);
1177   tab_title (t, _("Variables in the Equation"));
1178
1179   tab_headers (t, heading_columns, 0, heading_rows, 0);
1180
1181   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1182
1183   tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1184   tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1185
1186   tab_text (t,  0, row + 1, TAB_CENTER | TAT_TITLE, _("Step 1"));
1187
1188   tab_text (t,  2, row, TAB_CENTER | TAT_TITLE, _("B"));
1189   tab_text (t,  3, row, TAB_CENTER | TAT_TITLE, _("S.E."));
1190   tab_text (t,  4, row, TAB_CENTER | TAT_TITLE, _("Wald"));
1191   tab_text (t,  5, row, TAB_CENTER | TAT_TITLE, _("df"));
1192   tab_text (t,  6, row, TAB_CENTER | TAT_TITLE, _("Sig."));
1193   tab_text (t,  7, row, TAB_CENTER | TAT_TITLE, _("Exp(B)"));
1194
1195   if (cmd->print & PRINT_CI)
1196     {
1197       tab_joint_text_format (t, 8, 0, 9, 0,
1198                              TAB_CENTER | TAT_TITLE, _("%d%% CI for Exp(B)"), cmd->confidence);
1199
1200       tab_text (t,  8, row, TAB_CENTER | TAT_TITLE, _("Lower"));
1201       tab_text (t,  9, row, TAB_CENTER | TAT_TITLE, _("Upper"));
1202     }
1203
1204   for (row = heading_rows ; row < nr; ++row)
1205     {
1206       const int idx = row - heading_rows - idx_correction;
1207
1208       const double b = gsl_vector_get (res->beta_hat, idx);
1209       const double sigma2 = gsl_matrix_get (res->hessian, idx, idx);
1210       const double wald = pow2 (b) / sigma2;
1211       const double df = 1;
1212
1213       if (idx < cmd->n_predictor_vars)
1214         {
1215           tab_text (t, 1, row, TAB_LEFT | TAT_TITLE,
1216                     var_to_string (cmd->predictor_vars[idx]));
1217         }
1218       else if (i < cmd->n_cat_predictors)
1219         {
1220           double wald;
1221           bool summary = false;
1222           struct string str;
1223           const struct interaction *cat_predictors = cmd->cat_predictors[i];
1224           const int df = categoricals_df (res->cats, i);
1225
1226           ds_init_empty (&str);
1227           interaction_to_string (cat_predictors, &str);
1228
1229           if (ivar == 0)
1230             {
1231               /* Calculate the Wald statistic,
1232                  which is \beta' C^-1 \beta .
1233                  where \beta is the vector of the coefficient estimates comprising this
1234                  categorial variable. and C is the corresponding submatrix of the
1235                  hessian matrix.
1236               */
1237               gsl_matrix_const_view mv =
1238                 gsl_matrix_const_submatrix (res->hessian, idx, idx, df, df);
1239               gsl_matrix *subhessian = gsl_matrix_alloc (mv.matrix.size1, mv.matrix.size2);
1240               gsl_vector_const_view vv = gsl_vector_const_subvector (res->beta_hat, idx, df);
1241               gsl_vector *temp = gsl_vector_alloc (df);
1242
1243               gsl_matrix_memcpy (subhessian, &mv.matrix);
1244               gsl_linalg_cholesky_decomp (subhessian);
1245               gsl_linalg_cholesky_invert (subhessian);
1246
1247               gsl_blas_dgemv (CblasTrans, 1.0, subhessian, &vv.vector, 0, temp);
1248               gsl_blas_ddot (temp, &vv.vector, &wald);
1249
1250               tab_double (t, 4, row, 0, wald, 0);
1251               tab_double (t, 5, row, 0, df, &F_8_0);
1252               tab_double (t, 6, row, 0, gsl_cdf_chisq_Q (wald, df), 0);
1253
1254               idx_correction ++;
1255               summary = true;
1256               gsl_matrix_free (subhessian);
1257               gsl_vector_free (temp);
1258             }
1259           else
1260             {
1261               ds_put_format (&str, "(%d)", ivar);
1262             }
1263
1264           tab_text (t, 1, row, TAB_LEFT | TAT_TITLE, ds_cstr (&str));
1265           if (ivar++ == df)
1266             {
1267               ++i; /* next interaction */
1268               ivar = 0;
1269             }
1270
1271           ds_destroy (&str);
1272
1273           if (summary)
1274             continue;
1275         }
1276       else
1277         {
1278           tab_text (t, 1, row, TAB_LEFT | TAT_TITLE, _("Constant"));
1279         }
1280
1281       tab_double (t, 2, row, 0, b, 0);
1282       tab_double (t, 3, row, 0, sqrt (sigma2), 0);
1283       tab_double (t, 4, row, 0, wald, 0);
1284       tab_double (t, 5, row, 0, df, &F_8_0);
1285       tab_double (t, 6, row, 0, gsl_cdf_chisq_Q (wald, df), 0);
1286       tab_double (t, 7, row, 0, exp (b), 0);
1287
1288       if (cmd->print & PRINT_CI)
1289         {
1290           double wc = gsl_cdf_ugaussian_Pinv (0.5 + cmd->confidence / 200.0);
1291           wc *= sqrt (sigma2);
1292
1293           if (idx < cmd->n_predictor_vars)
1294             {
1295               tab_double (t, 8, row, 0, exp (b - wc), 0);
1296               tab_double (t, 9, row, 0, exp (b + wc), 0);
1297             }
1298         }
1299     }
1300
1301   tab_submit (t);
1302 }
1303
1304
1305 /* Show the model summary box */
1306 static void
1307 output_model_summary (const struct lr_result *res,
1308                       double initial_likelihood, double likelihood)
1309 {
1310   const int heading_columns = 0;
1311   const int heading_rows = 1;
1312   struct tab_table *t;
1313
1314   const int nc = 4;
1315   int nr = heading_rows + 1;
1316   double cox;
1317
1318   t = tab_create (nc, nr);
1319   tab_title (t, _("Model Summary"));
1320
1321   tab_headers (t, heading_columns, 0, heading_rows, 0);
1322
1323   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1324
1325   tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1326   tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1327
1328   tab_text (t,  0, 0, TAB_LEFT | TAT_TITLE, _("Step 1"));
1329   tab_text (t,  1, 0, TAB_CENTER | TAT_TITLE, _("-2 Log likelihood"));
1330   tab_double (t,  1, 1, 0, -2 * log (likelihood), 0);
1331
1332
1333   tab_text (t,  2, 0, TAB_CENTER | TAT_TITLE, _("Cox & Snell R Square"));
1334   cox =  1.0 - pow (initial_likelihood /likelihood, 2 / res->cc);
1335   tab_double (t,  2, 1, 0, cox, 0);
1336
1337   tab_text (t,  3, 0, TAB_CENTER | TAT_TITLE, _("Nagelkerke R Square"));
1338   tab_double (t,  3, 1, 0, cox / ( 1.0 - pow (initial_likelihood, 2 / res->cc)), 0);
1339
1340
1341   tab_submit (t);
1342 }
1343
1344 /* Show the case processing summary box */
1345 static void
1346 case_processing_summary (const struct lr_result *res)
1347 {
1348   const int heading_columns = 1;
1349   const int heading_rows = 1;
1350   struct tab_table *t;
1351
1352   const int nc = 3;
1353   const int nr = heading_rows + 3;
1354   casenumber total;
1355
1356   t = tab_create (nc, nr);
1357   tab_title (t, _("Case Processing Summary"));
1358
1359   tab_headers (t, heading_columns, 0, heading_rows, 0);
1360
1361   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1362
1363   tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1364   tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1365
1366   tab_text (t,  0, 0, TAB_LEFT | TAT_TITLE, _("Unweighted Cases"));
1367   tab_text (t,  1, 0, TAB_CENTER | TAT_TITLE, _("N"));
1368   tab_text (t,  2, 0, TAB_CENTER | TAT_TITLE, _("Percent"));
1369
1370
1371   tab_text (t,  0, 1, TAB_LEFT | TAT_TITLE, _("Included in Analysis"));
1372   tab_text (t,  0, 2, TAB_LEFT | TAT_TITLE, _("Missing Cases"));
1373   tab_text (t,  0, 3, TAB_LEFT | TAT_TITLE, _("Total"));
1374
1375   tab_double (t,  1, 1, 0, res->n_nonmissing, &F_8_0);
1376   tab_double (t,  1, 2, 0, res->n_missing, &F_8_0);
1377
1378   total = res->n_nonmissing + res->n_missing;
1379   tab_double (t,  1, 3, 0, total , &F_8_0);
1380
1381   tab_double (t,  2, 1, 0, 100 * res->n_nonmissing / (double) total, 0);
1382   tab_double (t,  2, 2, 0, 100 * res->n_missing / (double) total, 0);
1383   tab_double (t,  2, 3, 0, 100 * total / (double) total, 0);
1384
1385   tab_submit (t);
1386 }
1387
1388 static void
1389 output_categories (const struct lr_spec *cmd, const struct lr_result *res)
1390 {
1391   const struct fmt_spec *wfmt =
1392     cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0;
1393
1394   int cumulative_df;
1395   int i = 0;
1396   const int heading_columns = 2;
1397   const int heading_rows = 2;
1398   struct tab_table *t;
1399
1400   int nc ;
1401   int nr ;
1402
1403   int v;
1404   int r = 0;
1405
1406   int max_df = 0;
1407   int total_cats = 0;
1408   for (i = 0; i < cmd->n_cat_predictors; ++i)
1409     {
1410       size_t n = categoricals_n_count (res->cats, i);
1411       size_t df = categoricals_df (res->cats, i);
1412       if (max_df < df)
1413         max_df = df;
1414       total_cats += n;
1415     }
1416
1417   nc = heading_columns + 1 + max_df;
1418   nr = heading_rows + total_cats;
1419
1420   t = tab_create (nc, nr);
1421   tab_title (t, _("Categorical Variables' Codings"));
1422
1423   tab_headers (t, heading_columns, 0, heading_rows, 0);
1424
1425   tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1426
1427   tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1428   tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1429
1430
1431   tab_text (t, heading_columns, 1, TAB_CENTER | TAT_TITLE, _("Frequency"));
1432
1433   tab_joint_text_format (t, heading_columns + 1, 0, nc - 1, 0,
1434                          TAB_CENTER | TAT_TITLE, _("Parameter coding"));
1435
1436
1437   for (i = 0; i < max_df; ++i)
1438     {
1439       int c = heading_columns + 1 + i;
1440       tab_text_format (t,  c, 1, TAB_CENTER | TAT_TITLE, _("(%d)"), i + 1);
1441     }
1442
1443   cumulative_df = 0;
1444   for (v = 0; v < cmd->n_cat_predictors; ++v)
1445     {
1446       int cat;
1447       const struct interaction *cat_predictors = cmd->cat_predictors[v];
1448       int df =  categoricals_df (res->cats, v);
1449       struct string str;
1450       ds_init_empty (&str);
1451
1452       interaction_to_string (cat_predictors, &str);
1453
1454       tab_text (t, 0, heading_rows + r, TAB_LEFT | TAT_TITLE, ds_cstr (&str) );
1455
1456       ds_destroy (&str);
1457
1458       for (cat = 0; cat < categoricals_n_count (res->cats, v) ; ++cat)
1459         {
1460           struct string str;
1461           const struct ccase *c = categoricals_get_case_by_category_real (res->cats, v, cat);
1462           const double *freq = categoricals_get_user_data_by_category_real (res->cats, v, cat);
1463
1464           int x;
1465           ds_init_empty (&str);
1466
1467           for (x = 0; x < cat_predictors->n_vars; ++x)
1468             {
1469               const union value *val = case_data (c, cat_predictors->vars[x]);
1470               var_append_value_name (cat_predictors->vars[x], val, &str);
1471
1472               if (x < cat_predictors->n_vars - 1)
1473                 ds_put_cstr (&str, " ");
1474             }
1475
1476           tab_text   (t, 1, heading_rows + r, 0, ds_cstr (&str));
1477           ds_destroy (&str);
1478           tab_double (t, 2, heading_rows + r, 0, *freq, wfmt);
1479
1480           for (x = 0; x < df; ++x)
1481             {
1482               tab_double (t, heading_columns + 1 + x, heading_rows + r, 0, (cat == x), &F_8_0);
1483             }
1484           ++r;
1485         }
1486       cumulative_df += df;
1487     }
1488
1489   tab_submit (t);
1490
1491 }
1492
1493
1494 static void
1495 output_classification_table (const struct lr_spec *cmd, const struct lr_result *res)
1496 {
1497   const struct fmt_spec *wfmt =
1498     cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0;
1499
1500   const int heading_columns = 3;
1501   const int heading_rows = 3;
1502
1503   struct string sv0, sv1;
1504
1505   const int nc = heading_columns + 3;
1506   const int nr = heading_rows + 3;
1507
1508   struct tab_table *t = tab_create (nc, nr);
1509
1510   ds_init_empty (&sv0);
1511   ds_init_empty (&sv1);
1512
1513   tab_title (t, _("Classification Table"));
1514
1515   tab_headers (t, heading_columns, 0, heading_rows, 0);
1516
1517   tab_box (t, TAL_2, TAL_2, -1, -1, 0, 0, nc - 1, nr - 1);
1518   tab_box (t, -1, -1, -1, TAL_1, heading_columns, 0, nc - 1, nr - 1);
1519
1520   tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1521   tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1522
1523   tab_text (t,  0, heading_rows, TAB_CENTER | TAT_TITLE, _("Step 1"));
1524
1525
1526   tab_joint_text (t, heading_columns, 0, nc - 1, 0,
1527                   TAB_CENTER | TAT_TITLE, _("Predicted"));
1528
1529   tab_joint_text (t, heading_columns, 1, heading_columns + 1, 1,
1530                   0, var_to_string (cmd->dep_var) );
1531
1532   tab_joint_text (t, 1, 2, 2, 2,
1533                   TAB_LEFT | TAT_TITLE, _("Observed"));
1534
1535   tab_text (t, 1, 3, TAB_LEFT, var_to_string (cmd->dep_var) );
1536
1537
1538   tab_joint_text (t, nc - 1, 1, nc - 1, 2,
1539                   TAB_CENTER | TAT_TITLE, _("Percentage\nCorrect"));
1540
1541
1542   tab_joint_text (t, 1, nr - 1, 2, nr - 1,
1543                   TAB_LEFT | TAT_TITLE, _("Overall Percentage"));
1544
1545
1546   tab_hline (t, TAL_1, 1, nc - 1, nr - 1);
1547
1548   var_append_value_name (cmd->dep_var, &res->y0, &sv0);
1549   var_append_value_name (cmd->dep_var, &res->y1, &sv1);
1550
1551   tab_text (t, 2, heading_rows,     TAB_LEFT,  ds_cstr (&sv0));
1552   tab_text (t, 2, heading_rows + 1, TAB_LEFT,  ds_cstr (&sv1));
1553
1554   tab_text (t, heading_columns,     2, 0,  ds_cstr (&sv0));
1555   tab_text (t, heading_columns + 1, 2, 0,  ds_cstr (&sv1));
1556
1557   ds_destroy (&sv0);
1558   ds_destroy (&sv1);
1559
1560   tab_double (t, heading_columns, 3,     0, res->tn, wfmt);
1561   tab_double (t, heading_columns + 1, 4, 0, res->tp, wfmt);
1562
1563   tab_double (t, heading_columns + 1, 3, 0, res->fp, wfmt);
1564   tab_double (t, heading_columns,     4, 0, res->fn, wfmt);
1565
1566   tab_double (t, heading_columns + 2, 3, 0, 100 * res->tn / (res->tn + res->fp), 0);
1567   tab_double (t, heading_columns + 2, 4, 0, 100 * res->tp / (res->tp + res->fn), 0);
1568
1569   tab_double (t, heading_columns + 2, 5, 0,
1570               100 * (res->tp + res->tn) / (res->tp  + res->tn + res->fp + res->fn), 0);
1571
1572
1573   tab_submit (t);
1574 }