pintos-os.org Git - pspp/blob - src/language/stats/t-test.q

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include <gsl/gsl_cdf.h>
  20 #include <math.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24
  25 #include <data/case.h>
  26 #include <data/casegrouper.h>
  27 #include <data/casereader.h>
  28 #include <data/dictionary.h>
  29 #include <data/procedure.h>
  30 #include <data/value-labels.h>
  31 #include <data/variable.h>
  32 #include <language/command.h>
  33 #include <language/dictionary/split-file.h>
  34 #include <language/lexer/lexer.h>
  35 #include <language/lexer/value-parser.h>
  36 #include <libpspp/array.h>
  37 #include <libpspp/assertion.h>
  38 #include <libpspp/compiler.h>
  39 #include <libpspp/hash.h>
  40 #include <libpspp/message.h>
  41 #include <libpspp/misc.h>
  42 #include <libpspp/str.h>
  43 #include <libpspp/taint.h>
  44 #include <math/group-proc.h>
  45 #include <math/levene.h>
  46 #include <math/correlation.h>
  47 #include <output/manager.h>
  48 #include <output/table.h>
  49 #include <data/format.h>
  50
  51 #include "minmax.h"
  52 #include "xalloc.h"
  53 #include "xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57
  58 /* (headers) */
  59
  60 /* (specification)
  61    "T-TEST" (tts_):
  62    +groups=custom;
  63    testval=double;
  64    +variables=varlist("PV_NO_SCRATCH | PV_NUMERIC");
  65    +pairs=custom;
  66    missing=miss:!analysis/listwise,
  67    incl:include/!exclude;
  68    +format=fmt:!labels/nolabels;
  69    criteria=:cin(d:criteria,"%s > 0. && %s < 1.").
  70 */
  71 /* (declarations) */
  72 /* (functions) */
  73
  74 enum comparison
  75   {
  76     CMP_LE,
  77     CMP_EQ,
  78   };
  79
  80 /* A pair of variables to be compared. */
  81 struct pair
  82   {
  83     const struct variable *v[2]; /* The paired variables. */
  84     double n;             /* The number of valid variable pairs */
  85     double sum[2];        /* The sum of the members */
  86     double ssq[2];        /* sum of squares of the members */
  87     double std_dev[2];    /* Std deviation of the members */
  88     double s_std_dev[2];  /* Sample Std deviation of the members */
  89     double mean[2];       /* The means of the members */
  90     double correlation;   /* Correlation coefficient between the variables. */
  91     double sum_of_diffs;  /* The sum of the differences */
  92     double sum_of_prod;   /* The sum of the products */
  93     double mean_diff;     /* The mean of the differences */
  94     double ssq_diffs;     /* The sum of the squares of the differences */
  95     double std_dev_diff;  /* The std deviation of the differences */
  96   };
  97
  98 /* Which mode was T-TEST invoked */
  99 enum t_test_mode {
 100   T_1_SAMPLE,                   /* One-sample tests. */
 101   T_IND_SAMPLES,                /* Independent-sample tests. */
 102   T_PAIRED                      /* Paired-sample tests. */
 103 };
 104
 105 /* Total state of a T-TEST procedure. */
 106 struct t_test_proc
 107   {
 108     enum t_test_mode mode;      /* Mode that T-TEST was invoked in. */
 109     double criteria;            /* Confidence interval in (0, 1). */
 110     enum mv_class exclude;      /* Classes of missing values to exclude. */
 111     bool listwise_missing;      /* Drop whole case if one missing var? */
 112     struct fmt_spec weight_format; /* Format of weight variable. */
 113
 114     /* Dependent variables. */
 115     const struct variable **vars;
 116     size_t n_vars;
 117
 118     /* For mode == T_1_SAMPLE. */
 119     double testval;
 120
 121     /* For mode == T_PAIRED only. */
 122     struct pair *pairs;
 123     size_t n_pairs;
 124
 125     /* For mode == T_IND_SAMPLES only. */
 126     struct variable *indep_var; /* Independent variable. */
 127     enum comparison criterion;  /* Type of comparison. */
 128     double critical_value;      /* CMP_LE only: Grouping threshold value. */
 129     union value g_value[2];     /* CMP_EQ only: Per-group indep var values. */
 130   };
 131
 132 /* Statistics Summary Box */
 133 struct ssbox
 134   {
 135     struct tab_table *t;
 136     void (*populate) (struct ssbox *, struct t_test_proc *);
 137     void (*finalize) (struct ssbox *);
 138   };
 139
 140 static void ssbox_create (struct ssbox *, struct t_test_proc *);
 141 static void ssbox_populate (struct ssbox *, struct t_test_proc *);
 142 static void ssbox_finalize (struct ssbox *);
 143
 144 /* Paired Samples Correlation box */
 145 static void pscbox (struct t_test_proc *);
 146
 147
 148 /* Test Results Box. */
 149 struct trbox {
 150   struct tab_table *t;
 151   void (*populate) (struct trbox *, struct t_test_proc *);
 152   void (*finalize) (struct trbox *);
 153   };
 154
 155 static void trbox_create (struct trbox *, struct t_test_proc *);
 156 static void trbox_populate (struct trbox *, struct t_test_proc *);
 157 static void trbox_finalize (struct trbox *);
 158
 159 static void calculate (struct t_test_proc *, struct casereader *,
 160                        const struct dataset *);
 161
 162 static int compare_group_binary (const struct group_statistics *a,
 163                                  const struct group_statistics *b,
 164                                  const struct t_test_proc *);
 165 static unsigned hash_group_binary (const struct group_statistics *g,
 166                                    const struct t_test_proc *p);
 167
 168 int
 169 cmd_t_test (struct lexer *lexer, struct dataset *ds)
 170 {
 171   struct cmd_t_test cmd;
 172   struct t_test_proc proc;
 173   struct casegrouper *grouper;
 174   struct casereader *group;
 175   struct variable *wv;
 176   bool ok = false;
 177
 178   proc.pairs = NULL;
 179   proc.n_pairs = 0;
 180   proc.vars = NULL;
 181   proc.indep_var = NULL;
 182   if (!parse_t_test (lexer, ds, &cmd, &proc))
 183     goto parse_failed;
 184
 185   wv = dict_get_weight (dataset_dict (ds));
 186   proc.weight_format = wv ? *var_get_print_format (wv) : F_8_0;
 187
 188   if ((cmd.sbc_testval != 0) + (cmd.sbc_groups != 0) + (cmd.sbc_pairs != 0)
 189       != 1)
 190     {
 191       msg (SE, _("Exactly one of TESTVAL, GROUPS and PAIRS subcommands "
 192                  "must be specified."));
 193       goto done;
 194     }
 195
 196   proc.mode = (cmd.sbc_testval ? T_1_SAMPLE
 197                : cmd.sbc_groups ? T_IND_SAMPLES
 198                : T_PAIRED);
 199   proc.criteria = cmd.sbc_criteria ? cmd.criteria : 0.95;
 200   proc.exclude = cmd.incl != TTS_INCLUDE ? MV_ANY : MV_SYSTEM;
 201   proc.listwise_missing = cmd.miss == TTS_LISTWISE;
 202
 203   if (proc.mode == T_1_SAMPLE)
 204     proc.testval = cmd.n_testval[0];
 205
 206   if (proc.mode == T_PAIRED)
 207     {
 208       size_t i, j;
 209
 210       if (cmd.sbc_variables)
 211         {
 212           msg (SE, _("VARIABLES subcommand may not be used with PAIRS."));
 213           goto done;
 214         }
 215
 216       /* Fill proc.vars with the unique variables from pairs. */
 217       proc.n_vars = proc.n_pairs * 2;
 218       proc.vars = xmalloc (sizeof *proc.vars * proc.n_vars);
 219       for (i = j = 0; i < proc.n_pairs; i++)
 220         {
 221           proc.vars[j++] = proc.pairs[i].v[0];
 222           proc.vars[j++] = proc.pairs[i].v[1];
 223         }
 224       proc.n_vars = sort_unique (proc.vars, proc.n_vars, sizeof *proc.vars,
 225                                  compare_var_ptrs_by_name, NULL);
 226     }
 227   else
 228     {
 229       if (!cmd.n_variables)
 230         {
 231           msg (SE, _("One or more VARIABLES must be specified."));
 232           goto done;
 233         }
 234       proc.n_vars = cmd.n_variables;
 235       proc.vars = cmd.v_variables;
 236       cmd.v_variables = NULL;
 237     }
 238
 239   /* Data pass. */
 240   grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
 241   while (casegrouper_get_next_group (grouper, &group))
 242     calculate (&proc, group, ds);
 243   ok = casegrouper_destroy (grouper);
 244   ok = proc_commit (ds) && ok;
 245
 246   if (proc.mode == T_IND_SAMPLES)
 247     {
 248       int v;
 249       /* Destroy any group statistics we created */
 250       for (v = 0; v < proc.n_vars; v++)
 251         {
 252           struct group_proc *grpp = group_proc_get (proc.vars[v]);
 253           hsh_destroy (grpp->group_hash);
 254         }
 255     }
 256
 257 done:
 258   free_t_test (&cmd);
 259 parse_failed:
 260   if (proc.indep_var != NULL)
 261     {
 262       int width = var_get_width (proc.indep_var);
 263       value_destroy (&proc.g_value[0], width);
 264       value_destroy (&proc.g_value[1], width);
 265     }
 266   free (proc.vars);
 267   free (proc.pairs);
 268   return ok ? CMD_SUCCESS : CMD_FAILURE;
 269 }
 270
 271 static int
 272 tts_custom_groups (struct lexer *lexer, struct dataset *ds,
 273                    struct cmd_t_test *cmd UNUSED, void *proc_)
 274 {
 275   struct t_test_proc *proc = proc_;
 276   int n_values;
 277   int width;
 278
 279   lex_match (lexer, '=');
 280
 281   proc->indep_var = parse_variable (lexer, dataset_dict (ds));
 282   if (proc->indep_var == NULL)
 283     {
 284       lex_error (lexer, "expecting variable name in GROUPS subcommand");
 285       return 0;
 286     }
 287   width = var_get_width (proc->indep_var);
 288   value_init (&proc->g_value[0], width);
 289   value_init (&proc->g_value[1], width);
 290
 291   if (!lex_match (lexer, '('))
 292     n_values = 0;
 293   else
 294     {
 295       if (!parse_value (lexer, &proc->g_value[0], width))
 296         return 0;
 297       lex_match (lexer, ',');
 298       if (lex_match (lexer, ')'))
 299         n_values = 1;
 300       else
 301         {
 302           if (!parse_value (lexer, &proc->g_value[1], width)
 303               || !lex_force_match (lexer, ')'))
 304             return 0;
 305           n_values = 2;
 306         }
 307     }
 308
 309   if (var_is_numeric (proc->indep_var))
 310     {
 311       proc->criterion = n_values == 1 ? CMP_LE : CMP_EQ;
 312       if (n_values == 1)
 313         proc->critical_value = proc->g_value[0].f;
 314       else if (n_values == 0)
 315         {
 316           proc->g_value[0].f = 1;
 317           proc->g_value[1].f = 2;
 318         }
 319     }
 320   else
 321     {
 322       proc->criterion = CMP_EQ;
 323       if (n_values != 2)
 324         {
 325           msg (SE, _("When applying GROUPS to a string variable, two "
 326                      "values must be specified."));
 327           return 0;
 328         }
 329     }
 330   return 1;
 331 }
 332
 333 static void
 334 add_pair (struct t_test_proc *proc,
 335           const struct variable *v0, const struct variable *v1)
 336 {
 337   struct pair *p = &proc->pairs[proc->n_pairs++];
 338   p->v[0] = v0;
 339   p->v[1] = v1;
 340 }
 341
 342 static int
 343 tts_custom_pairs (struct lexer *lexer, struct dataset *ds,
 344                   struct cmd_t_test *cmd UNUSED, void *proc_)
 345 {
 346   struct t_test_proc *proc = proc_;
 347
 348   const struct variable **vars1 = NULL;
 349   size_t n_vars1 = 0;
 350
 351   const struct variable **vars2 = NULL;
 352   size_t n_vars2 = 0;
 353
 354   bool paired = false;
 355
 356   size_t n_total_pairs;
 357   size_t i, j;
 358
 359   lex_match (lexer, '=');
 360
 361   if (!parse_variables_const (lexer, dataset_dict (ds), &vars1, &n_vars1,
 362                               PV_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH))
 363     return 0;
 364
 365   if (lex_match (lexer, T_WITH))
 366     {
 367       if (!parse_variables_const (lexer, dataset_dict (ds), &vars2, &n_vars2,
 368                                   PV_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH))
 369         {
 370           free (vars1);
 371           return 0;
 372         }
 373
 374       if (lex_match (lexer, '(')
 375           && lex_match_id (lexer, "PAIRED")
 376           && lex_match (lexer, ')'))
 377         {
 378           paired = true;
 379           if (n_vars1 != n_vars2)
 380             {
 381               msg (SE, _("PAIRED was specified but the number of variables "
 382                          "preceding WITH (%zu) did not match the number "
 383                          "following (%zu)."),
 384                    n_vars1, n_vars2);
 385               free (vars1);
 386               free (vars2);
 387               return 0;
 388             }
 389         }
 390     }
 391   else
 392     {
 393       if (n_vars1 < 2)
 394         {
 395           free (vars1);
 396           msg (SE, _("At least two variables must be specified on PAIRS."));
 397           return 0;
 398         }
 399     }
 400
 401   /* Allocate storage for the new pairs. */
 402   n_total_pairs = proc->n_pairs + (paired ? n_vars1
 403                                    : n_vars2 > 0 ? n_vars1 * n_vars2
 404                                    : n_vars1 * (n_vars1 - 1) / 2);
 405   proc->pairs = xnrealloc (proc->pairs, n_total_pairs, sizeof *proc->pairs);
 406
 407   /* Populate the pairs with the appropriate variables. */
 408   if (paired)
 409     for (i = 0; i < n_vars1; i++)
 410       add_pair (proc, vars1[i], vars2[i]);
 411   else if (n_vars2 > 0)
 412     for (i = 0; i < n_vars1; i++)
 413       for (j = 0; j < n_vars2; j++)
 414         add_pair (proc, vars1[i], vars2[j]);
 415   else
 416     for (i = 0; i < n_vars1; i++)
 417       for (j = i + 1; j < n_vars1; j++)
 418         add_pair (proc, vars1[i], vars1[j]);
 419   assert (proc->n_pairs == n_total_pairs);
 420
 421   free (vars1);
 422   free (vars2);
 423   return 1;
 424 }
 425 \f
 426 /* Implementation of the SSBOX object. */
 427
 428 static void ssbox_base_init (struct ssbox *, int cols, int rows);
 429 static void ssbox_base_finalize (struct ssbox *);
 430 static void ssbox_one_sample_init (struct ssbox *, struct t_test_proc *);
 431 static void ssbox_independent_samples_init (struct ssbox *, struct t_test_proc *);
 432 static void ssbox_paired_init (struct ssbox *, struct t_test_proc *);
 433
 434 /* Factory to create an ssbox. */
 435 static void
 436 ssbox_create (struct ssbox *ssb, struct t_test_proc *proc)
 437 {
 438   switch (proc->mode)
 439     {
 440     case T_1_SAMPLE:
 441       ssbox_one_sample_init (ssb, proc);
 442       break;
 443     case T_IND_SAMPLES:
 444       ssbox_independent_samples_init (ssb, proc);
 445       break;
 446     case T_PAIRED:
 447       ssbox_paired_init (ssb, proc);
 448       break;
 449     default:
 450       NOT_REACHED ();
 451     }
 452 }
 453
 454 /* Despatcher for the populate method */
 455 static void
 456 ssbox_populate (struct ssbox *ssb, struct t_test_proc *proc)
 457 {
 458   ssb->populate (ssb, proc);
 459 }
 460
 461 /* Despatcher for finalize */
 462 static void
 463 ssbox_finalize (struct ssbox *ssb)
 464 {
 465   ssb->finalize (ssb);
 466 }
 467
 468 /* Submit the box and clear up */
 469 static void
 470 ssbox_base_finalize (struct ssbox *ssb)
 471 {
 472   tab_submit (ssb->t);
 473 }
 474
 475 /* Initialize a ssbox struct */
 476 static void
 477 ssbox_base_init (struct ssbox *this, int cols, int rows)
 478 {
 479   this->finalize = ssbox_base_finalize;
 480   this->t = tab_create (cols, rows, 0);
 481
 482   tab_columns (this->t, SOM_COL_DOWN, 1);
 483   tab_headers (this->t, 0, 0, 1, 0);
 484   tab_box (this->t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1);
 485   tab_hline (this->t, TAL_2, 0, cols- 1, 1);
 486   tab_dim (this->t, tab_natural_dimensions, NULL);
 487 }
 488 \f
 489 /* ssbox implementations. */
 490
 491 static void ssbox_one_sample_populate (struct ssbox *, struct t_test_proc *);
 492 static void ssbox_independent_samples_populate (struct ssbox *,
 493                                                 struct t_test_proc *);
 494 static void ssbox_paired_populate (struct ssbox *, struct t_test_proc *);
 495
 496 /* Initialize the one_sample ssbox */
 497 static void
 498 ssbox_one_sample_init (struct ssbox *this, struct t_test_proc *proc)
 499 {
 500   const int hsize = 5;
 501   const int vsize = proc->n_vars + 1;
 502
 503   this->populate = ssbox_one_sample_populate;
 504
 505   ssbox_base_init (this, hsize, vsize);
 506   tab_title (this->t, _("One-Sample Statistics"));
 507   tab_vline (this->t, TAL_2, 1, 0, vsize - 1);
 508   tab_text (this->t, 1, 0, TAB_CENTER | TAT_TITLE, _("N"));
 509   tab_text (this->t, 2, 0, TAB_CENTER | TAT_TITLE, _("Mean"));
 510   tab_text (this->t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation"));
 511   tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean"));
 512 }
 513
 514 /* Initialize the independent samples ssbox */
 515 static void
 516 ssbox_independent_samples_init (struct ssbox *this, struct t_test_proc *proc)
 517 {
 518   int hsize=6;
 519   int vsize = proc->n_vars * 2 + 1;
 520
 521   this->populate = ssbox_independent_samples_populate;
 522
 523   ssbox_base_init (this, hsize, vsize);
 524   tab_vline (this->t, TAL_GAP, 1, 0, vsize - 1);
 525   tab_title (this->t, _("Group Statistics"));
 526   tab_text (this->t, 1, 0, TAB_CENTER | TAT_TITLE,
 527             var_get_name (proc->indep_var));
 528   tab_text (this->t, 2, 0, TAB_CENTER | TAT_TITLE, _("N"));
 529   tab_text (this->t, 3, 0, TAB_CENTER | TAT_TITLE, _("Mean"));
 530   tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation"));
 531   tab_text (this->t, 5, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean"));
 532 }
 533
 534 /* Populate the ssbox for independent samples */
 535 static void
 536 ssbox_independent_samples_populate (struct ssbox *ssb,
 537                                     struct t_test_proc *proc)
 538 {
 539   int i;
 540
 541   char *val_lab[2];
 542   double indep_value[2];
 543
 544   char prefix[2][3];
 545
 546   for (i = 0; i < 2; i++)
 547     {
 548       union value *value = &proc->g_value[i];
 549       int width = var_get_width (proc->indep_var);
 550
 551       indep_value[i] = (proc->criterion == CMP_LE ? proc->critical_value
 552                         : value->f);
 553
 554       if (val_type_from_width (width) == VAL_NUMERIC)
 555         {
 556           const char *s = var_lookup_value_label (proc->indep_var, value);
 557           val_lab[i] = s ? xstrdup (s) : xasprintf ("%g", indep_value[i]);
 558         }
 559       else
 560         val_lab[i] = xmemdup0 (value_str (value, width), width);
 561     }
 562
 563   if (proc->criterion == CMP_LE)
 564     {
 565       strcpy (prefix[0], ">=");
 566       strcpy (prefix[1], "<");
 567     }
 568   else
 569     {
 570       strcpy (prefix[0], "");
 571       strcpy (prefix[1], "");
 572     }
 573
 574   for (i = 0; i < proc->n_vars; i++)
 575     {
 576       const struct variable *var = proc->vars[i];
 577       struct hsh_table *grp_hash = group_proc_get (var)->group_hash;
 578       int count=0;
 579
 580       tab_text (ssb->t, 0, i * 2 + 1, TAB_LEFT,
 581                 var_get_name (proc->vars[i]));
 582       tab_text_format (ssb->t, 1, i * 2 + 1, TAB_LEFT,
 583                        "%s%s", prefix[0], val_lab[0]);
 584       tab_text_format (ssb->t, 1, i * 2 + 1+ 1, TAB_LEFT,
 585                        "%s%s", prefix[1], val_lab[1]);
 586
 587       /* Fill in the group statistics */
 588       for (count = 0; count < 2; count++)
 589         {
 590           union value search_val;
 591           struct group_statistics *gs;
 592
 593           if (proc->criterion == CMP_LE)
 594             search_val.f = proc->critical_value + (count == 0 ? 1.0 : -1.0);
 595           else
 596             search_val = proc->g_value[count];
 597
 598           gs = hsh_find (grp_hash, &search_val);
 599           assert (gs);
 600
 601           tab_double (ssb->t, 2, i * 2 + count+ 1, TAB_RIGHT, gs->n,
 602                       &proc->weight_format);
 603           tab_double (ssb->t, 3, i * 2 + count+ 1, TAB_RIGHT, gs->mean, NULL);
 604           tab_double (ssb->t, 4, i * 2 + count+ 1, TAB_RIGHT, gs->std_dev,
 605                       NULL);
 606           tab_double (ssb->t, 5, i * 2 + count+ 1, TAB_RIGHT, gs->se_mean,
 607                       NULL);
 608         }
 609     }
 610   free (val_lab[0]);
 611   free (val_lab[1]);
 612 }
 613
 614 /* Initialize the paired values ssbox */
 615 static void
 616 ssbox_paired_init (struct ssbox *this, struct t_test_proc *proc)
 617 {
 618   int hsize = 6;
 619   int vsize = proc->n_pairs * 2 + 1;
 620
 621   this->populate = ssbox_paired_populate;
 622
 623   ssbox_base_init (this, hsize, vsize);
 624   tab_title (this->t, _("Paired Sample Statistics"));
 625   tab_vline (this->t, TAL_GAP, 1, 0, vsize - 1);
 626   tab_vline (this->t, TAL_2, 2, 0, vsize - 1);
 627   tab_text (this->t, 2, 0, TAB_CENTER | TAT_TITLE, _("Mean"));
 628   tab_text (this->t, 3, 0, TAB_CENTER | TAT_TITLE, _("N"));
 629   tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation"));
 630   tab_text (this->t, 5, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean"));
 631 }
 632
 633 /* Populate the ssbox for paired values */
 634 static void
 635 ssbox_paired_populate (struct ssbox *ssb, struct t_test_proc *proc)
 636 {
 637   int i;
 638
 639   for (i = 0; i < proc->n_pairs; i++)
 640     {
 641       struct pair *p = &proc->pairs[i];
 642       int j;
 643
 644       tab_text_format (ssb->t, 0, i * 2 + 1, TAB_LEFT, _("Pair %d"), i);
 645       for (j=0; j < 2; j++)
 646         {
 647           /* Titles */
 648           tab_text (ssb->t, 1, i * 2 + j + 1, TAB_LEFT,
 649                     var_get_name (p->v[j]));
 650
 651           /* Values */
 652           tab_double (ssb->t, 2, i * 2 + j + 1, TAB_RIGHT, p->mean[j], NULL);
 653           tab_double (ssb->t, 3, i * 2 + j + 1, TAB_RIGHT, p->n,
 654                       &proc->weight_format);
 655           tab_double (ssb->t, 4, i * 2 + j + 1, TAB_RIGHT, p->std_dev[j],
 656                       NULL);
 657           tab_double (ssb->t, 5, i * 2 + j + 1, TAB_RIGHT,
 658                      p->std_dev[j] /sqrt (p->n), NULL);
 659         }
 660     }
 661 }
 662
 663 /* Populate the one sample ssbox */
 664 static void
 665 ssbox_one_sample_populate (struct ssbox *ssb, struct t_test_proc *proc)
 666 {
 667   int i;
 668
 669   for (i = 0; i < proc->n_vars; i++)
 670     {
 671       struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs;
 672
 673       tab_text (ssb->t, 0, i + 1, TAB_LEFT, var_get_name (proc->vars[i]));
 674       tab_double (ssb->t, 1, i + 1, TAB_RIGHT, gs->n, &proc->weight_format);
 675       tab_double (ssb->t, 2, i + 1, TAB_RIGHT, gs->mean, NULL);
 676       tab_double (ssb->t, 3, i + 1, TAB_RIGHT, gs->std_dev, NULL);
 677       tab_double (ssb->t, 4, i + 1, TAB_RIGHT, gs->se_mean, NULL);
 678     }
 679 }
 680 \f
 681 /* Implementation of the Test Results box struct */
 682
 683 static void trbox_base_init (struct trbox *, size_t n_vars, int cols);
 684 static void trbox_base_finalize (struct trbox *);
 685 static void trbox_independent_samples_init (struct trbox *,
 686                                             struct t_test_proc *);
 687 static void trbox_independent_samples_populate (struct trbox *,
 688                                                 struct t_test_proc *);
 689 static void trbox_one_sample_init (struct trbox *, struct t_test_proc *);
 690 static void trbox_one_sample_populate (struct trbox *, struct t_test_proc *);
 691 static void trbox_paired_init (struct trbox *, struct t_test_proc *);
 692 static void trbox_paired_populate (struct trbox *, struct t_test_proc *);
 693
 694 /* Create a trbox according to mode*/
 695 static void
 696 trbox_create (struct trbox *trb, struct t_test_proc *proc)
 697 {
 698   switch (proc->mode)
 699     {
 700     case T_1_SAMPLE:
 701       trbox_one_sample_init (trb, proc);
 702       break;
 703     case T_IND_SAMPLES:
 704       trbox_independent_samples_init (trb, proc);
 705       break;
 706     case T_PAIRED:
 707       trbox_paired_init (trb, proc);
 708       break;
 709     default:
 710       NOT_REACHED ();
 711     }
 712 }
 713
 714 /* Populate a trbox according to proc */
 715 static void
 716 trbox_populate (struct trbox *trb, struct t_test_proc *proc)
 717 {
 718   trb->populate (trb, proc);
 719 }
 720
 721 /* Submit and destroy a trbox */
 722 static void
 723 trbox_finalize (struct trbox *trb)
 724 {
 725   trb->finalize (trb);
 726 }
 727
 728 /* Initialize the independent samples trbox */
 729 static void
 730 trbox_independent_samples_init (struct trbox *self,
 731                                 struct t_test_proc *proc)
 732 {
 733   const int hsize = 11;
 734   const int vsize = proc->n_vars * 2 + 3;
 735
 736   assert (self);
 737   self->populate = trbox_independent_samples_populate;
 738
 739   trbox_base_init (self, proc->n_vars * 2, hsize);
 740   tab_title (self->t, _("Independent Samples Test"));
 741   tab_hline (self->t, TAL_1, 2, hsize - 1, 1);
 742   tab_vline (self->t, TAL_2, 2, 0, vsize - 1);
 743   tab_vline (self->t, TAL_1, 4, 0, vsize - 1);
 744   tab_box (self->t, -1, -1, -1, TAL_1, 2, 1, hsize - 2, vsize - 1);
 745   tab_hline (self->t, TAL_1, hsize - 2, hsize - 1, 2);
 746   tab_box (self->t, -1, -1, -1, TAL_1, hsize - 2, 2, hsize - 1, vsize - 1);
 747   tab_joint_text (self->t, 2, 0, 3, 0,
 748                   TAB_CENTER, _("Levene's Test for Equality of Variances"));
 749   tab_joint_text (self->t, 4, 0, hsize- 1, 0,
 750                   TAB_CENTER, _("t-test for Equality of Means"));
 751
 752   tab_text (self->t, 2, 2, TAB_CENTER | TAT_TITLE, _("F"));
 753   tab_text (self->t, 3, 2, TAB_CENTER | TAT_TITLE, _("Sig."));
 754   tab_text (self->t, 4, 2, TAB_CENTER | TAT_TITLE, _("t"));
 755   tab_text (self->t, 5, 2, TAB_CENTER | TAT_TITLE, _("df"));
 756   tab_text (self->t, 6, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)"));
 757   tab_text (self->t, 7, 2, TAB_CENTER | TAT_TITLE, _("Mean Difference"));
 758   tab_text (self->t, 8, 2, TAB_CENTER | TAT_TITLE, _("Std. Error Difference"));
 759   tab_text (self->t, 9, 2, TAB_CENTER | TAT_TITLE, _("Lower"));
 760   tab_text (self->t, 10, 2, TAB_CENTER | TAT_TITLE, _("Upper"));
 761
 762   tab_joint_text_format (self->t, 9, 1, 10, 1, TAB_CENTER,
 763                          _("%g%% Confidence Interval of the Difference"),
 764                          proc->criteria * 100.0);
 765 }
 766
 767 /* Populate the independent samples trbox */
 768 static void
 769 trbox_independent_samples_populate (struct trbox *self,
 770                                     struct t_test_proc *proc)
 771 {
 772   int i;
 773
 774   for (i = 0; i < proc->n_vars; i++)
 775     {
 776       double p, q;
 777
 778       double t;
 779       double df;
 780
 781       double df1, df2;
 782
 783       double pooled_variance;
 784       double std_err_diff;
 785       double mean_diff;
 786
 787       double se2;
 788
 789       const struct variable *var = proc->vars[i];
 790       struct group_proc *grp_data = group_proc_get (var);
 791
 792       struct hsh_table *grp_hash = grp_data->group_hash;
 793
 794       struct group_statistics *gs0;
 795       struct group_statistics *gs1;
 796
 797       union value search_val;
 798
 799       if (proc->criterion == CMP_LE)
 800         search_val.f = proc->critical_value - 1.0;
 801       else
 802         search_val = proc->g_value[0];
 803
 804       gs0 = hsh_find (grp_hash, &search_val);
 805       assert (gs0);
 806
 807       if (proc->criterion == CMP_LE)
 808         search_val.f = proc->critical_value + 1.0;
 809       else
 810         search_val = proc->g_value[1];
 811
 812       gs1 = hsh_find (grp_hash, &search_val);
 813       assert (gs1);
 814
 815
 816       tab_text (self->t, 0, i * 2 + 3, TAB_LEFT, var_get_name (proc->vars[i]));
 817       tab_text (self->t, 1, i * 2 + 3, TAB_LEFT, _("Equal variances assumed"));
 818       tab_double (self->t, 2, i * 2 + 3, TAB_CENTER, grp_data->levene, NULL);
 819
 820       /* Now work out the significance of the Levene test */
 821       df1 = 1;
 822       df2 = grp_data->ugs.n - 2;
 823       q = gsl_cdf_fdist_Q (grp_data->levene, df1, df2);
 824       tab_double (self->t, 3, i * 2 + 3, TAB_CENTER, q, NULL);
 825
 826       df = gs0->n + gs1->n - 2.0;
 827       tab_double (self->t, 5, i * 2 + 3, TAB_RIGHT, df, NULL);
 828
 829       pooled_variance = (gs0->n * pow2 (gs0->s_std_dev)
 830                          + gs1->n *pow2 (gs1->s_std_dev)) / df ;
 831
 832       t = (gs0->mean - gs1->mean) / sqrt (pooled_variance);
 833       t /= sqrt ((gs0->n + gs1->n) / (gs0->n * gs1->n));
 834
 835       tab_double (self->t, 4, i * 2 + 3, TAB_RIGHT, t, NULL);
 836
 837       p = gsl_cdf_tdist_P (t, df);
 838       q = gsl_cdf_tdist_Q (t, df);
 839
 840       tab_double (self->t, 6, i * 2 + 3, TAB_RIGHT, 2.0 * (t > 0 ? q : p),
 841                   NULL);
 842
 843       mean_diff = gs0->mean - gs1->mean;
 844       tab_double (self->t, 7, i * 2 + 3, TAB_RIGHT, mean_diff, NULL);
 845
 846
 847       std_err_diff = sqrt (pow2 (gs0->se_mean) + pow2 (gs1->se_mean));
 848       tab_double (self->t, 8, i * 2 + 3, TAB_RIGHT, std_err_diff, NULL);
 849
 850       /* Now work out the confidence interval */
 851       q = (1 - proc->criteria)/2.0;  /* 2-tailed test */
 852
 853       t = gsl_cdf_tdist_Qinv (q, df);
 854       tab_double (self->t, 9, i * 2 + 3, TAB_RIGHT,
 855                  mean_diff - t * std_err_diff, NULL);
 856
 857       tab_double (self->t, 10, i * 2 + 3, TAB_RIGHT,
 858                  mean_diff + t * std_err_diff, NULL);
 859
 860
 861       /* Now for the \sigma_1 != \sigma_2 case */
 862       tab_text (self->t, 1, i * 2 + 3 + 1,
 863                 TAB_LEFT, _("Equal variances not assumed"));
 864
 865       se2 = ((pow2 (gs0->s_std_dev) / (gs0->n - 1)) +
 866              (pow2 (gs1->s_std_dev) / (gs1->n - 1)));
 867
 868       t = mean_diff / sqrt (se2);
 869       tab_double (self->t, 4, i * 2 + 3 + 1, TAB_RIGHT, t, NULL);
 870
 871       df = pow2 (se2) / ((pow2 (pow2 (gs0->s_std_dev) / (gs0->n - 1))
 872                           / (gs0->n - 1))
 873                          + (pow2 (pow2 (gs1->s_std_dev) / (gs1->n - 1))
 874                             / (gs1->n - 1)));
 875       tab_double (self->t, 5, i * 2 + 3 + 1, TAB_RIGHT, df, NULL);
 876
 877       p = gsl_cdf_tdist_P (t, df);
 878       q = gsl_cdf_tdist_Q (t, df);
 879
 880       tab_double (self->t, 6, i * 2 + 3 + 1, TAB_RIGHT, 2.0 * (t > 0 ? q : p),
 881                   NULL);
 882
 883       /* Now work out the confidence interval */
 884       q = (1 - proc->criteria) / 2.0;  /* 2-tailed test */
 885
 886       t = gsl_cdf_tdist_Qinv (q, df);
 887
 888       tab_double (self->t, 7, i * 2 + 3 + 1, TAB_RIGHT, mean_diff, NULL);
 889       tab_double (self->t, 8, i * 2 + 3 + 1, TAB_RIGHT, std_err_diff, NULL);
 890       tab_double (self->t, 9, i * 2 + 3 + 1, TAB_RIGHT,
 891                  mean_diff - t * std_err_diff, NULL);
 892       tab_double (self->t, 10, i * 2 + 3 + 1, TAB_RIGHT,
 893                  mean_diff + t * std_err_diff, NULL);
 894     }
 895 }
 896
 897 /* Initialize the paired samples trbox */
 898 static void
 899 trbox_paired_init (struct trbox *self, struct t_test_proc *proc)
 900 {
 901   const int hsize=10;
 902   const int vsize=proc->n_pairs+ 3;
 903
 904   self->populate = trbox_paired_populate;
 905
 906   trbox_base_init (self, proc->n_pairs, hsize);
 907   tab_title (self->t, _("Paired Samples Test"));
 908   tab_hline (self->t, TAL_1, 2, 6, 1);
 909   tab_vline (self->t, TAL_2, 2, 0, vsize - 1);
 910   tab_joint_text (self->t, 2, 0, 6, 0, TAB_CENTER, _("Paired Differences"));
 911   tab_box (self->t, -1, -1, -1, TAL_1, 2, 1, 6, vsize - 1);
 912   tab_box (self->t, -1, -1, -1, TAL_1, 6, 0, hsize - 1, vsize - 1);
 913   tab_hline (self->t, TAL_1, 5, 6, 2);
 914   tab_vline (self->t, TAL_GAP, 6, 0, 1);
 915
 916   tab_joint_text_format (self->t, 5, 1, 6, 1, TAB_CENTER,
 917                          _("%g%% Confidence Interval of the Difference"),
 918                          proc->criteria*100.0);
 919
 920   tab_text (self->t, 2, 2, TAB_CENTER | TAT_TITLE, _("Mean"));
 921   tab_text (self->t, 3, 2, TAB_CENTER | TAT_TITLE, _("Std. Deviation"));
 922   tab_text (self->t, 4, 2, TAB_CENTER | TAT_TITLE, _("Std. Error Mean"));
 923   tab_text (self->t, 5, 2, TAB_CENTER | TAT_TITLE, _("Lower"));
 924   tab_text (self->t, 6, 2, TAB_CENTER | TAT_TITLE, _("Upper"));
 925   tab_text (self->t, 7, 2, TAB_CENTER | TAT_TITLE, _("t"));
 926   tab_text (self->t, 8, 2, TAB_CENTER | TAT_TITLE, _("df"));
 927   tab_text (self->t, 9, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)"));
 928 }
 929
 930 /* Populate the paired samples trbox */
 931 static void
 932 trbox_paired_populate (struct trbox *trb,
 933                        struct t_test_proc *proc)
 934 {
 935   int i;
 936
 937   for (i = 0; i < proc->n_pairs; i++)
 938     {
 939       struct pair *pair = &proc->pairs[i];
 940       double p, q;
 941       double se_mean;
 942
 943       double n = pair->n;
 944       double t;
 945       double df = n - 1;
 946
 947       tab_text_format (trb->t, 0, i + 3, TAB_LEFT, _("Pair %d"), i);
 948       tab_text_format (trb->t, 1, i + 3, TAB_LEFT, "%s - %s",
 949                        var_get_name (pair->v[0]),
 950                        var_get_name (pair->v[1]));
 951       tab_double (trb->t, 2, i + 3, TAB_RIGHT, pair->mean_diff, NULL);
 952       tab_double (trb->t, 3, i + 3, TAB_RIGHT, pair->std_dev_diff, NULL);
 953
 954       /* SE Mean */
 955       se_mean = pair->std_dev_diff / sqrt (n);
 956       tab_double (trb->t, 4, i + 3, TAB_RIGHT, se_mean, NULL);
 957
 958       /* Now work out the confidence interval */
 959       q = (1 - proc->criteria) / 2.0;  /* 2-tailed test */
 960
 961       t = gsl_cdf_tdist_Qinv (q, df);
 962
 963       tab_double (trb->t, 5, i + 3, TAB_RIGHT,
 964                  pair->mean_diff - t * se_mean, NULL);
 965       tab_double (trb->t, 6, i + 3, TAB_RIGHT,
 966                  pair->mean_diff + t * se_mean, NULL);
 967
 968       t = ((pair->mean[0] - pair->mean[1])
 969            / sqrt ((pow2 (pair->s_std_dev[0]) + pow2 (pair->s_std_dev[1])
 970                     - (2 * pair->correlation
 971                        * pair->s_std_dev[0] * pair->s_std_dev[1]))
 972                    / (n - 1)));
 973
 974       tab_double (trb->t, 7, i + 3, TAB_RIGHT, t, NULL);
 975
 976       /* Degrees of freedom */
 977       tab_double (trb->t, 8, i + 3, TAB_RIGHT, df, &proc->weight_format);
 978
 979       p = gsl_cdf_tdist_P (t,df);
 980       q = gsl_cdf_tdist_Q (t,df);
 981
 982       tab_double (trb->t, 9, i + 3, TAB_RIGHT, 2.0 * (t > 0 ? q : p), NULL);
 983     }
 984 }
 985
 986 /* Initialize the one sample trbox */
 987 static void
 988 trbox_one_sample_init (struct trbox *self, struct t_test_proc *proc)
 989 {
 990   const int hsize = 7;
 991   const int vsize = proc->n_vars + 3;
 992
 993   self->populate = trbox_one_sample_populate;
 994
 995   trbox_base_init (self, proc->n_vars, hsize);
 996   tab_title (self->t, _("One-Sample Test"));
 997   tab_hline (self->t, TAL_1, 1, hsize - 1, 1);
 998   tab_vline (self->t, TAL_2, 1, 0, vsize - 1);
 999
1000   tab_joint_text_format (self->t, 1, 0, hsize - 1, 0, TAB_CENTER,
1001                          _("Test Value = %f"), proc->testval);
1002
1003   tab_box (self->t, -1, -1, -1, TAL_1, 1, 1, hsize - 1, vsize - 1);
1004
1005
1006   tab_joint_text_format (self->t, 5, 1, 6, 1, TAB_CENTER,
1007                          _("%g%% Confidence Interval of the Difference"),
1008                          proc->criteria * 100.0);
1009
1010   tab_vline (self->t, TAL_GAP, 6, 1, 1);
1011   tab_hline (self->t, TAL_1, 5, 6, 2);
1012   tab_text (self->t, 1, 2, TAB_CENTER | TAT_TITLE, _("t"));
1013   tab_text (self->t, 2, 2, TAB_CENTER | TAT_TITLE, _("df"));
1014   tab_text (self->t, 3, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)"));
1015   tab_text (self->t, 4, 2, TAB_CENTER | TAT_TITLE, _("Mean Difference"));
1016   tab_text (self->t, 5, 2, TAB_CENTER | TAT_TITLE, _("Lower"));
1017   tab_text (self->t, 6, 2, TAB_CENTER | TAT_TITLE, _("Upper"));
1018 }
1019
1020 /* Populate the one sample trbox */
1021 static void
1022 trbox_one_sample_populate (struct trbox *trb, struct t_test_proc *proc)
1023 {
1024   int i;
1025
1026   assert (trb->t);
1027
1028   for (i = 0; i < proc->n_vars; i++)
1029     {
1030       double t;
1031       double p, q;
1032       double df;
1033       struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs;
1034
1035       tab_text (trb->t, 0, i + 3, TAB_LEFT, var_get_name (proc->vars[i]));
1036
1037       t = (gs->mean - proc->testval) * sqrt (gs->n) / gs->std_dev;
1038
1039       tab_double (trb->t, 1, i + 3, TAB_RIGHT, t, NULL);
1040
1041       /* degrees of freedom */
1042       df = gs->n - 1;
1043
1044       tab_double (trb->t, 2, i + 3, TAB_RIGHT, df, &proc->weight_format);
1045
1046       p = gsl_cdf_tdist_P (t, df);
1047       q = gsl_cdf_tdist_Q (t, df);
1048
1049       /* Multiply by 2 to get 2-tailed significance, makeing sure we've got
1050          the correct tail*/
1051       tab_double (trb->t, 3, i + 3, TAB_RIGHT, 2.0 * (t > 0 ? q : p), NULL);
1052       tab_double (trb->t, 4, i + 3, TAB_RIGHT, gs->mean_diff, NULL);
1053
1054
1055       q = (1 - proc->criteria) / 2.0;  /* 2-tailed test */
1056       t = gsl_cdf_tdist_Qinv (q, df);
1057
1058       tab_double (trb->t, 5, i + 3, TAB_RIGHT,
1059                  gs->mean_diff - t * gs->se_mean, NULL);
1060       tab_double (trb->t, 6, i + 3, TAB_RIGHT,
1061                  gs->mean_diff + t * gs->se_mean, NULL);
1062     }
1063 }
1064
1065 /* Base initializer for the generalized trbox */
1066 static void
1067 trbox_base_init (struct trbox *self, size_t data_rows, int cols)
1068 {
1069   const size_t rows = 3 + data_rows;
1070
1071   self->finalize = trbox_base_finalize;
1072   self->t = tab_create (cols, rows, 0);
1073   tab_headers (self->t, 0, 0, 3, 0);
1074   tab_box (self->t, TAL_2, TAL_2, TAL_0, TAL_0, 0, 0, cols - 1, rows - 1);
1075   tab_hline (self->t, TAL_2, 0, cols- 1, 3);
1076   tab_dim (self->t, tab_natural_dimensions, NULL);
1077 }
1078
1079 /* Base finalizer for the trbox */
1080 static void
1081 trbox_base_finalize (struct trbox *trb)
1082 {
1083   tab_submit (trb->t);
1084 }
1085
1086 /* Create, populate and submit the Paired Samples Correlation box */
1087 static void
1088 pscbox (struct t_test_proc *proc)
1089 {
1090   const int rows=1+proc->n_pairs;
1091   const int cols=5;
1092   int i;
1093
1094   struct tab_table *table;
1095
1096   table = tab_create (cols, rows, 0);
1097
1098   tab_columns (table, SOM_COL_DOWN, 1);
1099   tab_headers (table, 0, 0, 1, 0);
1100   tab_box (table, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1);
1101   tab_hline (table, TAL_2, 0, cols - 1, 1);
1102   tab_vline (table, TAL_2, 2, 0, rows - 1);
1103   tab_dim (table, tab_natural_dimensions, NULL);
1104   tab_title (table, _("Paired Samples Correlations"));
1105
1106   /* column headings */
1107   tab_text (table, 2, 0, TAB_CENTER | TAT_TITLE, _("N"));
1108   tab_text (table, 3, 0, TAB_CENTER | TAT_TITLE, _("Correlation"));
1109   tab_text (table, 4, 0, TAB_CENTER | TAT_TITLE, _("Sig."));
1110
1111   for (i = 0; i < proc->n_pairs; i++)
1112     {
1113       struct pair *pair = &proc->pairs[i];
1114
1115       /* row headings */
1116       tab_text_format (table, 0, i + 1, TAB_LEFT | TAT_TITLE,
1117                        _("Pair %d"), i);
1118       tab_text_format (table, 1, i + 1, TAB_LEFT | TAT_TITLE,
1119                        _("%s & %s"),
1120                        var_get_name (pair->v[0]),
1121                        var_get_name (pair->v[1]));
1122
1123       /* row data */
1124       tab_double (table, 2, i + 1, TAB_RIGHT, pair->n, &proc->weight_format);
1125       tab_double (table, 3, i + 1, TAB_RIGHT, pair->correlation, NULL);
1126
1127       tab_double (table, 4, i + 1, TAB_RIGHT,
1128                   2.0 * significance_of_correlation (pair->correlation, pair->n), NULL);
1129     }
1130
1131   tab_submit (table);
1132 }
1133 \f
1134 /* Calculation Implementation */
1135
1136 /* Calculations common to all variants of the T test. */
1137 static void
1138 common_calc (const struct dictionary *dict,
1139              struct t_test_proc *proc,
1140              struct casereader *reader)
1141 {
1142   struct ccase *c;
1143   int i;
1144
1145   for (i = 0; i < proc->n_vars; i++)
1146     {
1147       struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs;
1148       gs->sum = 0;
1149       gs->n = 0;
1150       gs->ssq = 0;
1151       gs->sum_diff = 0;
1152     }
1153
1154   for (; (c = casereader_read (reader)) != NULL; case_unref (c))
1155     {
1156       double weight = dict_get_case_weight (dict, c, NULL);
1157
1158       /* Listwise has to be implicit if the independent variable
1159          is missing ?? */
1160       if (proc->mode == T_IND_SAMPLES)
1161         {
1162           if (var_is_value_missing (proc->indep_var,
1163                                     case_data (c, proc->indep_var),
1164                                     proc->exclude))
1165             continue;
1166         }
1167
1168       for (i = 0; i < proc->n_vars; i++)
1169         {
1170           const struct variable *v = proc->vars[i];
1171           const union value *val = case_data (c, v);
1172
1173           if (!var_is_value_missing (v, val, proc->exclude))
1174             {
1175               struct group_statistics *gs;
1176               gs = &group_proc_get (v)->ugs;
1177
1178               gs->n += weight;
1179               gs->sum += weight * val->f;
1180               gs->ssq += weight * pow2 (val->f);
1181             }
1182         }
1183     }
1184   casereader_destroy (reader);
1185
1186   for (i = 0; i < proc->n_vars; i++)
1187     {
1188       struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs;
1189
1190       gs->mean = gs->sum / gs->n;
1191       gs->s_std_dev = sqrt (((gs->ssq / gs->n) - pow2 (gs->mean)));
1192       gs->std_dev = sqrt (gs->n / (gs->n- 1)
1193                           * ((gs->ssq / gs->n) - pow2 (gs->mean)));
1194       gs->se_mean = gs->std_dev / sqrt (gs->n);
1195       gs->mean_diff = gs->sum_diff / gs->n;
1196     }
1197 }
1198
1199 /* Calculations for one sample T test. */
1200 static int
1201 one_sample_calc (const struct dictionary *dict, struct t_test_proc *proc,
1202                  struct casereader *reader)
1203 {
1204   struct ccase *c;
1205   int i;
1206
1207   for (i = 0; i < proc->n_vars; i++)
1208     {
1209       struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs;
1210       gs->sum_diff = 0;
1211     }
1212
1213   for (; (c = casereader_read (reader)) != NULL; case_unref (c))
1214     {
1215       double weight = dict_get_case_weight (dict, c, NULL);
1216       for (i = 0; i < proc->n_vars; i++)
1217         {
1218           const struct variable *v = proc->vars[i];
1219           struct group_statistics *gs = &group_proc_get (v)->ugs;
1220           const union value *val = case_data (c, v);
1221           if (!var_is_value_missing (v, val, proc->exclude))
1222             gs->sum_diff += weight * (val->f - proc->testval);
1223         }
1224     }
1225
1226   for (i = 0; i < proc->n_vars; i++)
1227     {
1228       struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs;
1229       gs->mean_diff = gs->sum_diff / gs->n;
1230     }
1231
1232   casereader_destroy (reader);
1233
1234   return 0;
1235 }
1236
1237 static int
1238 paired_calc (const struct dictionary *dict, struct t_test_proc *proc,
1239              struct casereader *reader)
1240 {
1241   struct ccase *c;
1242   int i;
1243
1244   for (i = 0; i < proc->n_pairs; i++)
1245     {
1246       struct pair *pair = &proc->pairs[i];
1247       pair->n = 0;
1248       pair->sum[0] = pair->sum[1] = 0;
1249       pair->ssq[0] = pair->ssq[1] = 0;
1250       pair->sum_of_prod = 0;
1251       pair->correlation = 0;
1252       pair->sum_of_diffs = 0;
1253       pair->ssq_diffs = 0;
1254     }
1255
1256   for (; (c = casereader_read (reader)) != NULL; case_unref (c))
1257     {
1258       double weight = dict_get_case_weight (dict, c, NULL);
1259       for (i = 0; i < proc->n_pairs; i++)
1260         {
1261           struct pair *pair = &proc->pairs[i];
1262           const struct variable *v0 = pair->v[0];
1263           const struct variable *v1 = pair->v[1];
1264
1265           const union value *val0 = case_data (c, v0);
1266           const union value *val1 = case_data (c, v1);
1267
1268           if (!var_is_value_missing (v0, val0, proc->exclude)
1269               && !var_is_value_missing (v1, val1, proc->exclude))
1270             {
1271               pair->n += weight;
1272               pair->sum[0] += weight * val0->f;
1273               pair->sum[1] += weight * val1->f;
1274               pair->ssq[0] += weight * pow2 (val0->f);
1275               pair->ssq[1] += weight * pow2 (val1->f);
1276               pair->sum_of_prod += weight * val0->f * val1->f;
1277               pair->sum_of_diffs += weight * (val0->f - val1->f);
1278               pair->ssq_diffs += weight * pow2 (val0->f - val1->f);
1279             }
1280         }
1281     }
1282
1283   for (i = 0; i < proc->n_pairs; i++)
1284     {
1285       struct pair *pair = &proc->pairs[i];
1286       const double n = pair->n;
1287       int j;
1288
1289       for (j=0; j < 2; j++)
1290         {
1291           pair->mean[j] = pair->sum[j] / n;
1292           pair->s_std_dev[j] = sqrt ((pair->ssq[j] / n
1293                                       - pow2 (pair->mean[j])));
1294           pair->std_dev[j] = sqrt (n / (n- 1) * (pair->ssq[j] / n
1295                                                 - pow2 (pair->mean[j])));
1296         }
1297
1298       pair->correlation = (pair->sum_of_prod / pair->n
1299                            - pair->mean[0] * pair->mean[1]);
1300       /* correlation now actually contains the covariance */
1301       pair->correlation /= pair->std_dev[0] * pair->std_dev[1];
1302       pair->correlation *= pair->n / (pair->n - 1);
1303
1304       pair->mean_diff = pair->sum_of_diffs / n;
1305       pair->std_dev_diff = sqrt (n / (n - 1) * ((pair->ssq_diffs / n)
1306                                                 - pow2 (pair->mean_diff)));
1307     }
1308
1309   casereader_destroy (reader);
1310   return 0;
1311 }
1312
1313 static int
1314 group_calc (const struct dictionary *dict, struct t_test_proc *proc,
1315             struct casereader *reader)
1316 {
1317   struct ccase *c;
1318   int i;
1319
1320   for (i = 0; i < proc->n_vars; i++)
1321     {
1322       struct group_proc *ttpr = group_proc_get (proc->vars[i]);
1323       int j;
1324
1325       /* There's always 2 groups for a T - TEST */
1326       ttpr->n_groups = 2;
1327       ttpr->group_hash = hsh_create (2,
1328                                      (hsh_compare_func *) compare_group_binary,
1329                                      (hsh_hash_func *) hash_group_binary,
1330                                      (hsh_free_func *) free_group,
1331                                      proc);
1332
1333       for (j = 0; j < 2; j++)
1334         {
1335           struct group_statistics *gs = xmalloc (sizeof *gs);
1336           gs->sum = 0;
1337           gs->n = 0;
1338           gs->ssq = 0;
1339           if (proc->criterion == CMP_EQ)
1340             gs->id = proc->g_value[j];
1341           else
1342             {
1343               if (j == 0)
1344                 gs->id.f = proc->critical_value - 1.0;
1345               else
1346                 gs->id.f = proc->critical_value + 1.0;
1347             }
1348
1349           hsh_insert (ttpr->group_hash, gs);
1350         }
1351     }
1352
1353   for (; (c = casereader_read (reader)) != NULL; case_unref (c))
1354     {
1355       const double weight = dict_get_case_weight (dict, c, NULL);
1356       const union value *gv;
1357
1358       if (var_is_value_missing (proc->indep_var,
1359                                 case_data (c, proc->indep_var), proc->exclude))
1360         continue;
1361
1362       gv = case_data (c, proc->indep_var);
1363       for (i = 0; i < proc->n_vars; i++)
1364         {
1365           const struct variable *var = proc->vars[i];
1366           const union value *val = case_data (c, var);
1367           struct hsh_table *grp_hash = group_proc_get (var)->group_hash;
1368           struct group_statistics *gs = hsh_find (grp_hash, gv);
1369
1370           /* If the independent variable doesn't match either of the values
1371              for this case then move on to the next case. */
1372           if (gs == NULL)
1373             break;
1374
1375           if (!var_is_value_missing (var, val, proc->exclude))
1376             {
1377               gs->n += weight;
1378               gs->sum += weight * val->f;
1379               gs->ssq += weight * pow2 (val->f);
1380             }
1381         }
1382     }
1383
1384   for (i = 0; i < proc->n_vars; i++)
1385     {
1386       const struct variable *var = proc->vars[i];
1387       struct hsh_table *grp_hash = group_proc_get (var)->group_hash;
1388       struct hsh_iterator g;
1389       struct group_statistics *gs;
1390       int count = 0;
1391
1392       for (gs = hsh_first (grp_hash, &g); gs != NULL;
1393            gs = hsh_next (grp_hash, &g))
1394         {
1395           gs->mean = gs->sum / gs->n;
1396           gs->s_std_dev = sqrt (((gs->ssq / gs->n) - pow2 (gs->mean)));
1397           gs->std_dev = sqrt (gs->n / (gs->n- 1)
1398                               * ((gs->ssq / gs->n) - pow2 (gs->mean)));
1399           gs->se_mean = gs->std_dev / sqrt (gs->n);
1400           count++;
1401         }
1402       assert (count == 2);
1403     }
1404
1405   casereader_destroy (reader);
1406
1407   return 0;
1408 }
1409
1410 static void
1411 calculate (struct t_test_proc *proc,
1412            struct casereader *input, const struct dataset *ds)
1413 {
1414   const struct dictionary *dict = dataset_dict (ds);
1415   struct ssbox stat_summary_box;
1416   struct trbox test_results_box;
1417   struct taint *taint;
1418   struct ccase *c;
1419
1420   c = casereader_peek (input, 0);
1421   if (c == NULL)
1422     {
1423       casereader_destroy (input);
1424       return;
1425     }
1426   output_split_file_values (ds, c);
1427   case_unref (c);
1428
1429   if (proc->listwise_missing)
1430     input = casereader_create_filter_missing (input,
1431                                               proc->vars,
1432                                               proc->n_vars,
1433                                               proc->exclude, NULL, NULL);
1434   input = casereader_create_filter_weight (input, dict, NULL, NULL);
1435   taint = taint_clone (casereader_get_taint (input));
1436
1437   common_calc (dict, proc, casereader_clone (input));
1438   switch (proc->mode)
1439     {
1440     case T_1_SAMPLE:
1441       one_sample_calc (dict, proc, input);
1442       break;
1443     case T_PAIRED:
1444       paired_calc (dict, proc, input);
1445       break;
1446     case T_IND_SAMPLES:
1447       group_calc (dict, proc, casereader_clone (input));
1448       levene (dict, input, proc->indep_var, proc->n_vars, proc->vars,
1449               proc->exclude);
1450       break;
1451     default:
1452       NOT_REACHED ();
1453     }
1454
1455   if (!taint_has_tainted_successor (taint))
1456     {
1457       ssbox_create (&stat_summary_box, proc);
1458       ssbox_populate (&stat_summary_box, proc);
1459       ssbox_finalize (&stat_summary_box);
1460
1461       if (proc->mode == T_PAIRED)
1462         pscbox (proc);
1463
1464       trbox_create (&test_results_box, proc);
1465       trbox_populate (&test_results_box, proc);
1466       trbox_finalize (&test_results_box);
1467     }
1468
1469   taint_destroy (taint);
1470 }
1471
1472 /* return 0 if G belongs to group 0,
1473           1 if it belongs to group 1,
1474           2 if it belongs to neither group */
1475 static int
1476 which_group (const struct group_statistics *g,
1477              const struct t_test_proc *proc)
1478 {
1479   int width = var_get_width (proc->indep_var);
1480
1481   if (0 == value_compare_3way (&g->id, &proc->g_value[0], width))
1482     return 0;
1483
1484   if (0 == value_compare_3way (&g->id, &proc->g_value[1], width))
1485     return 1;
1486
1487   return 2;
1488 }
1489
1490 /* Return -1 if the id of a is less than b; +1 if greater than and
1491    0 if equal */
1492 static int
1493 compare_group_binary (const struct group_statistics *a,
1494                       const struct group_statistics *b,
1495                       const struct t_test_proc *proc)
1496 {
1497   int flag_a;
1498   int flag_b;
1499
1500   if (proc->criterion == CMP_LE)
1501     {
1502       flag_a = (a->id.f < proc->critical_value);
1503       flag_b = (b->id.f < proc->critical_value);
1504     }
1505   else
1506     {
1507       flag_a = which_group (a, proc);
1508       flag_b = which_group (b, proc);
1509     }
1510
1511   if (flag_a < flag_b)
1512     return - 1;
1513
1514   return (flag_a > flag_b);
1515 }
1516
1517 /* This is a degenerate case of a hash, since it can only return three possible
1518    values.  It's really a comparison, being used as a hash function */
1519
1520 static unsigned
1521 hash_group_binary (const struct group_statistics *g,
1522                    const struct t_test_proc *proc)
1523 {
1524   return (proc->criterion == CMP_LE
1525           ? g->id.f < proc->critical_value
1526           : which_group (g, proc));
1527 }
1528
1529 /*
1530   Local Variables:
1531   mode: c
1532   End:
1533 */