Added type 3 sums of squares to GLM

[pspp] / src / language / stats / t-test.q
diff --git a/src/language/stats/t-test.q b/src/language/stats/t-test.q

index a3f4cf8f522516d810ae29948a8402becd01c636..d26fc8a4cffe6385cab26340e656698cab237d02 100644 (file)
--- a/src/language/stats/t-test.q
+++ b/src/language/stats/t-test.q
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -22,32 +22,34 @@
  #include <stdio.h>
  #include <stdlib.h>
  
-#include <data/case.h>
-#include <data/casegrouper.h>
-#include <data/casereader.h>
-#include <data/dictionary.h>
-#include <data/procedure.h>
-#include <data/value-labels.h>
-#include <data/variable.h>
-#include <language/command.h>
-#include <language/dictionary/split-file.h>
-#include <language/lexer/lexer.h>
-#include <libpspp/array.h>
-#include <libpspp/assertion.h>
-#include <libpspp/compiler.h>
-#include <libpspp/hash.h>
-#include <libpspp/message.h>
-#include <libpspp/misc.h>
-#include <libpspp/str.h>
-#include <libpspp/taint.h>
-#include <math/group-proc.h>
-#include <math/levene.h>
-#include <output/manager.h>
-#include <output/table.h>
-#include <data/format.h>
-
-#include "xalloc.h"
-#include "xmemdup0.h"
+#include "data/case.h"
+#include "data/casegrouper.h"
+#include "data/casereader.h"
+#include "data/dataset.h"
+#include "data/dictionary.h"
+#include "data/format.h"
+#include "data/value-labels.h"
+#include "data/variable.h"
+#include "language/command.h"
+#include "language/dictionary/split-file.h"
+#include "language/lexer/lexer.h"
+#include "language/lexer/value-parser.h"
+#include "libpspp/array.h"
+#include "libpspp/assertion.h"
+#include "libpspp/compiler.h"
+#include "libpspp/hash.h"
+#include "libpspp/message.h"
+#include "libpspp/misc.h"
+#include "libpspp/str.h"
+#include "libpspp/taint.h"
+#include "math/correlation.h"
+#include "math/group-proc.h"
+#include "math/levene.h"
+#include "output/tab.h"
+
+#include "gl/minmax.h"
+#include "gl/xalloc.h"
+#include "gl/xmemdup0.h"
  
  #include "gettext.h"
  #define _(msgid) gettext (msgid)
@@ -126,8 +128,6 @@ struct t_test_proc
      union value g_value[2];     /* CMP_EQ only: Per-group indep var values. */
    };
  
-static int parse_value (struct lexer *, union value *, int width);
-
  /* Statistics Summary Box */
  struct ssbox
    {
@@ -164,6 +164,8 @@ static int compare_group_binary (const struct group_statistics *a,
  static unsigned hash_group_binary (const struct group_statistics *g,
                                    const struct t_test_proc *p);
  
+static void t_test_proc_destroy (struct t_test_proc *proc);
+
  int
  cmd_t_test (struct lexer *lexer, struct dataset *ds)
  {
@@ -189,7 +191,7 @@ cmd_t_test (struct lexer *lexer, struct dataset *ds)
      {
        msg (SE, _("Exactly one of TESTVAL, GROUPS and PAIRS subcommands "
                   "must be specified."));
-      goto done;
+      goto error;
      }
  
    proc.mode = (cmd.sbc_testval ? T_1_SAMPLE
@@ -209,7 +211,7 @@ cmd_t_test (struct lexer *lexer, struct dataset *ds)
        if (cmd.sbc_variables)
         {
           msg (SE, _("VARIABLES subcommand may not be used with PAIRS."));
-          goto done;
+          goto error;
         }
  
        /* Fill proc.vars with the unique variables from pairs. */
@@ -228,7 +230,7 @@ cmd_t_test (struct lexer *lexer, struct dataset *ds)
        if (!cmd.n_variables)
          {
            msg (SE, _("One or more VARIABLES must be specified."));
-          goto done;
+          goto error;
          }
        proc.n_vars = cmd.n_variables;
        proc.vars = cmd.v_variables;
@@ -240,31 +242,33 @@ cmd_t_test (struct lexer *lexer, struct dataset *ds)
    while (casegrouper_get_next_group (grouper, &group))
      calculate (&proc, group, ds);
    ok = casegrouper_destroy (grouper);
+
+  /* Free 'proc' then commit the procedure.  Must happen in this order because
+     if proc->indep_var was created by a temporary transformation then
+     committing will destroy it.  */
+  t_test_proc_destroy (&proc);
    ok = proc_commit (ds) && ok;
  
-  if (proc.mode == T_IND_SAMPLES)
-    {
-      int v;
-      /* Destroy any group statistics we created */
-      for (v = 0; v < proc.n_vars; v++)
-       {
-         struct group_proc *grpp = group_proc_get (proc.vars[v]);
-         hsh_destroy (grpp->group_hash);
-       }
-    }
+  return ok ? CMD_SUCCESS : CMD_FAILURE;
  
-done:
+error:
    free_t_test (&cmd);
  parse_failed:
-  if (proc.indep_var != NULL)
+  t_test_proc_destroy (&proc);
+  return CMD_FAILURE;
+}
+
+static void
+t_test_proc_destroy (struct t_test_proc *proc)
+{
+  if (proc->indep_var != NULL)
      {
-      int width = var_get_width (proc.indep_var);
-      value_destroy (&proc.g_value[0], width);
-      value_destroy (&proc.g_value[1], width);
+      int width = var_get_width (proc->indep_var);
+      value_destroy (&proc->g_value[0], width);
+      value_destroy (&proc->g_value[1], width);
      }
-  free (proc.vars);
-  free (proc.pairs);
-  return ok ? CMD_SUCCESS : CMD_FAILURE;
+  free (proc->vars);
+  free (proc->pairs);
  }
  
  static int
@@ -275,7 +279,7 @@ tts_custom_groups (struct lexer *lexer, struct dataset *ds,
    int n_values;
    int width;
  
-  lex_match (lexer, '=');
+  lex_match (lexer, T_EQUALS);
  
    proc->indep_var = parse_variable (lexer, dataset_dict (ds));
    if (proc->indep_var == NULL)
@@ -287,19 +291,19 @@ tts_custom_groups (struct lexer *lexer, struct dataset *ds,
    value_init (&proc->g_value[0], width);
    value_init (&proc->g_value[1], width);
  
-  if (!lex_match (lexer, '('))
+  if (!lex_match (lexer, T_LPAREN))
      n_values = 0;
    else
      {
-      if (!parse_value (lexer, &proc->g_value[0], width))
+      if (!parse_value (lexer, &proc->g_value[0], proc->indep_var))
          return 0;
-      lex_match (lexer, ',');
-      if (lex_match (lexer, ')'))
+      lex_match (lexer, T_COMMA);
+      if (lex_match (lexer, T_RPAREN))
          n_values = 1;
        else
          {
-          if (!parse_value (lexer, &proc->g_value[1], width)
-              || !lex_force_match (lexer, ')'))
+          if (!parse_value (lexer, &proc->g_value[1], proc->indep_var)
+              || !lex_force_match (lexer, T_RPAREN))
              return 0;
            n_values = 2;
          }
@@ -355,7 +359,7 @@ tts_custom_pairs (struct lexer *lexer, struct dataset *ds,
    size_t n_total_pairs;
    size_t i, j;
  
-  lex_match (lexer, '=');
+  lex_match (lexer, T_EQUALS);
  
    if (!parse_variables_const (lexer, dataset_dict (ds), &vars1, &n_vars1,
                                PV_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH))
@@ -370,9 +374,9 @@ tts_custom_pairs (struct lexer *lexer, struct dataset *ds,
            return 0;
          }
  
-      if (lex_match (lexer, '(')
+      if (lex_match (lexer, T_LPAREN)
            && lex_match_id (lexer, "PAIRED")
-          && lex_match (lexer, ')'))
+          && lex_match (lexer, T_RPAREN))
          {
            paired = true;
            if (n_vars1 != n_vars2)
@@ -421,29 +425,6 @@ tts_custom_pairs (struct lexer *lexer, struct dataset *ds,
    free (vars2);
    return 1;
  }
-
-/* Parses the current token (numeric or string, depending on type)
-   value v and returns success. */
-static int
-parse_value (struct lexer *lexer, union value *v, int width)
-{
-  if (width == 0)
-    {
-      if (!lex_force_num (lexer))
-       return 0;
-      v->f = lex_tokval (lexer);
-    }
-  else
-    {
-      if (!lex_force_string (lexer))
-       return 0;
-      value_copy_str_rpad (v, width, ds_cstr (lex_tokstr (lexer)), ' ');
-    }
-
-  lex_get (lexer);
-
-  return 1;
-}
  \f
  /* Implementation of the SSBOX object. */
  
@@ -499,13 +480,11 @@ static void
  ssbox_base_init (struct ssbox *this, int cols, int rows)
  {
    this->finalize = ssbox_base_finalize;
-  this->t = tab_create (cols, rows, 0);
+  this->t = tab_create (cols, rows);
  
-  tab_columns (this->t, SOM_COL_DOWN, 1);
    tab_headers (this->t, 0, 0, 1, 0);
    tab_box (this->t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1);
    tab_hline (this->t, TAL_2, 0, cols- 1, 1);
-  tab_dim (this->t, tab_natural_dimensions, NULL);
  }
  \f
  /* ssbox implementations. */
@@ -530,7 +509,7 @@ ssbox_one_sample_init (struct ssbox *this, struct t_test_proc *proc)
    tab_text (this->t, 1, 0, TAB_CENTER | TAT_TITLE, _("N"));
    tab_text (this->t, 2, 0, TAB_CENTER | TAT_TITLE, _("Mean"));
    tab_text (this->t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation"));
-  tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("SE. Mean"));
+  tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean"));
  }
  
  /* Initialize the independent samples ssbox */
@@ -550,7 +529,7 @@ ssbox_independent_samples_init (struct ssbox *this, struct t_test_proc *proc)
    tab_text (this->t, 2, 0, TAB_CENTER | TAT_TITLE, _("N"));
    tab_text (this->t, 3, 0, TAB_CENTER | TAT_TITLE, _("Mean"));
    tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation"));
-  tab_text (this->t, 5, 0, TAB_CENTER | TAT_TITLE, _("SE. Mean"));
+  tab_text (this->t, 5, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean"));
  }
  
  /* Populate the ssbox for independent samples */
@@ -601,10 +580,10 @@ ssbox_independent_samples_populate (struct ssbox *ssb,
  
        tab_text (ssb->t, 0, i * 2 + 1, TAB_LEFT,
                  var_get_name (proc->vars[i]));
-      tab_text (ssb->t, 1, i * 2 + 1, TAB_LEFT | TAT_PRINTF,
-                "%s%s", prefix[0], val_lab[0]);
-      tab_text (ssb->t, 1, i * 2 + 1+ 1, TAB_LEFT | TAT_PRINTF,
-                "%s%s", prefix[1], val_lab[1]);
+      tab_text_format (ssb->t, 1, i * 2 + 1, TAB_LEFT,
+                       "%s%s", prefix[0], val_lab[0]);
+      tab_text_format (ssb->t, 1, i * 2 + 1+ 1, TAB_LEFT,
+                       "%s%s", prefix[1], val_lab[1]);
  
        /* Fill in the group statistics */
        for (count = 0; count < 2; count++)
@@ -649,7 +628,7 @@ ssbox_paired_init (struct ssbox *this, struct t_test_proc *proc)
    tab_text (this->t, 2, 0, TAB_CENTER | TAT_TITLE, _("Mean"));
    tab_text (this->t, 3, 0, TAB_CENTER | TAT_TITLE, _("N"));
    tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation"));
-  tab_text (this->t, 5, 0, TAB_CENTER | TAT_TITLE, _("SE. Mean"));
+  tab_text (this->t, 5, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean"));
  }
  
  /* Populate the ssbox for paired values */
@@ -663,7 +642,7 @@ ssbox_paired_populate (struct ssbox *ssb, struct t_test_proc *proc)
        struct pair *p = &proc->pairs[i];
        int j;
  
-      tab_text (ssb->t, 0, i * 2 + 1, TAB_LEFT | TAT_PRINTF, _("Pair %d"), i);
+      tab_text_format (ssb->t, 0, i * 2 + 1, TAB_LEFT, _("Pair %d"), i);
        for (j=0; j < 2; j++)
         {
           /* Titles */
@@ -781,9 +760,9 @@ trbox_independent_samples_init (struct trbox *self,
    tab_text (self->t, 9, 2, TAB_CENTER | TAT_TITLE, _("Lower"));
    tab_text (self->t, 10, 2, TAB_CENTER | TAT_TITLE, _("Upper"));
  
-  tab_joint_text (self->t, 9, 1, 10, 1, TAB_CENTER | TAT_PRINTF,
-                  _("%g%% Confidence Interval of the Difference"),
-                  proc->criteria * 100.0);
+  tab_joint_text_format (self->t, 9, 1, 10, 1, TAB_CENTER,
+                         _("%g%% Confidence Interval of the Difference"),
+                         proc->criteria * 100.0);
  }
  
  /* Populate the independent samples trbox */
@@ -935,9 +914,9 @@ trbox_paired_init (struct trbox *self, struct t_test_proc *proc)
    tab_hline (self->t, TAL_1, 5, 6, 2);
    tab_vline (self->t, TAL_GAP, 6, 0, 1);
  
-  tab_joint_text (self->t, 5, 1, 6, 1, TAB_CENTER | TAT_PRINTF,
-                  _("%g%% Confidence Interval of the Difference"),
-                  proc->criteria*100.0);
+  tab_joint_text_format (self->t, 5, 1, 6, 1, TAB_CENTER,
+                         _("%g%% Confidence Interval of the Difference"),
+                         proc->criteria*100.0);
  
    tab_text (self->t, 2, 2, TAB_CENTER | TAT_TITLE, _("Mean"));
    tab_text (self->t, 3, 2, TAB_CENTER | TAT_TITLE, _("Std. Deviation"));
@@ -966,10 +945,10 @@ trbox_paired_populate (struct trbox *trb,
        double t;
        double df = n - 1;
  
-      tab_text (trb->t, 0, i + 3, TAB_LEFT | TAT_PRINTF, _("Pair %d"), i);
-      tab_text (trb->t, 1, i + 3, TAB_LEFT | TAT_PRINTF, "%s - %s",
-               var_get_name (pair->v[0]),
-                var_get_name (pair->v[1]));
+      tab_text_format (trb->t, 0, i + 3, TAB_LEFT, _("Pair %d"), i);
+      tab_text_format (trb->t, 1, i + 3, TAB_LEFT, "%s - %s",
+                       var_get_name (pair->v[0]),
+                       var_get_name (pair->v[1]));
        tab_double (trb->t, 2, i + 3, TAB_RIGHT, pair->mean_diff, NULL);
        tab_double (trb->t, 3, i + 3, TAB_RIGHT, pair->std_dev_diff, NULL);
  
@@ -998,8 +977,8 @@ trbox_paired_populate (struct trbox *trb,
        /* Degrees of freedom */
        tab_double (trb->t, 8, i + 3, TAB_RIGHT, df, &proc->weight_format);
  
-      p = gsl_cdf_tdist_P (t, df);
-      q = gsl_cdf_tdist_P (t, df);
+      p = gsl_cdf_tdist_P (t,df);
+      q = gsl_cdf_tdist_Q (t,df);
  
        tab_double (trb->t, 9, i + 3, TAB_RIGHT, 2.0 * (t > 0 ? q : p), NULL);
      }
@@ -1019,15 +998,15 @@ trbox_one_sample_init (struct trbox *self, struct t_test_proc *proc)
    tab_hline (self->t, TAL_1, 1, hsize - 1, 1);
    tab_vline (self->t, TAL_2, 1, 0, vsize - 1);
  
-  tab_joint_text (self->t, 1, 0, hsize - 1, 0, TAB_CENTER | TAT_PRINTF,
-                  _("Test Value = %f"), proc->testval);
+  tab_joint_text_format (self->t, 1, 0, hsize - 1, 0, TAB_CENTER,
+                         _("Test Value = %f"), proc->testval);
  
    tab_box (self->t, -1, -1, -1, TAL_1, 1, 1, hsize - 1, vsize - 1);
  
  
-  tab_joint_text (self->t, 5, 1, 6, 1, TAB_CENTER  | TAT_PRINTF,
-                  _("%g%% Confidence Interval of the Difference"),
-                  proc->criteria * 100.0);
+  tab_joint_text_format (self->t, 5, 1, 6, 1, TAB_CENTER,
+                         _("%g%% Confidence Interval of the Difference"),
+                         proc->criteria * 100.0);
  
    tab_vline (self->t, TAL_GAP, 6, 1, 1);
    tab_hline (self->t, TAL_1, 5, 6, 2);
@@ -1091,11 +1070,10 @@ trbox_base_init (struct trbox *self, size_t data_rows, int cols)
    const size_t rows = 3 + data_rows;
  
    self->finalize = trbox_base_finalize;
-  self->t = tab_create (cols, rows, 0);
+  self->t = tab_create (cols, rows);
    tab_headers (self->t, 0, 0, 3, 0);
    tab_box (self->t, TAL_2, TAL_2, TAL_0, TAL_0, 0, 0, cols - 1, rows - 1);
    tab_hline (self->t, TAL_2, 0, cols- 1, 3);
-  tab_dim (self->t, tab_natural_dimensions, NULL);
  }
  
  /* Base finalizer for the trbox */
@@ -1115,14 +1093,12 @@ pscbox (struct t_test_proc *proc)
  
    struct tab_table *table;
  
-  table = tab_create (cols, rows, 0);
+  table = tab_create (cols, rows);
  
-  tab_columns (table, SOM_COL_DOWN, 1);
    tab_headers (table, 0, 0, 1, 0);
    tab_box (table, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1);
    tab_hline (table, TAL_2, 0, cols - 1, 1);
    tab_vline (table, TAL_2, 2, 0, rows - 1);
-  tab_dim (table, tab_natural_dimensions, NULL);
    tab_title (table, _("Paired Samples Correlations"));
  
    /* column headings */
@@ -1133,27 +1109,21 @@ pscbox (struct t_test_proc *proc)
    for (i = 0; i < proc->n_pairs; i++)
      {
        struct pair *pair = &proc->pairs[i];
-      double p, q;
-      double df = pair->n -2;
-      double correlation_t = (pair->correlation * sqrt (df) /
-                              sqrt (1 - pow2 (pair->correlation)));
  
        /* row headings */
-      tab_text (table, 0, i + 1, TAB_LEFT | TAT_TITLE | TAT_PRINTF,
-                _("Pair %d"), i);
-      tab_text (table, 1, i + 1, TAB_LEFT | TAT_TITLE | TAT_PRINTF,
-                _("%s & %s"),
-                var_get_name (pair->v[0]),
-                var_get_name (pair->v[1]));
+      tab_text_format (table, 0, i + 1, TAB_LEFT | TAT_TITLE,
+                       _("Pair %d"), i);
+      tab_text_format (table, 1, i + 1, TAB_LEFT | TAT_TITLE,
+                       _("%s & %s"),
+                       var_get_name (pair->v[0]),
+                       var_get_name (pair->v[1]));
  
        /* row data */
        tab_double (table, 2, i + 1, TAB_RIGHT, pair->n, &proc->weight_format);
        tab_double (table, 3, i + 1, TAB_RIGHT, pair->correlation, NULL);
  
-      p = gsl_cdf_tdist_P (correlation_t, df);
-      q = gsl_cdf_tdist_Q (correlation_t, df);
-      tab_double (table, 4, i + 1, TAB_RIGHT,
-                 2.0 * (correlation_t > 0 ? q : p), NULL);
+      tab_double (table, 4, i + 1, TAB_RIGHT, 
+                 2.0 * significance_of_correlation (pair->correlation, pair->n), NULL);
      }
  
    tab_submit (table);
@@ -1435,6 +1405,23 @@ group_calc (const struct dictionary *dict, struct t_test_proc *proc,
    return 0;
  }
  
+
+static bool
+is_criteria_value (const struct ccase *c, void *aux)
+{
+  const struct t_test_proc *proc = aux;
+  const union value *val = case_data (c, proc->indep_var);
+  int width = var_get_width (proc->indep_var);
+
+  if ( value_equal (val, &proc->g_value[0], width))
+    return true;
+
+  if ( value_equal (val, &proc->g_value[1], width))
+    return true;
+
+  return false;
+}
+
  static void
  calculate (struct t_test_proc *proc,
             struct casereader *input, const struct dataset *ds)
@@ -1444,7 +1431,7 @@ calculate (struct t_test_proc *proc,
    struct trbox test_results_box;
    struct taint *taint;
    struct ccase *c;
-
+  int i;
    c = casereader_peek (input, 0);
    if (c == NULL)
      {
@@ -1473,8 +1460,20 @@ calculate (struct t_test_proc *proc,
        break;
      case T_IND_SAMPLES:
        group_calc (dict, proc, casereader_clone (input));
-      levene (dict, input, proc->indep_var, proc->n_vars, proc->vars,
-              proc->exclude);
+
+      for (i = 0; i < proc->n_vars; ++i)
+       {
+         struct group_proc *grp_data = group_proc_get (proc->vars[i]);
+
+         if ( proc->criterion == CMP_EQ )
+           {
+             input = casereader_create_filter_func (input, is_criteria_value, NULL,
+                                                    proc, 
+                                                    NULL);
+           }
+
+         grp_data->levene = levene ( input, proc->indep_var, proc->vars[i], dict_get_weight (dict), proc->exclude);
+       }
        break;
      default:
        NOT_REACHED ();