pintos-os.org Git - pspp/blob - src/language/stats/t-test-indep.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "t-test.h"
  20
  21 #include <gsl/gsl_cdf.h>
  22 #include <math.h>
  23 #include "libpspp/misc.h"
  24
  25 #include "libpspp/str.h"
  26 #include "data/casereader.h"
  27 #include "data/dictionary.h"
  28 #include "data/format.h"
  29 #include "data/variable.h"
  30
  31 #include "math/moments.h"
  32 #include "math/levene.h"
  33
  34 #include <output/tab.h>
  35 #include "gettext.h"
  36 #define _(msgid) gettext (msgid)
  37
  38
  39 struct indep_samples
  40 {
  41   const struct variable *gvar;
  42   bool cut;
  43   const union value *gval0;
  44   const union value *gval1;
  45 };
  46
  47 struct pair_stats
  48 {
  49   struct moments *mom[2];
  50   double lev ;
  51   struct levene *nl;
  52 };
  53
  54
  55 static void indep_summary (const struct tt *tt, struct indep_samples *is, const struct pair_stats *ps);
  56 static void indep_test (const struct tt *tt, const struct pair_stats *ps);
  57
  58 static int
  59 which_group (const union value *v, const struct indep_samples *is)
  60 {
  61   int width = var_get_width (is->gvar);
  62   int cmp = value_compare_3way (v, is->gval0, width);
  63   if ( is->cut )
  64     return  (cmp < 0);
  65
  66   if (cmp == 0)
  67     return 0;
  68
  69   if (0 == value_compare_3way (v, is->gval1, width))
  70     return 1;
  71
  72   return -1;
  73 }
  74
  75 void
  76 indep_run (struct tt *tt, const struct variable *gvar,
  77            bool cut,
  78            const union value *gval0, const union value *gval1,
  79            struct casereader *reader)
  80 {
  81   struct indep_samples is;
  82   struct ccase *c;
  83   struct casereader *r;
  84
  85   struct pair_stats *ps = xcalloc (tt->n_vars, sizeof *ps);
  86
  87   int v;
  88
  89   for (v = 0; v < tt->n_vars; ++v)
  90     {
  91       ps[v].mom[0] = moments_create (MOMENT_VARIANCE);
  92       ps[v].mom[1] = moments_create (MOMENT_VARIANCE);
  93       ps[v].nl = levene_create (var_get_width (gvar), cut ? gval0: NULL);
  94     }
  95
  96   is.gvar = gvar;
  97   is.gval0 = gval0;
  98   is.gval1 = gval1;
  99   is.cut = cut;
 100
 101   r = casereader_clone (reader);
 102   for ( ; (c = casereader_read (r) ); case_unref (c))
 103     {
 104       double w = dict_get_case_weight (tt->dict, c, NULL);
 105
 106       const union value *gv = case_data (c, gvar);
 107
 108       int grp = which_group (gv, &is);
 109       if ( grp < 0)
 110         continue;
 111
 112       for (v = 0; v < tt->n_vars; ++v)
 113         {
 114           const union value *val = case_data (c, tt->vars[v]);
 115           if (var_is_value_missing (tt->vars[v], val, tt->exclude))
 116             continue;
 117
 118           moments_pass_one (ps[v].mom[grp], val->f, w);
 119           levene_pass_one (ps[v].nl, val->f, w, gv);
 120         }
 121     }
 122   casereader_destroy (r);
 123
 124   r = casereader_clone (reader);
 125   for ( ; (c = casereader_read (r) ); case_unref (c))
 126     {
 127       double w = dict_get_case_weight (tt->dict, c, NULL);
 128
 129       const union value *gv = case_data (c, gvar);
 130
 131       int grp = which_group (gv, &is);
 132       if ( grp < 0)
 133         continue;
 134
 135       for (v = 0; v < tt->n_vars; ++v)
 136         {
 137           const union value *val = case_data (c, tt->vars[v]);
 138           if (var_is_value_missing (tt->vars[v], val, tt->exclude))
 139             continue;
 140
 141           moments_pass_two (ps[v].mom[grp], val->f, w);
 142           levene_pass_two (ps[v].nl, val->f, w, gv);
 143         }
 144     }
 145   casereader_destroy (r);
 146
 147   r = reader;
 148   for ( ; (c = casereader_read (r) ); case_unref (c))
 149     {
 150       double w = dict_get_case_weight (tt->dict, c, NULL);
 151
 152       const union value *gv = case_data (c, gvar);
 153
 154       int grp = which_group (gv, &is);
 155       if ( grp < 0)
 156         continue;
 157
 158       for (v = 0; v < tt->n_vars; ++v)
 159         {
 160           const union value *val = case_data (c, tt->vars[v]);
 161           if (var_is_value_missing (tt->vars[v], val, tt->exclude))
 162             continue;
 163
 164           levene_pass_three (ps[v].nl, val->f, w, gv);
 165         }
 166     }
 167   casereader_destroy (r);
 168
 169
 170   for (v = 0; v < tt->n_vars; ++v)
 171     ps[v].lev = levene_calculate (ps[v].nl);
 172
 173   indep_summary (tt, &is, ps);
 174   indep_test (tt, ps);
 175
 176
 177   for (v = 0; v < tt->n_vars; ++v)
 178     {
 179       moments_destroy (ps[v].mom[0]);
 180       moments_destroy (ps[v].mom[1]);
 181       levene_destroy (ps[v].nl);
 182     }
 183   free (ps);
 184 }
 185
 186
 187 static void
 188 indep_summary (const struct tt *tt, struct indep_samples *is, const struct pair_stats *ps)
 189 {
 190   const struct fmt_spec *wfmt = tt->wv ? var_get_print_format (tt->wv) : & F_8_0;
 191
 192   int v;
 193   int cols = 6;
 194   const int heading_rows = 1;
 195   int rows = tt->n_vars * 2 + heading_rows;
 196
 197   struct string vallab0 ;
 198   struct string vallab1 ;
 199   struct tab_table *t = tab_create (cols, rows);
 200   tab_set_format (t, RC_WEIGHT, wfmt);
 201   ds_init_empty (&vallab0);
 202   ds_init_empty (&vallab1);
 203
 204   tab_headers (t, 0, 0, 1, 0);
 205   tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1);
 206   tab_hline (t, TAL_2, 0, cols - 1, 1);
 207
 208   tab_vline (t, TAL_GAP, 1, 0, rows - 1);
 209   tab_title (t, _("Group Statistics"));
 210   tab_text  (t, 1, 0, TAB_CENTER | TAT_TITLE, var_to_string (is->gvar));
 211   tab_text  (t, 2, 0, TAB_CENTER | TAT_TITLE, _("N"));
 212   tab_text  (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Mean"));
 213   tab_text  (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation"));
 214   tab_text  (t, 5, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean"));
 215
 216   if (is->cut)
 217     {
 218       ds_put_cstr (&vallab0, "≥");
 219       ds_put_cstr (&vallab1, "<");
 220
 221       var_append_value_name (is->gvar, is->gval0, &vallab0);
 222       var_append_value_name (is->gvar, is->gval0, &vallab1);
 223     }
 224   else
 225     {
 226       var_append_value_name (is->gvar, is->gval0, &vallab0);
 227       var_append_value_name (is->gvar, is->gval1, &vallab1);
 228     }
 229
 230   tab_vline (t, TAL_1, 1, heading_rows,  rows - 1);
 231
 232   for (v = 0; v < tt->n_vars; ++v)
 233     {
 234       int i;
 235       const struct variable *var = tt->vars[v];
 236
 237       tab_text (t, 0, v * 2 + heading_rows, TAB_LEFT,
 238                 var_to_string (var));
 239
 240       tab_text (t, 1, v * 2 + heading_rows, TAB_LEFT,
 241                        ds_cstr (&vallab0));
 242
 243       tab_text (t, 1, v * 2 + 1 + heading_rows, TAB_LEFT,
 244                        ds_cstr (&vallab1));
 245
 246       for (i = 0 ; i < 2; ++i)
 247         {
 248           double cc, mean, sigma;
 249           moments_calculate (ps[v].mom[i], &cc, &mean, &sigma, NULL, NULL);
 250
 251           tab_double (t, 2, v * 2 + i + heading_rows, TAB_RIGHT, cc, NULL, RC_WEIGHT);
 252           tab_double (t, 3, v * 2 + i + heading_rows, TAB_RIGHT, mean, NULL, RC_OTHER);
 253           tab_double (t, 4, v * 2 + i + heading_rows, TAB_RIGHT, sqrt (sigma), NULL, RC_OTHER);
 254           tab_double (t, 5, v * 2 + i + heading_rows, TAB_RIGHT, sqrt (sigma / cc), NULL, RC_OTHER);
 255         }
 256     }
 257
 258   tab_submit (t);
 259
 260   ds_destroy (&vallab0);
 261   ds_destroy (&vallab1);
 262 }
 263
 264
 265 static void
 266 indep_test (const struct tt *tt, const struct pair_stats *ps)
 267 {
 268   int v;
 269   const int heading_rows = 3;
 270   const int rows= tt->n_vars * 2 + heading_rows;
 271
 272   const size_t cols = 11;
 273
 274   struct tab_table *t = tab_create (cols, rows);
 275   tab_headers (t, 0, 0, 3, 0);
 276   tab_box (t, TAL_2, TAL_2, TAL_0, TAL_0, 0, 0, cols - 1, rows - 1);
 277   tab_hline (t, TAL_2, 0, cols - 1, 3);
 278
 279   tab_title (t, _("Independent Samples Test"));
 280
 281   tab_hline (t, TAL_1, 2, cols - 1, 1);
 282   tab_vline (t, TAL_2, 2, 0, rows - 1);
 283   tab_vline (t, TAL_1, 4, 0, rows - 1);
 284   tab_box (t, -1, -1, -1, TAL_1, 2, 1, cols - 2, rows - 1);
 285   tab_hline (t, TAL_1, cols - 2, cols - 1, 2);
 286   tab_box (t, -1, -1, -1, TAL_1, cols - 2, 2, cols - 1, rows - 1);
 287   tab_joint_text (t, 2, 0, 3, 0, TAB_CENTER, _("Levene's Test for Equality of Variances"));
 288   tab_joint_text (t, 4, 0, cols - 1, 0, TAB_CENTER, _("t-test for Equality of Means"));
 289
 290   tab_text (t, 2, 2, TAB_CENTER | TAT_TITLE, _("F"));
 291   tab_text (t, 3, 2, TAB_CENTER | TAT_TITLE, _("Sig."));
 292   tab_text (t, 4, 2, TAB_CENTER | TAT_TITLE, _("t"));
 293   tab_text (t, 5, 2, TAB_CENTER | TAT_TITLE, _("df"));
 294   tab_text (t, 6, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)"));
 295   tab_text (t, 7, 2, TAB_CENTER | TAT_TITLE, _("Mean Difference"));
 296   tab_text (t, 8, 2, TAB_CENTER | TAT_TITLE, _("Std. Error Difference"));
 297   tab_text (t, 9, 2, TAB_CENTER | TAT_TITLE, _("Lower"));
 298   tab_text (t, 10, 2, TAB_CENTER | TAT_TITLE, _("Upper"));
 299
 300   tab_joint_text_format (t, 9, 1, 10, 1, TAB_CENTER,
 301                          _("%g%% Confidence Interval of the Difference"),
 302                          tt->confidence * 100.0);
 303
 304   tab_vline (t, TAL_1, 1, heading_rows,  rows - 1);
 305
 306   for (v = 0; v < tt->n_vars; ++v)
 307   {
 308     double df, pooled_variance, mean_diff, tval;
 309     double se2, std_err_diff;
 310     double p, q;
 311     double cc0, mean0, sigma0;
 312     double cc1, mean1, sigma1;
 313     moments_calculate (ps[v].mom[0], &cc0, &mean0, &sigma0, NULL, NULL);
 314     moments_calculate (ps[v].mom[1], &cc1, &mean1, &sigma1, NULL, NULL);
 315
 316     tab_text (t, 0, v * 2 + heading_rows, TAB_LEFT, var_to_string (tt->vars[v]));
 317     tab_text (t, 1, v * 2 + heading_rows, TAB_LEFT, _("Equal variances assumed"));
 318
 319     df = cc0 + cc1 - 2.0;
 320     tab_double (t, 5, v * 2 + heading_rows, TAB_RIGHT, df, NULL, RC_OTHER);
 321
 322     pooled_variance = ((cc0 - 1)* sigma0 + (cc1 - 1) * sigma1) / df ;
 323
 324     tval = (mean0 - mean1) / sqrt (pooled_variance);
 325     tval /= sqrt ((cc0 + cc1) / (cc0 * cc1));
 326
 327     tab_double (t, 4, v * 2 + heading_rows, TAB_RIGHT, tval, NULL, RC_OTHER);
 328
 329     p = gsl_cdf_tdist_P (tval, df);
 330     q = gsl_cdf_tdist_Q (tval, df);
 331
 332     mean_diff = mean0 - mean1;
 333
 334     tab_double (t, 6, v * 2 + heading_rows, TAB_RIGHT, 2.0 * (tval > 0 ? q : p),   NULL, RC_PVALUE);
 335     tab_double (t, 7, v * 2 + heading_rows, TAB_RIGHT, mean_diff, NULL, RC_OTHER);
 336
 337     std_err_diff = sqrt (pooled_variance * (1.0/cc0 + 1.0/cc1));
 338     tab_double (t, 8, v * 2 + heading_rows, TAB_RIGHT, std_err_diff, NULL, RC_OTHER);
 339
 340
 341     /* Now work out the confidence interval */
 342     q = (1 - tt->confidence)/2.0;  /* 2-tailed test */
 343
 344     tval = gsl_cdf_tdist_Qinv (q, df);
 345     tab_double (t,  9, v * 2 + heading_rows, TAB_RIGHT, mean_diff - tval * std_err_diff, NULL, RC_OTHER);
 346     tab_double (t, 10, v * 2 + heading_rows, TAB_RIGHT, mean_diff + tval * std_err_diff, NULL, RC_OTHER);
 347
 348     /* Equal variances not assumed */
 349     tab_text (t, 1, v * 2 + heading_rows + 1,  TAB_LEFT, _("Equal variances not assumed"));
 350     std_err_diff = sqrt ((sigma0 / cc0) + (sigma1 / cc1));
 351
 352     se2 = sigma0 / cc0 + sigma1 / cc1;
 353     tval = mean_diff / sqrt (se2);
 354     tab_double (t, 4, v * 2 + heading_rows + 1, TAB_RIGHT, tval, NULL, RC_OTHER);
 355
 356     {
 357       double p, q;
 358       const double s0 = sigma0 / (cc0);
 359       const double s1 = sigma1 / (cc1);
 360       double df = pow2 (s0 + s1) ;
 361       df /= pow2 (s0) / (cc0 - 1) + pow2 (s1) / (cc1 - 1);
 362
 363       tab_double (t, 5, v * 2 + heading_rows + 1, TAB_RIGHT, df, NULL, RC_OTHER);
 364
 365       p = gsl_cdf_tdist_P (tval, df);
 366       q = gsl_cdf_tdist_Q (tval, df);
 367
 368       tab_double (t, 6, v * 2 + heading_rows + 1, TAB_RIGHT, 2.0 * (tval > 0 ? q : p), NULL, RC_PVALUE);
 369
 370       /* Now work out the confidence interval */
 371       q = (1 - tt->confidence) / 2.0;  /* 2-tailed test */
 372
 373       tval = gsl_cdf_tdist_Qinv (q, df);
 374     }
 375
 376     tab_double (t, 7, v * 2 + heading_rows + 1, TAB_RIGHT, mean_diff, NULL, RC_OTHER);
 377     tab_double (t, 8, v * 2 + heading_rows + 1, TAB_RIGHT, std_err_diff, NULL, RC_OTHER);
 378     tab_double (t, 9, v * 2 + heading_rows + 1, TAB_RIGHT,  mean_diff - tval * std_err_diff, NULL, RC_OTHER);
 379     tab_double (t, 10, v * 2 + heading_rows + 1, TAB_RIGHT, mean_diff + tval * std_err_diff, NULL, RC_OTHER);
 380
 381     tab_double (t, 2, v * 2 + heading_rows, TAB_CENTER, ps[v].lev, NULL, RC_OTHER);
 382
 383
 384     {
 385       /* Now work out the significance of the Levene test */
 386       double df1 = 1;
 387       double df2 = cc0 + cc1 - 2;
 388       double q = gsl_cdf_fdist_Q (ps[v].lev, df1, df2);
 389       tab_double (t, 3, v * 2 + heading_rows, TAB_CENTER, q, NULL, RC_PVALUE);
 390     }
 391   }
 392
 393   tab_submit (t);
 394 }