Implemented the POSTHOC subcommand for the ONEWAY command.

author John Darrington <john@darrington.wattle.id.au>

Fri, 1 Jul 2011 18:04:58 +0000 (20:04 +0200)

committer John Darrington <john@darrington.wattle.id.au>

Fri, 1 Jul 2011 18:04:58 +0000 (20:04 +0200)
author John Darrington <john@darrington.wattle.id.au>
Fri, 1 Jul 2011 18:04:58 +0000 (20:04 +0200)
committer John Darrington <john@darrington.wattle.id.au>
Fri, 1 Jul 2011 18:04:58 +0000 (20:04 +0200)
diff --git a/NEWS b/NEWS

index bd228d919aa7610b62ce9e7024d595a8228a7a81..5509d344af0f965803f8cff648136770887277a3 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -44,6 +44,8 @@ Changes from 0.6.2 to 0.7.8:
     - MISSING VALUES can now assign missing values to long string
       variables.
  
+   - ONEWAY: the POSTHOC subcommand is now implemented.
+
     - The following new subcommands to NPAR TESTS have been implemented:
       COCHRAN, FRIEDMAN, KENDALL, KRUSKAL-WALLIS, MANN-WHITNEY, MCNEMAR,
       SIGN, WILCOXON, and RUNS
diff --git a/doc/statistics.texi b/doc/statistics.texi

index 18fa79ac5209956065e069ddb7505f0537a4c8f2..599e7e42903bab70945735e8f455232af8230886 100644 (file)
--- a/doc/statistics.texi
+++ b/doc/statistics.texi
@@ -1101,7 +1101,7 @@ ONEWAY
          /MISSING=@{ANALYSIS,LISTWISE@} @{EXCLUDE,INCLUDE@}
          /CONTRAST= value1 [, value2] ... [,valueN]
          /STATISTICS=@{DESCRIPTIVES,HOMOGENEITY@}
-
+        /POSTHOC=@{BONFERRONI, GH, LSD, SCHEFFE, SIDAK, TUKEY, ALPHA ([value])@}
  @end display
  
  The @cmd{ONEWAY} procedure performs a one-way analysis of variance of
@@ -1145,6 +1145,29 @@ A setting of EXCLUDE means that variables whose values are
  user-missing are to be excluded from the analysis. A setting of
  INCLUDE means they are to be included.  The default is EXCLUDE.
  
+Using the @code{POSTHOC} subcommand you can perform multiple
+pairwise comparisons on the data. The following comparison methods
+are available:
+@itemize
+@item LSD
+Least Significant Difference.
+@item TUKEY
+Tukey Honestly Significant Difference.
+@item BONFERRONI
+Bonferroni test.
+@item SCHEFFE
+Scheff@'e's test.
+@item SIDAK
+Sidak test.
+@item GH
+The Games-Howell test.
+@end itemize
+
+@noindent
+The optional syntax @code{ALPHA(@var{value})} is used to indicate
+that @var{value} should be used as the
+confidence level for which the posthoc tests will be performed.
+The default is 0.05.
  
  @node RANK
  @comment  node-name,  next,  previous,  up
diff --git a/lib/automake.mk b/lib/automake.mk

index 9bde4afcf26d77b9950e960cdfd29063b7cbbbeb..3fb94bfe6fff0dc9fda19095aadbeeb0fe0cb878 100644 (file)
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -1,6 +1,7 @@
  ## Process this file with automake to produce Makefile.in  -*- makefile -*-
  
  include $(top_srcdir)/lib/linreg/automake.mk
+include $(top_srcdir)/lib/tukey/automake.mk
  
  if HAVE_GUI
  include $(top_srcdir)/lib/gtk-contrib/automake.mk
diff --git a/lib/tukey/README b/lib/tukey/README

new file mode 100644 (file)

index 0000000..e427872
--- /dev/null
+++ b/lib/tukey/README
@@ -0,0 +1 @@
+This is not part of the GNU PSPP program, but is used with GNU PSPP.
diff --git a/lib/tukey/automake.mk b/lib/tukey/automake.mk

new file mode 100644 (file)

index 0000000..85bcd62
--- /dev/null
+++ b/lib/tukey/automake.mk
@@ -0,0 +1,11 @@
+## Process this file with automake to produce Makefile.in  -*- makefile -*-
+
+noinst_LTLIBRARIES += lib/tukey/libtukey.la
+
+lib_tukey_libtukey_la_SOURCES = \
+       lib/tukey/ptukey.c \
+       lib/tukey/qtukey.c \
+       lib/tukey/tukey.h
+
+EXTRA_DIST += lib/tukey/README 
+
diff --git a/lib/tukey/ptukey.c b/lib/tukey/ptukey.c

new file mode 100644 (file)

index 0000000..c86ef23
--- /dev/null
+++ b/lib/tukey/ptukey.c
@@ -0,0 +1,486 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* This file is taken from the R project source code, and modified.
+   The original copyright notice is reproduced below: */
+
+/*
+ *  Mathlib : A C Library of Special Functions
+ *  Copyright (C) 1998       Ross Ihaka
+ *  Copyright (C) 2000--2007 The R Development Core Team
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, a copy is available at
+ *  http://www.r-project.org/Licenses/
+ *
+ *  SYNOPSIS
+ *
+ *    #include <Rmath.h>
+ *    double ptukey(q, rr, cc, df, lower_tail, log_p);
+ *
+ *  DESCRIPTION
+ *
+ *    Computes the probability that the maximum of rr studentized
+ *    ranges, each based on cc means and with df degrees of freedom
+ *    for the standard error, is less than q.
+ *
+ *    The algorithm is based on that of the reference.
+ *
+ *  REFERENCE
+ *
+ *    Copenhaver, Margaret Diponzio & Holland, Burt S.
+ *    Multiple comparisons of simple effects in
+ *    the two-way analysis of variance with fixed effects.
+ *    Journal of Statistical Computation and Simulation,
+ *    Vol.30, pp.1-15, 1988.
+ */
+
+ 
+
+#include <config.h>
+
+#include "libpspp/compiler.h"
+#include "tukey.h"
+
+#include <gsl/gsl_sf_gamma.h>
+#include <gsl/gsl_cdf.h>
+#include <assert.h>
+#include <math.h>
+
+#define R_D__0 (log_p ? ML_NEGINF : 0.)                /* 0 */
+#define R_D__1 (log_p ? 0. : 1.)                       /* 1 */
+#define R_DT_0 (lower_tail ? R_D__0 : R_D__1)          /* 0 */
+#define R_DT_1 (lower_tail ? R_D__1 : R_D__0)          /* 1 */
+
+#define R_D_val(x)     (log_p  ? log(x) : (x))         /*  x  in pF(x,..) */
+#define R_D_Clog(p)    (log_p  ? log1p(-(p)) : (0.5 - (p) + 0.5)) /* [log](1-p) */
+#define R_DT_val(x)    (lower_tail ? R_D_val(x)  : R_D_Clog(x))
+
+
+#define ME_PRECISION   8
+
+
+static inline double 
+pnorm(double x, double mu, double sigma, int lower_tail, int log_p)
+{
+  assert (lower_tail == 1);
+  assert (log_p == 0);
+  assert (sigma == 1.0);
+  
+  return gsl_cdf_gaussian_P (x - mu, sigma);
+}
+
+
+static double
+wprob (double w, double rr, double cc)
+{
+  const double M_1_SQRT_2PI = 1 / sqrt (2 * M_PI);
+
+
+/*  wprob() :
+
+       This function calculates probability integral of Hartley's
+       form of the range.
+
+       w     = value of range
+       rr    = no. of rows or groups
+       cc    = no. of columns or treatments
+       ir    = error flag = 1 if pr_w probability > 1
+       pr_w = returned probability integral from (0, w)
+
+       program will not terminate if ir is raised.
+
+       bb = upper limit of legendre integration
+       iMax = maximum acceptable value of integral
+       nleg = order of legendre quadrature
+       ihalf = int ((nleg + 1) / 2)
+       wlar = value of range above which wincr1 intervals are used to
+              calculate second part of integral,
+              else wincr2 intervals are used.
+       C1, C2, C3 = values which are used as cutoffs for terminating
+       or modifying a calculation.
+
+       M_1_SQRT_2PI = 1 / sqrt(2 * pi);  from abramowitz & stegun, p. 3.
+       M_SQRT2 = sqrt(2)
+       xleg = legendre 12-point nodes
+       aleg = legendre 12-point coefficients
+ */
+#define nleg   12
+#define ihalf  6
+
+  /* looks like this is suboptimal for double precision.
+     (see how C1-C3 are used) <MM>
+   */
+  /* const double iMax  = 1.; not used if = 1 */
+  static const double C1 = -30.;
+  static const double C2 = -50.;
+  static const double C3 = 60.;
+  static const double bb = 8.;
+  static const double wlar = 3.;
+  static const double wincr1 = 2.;
+  static const double wincr2 = 3.;
+  static const double xleg[ihalf] = {
+    0.981560634246719250690549090149,
+    0.904117256370474856678465866119,
+    0.769902674194304687036893833213,
+    0.587317954286617447296702418941,
+    0.367831498998180193752691536644,
+    0.125233408511468915472441369464
+  };
+  static const double aleg[ihalf] = {
+    0.047175336386511827194615961485,
+    0.106939325995318430960254718194,
+    0.160078328543346226334652529543,
+    0.203167426723065921749064455810,
+    0.233492536538354808760849898925,
+    0.249147045813402785000562436043
+  };
+  double a, ac, pr_w, b, binc, blb, c, cc1,
+    pminus, pplus, qexpo, qsqz, rinsum, wi, wincr, xx;
+  long double bub, einsum, elsum;
+  int j, jj;
+
+
+  qsqz = w * 0.5;
+
+  /* if w >= 16 then the integral lower bound (occurs for c=20) */
+  /* is 0.99999999999995 so return a value of 1. */
+
+  if (qsqz >= bb)
+    return 1.0;
+
+  /* find (f(w/2) - 1) ^ cc */
+  /* (first term in integral of hartley's form). */
+
+  pr_w = 2 * pnorm (qsqz, 0., 1., 1, 0) - 1.;  /* erf(qsqz / M_SQRT2) */
+  /* if pr_w ^ cc < 2e-22 then set pr_w = 0 */
+  if (pr_w >= exp (C2 / cc))
+    pr_w = pow (pr_w, cc);
+  else
+    pr_w = 0.0;
+
+  /* if w is large then the second component of the */
+  /* integral is small, so fewer intervals are needed. */
+
+  if (w > wlar)
+    wincr = wincr1;
+  else
+    wincr = wincr2;
+
+  /* find the integral of second term of hartley's form */
+  /* for the integral of the range for equal-length */
+  /* intervals using legendre quadrature.  limits of */
+  /* integration are from (w/2, 8).  two or three */
+  /* equal-length intervals are used. */
+
+  /* blb and bub are lower and upper limits of integration. */
+
+  blb = qsqz;
+  binc = (bb - qsqz) / wincr;
+  bub = blb + binc;
+  einsum = 0.0;
+
+  /* integrate over each interval */
+
+  cc1 = cc - 1.0;
+  for (wi = 1; wi <= wincr; wi++)
+    {
+      elsum = 0.0;
+      a = 0.5 * (bub + blb);
+
+      /* legendre quadrature with order = nleg */
+
+      b = 0.5 * (bub - blb);
+
+      for (jj = 1; jj <= nleg; jj++)
+       {
+         if (ihalf < jj)
+           {
+             j = (nleg - jj) + 1;
+             xx = xleg[j - 1];
+           }
+         else
+           {
+             j = jj;
+             xx = -xleg[j - 1];
+           }
+         c = b * xx;
+         ac = a + c;
+
+         /* if exp(-qexpo/2) < 9e-14, */
+         /* then doesn't contribute to integral */
+
+         qexpo = ac * ac;
+         if (qexpo > C3)
+           break;
+
+         pplus = 2 * pnorm (ac, 0., 1., 1, 0);
+         pminus = 2 * pnorm (ac, w, 1., 1, 0);
+
+         /* if rinsum ^ (cc-1) < 9e-14, */
+         /* then doesn't contribute to integral */
+
+         rinsum = (pplus * 0.5) - (pminus * 0.5);
+         if (rinsum >= exp (C1 / cc1))
+           {
+             rinsum =
+               (aleg[j - 1] * exp (-(0.5 * qexpo))) * pow (rinsum, cc1);
+             elsum += rinsum;
+           }
+       }
+      elsum *= (((2.0 * b) * cc) * M_1_SQRT_2PI);
+      einsum += elsum;
+      blb = bub;
+      bub += binc;
+    }
+
+  /* if pr_w ^ rr < 9e-14, then return 0 */
+  pr_w = einsum + pr_w;
+  if (pr_w <= exp (C1 / rr))
+    return 0.;
+
+  pr_w = pow (pr_w, rr);
+  if (pr_w >= 1.)              /* 1 was iMax was eps */
+    return 1.;
+  return pr_w;
+}                              /* wprob() */
+
+double
+ptukey (double q, double rr, double cc, double df, int lower_tail, int log_p)
+{
+  const double ML_NEGINF = -1.0 / 0.0;
+/*  function ptukey() [was qprob() ]:
+
+       q = value of studentized range
+       rr = no. of rows or groups
+       cc = no. of columns or treatments
+       df = degrees of freedom of error term
+       ir[0] = error flag = 1 if wprob probability > 1
+       ir[1] = error flag = 1 if qprob probability > 1
+
+       qprob = returned probability integral over [0, q]
+
+       The program will not terminate if ir[0] or ir[1] are raised.
+
+       All references in wprob to Abramowitz and Stegun
+       are from the following reference:
+
+       Abramowitz, Milton and Stegun, Irene A.
+       Handbook of Mathematical Functions.
+       New York:  Dover publications, Inc. (1970).
+
+       All constants taken from this text are
+       given to 25 significant digits.
+
+       nlegq = order of legendre quadrature
+       ihalfq = int ((nlegq + 1) / 2)
+       eps = max. allowable value of integral
+       eps1 & eps2 = values below which there is
+                     no contribution to integral.
+
+       d.f. <= dhaf:   integral is divided into ulen1 length intervals.  else
+       d.f. <= dquar:  integral is divided into ulen2 length intervals.  else
+       d.f. <= deigh:  integral is divided into ulen3 length intervals.  else
+       d.f. <= dlarg:  integral is divided into ulen4 length intervals.
+
+       d.f. > dlarg:   the range is used to calculate integral.
+
+       M_LN2 = log(2)
+
+       xlegq = legendre 16-point nodes
+       alegq = legendre 16-point coefficients
+
+       The coefficients and nodes for the legendre quadrature used in
+       qprob and wprob were calculated using the algorithms found in:
+
+       Stroud, A. H. and Secrest, D.
+       Gaussian Quadrature Formulas.
+       Englewood Cliffs,
+       New Jersey:  Prentice-Hall, Inc, 1966.
+
+       All values matched the tables (provided in same reference)
+       to 30 significant digits.
+
+       f(x) = .5 + erf(x / sqrt(2)) / 2      for x > 0
+
+       f(x) = erfc( -x / sqrt(2)) / 2        for x < 0
+
+       where f(x) is standard normal c. d. f.
+
+       if degrees of freedom large, approximate integral
+       with range distribution.
+ */
+#define nlegq  16
+#define ihalfq 8
+
+/*  const double eps = 1.0; not used if = 1 */
+  static const double eps1 = -30.0;
+  static const double eps2 = 1.0e-14;
+  static const double dhaf = 100.0;
+  static const double dquar = 800.0;
+  static const double deigh = 5000.0;
+  static const double dlarg = 25000.0;
+  static const double ulen1 = 1.0;
+  static const double ulen2 = 0.5;
+  static const double ulen3 = 0.25;
+  static const double ulen4 = 0.125;
+  static const double xlegq[ihalfq] = {
+    0.989400934991649932596154173450,
+    0.944575023073232576077988415535,
+    0.865631202387831743880467897712,
+    0.755404408355003033895101194847,
+    0.617876244402643748446671764049,
+    0.458016777657227386342419442984,
+    0.281603550779258913230460501460,
+    0.950125098376374401853193354250e-1
+  };
+  static const double alegq[ihalfq] = {
+    0.271524594117540948517805724560e-1,
+    0.622535239386478928628438369944e-1,
+    0.951585116824927848099251076022e-1,
+    0.124628971255533872052476282192,
+    0.149595988816576732081501730547,
+    0.169156519395002538189312079030,
+    0.182603415044923588866763667969,
+    0.189450610455068496285396723208
+  };
+  double ans, f2, f21, f2lf, ff4, otsum, qsqz, rotsum, t1, twa1, ulen, wprb;
+  int i, j, jj;
+
+#ifdef IEEE_754
+  abort (! (ISNAN (q) || ISNAN (rr) || ISNAN (cc) || ISNAN (df)));
+#endif
+
+  if (q <= 0)
+    return R_DT_0;
+
+  /* df must be > 1 */
+  /* there must be at least two values */
+  assert (! (df < 2 || rr < 1 || cc < 2));
+
+  if (!isfinite (q))
+    return R_DT_1;
+
+  if (df > dlarg)
+    return R_DT_val (wprob (q, rr, cc));
+
+  /* calculate leading constant */
+
+  f2 = df * 0.5;
+  /* lgammafn(u) = log(gamma(u)) */
+  f2lf = ((f2 * log (df)) - (df * M_LN2)) - gsl_sf_lngamma (f2);
+  f21 = f2 - 1.0;
+
+  /* integral is divided into unit, half-unit, quarter-unit, or */
+  /* eighth-unit length intervals depending on the value of the */
+  /* degrees of freedom. */
+
+  ff4 = df * 0.25;
+  if (df <= dhaf)
+    ulen = ulen1;
+  else if (df <= dquar)
+    ulen = ulen2;
+  else if (df <= deigh)
+    ulen = ulen3;
+  else
+    ulen = ulen4;
+
+  f2lf += log (ulen);
+
+  /* integrate over each subinterval */
+
+  ans = 0.0;
+
+  for (i = 1; i <= 50; i++)
+    {
+      otsum = 0.0;
+
+      /* legendre quadrature with order = nlegq */
+      /* nodes (stored in xlegq) are symmetric around zero. */
+
+      twa1 = (2 * i - 1) * ulen;
+
+      for (jj = 1; jj <= nlegq; jj++)
+       {
+         if (ihalfq < jj)
+           {
+             j = jj - ihalfq - 1;
+             t1 = (f2lf + (f21 * log (twa1 + (xlegq[j] * ulen))))
+               - (((xlegq[j] * ulen) + twa1) * ff4);
+           }
+         else
+           {
+             j = jj - 1;
+             t1 = (f2lf + (f21 * log (twa1 - (xlegq[j] * ulen))))
+               + (((xlegq[j] * ulen) - twa1) * ff4);
+
+           }
+
+         /* if exp(t1) < 9e-14, then doesn't contribute to integral */
+         if (t1 >= eps1)
+           {
+             if (ihalfq < jj)
+               {
+                 qsqz = q * sqrt (((xlegq[j] * ulen) + twa1) * 0.5);
+               }
+             else
+               {
+                 qsqz = q * sqrt (((-(xlegq[j] * ulen)) + twa1) * 0.5);
+               }
+
+             /* call wprob to find integral of range portion */
+
+             wprb = wprob (qsqz, rr, cc);
+             rotsum = (wprb * alegq[j]) * exp (t1);
+             otsum += rotsum;
+           }
+         /* end legendre integral for interval i */
+         /* L200: */
+       }
+
+      /* if integral for interval i < 1e-14, then stop.
+       * However, in order to avoid small area under left tail,
+       * at least  1 / ulen  intervals are calculated.
+       */
+      if (i * ulen >= 1.0 && otsum <= eps2)
+       break;
+
+      /* end of interval i */
+      /* L330: */
+
+      ans += otsum;
+    }
+
+  assert (otsum <= eps2); /* not converged */
+
+  if (ans > 1.)
+    ans = 1.;
+  return R_DT_val (ans);
+}
+
+
+
diff --git a/lib/tukey/qtukey.c b/lib/tukey/qtukey.c

new file mode 100644 (file)

index 0000000..08253df
--- /dev/null
+++ b/lib/tukey/qtukey.c
@@ -0,0 +1,253 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* This file is taken from the R project source code, and modified.
+   The original copyright notice is reproduced below: */
+
+/*
+ *  Mathlib : A C Library of Special Functions
+ *  Copyright (C) 1998              Ross Ihaka
+ *  Copyright (C) 2000--2005 The R Development Core Team
+ *  based in part on AS70 (C) 1974 Royal Statistical Society
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, a copy is available at
+ *  http://www.r-project.org/Licenses/
+ *
+ *  SYNOPSIS
+ *
+ *     #include <Rmath.h>
+ *     double qtukey(p, rr, cc, df, lower_tail, log_p);
+ *
+ *  DESCRIPTION
+ *
+ *     Computes the quantiles of the maximum of rr studentized
+ *     ranges, each based on cc means and with df degrees of freedom
+ *     for the standard error, is less than q.
+ *
+ *     The algorithm is based on that of the reference.
+ *
+ *  REFERENCE
+ *
+ *     Copenhaver, Margaret Diponzio & Holland, Burt S.
+ *     Multiple comparisons of simple effects in
+ *     the two-way analysis of variance with fixed effects.
+ *     Journal of Statistical Computation and Simulation,
+ *     Vol.30, pp.1-15, 1988.
+ */
+
+#include <config.h>
+
+#include "tukey.h"
+
+#include <assert.h>
+#include <math.h>
+
+#define TRUE (1)
+#define FALSE (0)
+
+#define ML_POSINF      (1.0 / 0.0)
+#define ML_NEGINF      (-1.0 / 0.0)
+
+#define R_D_Lval(p)    (lower_tail ? (p) : (0.5 - (p) + 0.5))  /*  p  */
+
+#define R_DT_qIv(p)    (log_p ? (lower_tail ? exp(p) : - expm1(p)) \
+                              : R_D_Lval(p))
+
+
+static double fmax2(double x, double y)
+{
+#ifdef IEEE_754
+       if (ISNAN(x) || ISNAN(y))
+               return x + y;
+#endif
+       return (x < y) ? y : x;
+}
+
+
+#define R_Q_P01_boundaries(p, _LEFT_, _RIGHT_)         \
+    if (log_p) {                                       \
+      assert (p <= 0);                                 \
+       if(p == 0) /* upper bound*/                     \
+           return lower_tail ? _RIGHT_ : _LEFT_;       \
+       if(p == ML_NEGINF)                              \
+           return lower_tail ? _LEFT_ : _RIGHT_;       \
+    }                                                  \
+    else { /* !log_p */                                        \
+      assert (p >= 0 && p <= 1);                       \
+       if(p == 0)                                      \
+           return lower_tail ? _LEFT_ : _RIGHT_;       \
+       if(p == 1)                                      \
+           return lower_tail ? _RIGHT_ : _LEFT_;       \
+    }
+
+
+/* qinv() :
+ *     this function finds percentage point of the studentized range
+ *     which is used as initial estimate for the secant method.
+ *     function is adapted from portion of algorithm as 70
+ *     from applied statistics (1974) ,vol. 23, no. 1
+ *     by odeh, r. e. and evans, j. o.
+ *
+ *       p = percentage point
+ *       c = no. of columns or treatments
+ *       v = degrees of freedom
+ *       qinv = returned initial estimate
+ *
+ *     vmax is cutoff above which degrees of freedom
+ *     is treated as infinity.
+ */
+
+static double qinv(double p, double c, double v)
+{
+    static const double p0 = 0.322232421088;
+    static const double q0 = 0.993484626060e-01;
+    static const double p1 = -1.0;
+    static const double q1 = 0.588581570495;
+    static const double p2 = -0.342242088547;
+    static const double q2 = 0.531103462366;
+    static const double p3 = -0.204231210125;
+    static const double q3 = 0.103537752850;
+    static const double p4 = -0.453642210148e-04;
+    static const double q4 = 0.38560700634e-02;
+    static const double c1 = 0.8832;
+    static const double c2 = 0.2368;
+    static const double c3 = 1.214;
+    static const double c4 = 1.208;
+    static const double c5 = 1.4142;
+    static const double vmax = 120.0;
+
+    double ps, q, t, yi;
+
+    ps = 0.5 - 0.5 * p;
+    yi = sqrt (log (1.0 / (ps * ps)));
+    t = yi + (((( yi * p4 + p3) * yi + p2) * yi + p1) * yi + p0)
+          / (((( yi * q4 + q3) * yi + q2) * yi + q1) * yi + q0);
+    if (v < vmax) t += (t * t * t + t) / v / 4.0;
+    q = c1 - c2 * t;
+    if (v < vmax) q += -c3 / v + c4 * t / v;
+    return t * (q * log (c - 1.0) + c5);
+}
+
+/*
+ *  Copenhaver, Margaret Diponzio & Holland, Burt S.
+ *  Multiple comparisons of simple effects in
+ *  the two-way analysis of variance with fixed effects.
+ *  Journal of Statistical Computation and Simulation,
+ *  Vol.30, pp.1-15, 1988.
+ *
+ *  Uses the secant method to find critical values.
+ *
+ *  p = confidence level (1 - alpha)
+ *  rr = no. of rows or groups
+ *  cc = no. of columns or treatments
+ *  df = degrees of freedom of error term
+ *
+ *  ir(1) = error flag = 1 if wprob probability > 1
+ *  ir(2) = error flag = 1 if ptukey probability > 1
+ *  ir(3) = error flag = 1 if convergence not reached in 50 iterations
+ *                    = 2 if df < 2
+ *
+ *  qtukey = returned critical value
+ *
+ *  If the difference between successive iterates is less than eps,
+ *  the search is terminated
+ */
+
+
+double qtukey(double p, double rr, double cc, double df,
+             int lower_tail, int log_p)
+{
+    static const double eps = 0.0001;
+    const int maxiter = 50;
+
+    double ans = 0.0, valx0, valx1, x0, x1, xabs;
+    int iter;
+
+#ifdef IEEE_754
+    if (ISNAN(p) || ISNAN(rr) || ISNAN(cc) || ISNAN(df)) {
+       ML_ERROR(ME_DOMAIN, "qtukey");
+       return p + rr + cc + df;
+    }
+#endif
+
+    /* df must be > 1 ; there must be at least two values */
+    assert (! (df < 2 || rr < 1 || cc < 2) );
+
+    R_Q_P01_boundaries (p, 0, ML_POSINF);
+
+    p = R_DT_qIv(p); /* lower_tail,non-log "p" */
+
+    /* Initial value */
+
+    x0 = qinv(p, cc, df);
+
+    /* Find prob(value < x0) */
+
+    valx0 = ptukey(x0, rr, cc, df, /*LOWER*/TRUE, /*LOG_P*/FALSE) - p;
+
+    /* Find the second iterate and prob(value < x1). */
+    /* If the first iterate has probability value */
+    /* exceeding p then second iterate is 1 less than */
+    /* first iterate; otherwise it is 1 greater. */
+
+    if (valx0 > 0.0)
+       x1 = fmax2(0.0, x0 - 1.0);
+    else
+       x1 = x0 + 1.0;
+    valx1 = ptukey(x1, rr, cc, df, /*LOWER*/TRUE, /*LOG_P*/FALSE) - p;
+
+    /* Find new iterate */
+
+    for(iter=1 ; iter < maxiter ; iter++) {
+       ans = x1 - ((valx1 * (x1 - x0)) / (valx1 - valx0));
+       valx0 = valx1;
+
+       /* New iterate must be >= 0 */
+
+       x0 = x1;
+       if (ans < 0.0) {
+           ans = 0.0;
+           valx1 = -p;
+       }
+       /* Find prob(value < new iterate) */
+
+       valx1 = ptukey(ans, rr, cc, df, /*LOWER*/TRUE, /*LOG_P*/FALSE) - p;
+       x1 = ans;
+
+       /* If the difference between two successive */
+       /* iterates is less than eps, stop */
+
+       xabs = fabs(x1 - x0);
+       if (xabs < eps)
+           return ans;
+    }
+
+    /* The process did not converge in 'maxiter' iterations */
+    assert (0);
+    return ans;
+}
diff --git a/lib/tukey/tukey.h b/lib/tukey/tukey.h

new file mode 100644 (file)

index 0000000..69050e8
--- /dev/null
+++ b/lib/tukey/tukey.h
@@ -0,0 +1,25 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef TUKEY_H
+#define TUKEY_H 1
+
+double qtukey(double p, double rr, double cc, double df,  int lower_tail, int log_p);
+
+double ptukey (double q, double rr, double cc, double df, int lower_tail, int log_p);
+
+#endif
diff --git a/src/language/stats/oneway.c b/src/language/stats/oneway.c

index 8e2805b3b1d54ca618b91d2b23889d10a84cdd75..bdac117c7ce860b88b73fc1af24a7989eb1fa3bd 100644 (file)
--- a/src/language/stats/oneway.c
+++ b/src/language/stats/oneway.c
@@ -37,6 +37,7 @@
  #include "libpspp/misc.h"
  #include "libpspp/taint.h"
  #include "linreg/sweep.h"
+#include "tukey/tukey.h"
  #include "math/categoricals.h"
  #include "math/covariance.h"
  #include "math/levene.h"
@@ -45,7 +46,35 @@
  
  #include "gettext.h"
  #define _(msgid) gettext (msgid)
+#define N_(msgid) msgid
  
+/* Workspace variable for each dependent variable */
+struct per_var_ws
+{
+  struct categoricals *cat;
+  struct covariance *cov;
+  struct levene *nl;
+
+  double n;
+
+  double sst;
+  double sse;
+  double ssa;
+
+  int n_groups;
+
+  double mse;
+};
+
+/* Per category data */
+struct descriptive_data
+{
+  const struct variable *var;
+  struct moments1 *mom;
+
+  double minimum;
+  double maximum;
+};
  
  enum missing_type
    {
@@ -70,8 +99,28 @@ struct contrasts_node
  {
    struct ll ll; 
    struct ll_list coefficient_list;
+};
  
-  bool bad_count; /* True if the number of coefficients does not equal the number of groups */
+
+struct oneway_spec;
+
+typedef double df_func (const struct per_var_ws *pvw, const struct moments1 *mom_i, const struct moments1 *mom_j);
+typedef double ts_func (int k, const struct moments1 *mom_i, const struct moments1 *mom_j, double std_err);
+typedef double p1tail_func (double ts, double df1, double df2);
+
+typedef double pinv_func (double std_err, double alpha, double df, int k, const struct moments1 *mom_i, const struct moments1 *mom_j);
+
+
+struct posthoc
+{
+  const char *syntax;
+  const char *label;
+
+  df_func *dff;
+  ts_func *tsf;
+  p1tail_func *p1f;
+
+  pinv_func *pinv;
  };
  
  struct oneway_spec
@@ -91,33 +140,206 @@ struct oneway_spec
  
    /* The weight variable */
    const struct variable *wv;
+
+  /* The confidence level for multiple comparisons */
+  double alpha;
+
+  int *posthoc;
+  int n_posthoc;
  };
  
-/* Per category data */
-struct descriptive_data
+static double
+df_common (const struct per_var_ws *pvw, const struct moments1 *mom_i UNUSED, const struct moments1 *mom_j UNUSED)
  {
-  const struct variable *var;
-  struct moments1 *mom;
+  return  pvw->n - pvw->n_groups;
+}
  
-  double minimum;
-  double maximum;
-};
+static double
+df_individual (const struct per_var_ws *pvw UNUSED, const struct moments1 *mom_i, const struct moments1 *mom_j)
+{
+  double n_i, var_i;
+  double n_j, var_j;
+  double nom,denom;
  
-/* Workspace variable for each dependent variable */
-struct per_var_ws
+  moments1_calculate (mom_i, &n_i, NULL, &var_i, 0, 0);  
+  moments1_calculate (mom_j, &n_j, NULL, &var_j, 0, 0);
+
+  nom = pow2 (var_i/n_i + var_j/n_j);
+  denom = pow2 (var_i/n_i) / (n_i - 1) + pow2 (var_j/n_j) / (n_j - 1);
+
+  return nom / denom;
+}
+
+static double lsd_pinv (double std_err, double alpha, double df, int k UNUSED, const struct moments1 *mom_i UNUSED, const struct moments1 *mom_j UNUSED)
  {
-  struct categoricals *cat;
-  struct covariance *cov;
-  struct levene *nl;
+  return std_err * gsl_cdf_tdist_Pinv (1.0 - alpha / 2.0, df);
+}
  
-  double sst;
-  double sse;
-  double ssa;
+static double bonferroni_pinv (double std_err, double alpha, double df, int k, const struct moments1 *mom_i UNUSED, const struct moments1 *mom_j UNUSED)
+{
+  const int m = k * (k - 1) / 2;
+  return std_err * gsl_cdf_tdist_Pinv (1.0 - alpha / (2.0 * m), df);
+}
  
-  int n_groups;
+static double sidak_pinv (double std_err, double alpha, double df, int k, const struct moments1 *mom_i, const struct moments1 *mom_j)
+{
+  const double m = k * (k - 1) / 2;
+  double lp = 1.0 - exp (log (1.0 - alpha) / m ) ;
+  return std_err * gsl_cdf_tdist_Pinv (1.0 - lp / 2.0, df);
+}
+
+static double tukey_pinv (double std_err, double alpha, double df, int k, const struct moments1 *mom_i UNUSED, const struct moments1 *mom_j UNUSED)
+{
+  return std_err / sqrt (2.0)  * qtukey (1 - alpha, 1.0, k, df, 1, 0);
+}
+
+static double scheffe_pinv (double std_err, double alpha, double df, int k, const struct moments1 *mom_i UNUSED, const struct moments1 *mom_j UNUSED)
+{
+  double x = (k - 1) * gsl_cdf_fdist_Pinv (1.0 - alpha, k - 1, df);
+  return std_err * sqrt (x);
+}
+
+static double gh_pinv (double std_err UNUSED, double alpha, double df, int k, const struct moments1 *mom_i, const struct moments1 *mom_j)
+{
+  double n_i, mean_i, var_i;
+  double n_j, mean_j, var_j;
+  double m;
+
+  moments1_calculate (mom_i, &n_i, &mean_i, &var_i, 0, 0);  
+  moments1_calculate (mom_j, &n_j, &mean_j, &var_j, 0, 0);
+
+  m = sqrt ((var_i/n_i + var_j/n_j) / 2.0);
+
+  return m * qtukey (1 - alpha, 1.0, k, df, 1, 0);
+}
+
+
+static double 
+multiple_comparison_sig (double std_err,
+                                      const struct per_var_ws *pvw,
+                                      const struct descriptive_data *dd_i, const struct descriptive_data *dd_j,
+                                      const struct posthoc *ph)
+{
+  int k = pvw->n_groups;
+  double df = ph->dff (pvw, dd_i->mom, dd_j->mom);
+  double ts = ph->tsf (k, dd_i->mom, dd_j->mom, std_err);
+  return  ph->p1f (ts, k - 1, df);
+}
+
+static double 
+mc_half_range (const struct oneway_spec *cmd, const struct per_var_ws *pvw, double std_err, const struct descriptive_data *dd_i, const struct descriptive_data *dd_j, const struct posthoc *ph)
+{
+  int k = pvw->n_groups;
+  double df = ph->dff (pvw, dd_i->mom, dd_j->mom);
+
+  return ph->pinv (std_err, cmd->alpha, df, k, dd_i->mom, dd_j->mom);
+}
+
+static double tukey_1tailsig (double ts, double df1, double df2)
+{
+  double twotailedsig = 1.0 - ptukey (ts, 1.0, df1 + 1, df2, 1, 0);
+
+  return twotailedsig / 2.0;
+}
+
+static double lsd_1tailsig (double ts, double df1 UNUSED, double df2)
+{
+  return ts < 0 ? gsl_cdf_tdist_P (ts, df2) : gsl_cdf_tdist_Q (ts, df2);
+}
+
+static double sidak_1tailsig (double ts, double df1, double df2)
+{
+  double ex = (df1 + 1.0) * df1 / 2.0;
+  double lsd_sig = 2 * lsd_1tailsig (ts, df1, df2);
+
+  return 0.5 * (1.0 - pow (1.0 - lsd_sig, ex));
+}
+
+static double bonferroni_1tailsig (double ts, double df1, double df2)
+{
+  const int m = (df1 + 1) * df1 / 2;
+
+  double p = ts < 0 ? gsl_cdf_tdist_P (ts, df2) : gsl_cdf_tdist_Q (ts, df2);
+  p *= m;
+
+  return p > 0.5 ? 0.5 : p;
+}
+
+static double scheffe_1tailsig (double ts, double df1, double df2)
+{
+  return 0.5 * gsl_cdf_fdist_Q (ts, df1, df2);
+}
+
+
+static double tukey_test_stat (int k, const struct moments1 *mom_i, const struct moments1 *mom_j, double std_err)
+{
+  double n_i, mean_i, var_i;
+  double n_j, mean_j, var_j;
+
+  moments1_calculate (mom_i, &n_i, &mean_i, &var_i, 0, 0);  
+  moments1_calculate (mom_j, &n_j, &mean_j, &var_j, 0, 0);
+
+  double ts =  (mean_i - mean_j) / std_err;
+  ts = fabs (ts) * sqrt (2.0);
+
+  return ts;
+}
+
+static double lsd_test_stat (int k, const struct moments1 *mom_i, const struct moments1 *mom_j, double std_err)
+{
+  double n_i, mean_i, var_i;
+  double n_j, mean_j, var_j;
+
+  moments1_calculate (mom_i, &n_i, &mean_i, &var_i, 0, 0);  
+  moments1_calculate (mom_j, &n_j, &mean_j, &var_j, 0, 0);
+
+  return (mean_i - mean_j) / std_err;
+}
+
+static double scheffe_test_stat (int k, const struct moments1 *mom_i, const struct moments1 *mom_j, double std_err)
+{
+  double n_i, mean_i, var_i;
+  double n_j, mean_j, var_j;
+
+  moments1_calculate (mom_i, &n_i, &mean_i, &var_i, 0, 0);  
+  moments1_calculate (mom_j, &n_j, &mean_j, &var_j, 0, 0);
+
+  double t = (mean_i - mean_j) / std_err;
+  t = pow2 (t);
+  t /= k - 1;
+
+  return t;
+}
+
+static double gh_test_stat (int k, const struct moments1 *mom_i, const struct moments1 *mom_j, double std_err)
+{
+  double n_i, mean_i, var_i;
+  double n_j, mean_j, var_j;
+
+  moments1_calculate (mom_i, &n_i, &mean_i, &var_i, 0, 0);  
+  moments1_calculate (mom_j, &n_j, &mean_j, &var_j, 0, 0);
+
+  double thing = var_i / n_i + var_j / n_j;
+  thing /= 2.0;
+  thing = sqrt (thing);
+
+  double ts = (mean_i - mean_j) / thing;
+
+  return fabs (ts);
+}
+
+
+
+static const struct posthoc ph_tests [] = 
+  {
+    { "LSD",        N_("LSD"),          df_common, lsd_test_stat,     lsd_1tailsig,          lsd_pinv},
+    { "TUKEY",      N_("Tukey HSD"),    df_common, tukey_test_stat,   tukey_1tailsig,        tukey_pinv},
+    { "BONFERRONI", N_("Bonferroni"),   df_common, lsd_test_stat,     bonferroni_1tailsig,   bonferroni_pinv},
+    { "SCHEFFE",    N_("Scheffé"),      df_common, scheffe_test_stat, scheffe_1tailsig,      scheffe_pinv},
+    { "GH",         N_("Games-Howell"), df_individual, gh_test_stat,  tukey_1tailsig,        gh_pinv},
+    { "SIDAK",      N_("Šidák"),        df_common, lsd_test_stat,     sidak_1tailsig,        sidak_pinv}
+  };
  
-  double mse;
-};
  
  struct oneway_workspace
  {
@@ -151,6 +373,9 @@ cmd_oneway (struct lexer *lexer, struct dataset *ds)
    oneway.missing_type = MISS_ANALYSIS;
    oneway.exclude = MV_ANY;
    oneway.wv = dict_get_weight (dict);
+  oneway.alpha = 0.05;
+  oneway.posthoc = NULL;
+  oneway.n_posthoc = 0;
  
    ll_init (&oneway.contrast_list);
  
@@ -197,6 +422,45 @@ cmd_oneway (struct lexer *lexer, struct dataset *ds)
                 }
             }
         }
+      else if (lex_match_id (lexer, "POSTHOC"))
+       {
+          lex_match (lexer, T_EQUALS);
+          while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
+           {
+             int p;
+             bool method = false;
+             for (p = 0 ; p < sizeof (ph_tests) / sizeof (struct posthoc); ++p)
+               {
+                 if (lex_match_id (lexer, ph_tests[p].syntax))
+                   {
+                     oneway.n_posthoc++;
+                     oneway.posthoc = xrealloc (oneway.posthoc, sizeof (*oneway.posthoc) * oneway.n_posthoc);
+                     oneway.posthoc[oneway.n_posthoc - 1] = p;
+                     method = true;
+                     break;
+                   }
+               }
+             if ( method == false)
+               {
+                 if (lex_match_id (lexer, "ALPHA"))
+                   {
+                     if ( !lex_force_match (lexer, T_LPAREN))
+                       goto error;
+                     lex_force_num (lexer);
+                     oneway.alpha = lex_number (lexer);
+                     lex_get (lexer);
+                     if ( !lex_force_match (lexer, T_RPAREN))
+                       goto error;
+                   }
+                 else
+                   {
+                     msg (SE, _("The post hoc analysis method %s is not supported."), lex_tokcstr (lexer));
+                     lex_error (lexer, NULL);
+                     goto error;
+                   }
+               }
+           }
+       }
        else if (lex_match_id (lexer, "CONTRAST"))
         {
           struct contrasts_node *cl = xzalloc (sizeof *cl);
@@ -490,8 +754,7 @@ run_oneway (const struct oneway_spec *cmd,
        gsl_matrix *cm = covariance_calculate_unnormalized (pvw->cov);
        const struct categoricals *cats = covariance_get_categoricals (pvw->cov);
  
-      double n;
-      moments1_calculate (ws.dd_total[v]->mom, &n, NULL, NULL, NULL, NULL);
+      moments1_calculate (ws.dd_total[v]->mom, &pvw->n, NULL, NULL, NULL, NULL);
  
        pvw->sst = gsl_matrix_get (cm, 0, 0);
  
@@ -503,7 +766,7 @@ run_oneway (const struct oneway_spec *cmd,
  
        pvw->n_groups = categoricals_total (cats);
  
-      pvw->mse = (pvw->sst - pvw->ssa) / (n - pvw->n_groups);
+      pvw->mse = (pvw->sst - pvw->ssa) / (pvw->n - pvw->n_groups);
  
        gsl_matrix_free (cm);
      }
@@ -538,6 +801,7 @@ run_oneway (const struct oneway_spec *cmd,
  
  static void show_contrast_coeffs (const struct oneway_spec *cmd, const struct oneway_workspace *ws);
  static void show_contrast_tests (const struct oneway_spec *cmd, const struct oneway_workspace *ws);
+static void show_comparisons (const struct oneway_spec *cmd, const struct oneway_workspace *ws, int depvar);
  
  static void
  output_oneway (const struct oneway_spec *cmd, struct oneway_workspace *ws)
@@ -546,7 +810,8 @@ output_oneway (const struct oneway_spec *cmd, struct oneway_workspace *ws)
  
    /* Check the sanity of the given contrast values */
    struct contrasts_node *coeff_list  = NULL;
-  ll_for_each (coeff_list, struct contrasts_node, ll, &cmd->contrast_list)
+  struct contrasts_node *coeff_next  = NULL;
+  ll_for_each_safe (coeff_list, coeff_next, struct contrasts_node, ll, &cmd->contrast_list)
      {
        struct coeff_node *cn = NULL;
        double sum = 0;
@@ -556,8 +821,10 @@ output_oneway (const struct oneway_spec *cmd, struct oneway_workspace *ws)
        if (ll_count (cl) != ws->actual_number_of_groups)
         {
           msg (SW,
-              _("Number of contrast coefficients must equal the number of groups"));
-         coeff_list->bad_count = true;
+              _("In contrast list %zu, the number of coefficients (%d) does not equal the number of groups (%d). This contrast list will be ignored."),
+              i, ll_count (cl), ws->actual_number_of_groups);
+
+         ll_remove (&coeff_list->ll);
           continue;
         }
  
@@ -581,6 +848,13 @@ output_oneway (const struct oneway_spec *cmd, struct oneway_workspace *ws)
        show_contrast_coeffs (cmd, ws);
        show_contrast_tests (cmd, ws);
      }
+
+  if ( cmd->posthoc )
+    {
+      int v;
+      for (v = 0 ; v < cmd->n_vars; ++v)
+       show_comparisons (cmd, ws, v);
+    }
  }
  
  
@@ -954,6 +1228,7 @@ show_contrast_coeffs (const struct oneway_spec *cmd, const struct oneway_workspa
         {
           const struct categoricals *cats = covariance_get_categoricals (cov);
           const union value *val = categoricals_get_value_by_category (cats, count);
+         struct coeff_node *coeffn = ll_data (coeffi, struct coeff_node, ll);
           struct string vstr;
  
           ds_init_empty (&vstr);
@@ -964,14 +1239,7 @@ show_contrast_coeffs (const struct oneway_spec *cmd, const struct oneway_workspa
  
           ds_destroy (&vstr);
  
-         if (cn->bad_count)
-           tab_text (t, count + 2, c_num + 2, TAB_RIGHT, "?" );
-         else
-           {
-             struct coeff_node *coeffn = ll_data (coeffi, struct coeff_node, ll);
-
-             tab_text_format (t, count + 2, c_num + 2, TAB_RIGHT, "%g", coeffn->coeff);
-           }
+         tab_text_format (t, count + 2, c_num + 2, TAB_RIGHT, "%g", coeffn->coeff);
         }
        ++c_num;
      }
@@ -1082,9 +1350,6 @@ show_contrast_tests (const struct oneway_spec *cmd, const struct oneway_workspac
                             (v * lines_per_variable) + i + 1 + n_contrasts,
                             TAB_CENTER | TAT_TITLE, "%d", i + 1);
  
-         if (cn->bad_count)
-           continue;
-
           for (coeffi = ll_head (&cn->coefficient_list);
                coeffi != ll_null (&cn->coefficient_list);
                ++ci, coeffi = ll_next (coeffi))
@@ -1182,3 +1447,124 @@ show_contrast_tests (const struct oneway_spec *cmd, const struct oneway_workspac
  }
  
  
+
+static void
+show_comparisons (const struct oneway_spec *cmd, const struct oneway_workspace *ws, int v)
+{
+  const int n_cols = 8;
+  const int heading_rows = 2;
+  const int heading_cols = 3;
+
+  int p;
+  int r = heading_rows ;
+
+  const struct per_var_ws *pvw = &ws->vws[v];
+  const struct categoricals *cat = pvw->cat;
+  const int n_rows = heading_rows + cmd->n_posthoc * pvw->n_groups * (pvw->n_groups - 1);
+
+  struct tab_table *t = tab_create (n_cols, n_rows);
+
+  tab_headers (t, heading_cols, 0, heading_rows, 0);
+
+  /* Put a frame around the entire box, and vertical lines inside */
+  tab_box (t,
+          TAL_2, TAL_2,
+          -1, -1,
+          0, 0,
+          n_cols - 1, n_rows - 1);
+
+  tab_box (t,
+          -1, -1,
+          -1, TAL_1,
+          heading_cols, 0,
+          n_cols - 1, n_rows - 1);
+
+  tab_vline (t, TAL_2, heading_cols, 0, n_rows - 1);
+
+  tab_title (t, _("Multiple Comparisons"));
+
+  tab_text_format (t,  1, 1, TAB_LEFT | TAT_TITLE, _("(I) %s"), var_to_string (cmd->indep_var));
+  tab_text_format (t,  2, 1, TAB_LEFT | TAT_TITLE, _("(J) %s"), var_to_string (cmd->indep_var));
+  tab_text (t,  3, 0, TAB_CENTER | TAT_TITLE, _("Mean Difference"));
+  tab_text (t,  3, 1, TAB_CENTER | TAT_TITLE, _("(I - J)"));
+  tab_text (t,  4, 1, TAB_CENTER | TAT_TITLE, _("Std. Error"));
+  tab_text (t,  5, 1, TAB_CENTER | TAT_TITLE, _("Sig."));
+
+  tab_joint_text_format (t, 6, 0, 7, 0, TAB_CENTER | TAT_TITLE,
+                         _("%g%% Confidence Interval"),
+                         (1 - cmd->alpha) * 100.0);
+
+  tab_text (t,  6, 1, TAB_CENTER | TAT_TITLE, _("Lower Bound"));
+  tab_text (t,  7, 1, TAB_CENTER | TAT_TITLE, _("Upper Bound"));
+
+
+  for (p = 0; p < cmd->n_posthoc; ++p)
+    {
+      int i;
+      const struct posthoc *ph = &ph_tests[cmd->posthoc[p]];
+
+      tab_hline (t, TAL_2, 0, n_cols - 1, r);
+
+      tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, gettext (ph->label));
+
+      for (i = 0; i < pvw->n_groups ; ++i)
+       {
+         double weight_i, mean_i, var_i;
+         int rx = 0;
+         struct string vstr;
+         int j;
+         struct descriptive_data *dd_i = categoricals_get_user_data_by_category (cat, i);
+         const union value *gval = categoricals_get_value_by_category (cat, i);
+
+         ds_init_empty (&vstr);
+         var_append_value_name (cmd->indep_var, gval, &vstr);
+
+         if ( i != 0)
+           tab_hline (t, TAL_1, 1, n_cols - 1, r);
+         tab_text (t, 1, r, TAB_LEFT | TAT_TITLE, ds_cstr (&vstr));
+
+         moments1_calculate (dd_i->mom, &weight_i, &mean_i, &var_i, 0, 0);
+
+         for (j = 0 ; j < pvw->n_groups; ++j)
+           {
+             double weight_j, mean_j, var_j;
+             double half_range;
+             struct descriptive_data *dd_j = categoricals_get_user_data_by_category (cat, j);
+             if (j == i)
+               continue;
+
+             ds_clear (&vstr);
+             gval = categoricals_get_value_by_category (cat, j);
+             var_append_value_name (cmd->indep_var, gval, &vstr);
+             tab_text (t, 2, r + rx, TAB_LEFT | TAT_TITLE, ds_cstr (&vstr));
+
+             moments1_calculate (dd_j->mom, &weight_j, &mean_j, &var_j, 0, 0);
+
+             tab_double  (t, 3, r + rx, 0, mean_i - mean_j, 0);
+
+             double std_err = pvw->mse;
+             std_err *= weight_i + weight_j;
+             std_err /= weight_i * weight_j;
+             std_err = sqrt (std_err);
+
+             tab_double  (t, 4, r + rx, 0, std_err, 0);
+         
+             tab_double (t, 5, r + rx, 0, 2 * multiple_comparison_sig (std_err, pvw, dd_i, dd_j, ph), 0);
+
+             half_range = mc_half_range (cmd, pvw, std_err, dd_i, dd_j, ph);
+
+             tab_double (t, 6, r + rx, 0,
+                          (mean_i - mean_j) - half_range, 0 );
+
+             tab_double (t, 7, r + rx, 0,
+                          (mean_i - mean_j) + half_range, 0 );
+
+             rx++;
+           }
+         ds_destroy (&vstr);
+         r += pvw->n_groups - 1;
+       }
+    }
+
+  tab_submit (t);
+}
diff --git a/src/math/automake.mk b/src/math/automake.mk

index b29d9baad19290f23e3a84c0552c190b3d199f08..cdcc2c8cb79355d6434a32a6019f21aa6edc3851 100644 (file)
--- a/src/math/automake.mk
+++ b/src/math/automake.mk
@@ -4,7 +4,8 @@
  noinst_LTLIBRARIES += src/math/libpspp-math.la
  
  src_math_libpspp_math_la_LIBADD = \
-       lib/linreg/liblinreg.la
+       lib/linreg/liblinreg.la \
+       lib/tukey/libtukey.la
  
  src_math_libpspp_math_la_SOURCES = \
         src/math/chart-geometry.c \
diff --git a/tests/language/stats/oneway.at b/tests/language/stats/oneway.at

index a0a3bafb745abeff97c97e06f8213a0bb328543c..ce121beb977dac2bda33886450fd4689d24e09e9 100644 (file)
--- a/tests/language/stats/oneway.at
+++ b/tests/language/stats/oneway.at
@@ -546,8 +546,358 @@ ONEWAY
  
  AT_CHECK([pspp -o pspp-weighted.csv oneway-weighted.sps], [0], [ignore], [ignore])
  
-
  AT_CHECK([diff pspp-weighted.csv pspp-unweighted.csv], [0])
  
+AT_CLEANUP
+
+
+
+AT_SETUP([ONEWAY posthoc LSD and BONFERRONI])
+AT_DATA([oneway-pig.sps],[dnl
+SET FORMAT F12.3.
+data list notable list /pigmentation * family *.
+begin data.
+36 1
+39 1
+43 1
+38 1
+37 1
+46 2
+47 2
+47 2
+47 2
+43 2
+40 3
+50 3
+44 3
+48 3
+50 3
+45 4
+53 4
+56 4
+52 4
+56 4
+end data.
+
+
+oneway pigmentation by family
+       /statistics = descriptives
+       /posthoc = lsd bonferroni alpha (0.05)
+        .
+])
+
+AT_CHECK([pspp -O format=csv oneway-pig.sps], [0], 
+[Table: Descriptives
+,,,,,,95% Confidence Interval for Mean,,,
+,,N,Mean,Std. Deviation,Std. Error,Lower Bound,Upper Bound,Minimum,Maximum
+pigmentation,1.000,5,38.600,2.702,1.208,35.245,41.955,36.000,43.000
+,2.000,5,46.000,1.732,.775,43.849,48.151,43.000,47.000
+,3.000,5,46.400,4.336,1.939,41.016,51.784,40.000,50.000
+,4.000,5,52.400,4.506,2.015,46.806,57.994,45.000,56.000
+,Total,20,45.850,5.967,1.334,43.057,48.643,36.000,56.000
+
+Table: ANOVA
+,,Sum of Squares,df,Mean Square,F,Significance
+pigmentation,Between Groups,478.950,3,159.650,12.927,.000
+,Within Groups,197.600,16,12.350,,
+,Total,676.550,19,,,
+
+Table: Multiple Comparisons
+,,,Mean Difference,,,95% Confidence Interval,
+,(I) family,(J) family,(I - J),Std. Error,Sig.,Lower Bound,Upper Bound
+LSD,1.000,2.000,-7.400,2.223,.004,-12.112,-2.688
+,,3.000,-7.800,2.223,.003,-12.512,-3.088
+,,4.000,-13.800,2.223,.000,-18.512,-9.088
+,2.000,1.000,7.400,2.223,.004,2.688,12.112
+,,3.000,-.400,2.223,.859,-5.112,4.312
+,,4.000,-6.400,2.223,.011,-11.112,-1.688
+,3.000,1.000,7.800,2.223,.003,3.088,12.512
+,,2.000,.400,2.223,.859,-4.312,5.112
+,,4.000,-6.000,2.223,.016,-10.712,-1.288
+,4.000,1.000,13.800,2.223,.000,9.088,18.512
+,,2.000,6.400,2.223,.011,1.688,11.112
+,,3.000,6.000,2.223,.016,1.288,10.712
+Bonferroni,1.000,2.000,-7.400,2.223,.025,-14.086,-.714
+,,3.000,-7.800,2.223,.017,-14.486,-1.114
+,,4.000,-13.800,2.223,.000,-20.486,-7.114
+,2.000,1.000,7.400,2.223,.025,.714,14.086
+,,3.000,-.400,2.223,1.000,-7.086,6.286
+,,4.000,-6.400,2.223,.065,-13.086,.286
+,3.000,1.000,7.800,2.223,.017,1.114,14.486
+,,2.000,.400,2.223,1.000,-6.286,7.086
+,,4.000,-6.000,2.223,.095,-12.686,.686
+,4.000,1.000,13.800,2.223,.000,7.114,20.486
+,,2.000,6.400,2.223,.065,-.286,13.086
+,,3.000,6.000,2.223,.095,-.686,12.686
+])
+
+AT_CLEANUP
+
+
+AT_SETUP([ONEWAY posthoc Tukey HSD and Games-Howell])
+AT_DATA([oneway-tukey.sps],[dnl
+set format = f11.3.
+data list notable list /libido * dose *.
+begin data.
+3 0
+2 0
+1 0
+1 0
+4 0
+5 1
+2 1
+4 1
+2 1
+3 1
+7 2
+4 2
+5 2
+3 2
+6 2
+end data.
+
+variable label dose 'Dose of Viagra'.
+
+add value labels dose 0 'Placebo' 1 '1 Dose' 2 '2 Doses'.
+
+oneway libido by dose
+       /posthoc tukey gh.
+])
+
+AT_CHECK([pspp -O format=csv oneway-tukey.sps], [0], 
+[Table: ANOVA
+,,Sum of Squares,df,Mean Square,F,Significance
+libido,Between Groups,20.133,2,10.067,5.119,.025
+,Within Groups,23.600,12,1.967,,
+,Total,43.733,14,,,
+
+Table: Multiple Comparisons
+,,,Mean Difference,,,95% Confidence Interval,
+,(I) Dose of Viagra,(J) Dose of Viagra,(I - J),Std. Error,Sig.,Lower Bound,Upper Bound
+Tukey HSD,Placebo,1 Dose,-1.000,.887,.516,-3.366,1.366
+,,2 Doses,-2.800,.887,.021,-5.166,-.434
+,1 Dose,Placebo,1.000,.887,.516,-1.366,3.366
+,,2 Doses,-1.800,.887,.147,-4.166,.566
+,2 Doses,Placebo,2.800,.887,.021,.434,5.166
+,,1 Dose,1.800,.887,.147,-.566,4.166
+Games-Howell,Placebo,1 Dose,-1.000,.887,.479,-3.356,1.356
+,,2 Doses,-2.800,.887,.039,-5.439,-.161
+,1 Dose,Placebo,1.000,.887,.479,-1.356,3.356
+,,2 Doses,-1.800,.887,.185,-4.439,.839
+,2 Doses,Placebo,2.800,.887,.039,.161,5.439
+,,1 Dose,1.800,.887,.185,-.839,4.439
+])
  
  AT_CLEANUP
+
+AT_SETUP([ONEWAY posthoc Sidak])
+AT_DATA([oneway-sidak.sps],[dnl
+SET FORMAT F20.4.
+
+DATA LIST notable LIST /program score.
+BEGIN DATA.
+1   9
+1  12
+1  14
+1  11
+1  13
+2  10
+2   6
+2   9
+2   9
+2  10
+3  12
+3  14
+3  11
+3  13
+3  11
+4   9
+4   8
+4  11
+4   7
+4   8
+END DATA.
+
+ONEWAY
+  score BY program
+  /MISSING ANALYSIS
+  /POSTHOC = SIDAK.
+])
+
+AT_CHECK([pspp -O format=csv oneway-sidak.sps], [0], 
+[Table: ANOVA
+,,Sum of Squares,df,Mean Square,F,Significance
+score,Between Groups,54.9500,3,18.3167,7.0449,.0031
+,Within Groups,41.6000,16,2.6000,,
+,Total,96.5500,19,,,
+
+Table: Multiple Comparisons
+,,,Mean Difference,,,95% Confidence Interval,
+,(I) program,(J) program,(I - J),Std. Error,Sig.,Lower Bound,Upper Bound
+Šidák,1.0000,2.0000,3.0000,1.0198,.0561,-.0575,6.0575
+,,3.0000,-.4000,1.0198,.9993,-3.4575,2.6575
+,,4.0000,3.2000,1.0198,.0375,.1425,6.2575
+,2.0000,1.0000,-3.0000,1.0198,.0561,-6.0575,.0575
+,,3.0000,-3.4000,1.0198,.0250,-6.4575,-.3425
+,,4.0000,.2000,1.0198,1.0000,-2.8575,3.2575
+,3.0000,1.0000,.4000,1.0198,.9993,-2.6575,3.4575
+,,2.0000,3.4000,1.0198,.0250,.3425,6.4575
+,,4.0000,3.6000,1.0198,.0166,.5425,6.6575
+,4.0000,1.0000,-3.2000,1.0198,.0375,-6.2575,-.1425
+,,2.0000,-.2000,1.0198,1.0000,-3.2575,2.8575
+,,3.0000,-3.6000,1.0198,.0166,-6.6575,-.5425
+])
+
+AT_CLEANUP
+
+AT_SETUP([ONEWAY posthoc Scheffe])
+AT_DATA([oneway-scheffe.sps],[dnl
+set format = f11.3.
+data list notable list /usage * group *.
+begin data.
+21.00     1
+19.00     1
+18.00     1
+25.00     1
+14.00     1
+13.00     1
+24.00     1
+19.00     1
+20.00     1
+21.00     1
+15.00     2
+10.00     2
+13.00     2
+16.00     2
+14.00     2
+24.00     2
+16.00     2
+14.00     2
+18.00     2
+16.00     2
+10.00     3
+ 7.00     3
+13.00     3
+20.00     3
+  .00     3
+ 8.00     3
+ 6.00     3
+ 1.00     3
+12.00     3
+14.00     3
+18.00     4
+15.00     4
+ 3.00     4
+27.00     4
+ 6.00     4
+14.00     4
+13.00     4
+11.00     4
+ 9.00     4
+18.00     4
+end data.
+
+variable label usage 'Days of Use'.
+
+add value labels group 0 'none' 1 'one' 2 'two' 3 'three' 4 'four'.
+
+oneway usage by group
+       /posthoc scheffe.
+])
+
+AT_CHECK([pspp -O format=csv oneway-scheffe.sps], [0], 
+[Table: ANOVA
+,,Sum of Squares,df,Mean Square,F,Significance
+Days of Use,Between Groups,555.275,3,185.092,6.663,.001
+,Within Groups,1000.100,36,27.781,,
+,Total,1555.375,39,,,
+
+Table: Multiple Comparisons
+,,,Mean Difference,,,95% Confidence Interval,
+,(I) group,(J) group,(I - J),Std. Error,Sig.,Lower Bound,Upper Bound
+Scheffé,one,two,3.800,2.357,.467,-3.112,10.712
+,,three,10.300,2.357,.001,3.388,17.212
+,,four,6.000,2.357,.110,-.912,12.912
+,two,one,-3.800,2.357,.467,-10.712,3.112
+,,three,6.500,2.357,.072,-.412,13.412
+,,four,2.200,2.357,.832,-4.712,9.112
+,three,one,-10.300,2.357,.001,-17.212,-3.388
+,,two,-6.500,2.357,.072,-13.412,.412
+,,four,-4.300,2.357,.358,-11.212,2.612
+,four,one,-6.000,2.357,.110,-12.912,.912
+,,two,-2.200,2.357,.832,-9.112,4.712
+,,three,4.300,2.357,.358,-2.612,11.212
+])
+
+AT_CLEANUP
+
+
+AT_SETUP([ONEWAY bad contrast count])
+
+AT_DATA([oneway-bad-contrast.sps],[dnl
+DATA LIST NOTABLE LIST /height * weight * temperature * sex *.
+BEGIN DATA.
+1884     88.6       39.97     0
+1801     90.9       39.03     0
+1801     91.7       38.98     0
+1607     56.3       36.26     1
+1608     46.3       46.26     1
+1607     55.9       37.84     1
+1604     56.6       36.81     1
+1606     56.1       34.56     1
+END DATA.
+
+ONEWAY /VARIABLES= height weight temperature BY sex
+ /CONTRAST = -1  1 
+ /CONTRAST = -3  3 
+ /CONTRAST =  2 -2  1
+ /CONTRAST = -9  9
+ . 
+])
+
+
+AT_CHECK([pspp -O format=csv oneway-bad-contrast.sps], [0], [dnl
+"oneway-bad-contrast.sps:18: warning: ONEWAY: In contrast list 3, the number of coefficients (3) does not equal the number of groups (2). This contrast list will be ignored."
+
+Table: ANOVA
+,,Sum of Squares,df,Mean Square,F,Significance
+height,Between Groups,92629.63,1,92629.63,120.77,.00
+,Within Groups,4601.87,6,766.98,,
+,Total,97231.50,7,,,
+weight,Between Groups,2451.65,1,2451.65,174.59,.00
+,Within Groups,84.25,6,14.04,,
+,Total,2535.90,7,,,
+temperature,Between Groups,1.80,1,1.80,.13,.73
+,Within Groups,84.55,6,14.09,,
+,Total,86.36,7,,,
+
+Table: Contrast Coefficients
+,,sex,
+,,.00,1.00
+Contrast,1,-1,1
+,2,-3,3
+,3,-9,9
+
+Table: Contrast Tests
+,,Contrast,Value of Contrast,Std. Error,t,df,Sig. (2-tailed)
+height,Assume equal variances,1,-222.27,20.23,10.99,6,.00
+,,2,-666.80,60.68,10.99,6,.00
+,,3,-2000.40,182.03,10.99,6,.00
+,Does not assume equal,1,-222.27,27.67,-8.03,2.00,1.98
+,,2,-666.80,83.02,-8.03,2.00,1.98
+,,3,-2000.40,249.07,-8.03,2.00,1.98
+weight,Assume equal variances,1,-36.16,2.74,13.21,6,.00
+,,2,-108.48,8.21,13.21,6,.00
+,,3,-325.44,24.63,13.21,6,.00
+,Does not assume equal,1,-36.16,2.19,-16.48,5.42,2.00
+,,2,-108.48,6.58,-16.48,5.42,2.00
+,,3,-325.44,19.75,-16.48,5.42,2.00
+temperature,Assume equal variances,1,-.98,2.74,.36,6,.73
+,,2,-2.94,8.22,.36,6,.73
+,,3,-8.83,24.67,.36,6,.73
+,Does not assume equal,1,-.98,2.07,-.47,4.19,1.34
+,,2,-2.94,6.22,-.47,4.19,1.34
+,,3,-8.83,18.66,-.47,4.19,1.34
+])
+
+AT_CLEANUP
+\ No newline at end of file
author	John Darrington <john@darrington.wattle.id.au>
	Fri, 1 Jul 2011 18:04:58 +0000 (20:04 +0200)
committer	John Darrington <john@darrington.wattle.id.au>
	Fri, 1 Jul 2011 18:04:58 +0000 (20:04 +0200)
NEWS		patch \| blob \| history
doc/statistics.texi		patch \| blob \| history
lib/automake.mk		patch \| blob \| history
lib/tukey/README	[new file with mode: 0644]	patch \| blob
lib/tukey/automake.mk	[new file with mode: 0644]	patch \| blob
lib/tukey/ptukey.c	[new file with mode: 0644]	patch \| blob
lib/tukey/qtukey.c	[new file with mode: 0644]	patch \| blob
lib/tukey/tukey.h	[new file with mode: 0644]	patch \| blob
src/language/stats/oneway.c		patch \| blob \| history
src/math/automake.mk		patch \| blob \| history
tests/language/stats/oneway.at		patch \| blob \| history