1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by John Williams <johnr.williams@stonebow.otago.ac.nz>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27 #include "dcdflib/cdflib.h"
32 #include "value-labels.h"
39 variables=varlist("PV_NO_SCRATCH | PV_NUMERIC");
41 +missing=miss:!analysis/listwise,
42 incl:include/!exclude;
43 +format=fmt:!labels/nolabels;
44 +criteria=:ci(d:criteria,"%s > 0. && %s < 1.").
49 #include "debug-print.h"
51 /* Command parsing information. */
52 static struct cmd_t_test cmd;
54 /* Variable for the GROUPS subcommand, if given. */
55 static struct variable *groups;
57 /* GROUPS: Number of values specified by the user; the values
59 static int n_groups_values;
60 static union value groups_values[2];
62 /* PAIRED: Number of pairs; each pair. */
64 static struct variable *(*pairs)[2];
66 /* Routines to scan data and perform t-tests */
67 static void precalc (void);
68 static void postcalc (void);
69 static void g_postcalc (void);
70 static void t_pairs (void);
71 static void t_groups (void);
72 static int groups_calc (struct ccase *);
73 static int pairs_calc (struct ccase *);
74 static int z_calc (struct ccase *);
81 struct value_list *next;
84 /* general workhorses - should move these to a separate library... */
85 double variance (double n, double ss, double sum);
87 double covariance (double x_sum, double x_n,
88 double y_sum, double y_n, double ss);
90 double pooled_variance (double n_1, double var_1,
91 double n_2, double var_2);
93 double oneway (double *f, double *p, struct value_list *list);
95 double pearson_r (double c_xy, double c_xx, double c_yy);
97 double f_sig (double f, double dfn, double dfd);
98 double t_crt (double df, double q);
99 double t_sig (double t, double df);
101 /* massive function simply to remove any responsibility for output
102 from the function which does the actual t-test calculations */
103 void print_t_groups (struct variable * grps, union value * g1, union value * g2,
104 double n1, double n2, double mean1, double mean2,
105 double sd1, double sd2, double se1, double se2,
106 double diff, double l_f, double l_p,
107 double p_t, double p_sig, double p_df, double p_sed,
108 double p_l, double p_h,
109 double s_t, double s_sig, double s_df, double s_sed,
110 double s_l, double s_h);
112 /* Global variables to communicate between calc() and postcalc()
113 should move to a structure in the p union of variable... */
114 static double v1_n, v1_ss, v1_sum, v1_se, v1_var, v1_mean;
115 static double v2_n, v2_ss, v2_sum, v2_se, v2_var, v2_mean;
116 static double v1_z_sum, v1_z_ss;
117 static double v2_z_sum, v2_z_ss;
118 static double diff, se_diff, sp, xy_sum, xy_diff, xy_ss;
121 /* some defines for CDFlib */
123 #define FIND_CRITICAL_VALUE 2
127 static void debug_print (void);
130 /* Parses and executes the T-TEST procedure. */
134 struct cmd_t_test cmd;
136 if (!lex_force_match_id ("T"))
139 lex_match_id ("TEST");
141 if (!parse_t_test (&cmd))
149 procedure (precalc, pairs_calc, postcalc);
151 /* probably groups then... */
153 printf ("\n\n t-tests for independent samples of %s %s\n",
154 groups->name, groups->label);
156 for (cur_var = 0; cur_var < cmd.n_variables; cur_var++)
158 v1_n = v1_ss = v1_sum = v1_se = v1_var = v1_mean = 0.0;
159 v2_n = v2_ss = v2_sum = v2_se = v2_var = v2_mean = 0.0;
160 v1_z_sum = v1_z_ss = v2_z_sum = v2_z_ss = 0.0;
161 diff = se_diff = sp = xy_diff = xy_ss = xy_sum = 0.0;
163 procedure (precalc, groups_calc, g_postcalc);
164 procedure (precalc, z_calc, postcalc);
174 return; /* rilly void... */
178 groups_calc (struct ccase * c)
182 struct variable *v = cmd.v_variables[cur_var];
183 double X = c->data[v->fv].f;
185 /* Get the weight for this case. */
186 w = dict_get_case_weight (default_dict, c);
187 if (w <= 0.0 || w == SYSMIS)
191 printf ("Bad weight\n");
194 if (X == SYSMIS || X == 0.0) /* FIXME: should be USER_MISSING? */
196 /* printf("Missing value\n"); */
202 group = c->data[groups->fv].f;
204 if (group == groups_values[0].f)
210 else if (group == groups_values[1].f)
224 v1_mean = v1_sum / v1_n;
225 v2_mean = v2_sum / v2_n;
229 int /* this pass generates the z-zcores */
230 z_calc (struct ccase * c)
233 struct variable *v = cmd.v_variables[cur_var];
234 double X = c->data[v->fv].f;
238 /* Get the weight for this case. */
239 w = dict_get_case_weight (default_dict, c);
241 if (X == SYSMIS || X == 0.0) /* FIXME: how to specify user missing? */
247 group = c->data[groups->fv].f;
250 if (group == groups_values[0].f)
252 z = fabs (X - v1_mean);
254 v1_z_ss += pow (z, 2);
256 else if (group == groups_values[1].f)
258 z = fabs (X - v2_mean);
259 v2_z_ss += pow (z, 2);
269 pairs_calc (struct ccase * c)
272 struct variable *v1, *v2;
275 for (i = 0; i < n_pairs; i++)
280 X = c->data[v1->fv].f;
281 Y = c->data[v2->fv].f;
283 if (X == SYSMIS || Y == SYSMIS)
285 printf ("Missing value\n");
291 xy_ss += pow ((X - Y), 2);
307 /* Calculate basic statistics */
308 v1_var = variance (v1_n, v1_ss, v1_sum); /* variances */
309 v2_var = variance (v2_n, v2_ss, v2_sum);
310 v1_se = sqrt (v1_var / v1_n); /* standard errors */
311 v2_se = sqrt (v2_var / v2_n);
312 diff = v1_mean - v2_mean;
329 double df_pooled, t_pooled, t_sep, p_pooled, p_sep;
330 double crt_t_p, crt_t_s, tmp, v1_z, v2_z, f_levene, p_levene;
331 double df_sep, se_diff_s, se_diff_p;
332 struct value_list *val_1, *val_2;
335 val_1 = malloc (sizeof (struct value_list));
336 val_1->sum = v1_z_sum;
339 val_2 = malloc (sizeof (struct value_list));
340 val_2->sum = v2_z_sum;
347 f_levene = oneway (&f_levene, &p_levene, val_1);
349 /* T test results for pooled variances */
350 se_diff_p = sqrt (pooled_variance (v1_n, v1_var, v2_n, v2_var));
351 df_pooled = v1_n + v2_n - 2.0;
352 t_pooled = diff / se_diff_p;
353 p_pooled = t_sig (t_pooled, df_pooled);
354 crt_t_p = t_crt (df_pooled, 0.025);
356 if ((2.0 * p_pooled) >= 1.0)
357 p_pooled = 1.0 - p_pooled;
359 /* oh god, the separate variance calculations... */
360 t_sep = diff / sqrt ((v1_var / v1_n) + (v2_var / v2_n));
362 tmp = (v1_var / v1_n) + (v2_var / v2_n);
363 tmp = (v1_var / v1_n) / tmp;
365 tmp = tmp / (v1_n - 1.0);
368 tmp = (v1_var / v1_n) + (v2_var / v2_n);
369 tmp = (v2_var / v2_n) / tmp;
371 tmp = tmp / (v2_n - 1.0);
374 tmp = 1.0 / (v1_z + v2_z);
377 p_sep = t_sig (t_sep, df_sep);
378 if ((2.0 * p_sep) >= 1.0)
380 crt_t_s = t_crt (df_sep, 0.025);
381 se_diff_s = sqrt ((v1_var / v1_n) + (v2_var / v2_n));
383 /* FIXME: convert to a proper PSPP output call */
384 print_t_groups (groups, &groups_values[0], &groups_values[1],
385 v1_n, v2_n, v1_mean, v2_mean,
386 sqrt (v1_var), sqrt (v2_var), v1_se, v2_se,
387 diff, f_levene, p_levene,
388 t_pooled, 2.0 * p_pooled, df_pooled, se_diff_p,
389 diff - (crt_t_p * se_diff_p), diff + (crt_t_p * se_diff_p),
390 t_sep, 2.0 * p_sep, df_sep, se_diff_s,
391 diff - (crt_t_s * se_diff_s), diff + (crt_t_s * se_diff_s));
398 double cov12, cov11, cov22, r, t, p, crt_t, sp, r_t, r_p;
399 struct variable *v1, *v2;
403 cov12 = covariance (v1_sum, v1_n, v2_sum, v2_n, xy_sum);
404 cov11 = covariance (v1_sum, v1_n, v1_sum, v1_n, v1_ss);
405 cov22 = covariance (v2_sum, v2_n, v2_sum, v2_n, v2_ss);
406 r = pearson_r (cov12, cov11, cov22);
407 /* this t and it's associated p is a significance test for the pearson's r */
408 r_t = r * sqrt ((v1_n - 2.0) / (1.0 - (r * r)));
409 r_p = t_sig (r_t, v1_n - 2.0);
411 /* now we move to the t test for the difference in means */
412 diff = xy_diff / v1_n;
413 sp = sqrt (variance (v1_n, xy_ss, xy_diff));
414 se_diff = sp / sqrt (v1_n);
416 crt_t = t_crt (v1_n - 1.0, 0.025);
417 p = t_sig (t, v1_n - 1.0);
420 printf (" Number of 2-tail\n");
421 printf (" Variable pairs Corr Sig Mean SD SE of Mean\n");
422 printf ("---------------------------------------------------------------\n");
423 printf ("%s %8.4f %8.4f %8.4f\n",
424 v1->name, v1_mean, sqrt (v1_var), v1_se);
425 printf (" %8.4f %0.4f %0.4f\n", v1_n, r, r_p);
426 printf ("%s %8.4f %8.4f %8.4f\n",
427 v2->name, v2_mean, sqrt (v2_var), v2_se);
428 printf ("---------------------------------------------------------------\n");
431 printf (" Paired Differences |\n");
432 printf (" Mean SD SE of Mean | t-value df 2-tail Sig\n");
433 printf ("--------------------------------------|---------------------------\n");
435 printf ("%8.4f %8.4f %8.4f | %8.4f %8.4f %8.4f\n",
436 diff, sp, se_diff, t, v1_n - 1.0, 2.0 * (1.0 - p));
438 printf ("95pc CI (%8.4f, %8.4f) |\n\n",
439 diff - (se_diff * crt_t), diff + (se_diff * crt_t));
444 static int parse_value (union value *);
446 /* Parses the GROUPS subcommand. */
448 tts_custom_groups (struct cmd_t_test *cmd unused)
450 groups = parse_variable ();
453 lex_error (_("expecting variable name in GROUPS subcommand"));
456 if (groups->type == T_STRING && groups->width > MAX_SHORT_STRING)
458 msg (SE, _("Long string variable %s is not valid here."),
463 if (!lex_match ('('))
465 if (groups->type == NUMERIC)
468 groups_values[0].f = 1;
469 groups_values[1].f = 2;
474 msg (SE, _("When applying GROUPS to a string variable, at "
475 "least one value must be specified."));
480 if (!parse_value (&groups_values[0]))
489 if (!parse_value (&groups_values[1]))
493 if (!lex_force_match (')'))
499 /* Parses the current token (numeric or string, depending on the
500 variable in `groups') into value V and returns success. */
502 parse_value (union value * v)
504 if (groups->type == NUMERIC)
506 if (!lex_force_num ())
512 if (!lex_force_string ())
514 strncpy (v->s, ds_value (&tokstr), ds_length (&tokstr));
522 /* Parses the PAIRS subcommand. */
524 tts_custom_pairs (struct cmd_t_test *cmd unused)
526 struct variable **vars;
535 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
538 if (!parse_variables (default_dict, &vars, &n_vars,
539 PV_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH))
543 if (lex_match (T_WITH))
545 n_before_WITH = n_vars;
547 if (!parse_variables (default_dict, &vars, &n_vars,
548 PV_DUPLICATE | PV_APPEND
549 | PV_NUMERIC | PV_NO_SCRATCH))
558 paired = (lex_match ('(') && lex_match_id ("PAIRED") && lex_match (')'));
562 if (n_before_WITH * 2 != n_vars)
565 msg (SE, _("PAIRED was specified but the number of variables "
566 "preceding WITH (%d) did not match the number "
568 n_before_WITH, n_vars - n_before_WITH);
572 extra = n_before_WITH;
574 else if (n_before_WITH)
575 extra = n_before_WITH * (n_vars - n_before_WITH);
581 msg (SE, _("At least two variables must be specified "
586 extra = n_vars * (n_vars - 1) / 2;
590 n_predicted = n_pairs + extra;
593 pairs = xrealloc (pairs, sizeof (struct variable *[2]) * (n_pairs + extra));
599 for (i = 0; i < extra; i++)
601 pairs[n_pairs][0] = vars[i];
602 pairs[n_pairs++][1] = vars[i + extra];
605 else if (n_before_WITH)
609 for (i = 0; i < n_before_WITH; i++)
613 for (j = n_before_WITH; j < n_vars; j++)
615 pairs[n_pairs][0] = vars[i];
616 pairs[n_pairs++][1] = vars[j];
624 for (i = 0; i < n_vars; i++)
628 for (j = i + 1; j < n_vars; j++)
630 pairs[n_pairs][0] = vars[i];
631 pairs[n_pairs++][1] = vars[j];
637 assert (n_pairs == n_predicted);
651 printf (" GROUPS=%s", groups->name);
657 for (i = 0; i < n_groups_values; i++)
658 if (groups->type == NUMERIC)
659 printf ("%g%s", groups_values[i].f, i ? " " : "");
661 printf ("%.*s%s", groups->width, groups_values[i].s,
671 printf (" VARIABLES=");
672 for (i = 0; i < cmd.n_variables; i++)
673 printf ("%s ", cmd.v_variables[i]->name);
681 for (i = 0; i < n_pairs; i++)
682 printf ("%s ", pairs[i][0]->name);
684 for (i = 0; i < n_pairs; i++)
685 printf (" %s", pairs[i][1]->name);
686 printf (" (PAIRED)\n");
688 printf (" MISSING=%s %s\n",
689 cmd.miss == TTS_ANALYSIS ? "ANALYSIS" : "LISTWISE",
690 cmd.miss == TTS_INCLUDE ? "INCLUDE" : "EXCLUDE");
691 printf (" FORMAT=%s\n",
692 cmd.fmt == TTS_LABELS ? "LABELS" : "NOLABELS");
693 if (cmd.criteria != NOT_LONG)
694 printf (" CRITERIA=%f\n", cmd.criteria);
697 #endif /* DEBUGGING */
699 /* Here are some general routines tha should probably be moved into
700 a separate library and documented as part of the PSPP "API" */
702 variance (double n, double ss, double sum)
704 return ((ss - ((sum * sum) / n)) / (n - 1.0));
708 pooled_variance (double n_1, double var_1, double n_2, double var_2)
712 tmp = n_1 + n_2 - 2.0;
713 tmp = (((n_1 - 1.0) * var_1) + ((n_2 - 1.0) * var_2)) / tmp;
714 tmp = tmp * ((n_1 + n_2) / (n_1 * n_2));
719 oneway (double *f, double *p, struct value_list *levels)
721 double k, SSTR, SSE, SSTO, N, MSTR, MSE, sum, dftr, dfe, print;
722 struct value_list *g;
726 for (g = levels; g != NULL; g = g->next)
731 SSTR += g->ss - (pow (g->sum, 2) / g->n);
735 SSTO = SSTO - (pow (sum, 2) / N);
744 *p = f_sig (*f, dfe, dftr);
749 printf ("sum1 %f, sum2 %f, ss1 %f, ss2 %f\n",
750 levels->sum, levels->next->sum, levels->ss, levels->next->ss);
751 printf (" - - - - - - O N E W A Y - - - - - -\n\n");
752 printf (" Variable %s %s\n",
753 cmd.v_variables[0]->name, cmd.v_variables[0]->label);
754 printf ("By Variable %s %s\n", groups->name, groups->label);
755 printf ("\n Analysis of Variance\n\n");
756 printf (" Sum of Mean F F\n");
757 printf ("Source D.F. Squares Squares Ratio Prob\n\n");
758 printf ("Between %8.0f %8.4f %8.4f %8.4f %8.4f\n",
759 dfe, SSE, MSE, *f, *p);
760 printf ("Within %8.0f %8.4f %8.4f\n", dftr, SSTR, MSTR);
761 printf ("Total %8.0f %8.4f\n\n\n", N - 1.0, SSTO);
767 f_sig (double f, double dfn, double dfd)
775 cdff (&which, &p, &q, &f, &dfn, &dfd, &status, &bound);
781 printf ("Parameter 1 is out of range\n");
786 printf ("Parameter 2 is out of range\n");
791 printf ("Parameter 3 is out of range\n");
796 printf ("Parameter 4 is out of range\n");
801 printf ("Parameter 5 is out of range\n");
806 printf ("Parameter 6 is out of range\n");
811 printf ("Parameter 7 is out of range\n");
816 printf ("Parameter 8 is out of range\n");
821 /* printf( "Command completed successfully\n" ); */
826 printf ("Answer appears to be lower than the lowest search bound\n");
831 printf ("Answer appears to be higher than the greatest search bound\n");
836 printf ("P - Q NE 1\n");
843 return (double) ERROR_SIG;
852 t_crt (double df, double q)
857 which = FIND_CRITICAL_VALUE;
862 cdft (&which, &p, &q, &t, &df, &status, &bound);
868 printf ("t_crt: Parameter 1 is out of range\n");
873 printf ("t_crt: value of p (%f) is out of range\n", p);
878 printf ("t_crt: value of q (%f) is out of range\n", q);
883 printf ("t_crt: value of df (%f) is out of range\n", df);
888 printf ("t_crt: Parameter 5 is out of range\n");
893 printf ("t_crt: Parameter 6 is out of range\n");
898 printf ("t_crt: Parameter 7 is out of range\n");
903 /* printf( "Command completed successfully\n" ); */
908 printf ("t_crt: Answer appears to be lower than the lowest search bound\n");
913 printf ("t_crt: Answer appears to be higher than the greatest search bound\n");
918 printf ("t_crt: P - Q NE 1\n");
925 return (double) ERROR_SIG;
934 t_sig (double t, double df)
944 cdft (&which, &p, &q, &t, &df, &status, &bound);
950 printf ("t-sig: Parameter 1 is out of range\n");
955 printf ("t-sig: Parameter 2 is out of range\n");
960 printf ("t-sig: Parameter 3 is out of range\n");
965 printf ("t-sig: Parameter 4 is out of range\n");
970 printf ("t-sig: Parameter 5 is out of range\n");
975 printf ("t-sig: Parameter 6 is out of range\n");
980 printf ("t-sig: Parameter 7 is out of range\n");
985 /* printf( "Command completed successfully\n" ); */
990 printf ("t-sig: Answer appears to be lower than the lowest search bound\n");
995 printf ("t-sig: Answer appears to be higher than the greatest search bound\n");
1000 printf ("t-sig: P - Q NE 1\n");
1007 return (double) ERROR_SIG;
1016 covariance (double x_sum, double x_n, double y_sum, double y_n, double ss)
1020 tmp = x_sum * y_sum;
1023 tmp = (tmp / (x_n + y_n - 1.0));
1028 pearson_r (double c_xy, double c_xx, double c_yy)
1030 return (c_xy / (sqrt (c_xx * c_yy)));
1034 print_t_groups (struct variable * grps, union value * g1, union value * g2,
1035 double n1, double n2, double mean1, double mean2,
1036 double sd1, double sd2, double se1, double se2,
1037 double diff, double l_f, double l_p,
1038 double p_t, double p_sig, double p_df, double p_sed,
1039 double p_l, double p_h,
1040 double s_t, double s_sig, double s_df, double s_sed,
1041 double s_l, double s_h)
1044 /* Display all this shit as SPSS 6.0 does (roughly) */
1045 printf ("\n\n Number \n");
1046 printf (" Variable of Cases Mean SD SE of Mean\n");
1047 printf ("-----------------------------------------------------------\n");
1048 printf (" %s %s\n\n", cmd.v_variables[cur_var]->name, cmd.v_variables[cur_var]->label);
1049 printf ("%s %8.4f %8.0f %8.4f %8.3f %8.3f\n",
1050 val_labs_find (grps->val_labs, *g1), g1->f, n1, mean1, sd1, se1);
1051 printf ("%s %8.4f %8.0f %8.4f %8.3f %8.3f\n",
1052 val_labs_find (grps->val_labs, *g2), g2->f, n2, mean2, sd2, se2);
1053 printf ("-----------------------------------------------------------\n");
1054 printf ("\n Mean Difference = %8.4f\n", diff);
1055 printf ("\n Levene's Test for Equality of Variances: F= %.3f P= %.3f\n",
1057 printf ("\n\n t-test for Equality of Means 95pc \n");
1058 printf ("Variances t-value df 2-Tail Sig SE of Diff CI for Diff \n");
1059 printf ("-----------------------------------------------------------------\n");
1060 printf ("Equal %8.2f %8.0f %8.3f %8.3f (%8.3f, %8.3f)\n",
1061 p_t, p_df, p_sig, p_sed, p_l, p_h);
1062 printf ("Unequal %8.2f %8.2f %8.3f %8.3f (%8.3f, %8.3f)\n",
1063 s_t, s_df, s_sig, s_sed, s_l, s_h);
1064 printf ("-----------------------------------------------------------------\n");