1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27 #include "dcdflib/cdflib.h"
38 variables=varlist("PV_NO_SCRATCH | PV_NUMERIC");
40 +missing=miss:!analysis/listwise,
41 incl:include/!exclude;
42 +format=fmt:!labels/nolabels;
43 +criteria=:ci(d:criteria,"%s > 0. && %s < 1.").
48 #include "debug-print.h"
50 /* Command parsing information. */
51 static struct cmd_t_test cmd;
53 /* Variable for the GROUPS subcommand, if given. */
54 static struct variable *groups;
56 /* GROUPS: Number of values specified by the user; the values
58 static int n_groups_values;
59 static union value groups_values[2];
61 /* PAIRED: Number of pairs; each pair. */
63 static struct variable *(*pairs)[2];
65 /* Routines to scan data and perform t-tests */
66 static void precalc (void);
67 static void postcalc (void);
68 static void g_postcalc (void);
69 static void t_pairs (void);
70 static void t_groups (void);
71 static int groups_calc (struct ccase *);
72 static int pairs_calc (struct ccase *);
73 static int z_calc (struct ccase *);
80 struct value_list *next;
83 /* general workhorses - should move these to a separate library... */
84 double variance (double n, double ss, double sum);
86 double covariance (double x_sum, double x_n,
87 double y_sum, double y_n, double ss);
89 double pooled_variance (double n_1, double var_1,
90 double n_2, double var_2);
92 double oneway (double *f, double *p, struct value_list *list);
94 double pearson_r (double c_xy, double c_xx, double c_yy);
96 double f_sig (double f, double dfn, double dfd);
97 double t_crt (double df, double q);
98 double t_sig (double t, double df);
100 /* massive function simply to remove any responsibility for output
101 from the function which does the actual t-test calculations */
102 void print_t_groups (struct variable * grps, union value * g1, union value * g2,
103 double n1, double n2, double mean1, double mean2,
104 double sd1, double sd2, double se1, double se2,
105 double diff, double l_f, double l_p,
106 double p_t, double p_sig, double p_df, double p_sed,
107 double p_l, double p_h,
108 double s_t, double s_sig, double s_df, double s_sed,
109 double s_l, double s_h);
111 /* Global variables to communicate between calc() and postcalc()
112 should move to a structure in the p union of variable... */
113 static double v1_n, v1_ss, v1_sum, v1_se, v1_var, v1_mean;
114 static double v2_n, v2_ss, v2_sum, v2_se, v2_var, v2_mean;
115 static double v1_z_sum, v1_z_ss;
116 static double v2_z_sum, v2_z_ss;
117 static double diff, se_diff, sp, xy_sum, xy_diff, xy_ss;
120 /* some defines for CDFlib */
122 #define FIND_CRITICAL_VALUE 2
126 static void debug_print (void);
129 /* Parses and executes the T-TEST procedure. */
133 struct cmd_t_test cmd;
135 if (!lex_force_match_id ("T"))
138 lex_match_id ("TEST");
140 if (!parse_t_test (&cmd))
148 procedure (precalc, pairs_calc, postcalc);
150 /* probably groups then... */
152 printf ("\n\n t-tests for independent samples of %s %s\n",
153 groups->name, groups->label);
155 for (cur_var = 0; cur_var < cmd.n_variables; cur_var++)
157 v1_n = v1_ss = v1_sum = v1_se = v1_var = v1_mean = 0.0;
158 v2_n = v2_ss = v2_sum = v2_se = v2_var = v2_mean = 0.0;
159 v1_z_sum = v1_z_ss = v2_z_sum = v2_z_ss = 0.0;
160 diff = se_diff = sp = xy_diff = xy_ss = xy_sum = 0.0;
162 procedure (precalc, groups_calc, g_postcalc);
163 procedure (precalc, z_calc, postcalc);
173 return; /* rilly void... */
177 groups_calc (struct ccase * c)
181 struct variable *v = cmd.v_variables[cur_var];
182 double X = c->data[v->fv].f;
184 /* Get the weight for this case. */
185 if (default_dict.weight_index == -1)
189 w = c->data[default_dict.weight_index].f;
190 if (w <= 0.0 || w == SYSMIS)
194 printf ("Bad weight\n");
198 if (X == SYSMIS || X == 0.0) /* FIXME: should be USER_MISSING? */
200 /* printf("Missing value\n"); */
206 group = c->data[groups->fv].f;
208 if (group == groups_values[0].f)
214 else if (group == groups_values[1].f)
228 v1_mean = v1_sum / v1_n;
229 v2_mean = v2_sum / v2_n;
233 int /* this pass generates the z-zcores */
234 z_calc (struct ccase * c)
238 struct variable *v = cmd.v_variables[cur_var];
239 double X = c->data[v->fv].f;
243 /* Get the weight for this case. */
244 if (default_dict.weight_index == -1)
248 w = c->data[default_dict.weight_index].f;
249 if (w <= 0.0 || w == SYSMIS)
256 if (X == SYSMIS || X == 0.0) /* FIXME: how to specify user missing? */
262 group = c->data[groups->fv].f;
265 if (group == groups_values[0].f)
267 z = fabs (X - v1_mean);
269 v1_z_ss += pow (z, 2);
271 else if (group == groups_values[1].f)
273 z = fabs (X - v2_mean);
274 v2_z_ss += pow (z, 2);
284 pairs_calc (struct ccase * c)
287 struct variable *v1, *v2;
290 for (i = 0; i < n_pairs; i++)
295 X = c->data[v1->fv].f;
296 Y = c->data[v2->fv].f;
298 if (X == SYSMIS || Y == SYSMIS)
300 printf ("Missing value\n");
306 xy_ss += pow ((X - Y), 2);
322 /* Calculate basic statistics */
323 v1_var = variance (v1_n, v1_ss, v1_sum); /* variances */
324 v2_var = variance (v2_n, v2_ss, v2_sum);
325 v1_se = sqrt (v1_var / v1_n); /* standard errors */
326 v2_se = sqrt (v2_var / v2_n);
327 diff = v1_mean - v2_mean;
344 double df_pooled, t_pooled, t_sep, p_pooled, p_sep;
345 double crt_t_p, crt_t_s, tmp, v1_z, v2_z, f_levene, p_levene;
346 double df_sep, se_diff_s, se_diff_p;
347 struct value_list *val_1, *val_2;
350 val_1 = malloc (sizeof (struct value_list));
351 val_1->sum = v1_z_sum;
354 val_2 = malloc (sizeof (struct value_list));
355 val_2->sum = v2_z_sum;
362 f_levene = oneway (&f_levene, &p_levene, val_1);
364 /* T test results for pooled variances */
365 se_diff_p = sqrt (pooled_variance (v1_n, v1_var, v2_n, v2_var));
366 df_pooled = v1_n + v2_n - 2.0;
367 t_pooled = diff / se_diff_p;
368 p_pooled = t_sig (t_pooled, df_pooled);
369 crt_t_p = t_crt (df_pooled, 0.025);
371 if ((2.0 * p_pooled) >= 1.0)
372 p_pooled = 1.0 - p_pooled;
374 /* oh god, the separate variance calculations... */
375 t_sep = diff / sqrt ((v1_var / v1_n) + (v2_var / v2_n));
377 tmp = (v1_var / v1_n) + (v2_var / v2_n);
378 tmp = (v1_var / v1_n) / tmp;
380 tmp = tmp / (v1_n - 1.0);
383 tmp = (v1_var / v1_n) + (v2_var / v2_n);
384 tmp = (v2_var / v2_n) / tmp;
386 tmp = tmp / (v2_n - 1.0);
389 tmp = 1.0 / (v1_z + v2_z);
392 p_sep = t_sig (t_sep, df_sep);
393 if ((2.0 * p_sep) >= 1.0)
395 crt_t_s = t_crt (df_sep, 0.025);
396 se_diff_s = sqrt ((v1_var / v1_n) + (v2_var / v2_n));
398 /* FIXME: convert to a proper PSPP output call */
399 print_t_groups (groups, &groups_values[0], &groups_values[1],
400 v1_n, v2_n, v1_mean, v2_mean,
401 sqrt (v1_var), sqrt (v2_var), v1_se, v2_se,
402 diff, f_levene, p_levene,
403 t_pooled, 2.0 * p_pooled, df_pooled, se_diff_p,
404 diff - (crt_t_p * se_diff_p), diff + (crt_t_p * se_diff_p),
405 t_sep, 2.0 * p_sep, df_sep, se_diff_s,
406 diff - (crt_t_s * se_diff_s), diff + (crt_t_s * se_diff_s));
413 double cov12, cov11, cov22, r, t, p, crt_t, sp, r_t, r_p;
414 struct variable *v1, *v2;
418 cov12 = covariance (v1_sum, v1_n, v2_sum, v2_n, xy_sum);
419 cov11 = covariance (v1_sum, v1_n, v1_sum, v1_n, v1_ss);
420 cov22 = covariance (v2_sum, v2_n, v2_sum, v2_n, v2_ss);
421 r = pearson_r (cov12, cov11, cov22);
422 /* this t and it's associated p is a significance test for the pearson's r */
423 r_t = r * sqrt ((v1_n - 2.0) / (1.0 - (r * r)));
424 r_p = t_sig (r_t, v1_n - 2.0);
426 /* now we move to the t test for the difference in means */
427 diff = xy_diff / v1_n;
428 sp = sqrt (variance (v1_n, xy_ss, xy_diff));
429 se_diff = sp / sqrt (v1_n);
431 crt_t = t_crt (v1_n - 1.0, 0.025);
432 p = t_sig (t, v1_n - 1.0);
435 printf (" Number of 2-tail\n");
436 printf (" Variable pairs Corr Sig Mean SD SE of Mean\n");
437 printf ("---------------------------------------------------------------\n");
438 printf ("%s %8.4f %8.4f %8.4f\n",
439 v1->name, v1_mean, sqrt (v1_var), v1_se);
440 printf (" %8.4f %0.4f %0.4f\n", v1_n, r, r_p);
441 printf ("%s %8.4f %8.4f %8.4f\n",
442 v2->name, v2_mean, sqrt (v2_var), v2_se);
443 printf ("---------------------------------------------------------------\n");
446 printf (" Paired Differences |\n");
447 printf (" Mean SD SE of Mean | t-value df 2-tail Sig\n");
448 printf ("--------------------------------------|---------------------------\n");
450 printf ("%8.4f %8.4f %8.4f | %8.4f %8.4f %8.4f\n",
451 diff, sp, se_diff, t, v1_n - 1.0, 2.0 * (1.0 - p));
453 printf ("95pc CI (%8.4f, %8.4f) |\n\n",
454 diff - (se_diff * crt_t), diff + (se_diff * crt_t));
459 static int parse_value (union value *);
461 /* Parses the GROUPS subcommand. */
463 tts_custom_groups (struct cmd_t_test *cmd unused)
465 groups = parse_variable ();
468 lex_error (_("expecting variable name in GROUPS subcommand"));
471 if (groups->type == T_STRING && groups->width > MAX_SHORT_STRING)
473 msg (SE, _("Long string variable %s is not valid here."),
478 if (!lex_match ('('))
480 if (groups->type == NUMERIC)
483 groups_values[0].f = 1;
484 groups_values[1].f = 2;
489 msg (SE, _("When applying GROUPS to a string variable, at "
490 "least one value must be specified."));
495 if (!parse_value (&groups_values[0]))
504 if (!parse_value (&groups_values[1]))
508 if (!lex_force_match (')'))
514 /* Parses the current token (numeric or string, depending on the
515 variable in `groups') into value V and returns success. */
517 parse_value (union value * v)
519 if (groups->type == NUMERIC)
521 if (!lex_force_num ())
527 if (!lex_force_string ())
529 strncpy (v->s, ds_value (&tokstr), ds_length (&tokstr));
537 /* Parses the PAIRS subcommand. */
539 tts_custom_pairs (struct cmd_t_test *cmd unused)
541 struct variable **vars;
550 if ((token != T_ID || !is_varname (tokid)) && token != T_ALL)
552 if (!parse_variables (&default_dict, &vars, &n_vars,
553 PV_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH))
557 if (lex_match (T_WITH))
559 n_before_WITH = n_vars;
561 if (!parse_variables (&default_dict, &vars, &n_vars,
562 PV_DUPLICATE | PV_APPEND
563 | PV_NUMERIC | PV_NO_SCRATCH))
572 paired = (lex_match ('(') && lex_match_id ("PAIRED") && lex_match (')'));
576 if (n_before_WITH * 2 != n_vars)
579 msg (SE, _("PAIRED was specified but the number of variables "
580 "preceding WITH (%d) did not match the number "
582 n_before_WITH, n_vars - n_before_WITH);
586 extra = n_before_WITH;
588 else if (n_before_WITH)
589 extra = n_before_WITH * (n_vars - n_before_WITH);
595 msg (SE, _("At least two variables must be specified "
600 extra = n_vars * (n_vars - 1) / 2;
604 n_predicted = n_pairs + extra;
607 pairs = xrealloc (pairs, sizeof (struct variable *[2]) * (n_pairs + extra));
613 for (i = 0; i < extra; i++)
615 pairs[n_pairs][0] = vars[i];
616 pairs[n_pairs++][1] = vars[i + extra];
619 else if (n_before_WITH)
623 for (i = 0; i < n_before_WITH; i++)
627 for (j = n_before_WITH; j < n_vars; j++)
629 pairs[n_pairs][0] = vars[i];
630 pairs[n_pairs++][1] = vars[j];
638 for (i = 0; i < n_vars; i++)
642 for (j = i + 1; j < n_vars; j++)
644 pairs[n_pairs][0] = vars[i];
645 pairs[n_pairs++][1] = vars[j];
651 assert (n_pairs == n_predicted);
665 printf (" GROUPS=%s", groups->name);
671 for (i = 0; i < n_groups_values; i++)
672 if (groups->type == NUMERIC)
673 printf ("%g%s", groups_values[i].f, i ? " " : "");
675 printf ("%.*s%s", groups->width, groups_values[i].s,
685 printf (" VARIABLES=");
686 for (i = 0; i < cmd.n_variables; i++)
687 printf ("%s ", cmd.v_variables[i]->name);
695 for (i = 0; i < n_pairs; i++)
696 printf ("%s ", pairs[i][0]->name);
698 for (i = 0; i < n_pairs; i++)
699 printf (" %s", pairs[i][1]->name);
700 printf (" (PAIRED)\n");
702 printf (" MISSING=%s %s\n",
703 cmd.miss == TTS_ANALYSIS ? "ANALYSIS" : "LISTWISE",
704 cmd.miss == TTS_INCLUDE ? "INCLUDE" : "EXCLUDE");
705 printf (" FORMAT=%s\n",
706 cmd.fmt == TTS_LABELS ? "LABELS" : "NOLABELS");
707 if (cmd.criteria != NOT_LONG)
708 printf (" CRITERIA=%f\n", cmd.criteria);
711 #endif /* DEBUGGING */
713 /* Here are some general routines tha should probably be moved into
714 a separate library and documented as part of the PSPP "API" */
716 variance (double n, double ss, double sum)
718 return ((ss - ((sum * sum) / n)) / (n - 1.0));
722 pooled_variance (double n_1, double var_1, double n_2, double var_2)
726 tmp = n_1 + n_2 - 2.0;
727 tmp = (((n_1 - 1.0) * var_1) + ((n_2 - 1.0) * var_2)) / tmp;
728 tmp = tmp * ((n_1 + n_2) / (n_1 * n_2));
733 oneway (double *f, double *p, struct value_list *levels)
735 double k, SSTR, SSE, SSTO, N, MSTR, MSE, sum, dftr, dfe, print;
736 struct value_list *g;
740 for (g = levels; g != NULL; g = g->next)
745 SSTR += g->ss - (pow (g->sum, 2) / g->n);
749 SSTO = SSTO - (pow (sum, 2) / N);
758 *p = f_sig (*f, dfe, dftr);
763 printf ("sum1 %f, sum2 %f, ss1 %f, ss2 %f\n",
764 levels->sum, levels->next->sum, levels->ss, levels->next->ss);
765 printf (" - - - - - - O N E W A Y - - - - - -\n\n");
766 printf (" Variable %s %s\n",
767 cmd.v_variables[0]->name, cmd.v_variables[0]->label);
768 printf ("By Variable %s %s\n", groups->name, groups->label);
769 printf ("\n Analysis of Variance\n\n");
770 printf (" Sum of Mean F F\n");
771 printf ("Source D.F. Squares Squares Ratio Prob\n\n");
772 printf ("Between %8.0f %8.4f %8.4f %8.4f %8.4f\n",
773 dfe, SSE, MSE, *f, *p);
774 printf ("Within %8.0f %8.4f %8.4f\n", dftr, SSTR, MSTR);
775 printf ("Total %8.0f %8.4f\n\n\n", N - 1.0, SSTO);
781 f_sig (double f, double dfn, double dfd)
789 cdff (&which, &p, &q, &f, &dfn, &dfd, &status, &bound);
795 printf ("Parameter 1 is out of range\n");
800 printf ("Parameter 2 is out of range\n");
805 printf ("Parameter 3 is out of range\n");
810 printf ("Parameter 4 is out of range\n");
815 printf ("Parameter 5 is out of range\n");
820 printf ("Parameter 6 is out of range\n");
825 printf ("Parameter 7 is out of range\n");
830 printf ("Parameter 8 is out of range\n");
835 /* printf( "Command completed successfully\n" ); */
840 printf ("Answer appears to be lower than the lowest search bound\n");
845 printf ("Answer appears to be higher than the greatest search bound\n");
850 printf ("P - Q NE 1\n");
857 return (double) ERROR_SIG;
866 t_crt (double df, double q)
871 which = FIND_CRITICAL_VALUE;
876 cdft (&which, &p, &q, &t, &df, &status, &bound);
882 printf ("t_crt: Parameter 1 is out of range\n");
887 printf ("t_crt: value of p (%f) is out of range\n", p);
892 printf ("t_crt: value of q (%f) is out of range\n", q);
897 printf ("t_crt: value of df (%f) is out of range\n", df);
902 printf ("t_crt: Parameter 5 is out of range\n");
907 printf ("t_crt: Parameter 6 is out of range\n");
912 printf ("t_crt: Parameter 7 is out of range\n");
917 /* printf( "Command completed successfully\n" ); */
922 printf ("t_crt: Answer appears to be lower than the lowest search bound\n");
927 printf ("t_crt: Answer appears to be higher than the greatest search bound\n");
932 printf ("t_crt: P - Q NE 1\n");
939 return (double) ERROR_SIG;
948 t_sig (double t, double df)
958 cdft (&which, &p, &q, &t, &df, &status, &bound);
964 printf ("t-sig: Parameter 1 is out of range\n");
969 printf ("t-sig: Parameter 2 is out of range\n");
974 printf ("t-sig: Parameter 3 is out of range\n");
979 printf ("t-sig: Parameter 4 is out of range\n");
984 printf ("t-sig: Parameter 5 is out of range\n");
989 printf ("t-sig: Parameter 6 is out of range\n");
994 printf ("t-sig: Parameter 7 is out of range\n");
999 /* printf( "Command completed successfully\n" ); */
1004 printf ("t-sig: Answer appears to be lower than the lowest search bound\n");
1009 printf ("t-sig: Answer appears to be higher than the greatest search bound\n");
1014 printf ("t-sig: P - Q NE 1\n");
1021 return (double) ERROR_SIG;
1030 covariance (double x_sum, double x_n, double y_sum, double y_n, double ss)
1034 tmp = x_sum * y_sum;
1037 tmp = (tmp / (x_n + y_n - 1.0));
1042 pearson_r (double c_xy, double c_xx, double c_yy)
1044 return (c_xy / (sqrt (c_xx * c_yy)));
1048 print_t_groups (struct variable * grps, union value * g1, union value * g2,
1049 double n1, double n2, double mean1, double mean2,
1050 double sd1, double sd2, double se1, double se2,
1051 double diff, double l_f, double l_p,
1052 double p_t, double p_sig, double p_df, double p_sed,
1053 double p_l, double p_h,
1054 double s_t, double s_sig, double s_df, double s_sed,
1055 double s_l, double s_h)
1058 /* Display all this shit as SPSS 6.0 does (roughly) */
1059 printf ("\n\n Number \n");
1060 printf (" Variable of Cases Mean SD SE of Mean\n");
1061 printf ("-----------------------------------------------------------\n");
1062 printf (" %s %s\n\n", cmd.v_variables[cur_var]->name, cmd.v_variables[cur_var]->label);
1063 printf ("%s %8.4f %8.0f %8.4f %8.3f %8.3f\n",
1064 get_val_lab (grps, *g1, 0), g1->f, n1, mean1, sd1, se1);
1065 printf ("%s %8.4f %8.0f %8.4f %8.3f %8.3f\n",
1066 get_val_lab (grps, *g2, 0), g2->f, n2, mean2, sd2, se2);
1067 printf ("-----------------------------------------------------------\n");
1068 printf ("\n Mean Difference = %8.4f\n", diff);
1069 printf ("\n Levene's Test for Equality of Variances: F= %.3f P= %.3f\n",
1071 printf ("\n\n t-test for Equality of Means 95pc \n");
1072 printf ("Variances t-value df 2-Tail Sig SE of Diff CI for Diff \n");
1073 printf ("-----------------------------------------------------------------\n");
1074 printf ("Equal %8.2f %8.0f %8.3f %8.3f (%8.3f, %8.3f)\n",
1075 p_t, p_df, p_sig, p_sed, p_l, p_h);
1076 printf ("Unequal %8.2f %8.2f %8.3f %8.3f (%8.3f, %8.3f)\n",
1077 s_t, s_df, s_sig, s_sed, s_l, s_h);
1078 printf ("-----------------------------------------------------------------\n");