1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by John Williams <johnr.williams@stonebow.otago.ac.nz>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27 #include "dcdflib/cdflib.h"
32 #include "value-labels.h"
39 variables=varlist("PV_NO_SCRATCH | PV_NUMERIC");
41 +missing=miss:!analysis/listwise,
42 incl:include/!exclude;
43 +format=fmt:!labels/nolabels;
44 +criteria=:ci(d:criteria,"%s > 0. && %s < 1.").
49 #include "debug-print.h"
51 /* Command parsing information. */
52 static struct cmd_t_test cmd;
54 /* Variable for the GROUPS subcommand, if given. */
55 static struct variable *groups;
57 /* GROUPS: Number of values specified by the user; the values
59 static int n_groups_values;
60 static union value groups_values[2];
62 /* PAIRED: Number of pairs; each pair. */
64 static struct variable *(*pairs)[2];
66 /* Routines to scan data and perform t-tests */
67 static void precalc (void);
68 static void postcalc (void);
69 static void g_postcalc (void);
70 static void t_pairs (void);
71 static void t_groups (void);
72 static int groups_calc (struct ccase *);
73 static int pairs_calc (struct ccase *);
74 static int z_calc (struct ccase *);
81 struct value_list *next;
84 /* general workhorses - should move these to a separate library... */
85 double variance (double n, double ss, double sum);
87 double covariance (double x_sum, double x_n,
88 double y_sum, double y_n, double ss);
90 double pooled_variance (double n_1, double var_1,
91 double n_2, double var_2);
93 double oneway (double *f, double *p, struct value_list *list);
95 double pearson_r (double c_xy, double c_xx, double c_yy);
97 double f_sig (double f, double dfn, double dfd);
98 double t_crt (double df, double q);
99 double t_sig (double t, double df);
101 /* massive function simply to remove any responsibility for output
102 from the function which does the actual t-test calculations */
103 void print_t_groups (struct variable * grps, union value * g1, union value * g2,
104 double n1, double n2, double mean1, double mean2,
105 double sd1, double sd2, double se1, double se2,
106 double diff, double l_f, double l_p,
107 double p_t, double p_sig, double p_df, double p_sed,
108 double p_l, double p_h,
109 double s_t, double s_sig, double s_df, double s_sed,
110 double s_l, double s_h);
112 /* Global variables to communicate between calc() and postcalc()
113 should move to a structure in the p union of variable... */
114 static double v1_n, v1_ss, v1_sum, v1_se, v1_var, v1_mean;
115 static double v2_n, v2_ss, v2_sum, v2_se, v2_var, v2_mean;
116 static double v1_z_sum, v1_z_ss;
117 static double v2_z_sum, v2_z_ss;
118 static double diff, se_diff, sp, xy_sum, xy_diff, xy_ss;
121 /* some defines for CDFlib */
123 #define FIND_CRITICAL_VALUE 2
127 static void debug_print (void);
130 /* Parses and executes the T-TEST procedure. */
134 struct cmd_t_test cmd;
136 if (!lex_force_match_id ("T"))
139 lex_match_id ("TEST");
141 if (!parse_t_test (&cmd))
149 procedure (precalc, pairs_calc, postcalc);
151 /* probably groups then... */
153 printf ("\n\n t-tests for independent samples of %s %s\n",
154 groups->name, groups->label);
156 for (cur_var = 0; cur_var < cmd.n_variables; cur_var++)
158 v1_n = v1_ss = v1_sum = v1_se = v1_var = v1_mean = 0.0;
159 v2_n = v2_ss = v2_sum = v2_se = v2_var = v2_mean = 0.0;
160 v1_z_sum = v1_z_ss = v2_z_sum = v2_z_ss = 0.0;
161 diff = se_diff = sp = xy_diff = xy_ss = xy_sum = 0.0;
163 procedure (precalc, groups_calc, g_postcalc);
164 procedure (precalc, z_calc, postcalc);
174 return; /* rilly void... */
178 groups_calc (struct ccase * c)
182 struct variable *v = cmd.v_variables[cur_var];
183 double X = c->data[v->fv].f;
185 /* Get the weight for this case. */
186 if (default_dict.weight_index == -1)
190 w = c->data[default_dict.weight_index].f;
191 if (w <= 0.0 || w == SYSMIS)
195 printf ("Bad weight\n");
199 if (X == SYSMIS || X == 0.0) /* FIXME: should be USER_MISSING? */
201 /* printf("Missing value\n"); */
207 group = c->data[groups->fv].f;
209 if (group == groups_values[0].f)
215 else if (group == groups_values[1].f)
229 v1_mean = v1_sum / v1_n;
230 v2_mean = v2_sum / v2_n;
234 int /* this pass generates the z-zcores */
235 z_calc (struct ccase * c)
239 struct variable *v = cmd.v_variables[cur_var];
240 double X = c->data[v->fv].f;
244 /* Get the weight for this case. */
245 if (default_dict.weight_index == -1)
249 w = c->data[default_dict.weight_index].f;
250 if (w <= 0.0 || w == SYSMIS)
257 if (X == SYSMIS || X == 0.0) /* FIXME: how to specify user missing? */
263 group = c->data[groups->fv].f;
266 if (group == groups_values[0].f)
268 z = fabs (X - v1_mean);
270 v1_z_ss += pow (z, 2);
272 else if (group == groups_values[1].f)
274 z = fabs (X - v2_mean);
275 v2_z_ss += pow (z, 2);
285 pairs_calc (struct ccase * c)
288 struct variable *v1, *v2;
291 for (i = 0; i < n_pairs; i++)
296 X = c->data[v1->fv].f;
297 Y = c->data[v2->fv].f;
299 if (X == SYSMIS || Y == SYSMIS)
301 printf ("Missing value\n");
307 xy_ss += pow ((X - Y), 2);
323 /* Calculate basic statistics */
324 v1_var = variance (v1_n, v1_ss, v1_sum); /* variances */
325 v2_var = variance (v2_n, v2_ss, v2_sum);
326 v1_se = sqrt (v1_var / v1_n); /* standard errors */
327 v2_se = sqrt (v2_var / v2_n);
328 diff = v1_mean - v2_mean;
345 double df_pooled, t_pooled, t_sep, p_pooled, p_sep;
346 double crt_t_p, crt_t_s, tmp, v1_z, v2_z, f_levene, p_levene;
347 double df_sep, se_diff_s, se_diff_p;
348 struct value_list *val_1, *val_2;
351 val_1 = malloc (sizeof (struct value_list));
352 val_1->sum = v1_z_sum;
355 val_2 = malloc (sizeof (struct value_list));
356 val_2->sum = v2_z_sum;
363 f_levene = oneway (&f_levene, &p_levene, val_1);
365 /* T test results for pooled variances */
366 se_diff_p = sqrt (pooled_variance (v1_n, v1_var, v2_n, v2_var));
367 df_pooled = v1_n + v2_n - 2.0;
368 t_pooled = diff / se_diff_p;
369 p_pooled = t_sig (t_pooled, df_pooled);
370 crt_t_p = t_crt (df_pooled, 0.025);
372 if ((2.0 * p_pooled) >= 1.0)
373 p_pooled = 1.0 - p_pooled;
375 /* oh god, the separate variance calculations... */
376 t_sep = diff / sqrt ((v1_var / v1_n) + (v2_var / v2_n));
378 tmp = (v1_var / v1_n) + (v2_var / v2_n);
379 tmp = (v1_var / v1_n) / tmp;
381 tmp = tmp / (v1_n - 1.0);
384 tmp = (v1_var / v1_n) + (v2_var / v2_n);
385 tmp = (v2_var / v2_n) / tmp;
387 tmp = tmp / (v2_n - 1.0);
390 tmp = 1.0 / (v1_z + v2_z);
393 p_sep = t_sig (t_sep, df_sep);
394 if ((2.0 * p_sep) >= 1.0)
396 crt_t_s = t_crt (df_sep, 0.025);
397 se_diff_s = sqrt ((v1_var / v1_n) + (v2_var / v2_n));
399 /* FIXME: convert to a proper PSPP output call */
400 print_t_groups (groups, &groups_values[0], &groups_values[1],
401 v1_n, v2_n, v1_mean, v2_mean,
402 sqrt (v1_var), sqrt (v2_var), v1_se, v2_se,
403 diff, f_levene, p_levene,
404 t_pooled, 2.0 * p_pooled, df_pooled, se_diff_p,
405 diff - (crt_t_p * se_diff_p), diff + (crt_t_p * se_diff_p),
406 t_sep, 2.0 * p_sep, df_sep, se_diff_s,
407 diff - (crt_t_s * se_diff_s), diff + (crt_t_s * se_diff_s));
414 double cov12, cov11, cov22, r, t, p, crt_t, sp, r_t, r_p;
415 struct variable *v1, *v2;
419 cov12 = covariance (v1_sum, v1_n, v2_sum, v2_n, xy_sum);
420 cov11 = covariance (v1_sum, v1_n, v1_sum, v1_n, v1_ss);
421 cov22 = covariance (v2_sum, v2_n, v2_sum, v2_n, v2_ss);
422 r = pearson_r (cov12, cov11, cov22);
423 /* this t and it's associated p is a significance test for the pearson's r */
424 r_t = r * sqrt ((v1_n - 2.0) / (1.0 - (r * r)));
425 r_p = t_sig (r_t, v1_n - 2.0);
427 /* now we move to the t test for the difference in means */
428 diff = xy_diff / v1_n;
429 sp = sqrt (variance (v1_n, xy_ss, xy_diff));
430 se_diff = sp / sqrt (v1_n);
432 crt_t = t_crt (v1_n - 1.0, 0.025);
433 p = t_sig (t, v1_n - 1.0);
436 printf (" Number of 2-tail\n");
437 printf (" Variable pairs Corr Sig Mean SD SE of Mean\n");
438 printf ("---------------------------------------------------------------\n");
439 printf ("%s %8.4f %8.4f %8.4f\n",
440 v1->name, v1_mean, sqrt (v1_var), v1_se);
441 printf (" %8.4f %0.4f %0.4f\n", v1_n, r, r_p);
442 printf ("%s %8.4f %8.4f %8.4f\n",
443 v2->name, v2_mean, sqrt (v2_var), v2_se);
444 printf ("---------------------------------------------------------------\n");
447 printf (" Paired Differences |\n");
448 printf (" Mean SD SE of Mean | t-value df 2-tail Sig\n");
449 printf ("--------------------------------------|---------------------------\n");
451 printf ("%8.4f %8.4f %8.4f | %8.4f %8.4f %8.4f\n",
452 diff, sp, se_diff, t, v1_n - 1.0, 2.0 * (1.0 - p));
454 printf ("95pc CI (%8.4f, %8.4f) |\n\n",
455 diff - (se_diff * crt_t), diff + (se_diff * crt_t));
460 static int parse_value (union value *);
462 /* Parses the GROUPS subcommand. */
464 tts_custom_groups (struct cmd_t_test *cmd unused)
466 groups = parse_variable ();
469 lex_error (_("expecting variable name in GROUPS subcommand"));
472 if (groups->type == T_STRING && groups->width > MAX_SHORT_STRING)
474 msg (SE, _("Long string variable %s is not valid here."),
479 if (!lex_match ('('))
481 if (groups->type == NUMERIC)
484 groups_values[0].f = 1;
485 groups_values[1].f = 2;
490 msg (SE, _("When applying GROUPS to a string variable, at "
491 "least one value must be specified."));
496 if (!parse_value (&groups_values[0]))
505 if (!parse_value (&groups_values[1]))
509 if (!lex_force_match (')'))
515 /* Parses the current token (numeric or string, depending on the
516 variable in `groups') into value V and returns success. */
518 parse_value (union value * v)
520 if (groups->type == NUMERIC)
522 if (!lex_force_num ())
528 if (!lex_force_string ())
530 strncpy (v->s, ds_value (&tokstr), ds_length (&tokstr));
538 /* Parses the PAIRS subcommand. */
540 tts_custom_pairs (struct cmd_t_test *cmd unused)
542 struct variable **vars;
551 if ((token != T_ID || !is_varname (tokid)) && token != T_ALL)
553 if (!parse_variables (&default_dict, &vars, &n_vars,
554 PV_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH))
558 if (lex_match (T_WITH))
560 n_before_WITH = n_vars;
562 if (!parse_variables (&default_dict, &vars, &n_vars,
563 PV_DUPLICATE | PV_APPEND
564 | PV_NUMERIC | PV_NO_SCRATCH))
573 paired = (lex_match ('(') && lex_match_id ("PAIRED") && lex_match (')'));
577 if (n_before_WITH * 2 != n_vars)
580 msg (SE, _("PAIRED was specified but the number of variables "
581 "preceding WITH (%d) did not match the number "
583 n_before_WITH, n_vars - n_before_WITH);
587 extra = n_before_WITH;
589 else if (n_before_WITH)
590 extra = n_before_WITH * (n_vars - n_before_WITH);
596 msg (SE, _("At least two variables must be specified "
601 extra = n_vars * (n_vars - 1) / 2;
605 n_predicted = n_pairs + extra;
608 pairs = xrealloc (pairs, sizeof (struct variable *[2]) * (n_pairs + extra));
614 for (i = 0; i < extra; i++)
616 pairs[n_pairs][0] = vars[i];
617 pairs[n_pairs++][1] = vars[i + extra];
620 else if (n_before_WITH)
624 for (i = 0; i < n_before_WITH; i++)
628 for (j = n_before_WITH; j < n_vars; j++)
630 pairs[n_pairs][0] = vars[i];
631 pairs[n_pairs++][1] = vars[j];
639 for (i = 0; i < n_vars; i++)
643 for (j = i + 1; j < n_vars; j++)
645 pairs[n_pairs][0] = vars[i];
646 pairs[n_pairs++][1] = vars[j];
652 assert (n_pairs == n_predicted);
666 printf (" GROUPS=%s", groups->name);
672 for (i = 0; i < n_groups_values; i++)
673 if (groups->type == NUMERIC)
674 printf ("%g%s", groups_values[i].f, i ? " " : "");
676 printf ("%.*s%s", groups->width, groups_values[i].s,
686 printf (" VARIABLES=");
687 for (i = 0; i < cmd.n_variables; i++)
688 printf ("%s ", cmd.v_variables[i]->name);
696 for (i = 0; i < n_pairs; i++)
697 printf ("%s ", pairs[i][0]->name);
699 for (i = 0; i < n_pairs; i++)
700 printf (" %s", pairs[i][1]->name);
701 printf (" (PAIRED)\n");
703 printf (" MISSING=%s %s\n",
704 cmd.miss == TTS_ANALYSIS ? "ANALYSIS" : "LISTWISE",
705 cmd.miss == TTS_INCLUDE ? "INCLUDE" : "EXCLUDE");
706 printf (" FORMAT=%s\n",
707 cmd.fmt == TTS_LABELS ? "LABELS" : "NOLABELS");
708 if (cmd.criteria != NOT_LONG)
709 printf (" CRITERIA=%f\n", cmd.criteria);
712 #endif /* DEBUGGING */
714 /* Here are some general routines tha should probably be moved into
715 a separate library and documented as part of the PSPP "API" */
717 variance (double n, double ss, double sum)
719 return ((ss - ((sum * sum) / n)) / (n - 1.0));
723 pooled_variance (double n_1, double var_1, double n_2, double var_2)
727 tmp = n_1 + n_2 - 2.0;
728 tmp = (((n_1 - 1.0) * var_1) + ((n_2 - 1.0) * var_2)) / tmp;
729 tmp = tmp * ((n_1 + n_2) / (n_1 * n_2));
734 oneway (double *f, double *p, struct value_list *levels)
736 double k, SSTR, SSE, SSTO, N, MSTR, MSE, sum, dftr, dfe, print;
737 struct value_list *g;
741 for (g = levels; g != NULL; g = g->next)
746 SSTR += g->ss - (pow (g->sum, 2) / g->n);
750 SSTO = SSTO - (pow (sum, 2) / N);
759 *p = f_sig (*f, dfe, dftr);
764 printf ("sum1 %f, sum2 %f, ss1 %f, ss2 %f\n",
765 levels->sum, levels->next->sum, levels->ss, levels->next->ss);
766 printf (" - - - - - - O N E W A Y - - - - - -\n\n");
767 printf (" Variable %s %s\n",
768 cmd.v_variables[0]->name, cmd.v_variables[0]->label);
769 printf ("By Variable %s %s\n", groups->name, groups->label);
770 printf ("\n Analysis of Variance\n\n");
771 printf (" Sum of Mean F F\n");
772 printf ("Source D.F. Squares Squares Ratio Prob\n\n");
773 printf ("Between %8.0f %8.4f %8.4f %8.4f %8.4f\n",
774 dfe, SSE, MSE, *f, *p);
775 printf ("Within %8.0f %8.4f %8.4f\n", dftr, SSTR, MSTR);
776 printf ("Total %8.0f %8.4f\n\n\n", N - 1.0, SSTO);
782 f_sig (double f, double dfn, double dfd)
790 cdff (&which, &p, &q, &f, &dfn, &dfd, &status, &bound);
796 printf ("Parameter 1 is out of range\n");
801 printf ("Parameter 2 is out of range\n");
806 printf ("Parameter 3 is out of range\n");
811 printf ("Parameter 4 is out of range\n");
816 printf ("Parameter 5 is out of range\n");
821 printf ("Parameter 6 is out of range\n");
826 printf ("Parameter 7 is out of range\n");
831 printf ("Parameter 8 is out of range\n");
836 /* printf( "Command completed successfully\n" ); */
841 printf ("Answer appears to be lower than the lowest search bound\n");
846 printf ("Answer appears to be higher than the greatest search bound\n");
851 printf ("P - Q NE 1\n");
858 return (double) ERROR_SIG;
867 t_crt (double df, double q)
872 which = FIND_CRITICAL_VALUE;
877 cdft (&which, &p, &q, &t, &df, &status, &bound);
883 printf ("t_crt: Parameter 1 is out of range\n");
888 printf ("t_crt: value of p (%f) is out of range\n", p);
893 printf ("t_crt: value of q (%f) is out of range\n", q);
898 printf ("t_crt: value of df (%f) is out of range\n", df);
903 printf ("t_crt: Parameter 5 is out of range\n");
908 printf ("t_crt: Parameter 6 is out of range\n");
913 printf ("t_crt: Parameter 7 is out of range\n");
918 /* printf( "Command completed successfully\n" ); */
923 printf ("t_crt: Answer appears to be lower than the lowest search bound\n");
928 printf ("t_crt: Answer appears to be higher than the greatest search bound\n");
933 printf ("t_crt: P - Q NE 1\n");
940 return (double) ERROR_SIG;
949 t_sig (double t, double df)
959 cdft (&which, &p, &q, &t, &df, &status, &bound);
965 printf ("t-sig: Parameter 1 is out of range\n");
970 printf ("t-sig: Parameter 2 is out of range\n");
975 printf ("t-sig: Parameter 3 is out of range\n");
980 printf ("t-sig: Parameter 4 is out of range\n");
985 printf ("t-sig: Parameter 5 is out of range\n");
990 printf ("t-sig: Parameter 6 is out of range\n");
995 printf ("t-sig: Parameter 7 is out of range\n");
1000 /* printf( "Command completed successfully\n" ); */
1005 printf ("t-sig: Answer appears to be lower than the lowest search bound\n");
1010 printf ("t-sig: Answer appears to be higher than the greatest search bound\n");
1015 printf ("t-sig: P - Q NE 1\n");
1022 return (double) ERROR_SIG;
1031 covariance (double x_sum, double x_n, double y_sum, double y_n, double ss)
1035 tmp = x_sum * y_sum;
1038 tmp = (tmp / (x_n + y_n - 1.0));
1043 pearson_r (double c_xy, double c_xx, double c_yy)
1045 return (c_xy / (sqrt (c_xx * c_yy)));
1049 print_t_groups (struct variable * grps, union value * g1, union value * g2,
1050 double n1, double n2, double mean1, double mean2,
1051 double sd1, double sd2, double se1, double se2,
1052 double diff, double l_f, double l_p,
1053 double p_t, double p_sig, double p_df, double p_sed,
1054 double p_l, double p_h,
1055 double s_t, double s_sig, double s_df, double s_sed,
1056 double s_l, double s_h)
1059 /* Display all this shit as SPSS 6.0 does (roughly) */
1060 printf ("\n\n Number \n");
1061 printf (" Variable of Cases Mean SD SE of Mean\n");
1062 printf ("-----------------------------------------------------------\n");
1063 printf (" %s %s\n\n", cmd.v_variables[cur_var]->name, cmd.v_variables[cur_var]->label);
1064 printf ("%s %8.4f %8.0f %8.4f %8.3f %8.3f\n",
1065 val_labs_find (grps->val_labs, *g1), g1->f, n1, mean1, sd1, se1);
1066 printf ("%s %8.4f %8.0f %8.4f %8.3f %8.3f\n",
1067 val_labs_find (grps->val_labs, *g2), g2->f, n2, mean2, sd2, se2);
1068 printf ("-----------------------------------------------------------\n");
1069 printf ("\n Mean Difference = %8.4f\n", diff);
1070 printf ("\n Levene's Test for Equality of Variances: F= %.3f P= %.3f\n",
1072 printf ("\n\n t-test for Equality of Means 95pc \n");
1073 printf ("Variances t-value df 2-Tail Sig SE of Diff CI for Diff \n");
1074 printf ("-----------------------------------------------------------------\n");
1075 printf ("Equal %8.2f %8.0f %8.3f %8.3f (%8.3f, %8.3f)\n",
1076 p_t, p_df, p_sig, p_sed, p_l, p_h);
1077 printf ("Unequal %8.2f %8.2f %8.3f %8.3f (%8.3f, %8.3f)\n",
1078 s_t, s_df, s_sig, s_sed, s_l, s_h);
1079 printf ("-----------------------------------------------------------------\n");