1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27 #include "dcdflib/cdflib.h"
38 variables=varlist("PV_NO_SCRATCH | PV_NUMERIC");
40 +missing=miss:!analysis/listwise,
41 incl:include/!exclude;
42 +format=fmt:!labels/nolabels;
43 +criteria=:ci(d:criteria,"%s > 0. && %s < 1.").
50 #include "debug-print.h"
52 /* Command parsing information. */
53 static struct cmd_t_test cmd;
55 /* Variable for the GROUPS subcommand, if given. */
56 static struct variable *groups;
58 /* GROUPS: Number of values specified by the user; the values
60 static int n_groups_values;
61 static union value groups_values[2];
63 /* PAIRED: Number of pairs; each pair. */
65 static struct variable *(*pairs)[2];
67 /* Routines to scan data and perform t-tests */
68 static void precalc (void);
69 static void postcalc (void);
70 static void g_postcalc (void);
71 static void t_pairs (void);
72 static void t_groups (void);
73 static int groups_calc (struct ccase *);
74 static int pairs_calc (struct ccase *);
75 static int z_calc (struct ccase *);
82 struct value_list *next;
85 /* general workhorses - should move these to a separate library... */
86 double variance (double n, double ss, double sum);
88 double covariance (double x_sum, double x_n,
89 double y_sum, double y_n, double ss);
91 double pooled_variance (double n_1, double var_1,
92 double n_2, double var_2);
94 double oneway (double *f, double *p, struct value_list *list);
96 double pearson_r (double c_xy, double c_xx, double c_yy);
98 double f_sig (double f, double dfn, double dfd);
99 double t_crt (double df, double q);
100 double t_sig (double t, double df);
102 /* massive function simply to remove any responsibility for output
103 from the function which does the actual t-test calculations */
104 void print_t_groups (struct variable * grps, union value * g1, union value * g2,
105 double n1, double n2, double mean1, double mean2,
106 double sd1, double sd2, double se1, double se2,
107 double diff, double l_f, double l_p,
108 double p_t, double p_sig, double p_df, double p_sed,
109 double p_l, double p_h,
110 double s_t, double s_sig, double s_df, double s_sed,
111 double s_l, double s_h);
113 /* Global variables to communicate between calc() and postcalc()
114 should move to a structure in the p union of variable... */
115 static double v1_n, v1_ss, v1_sum, v1_se, v1_var, v1_mean;
116 static double v2_n, v2_ss, v2_sum, v2_se, v2_var, v2_mean;
117 static double v1_z_sum, v1_z_ss;
118 static double v2_z_sum, v2_z_ss;
119 static double diff, se_diff, sp, xy_sum, xy_diff, xy_ss;
122 /* some defines for CDFlib */
124 #define FIND_CRITICAL_VALUE 2
128 static void debug_print (void);
131 /* Parses and executes the T-TEST procedure. */
135 struct cmd_t_test cmd;
137 if (!lex_force_match_id ("T"))
140 lex_match_id ("TEST");
142 if (!parse_t_test (&cmd))
150 procedure (precalc, pairs_calc, postcalc);
152 /* probably groups then... */
154 printf ("\n\n t-tests for independent samples of %s %s\n",
155 groups->name, groups->label);
157 for (cur_var = 0; cur_var < cmd.n_variables; cur_var++)
159 v1_n = v1_ss = v1_sum = v1_se = v1_var = v1_mean = 0.0;
160 v2_n = v2_ss = v2_sum = v2_se = v2_var = v2_mean = 0.0;
161 v1_z_sum = v1_z_ss = v2_z_sum = v2_z_ss = 0.0;
162 diff = se_diff = sp = xy_diff = xy_ss = xy_sum = 0.0;
164 procedure (precalc, groups_calc, g_postcalc);
165 procedure (precalc, z_calc, postcalc);
175 return; /* rilly void... */
179 groups_calc (struct ccase * c)
183 struct variable *v = cmd.v_variables[cur_var];
184 double X = c->data[v->fv].f;
186 /* Get the weight for this case. */
187 if (default_dict.weight_index == -1)
191 w = c->data[default_dict.weight_index].f;
192 if (w <= 0.0 || w == SYSMIS)
196 printf ("Bad weight\n");
200 if (X == SYSMIS || X == 0.0) /* FIXME: should be USER_MISSING? */
202 /* printf("Missing value\n"); */
208 group = c->data[groups->fv].f;
210 if (group == groups_values[0].f)
216 else if (group == groups_values[1].f)
230 v1_mean = v1_sum / v1_n;
231 v2_mean = v2_sum / v2_n;
235 int /* this pass generates the z-zcores */
236 z_calc (struct ccase * c)
240 struct variable *v = cmd.v_variables[cur_var];
241 double X = c->data[v->fv].f;
245 /* Get the weight for this case. */
246 if (default_dict.weight_index == -1)
250 w = c->data[default_dict.weight_index].f;
251 if (w <= 0.0 || w == SYSMIS)
258 if (X == SYSMIS || X == 0.0) /* FIXME: how to specify user missing? */
264 group = c->data[groups->fv].f;
267 if (group == groups_values[0].f)
269 z = fabs (X - v1_mean);
271 v1_z_ss += pow (z, 2);
273 else if (group == groups_values[1].f)
275 z = fabs (X - v2_mean);
276 v2_z_ss += pow (z, 2);
286 pairs_calc (struct ccase * c)
289 struct variable *v1, *v2;
292 for (i = 0; i < n_pairs; i++)
297 X = c->data[v1->fv].f;
298 Y = c->data[v2->fv].f;
300 if (X == SYSMIS || Y == SYSMIS)
302 printf ("Missing value\n");
308 xy_ss += pow ((X - Y), 2);
324 /* Calculate basic statistics */
325 v1_var = variance (v1_n, v1_ss, v1_sum); /* variances */
326 v2_var = variance (v2_n, v2_ss, v2_sum);
327 v1_se = sqrt (v1_var / v1_n); /* standard errors */
328 v2_se = sqrt (v2_var / v2_n);
329 diff = v1_mean - v2_mean;
346 double df_pooled, t_pooled, t_sep, p_pooled, p_sep;
347 double crt_t_p, crt_t_s, tmp, v1_z, v2_z, f_levene, p_levene;
348 double df_sep, se_diff_s, se_diff_p;
349 struct value_list *val_1, *val_2;
352 val_1 = malloc (sizeof (struct value_list));
353 val_1->sum = v1_z_sum;
356 val_2 = malloc (sizeof (struct value_list));
357 val_2->sum = v2_z_sum;
364 f_levene = oneway (&f_levene, &p_levene, val_1);
366 /* T test results for pooled variances */
367 se_diff_p = sqrt (pooled_variance (v1_n, v1_var, v2_n, v2_var));
368 df_pooled = v1_n + v2_n - 2.0;
369 t_pooled = diff / se_diff_p;
370 p_pooled = t_sig (t_pooled, df_pooled);
371 crt_t_p = t_crt (df_pooled, 0.025);
373 if ((2.0 * p_pooled) >= 1.0)
374 p_pooled = 1.0 - p_pooled;
376 /* oh god, the separate variance calculations... */
377 t_sep = diff / sqrt ((v1_var / v1_n) + (v2_var / v2_n));
379 tmp = (v1_var / v1_n) + (v2_var / v2_n);
380 tmp = (v1_var / v1_n) / tmp;
382 tmp = tmp / (v1_n - 1.0);
385 tmp = (v1_var / v1_n) + (v2_var / v2_n);
386 tmp = (v2_var / v2_n) / tmp;
388 tmp = tmp / (v2_n - 1.0);
391 tmp = 1.0 / (v1_z + v2_z);
394 p_sep = t_sig (t_sep, df_sep);
395 if ((2.0 * p_sep) >= 1.0)
397 crt_t_s = t_crt (df_sep, 0.025);
398 se_diff_s = sqrt ((v1_var / v1_n) + (v2_var / v2_n));
400 /* FIXME: convert to a proper PSPP output call */
401 print_t_groups (groups, &groups_values[0], &groups_values[1],
402 v1_n, v2_n, v1_mean, v2_mean,
403 sqrt (v1_var), sqrt (v2_var), v1_se, v2_se,
404 diff, f_levene, p_levene,
405 t_pooled, 2.0 * p_pooled, df_pooled, se_diff_p,
406 diff - (crt_t_p * se_diff_p), diff + (crt_t_p * se_diff_p),
407 t_sep, 2.0 * p_sep, df_sep, se_diff_s,
408 diff - (crt_t_s * se_diff_s), diff + (crt_t_s * se_diff_s));
415 double cov12, cov11, cov22, r, t, p, crt_t, sp, r_t, r_p;
416 struct variable *v1, *v2;
420 cov12 = covariance (v1_sum, v1_n, v2_sum, v2_n, xy_sum);
421 cov11 = covariance (v1_sum, v1_n, v1_sum, v1_n, v1_ss);
422 cov22 = covariance (v2_sum, v2_n, v2_sum, v2_n, v2_ss);
423 r = pearson_r (cov12, cov11, cov22);
424 /* this t and it's associated p is a significance test for the pearson's r */
425 r_t = r * sqrt ((v1_n - 2.0) / (1.0 - (r * r)));
426 r_p = t_sig (r_t, v1_n - 2.0);
428 /* now we move to the t test for the difference in means */
429 diff = xy_diff / v1_n;
430 sp = sqrt (variance (v1_n, xy_ss, xy_diff));
431 se_diff = sp / sqrt (v1_n);
433 crt_t = t_crt (v1_n - 1.0, 0.025);
434 p = t_sig (t, v1_n - 1.0);
437 printf (" Number of 2-tail\n");
438 printf (" Variable pairs Corr Sig Mean SD SE of Mean\n");
439 printf ("---------------------------------------------------------------\n");
440 printf ("%s %8.4f %8.4f %8.4f\n",
441 v1->name, v1_mean, sqrt (v1_var), v1_se);
442 printf (" %8.4f %0.4f %0.4f\n", v1_n, r, r_p);
443 printf ("%s %8.4f %8.4f %8.4f\n",
444 v2->name, v2_mean, sqrt (v2_var), v2_se);
445 printf ("---------------------------------------------------------------\n");
448 printf (" Paired Differences |\n");
449 printf (" Mean SD SE of Mean | t-value df 2-tail Sig\n");
450 printf ("--------------------------------------|---------------------------\n");
452 printf ("%8.4f %8.4f %8.4f | %8.4f %8.4f %8.4f\n",
453 diff, sp, se_diff, t, v1_n - 1.0, 2.0 * (1.0 - p));
455 printf ("95pc CI (%8.4f, %8.4f) |\n\n",
456 diff - (se_diff * crt_t), diff + (se_diff * crt_t));
461 static int parse_value (union value *);
463 /* Parses the GROUPS subcommand. */
465 tts_custom_groups (struct cmd_t_test *cmd unused)
467 groups = parse_variable ();
470 lex_error (_("expecting variable name in GROUPS subcommand"));
473 if (groups->type == T_STRING && groups->width > MAX_SHORT_STRING)
475 msg (SE, _("Long string variable %s is not valid here."),
480 if (!lex_match ('('))
482 if (groups->type == NUMERIC)
485 groups_values[0].f = 1;
486 groups_values[1].f = 2;
491 msg (SE, _("When applying GROUPS to a string variable, at "
492 "least one value must be specified."));
497 if (!parse_value (&groups_values[0]))
506 if (!parse_value (&groups_values[1]))
510 if (!lex_force_match (')'))
516 /* Parses the current token (numeric or string, depending on the
517 variable in `groups') into value V and returns success. */
519 parse_value (union value * v)
521 if (groups->type == NUMERIC)
523 if (!lex_force_num ())
529 if (!lex_force_string ())
531 strncpy (v->s, ds_value (&tokstr), ds_length (&tokstr));
539 /* Parses the PAIRS subcommand. */
541 tts_custom_pairs (struct cmd_t_test *cmd unused)
543 struct variable **vars;
552 if ((token != T_ID || !is_varname (tokid)) && token != T_ALL)
554 if (!parse_variables (&default_dict, &vars, &n_vars,
555 PV_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH))
559 if (lex_match (T_WITH))
561 n_before_WITH = n_vars;
563 if (!parse_variables (&default_dict, &vars, &n_vars,
564 PV_DUPLICATE | PV_APPEND
565 | PV_NUMERIC | PV_NO_SCRATCH))
574 paired = (lex_match ('(') && lex_match_id ("PAIRED") && lex_match (')'));
578 if (n_before_WITH * 2 != n_vars)
581 msg (SE, _("PAIRED was specified but the number of variables "
582 "preceding WITH (%d) did not match the number "
584 n_before_WITH, n_vars - n_before_WITH);
588 extra = n_before_WITH;
590 else if (n_before_WITH)
591 extra = n_before_WITH * (n_vars - n_before_WITH);
597 msg (SE, _("At least two variables must be specified "
602 extra = n_vars * (n_vars - 1) / 2;
606 n_predicted = n_pairs + extra;
609 pairs = xrealloc (pairs, sizeof (struct variable *[2]) * (n_pairs + extra));
615 for (i = 0; i < extra; i++)
617 pairs[n_pairs][0] = vars[i];
618 pairs[n_pairs++][1] = vars[i + extra];
621 else if (n_before_WITH)
625 for (i = 0; i < n_before_WITH; i++)
629 for (j = n_before_WITH; j < n_vars; j++)
631 pairs[n_pairs][0] = vars[i];
632 pairs[n_pairs++][1] = vars[j];
640 for (i = 0; i < n_vars; i++)
644 for (j = i + 1; j < n_vars; j++)
646 pairs[n_pairs][0] = vars[i];
647 pairs[n_pairs++][1] = vars[j];
653 assert (n_pairs == n_predicted);
667 printf (" GROUPS=%s", groups->name);
673 for (i = 0; i < n_groups_values; i++)
674 if (groups->type == NUMERIC)
675 printf ("%g%s", groups_values[i].f, i ? " " : "");
677 printf ("%.*s%s", groups->width, groups_values[i].s,
687 printf (" VARIABLES=");
688 for (i = 0; i < cmd.n_variables; i++)
689 printf ("%s ", cmd.v_variables[i]->name);
697 for (i = 0; i < n_pairs; i++)
698 printf ("%s ", pairs[i][0]->name);
700 for (i = 0; i < n_pairs; i++)
701 printf (" %s", pairs[i][1]->name);
702 printf (" (PAIRED)\n");
704 printf (" MISSING=%s %s\n",
705 cmd.miss == TTS_ANALYSIS ? "ANALYSIS" : "LISTWISE",
706 cmd.miss == TTS_INCLUDE ? "INCLUDE" : "EXCLUDE");
707 printf (" FORMAT=%s\n",
708 cmd.fmt == TTS_LABELS ? "LABELS" : "NOLABELS");
709 if (cmd.criteria != NOT_LONG)
710 printf (" CRITERIA=%f\n", cmd.criteria);
713 #endif /* DEBUGGING */
715 /* Here are some general routines tha should probably be moved into
716 a separate library and documented as part of the PSPP "API" */
718 variance (double n, double ss, double sum)
720 return ((ss - ((sum * sum) / n)) / (n - 1.0));
724 pooled_variance (double n_1, double var_1, double n_2, double var_2)
728 tmp = n_1 + n_2 - 2.0;
729 tmp = (((n_1 - 1.0) * var_1) + ((n_2 - 1.0) * var_2)) / tmp;
730 tmp = tmp * ((n_1 + n_2) / (n_1 * n_2));
735 oneway (double *f, double *p, struct value_list *levels)
737 double k, SSTR, SSE, SSTO, N, MSTR, MSE, sum, dftr, dfe, print;
738 struct value_list *g;
742 for (g = levels; g != NULL; g = g->next)
747 SSTR += g->ss - (pow (g->sum, 2) / g->n);
751 SSTO = SSTO - (pow (sum, 2) / N);
760 *p = f_sig (*f, dfe, dftr);
765 printf ("sum1 %f, sum2 %f, ss1 %f, ss2 %f\n",
766 levels->sum, levels->next->sum, levels->ss, levels->next->ss);
767 printf (" - - - - - - O N E W A Y - - - - - -\n\n");
768 printf (" Variable %s %s\n",
769 cmd.v_variables[0]->name, cmd.v_variables[0]->label);
770 printf ("By Variable %s %s\n", groups->name, groups->label);
771 printf ("\n Analysis of Variance\n\n");
772 printf (" Sum of Mean F F\n");
773 printf ("Source D.F. Squares Squares Ratio Prob\n\n");
774 printf ("Between %8.0f %8.4f %8.4f %8.4f %8.4f\n",
775 dfe, SSE, MSE, *f, *p);
776 printf ("Within %8.0f %8.4f %8.4f\n", dftr, SSTR, MSTR);
777 printf ("Total %8.0f %8.4f\n\n\n", N - 1.0, SSTO);
783 f_sig (double f, double dfn, double dfd)
791 cdff (&which, &p, &q, &f, &dfn, &dfd, &status, &bound);
797 printf ("Parameter 1 is out of range\n");
802 printf ("Parameter 2 is out of range\n");
807 printf ("Parameter 3 is out of range\n");
812 printf ("Parameter 4 is out of range\n");
817 printf ("Parameter 5 is out of range\n");
822 printf ("Parameter 6 is out of range\n");
827 printf ("Parameter 7 is out of range\n");
832 printf ("Parameter 8 is out of range\n");
837 /* printf( "Command completed successfully\n" ); */
842 printf ("Answer appears to be lower than the lowest search bound\n");
847 printf ("Answer appears to be higher than the greatest search bound\n");
852 printf ("P - Q NE 1\n");
859 return (double) ERROR_SIG;
868 t_crt (double df, double q)
873 which = FIND_CRITICAL_VALUE;
878 cdft (&which, &p, &q, &t, &df, &status, &bound);
884 printf ("t_crt: Parameter 1 is out of range\n");
889 printf ("t_crt: value of p (%f) is out of range\n", p);
894 printf ("t_crt: value of q (%f) is out of range\n", q);
899 printf ("t_crt: value of df (%f) is out of range\n", df);
904 printf ("t_crt: Parameter 5 is out of range\n");
909 printf ("t_crt: Parameter 6 is out of range\n");
914 printf ("t_crt: Parameter 7 is out of range\n");
919 /* printf( "Command completed successfully\n" ); */
924 printf ("t_crt: Answer appears to be lower than the lowest search bound\n");
929 printf ("t_crt: Answer appears to be higher than the greatest search bound\n");
934 printf ("t_crt: P - Q NE 1\n");
941 return (double) ERROR_SIG;
950 t_sig (double t, double df)
960 cdft (&which, &p, &q, &t, &df, &status, &bound);
966 printf ("t-sig: Parameter 1 is out of range\n");
971 printf ("t-sig: Parameter 2 is out of range\n");
976 printf ("t-sig: Parameter 3 is out of range\n");
981 printf ("t-sig: Parameter 4 is out of range\n");
986 printf ("t-sig: Parameter 5 is out of range\n");
991 printf ("t-sig: Parameter 6 is out of range\n");
996 printf ("t-sig: Parameter 7 is out of range\n");
1001 /* printf( "Command completed successfully\n" ); */
1006 printf ("t-sig: Answer appears to be lower than the lowest search bound\n");
1011 printf ("t-sig: Answer appears to be higher than the greatest search bound\n");
1016 printf ("t-sig: P - Q NE 1\n");
1023 return (double) ERROR_SIG;
1032 covariance (double x_sum, double x_n, double y_sum, double y_n, double ss)
1036 tmp = x_sum * y_sum;
1039 tmp = (tmp / (x_n + y_n - 1.0));
1044 pearson_r (double c_xy, double c_xx, double c_yy)
1046 return (c_xy / (sqrt (c_xx * c_yy)));
1050 print_t_groups (struct variable * grps, union value * g1, union value * g2,
1051 double n1, double n2, double mean1, double mean2,
1052 double sd1, double sd2, double se1, double se2,
1053 double diff, double l_f, double l_p,
1054 double p_t, double p_sig, double p_df, double p_sed,
1055 double p_l, double p_h,
1056 double s_t, double s_sig, double s_df, double s_sed,
1057 double s_l, double s_h)
1060 /* Display all this shit as SPSS 6.0 does (roughly) */
1061 printf ("\n\n Number \n");
1062 printf (" Variable of Cases Mean SD SE of Mean\n");
1063 printf ("-----------------------------------------------------------\n");
1064 printf (" %s %s\n\n", cmd.v_variables[cur_var]->name, cmd.v_variables[cur_var]->label);
1065 printf ("%s %8.4f %8.0f %8.4f %8.3f %8.3f\n",
1066 get_val_lab (grps, *g1, 0), g1->f, n1, mean1, sd1, se1);
1067 printf ("%s %8.4f %8.0f %8.4f %8.3f %8.3f\n",
1068 get_val_lab (grps, *g2, 0), g2->f, n2, mean2, sd2, se2);
1069 printf ("-----------------------------------------------------------\n");
1070 printf ("\n Mean Difference = %8.4f\n", diff);
1071 printf ("\n Levene's Test for Equality of Variances: F= %.3f P= %.3f\n",
1073 printf ("\n\n t-test for Equality of Means 95pc \n");
1074 printf ("Variances t-value df 2-Tail Sig SE of Diff CI for Diff \n");
1075 printf ("-----------------------------------------------------------------\n");
1076 printf ("Equal %8.2f %8.0f %8.3f %8.3f (%8.3f, %8.3f)\n",
1077 p_t, p_df, p_sig, p_sed, p_l, p_h);
1078 printf ("Unequal %8.2f %8.2f %8.3f %8.3f (%8.3f, %8.3f)\n",
1079 s_t, s_df, s_sig, s_sed, s_l, s_h);
1080 printf ("-----------------------------------------------------------------\n");