From 71eea20b080f51f1aa00ef35acf4f49ce742d10a Mon Sep 17 00:00:00 2001 From: John Darrington Date: Fri, 1 Jul 2011 16:47:33 +0200 Subject: [PATCH] Re-implemented the T-TEST command and the levene calculation. Also updated the ONEWAY command to use the new levene test. The changes to t-test.at in this changeset affect only whitespace or the order of reported significance. The numbers remain unchanged. --- src/language/stats/automake.mk | 8 +- src/language/stats/oneway.c | 49 +- src/language/stats/t-test-indep.c | 390 ++++++ src/language/stats/t-test-one-sample.c | 253 ++++ src/language/stats/t-test-paired.c | 356 ++++++ src/language/stats/t-test-parser.c | 366 ++++++ src/language/stats/t-test.h | 61 + src/language/stats/t-test.q | 1560 ------------------------ src/math/levene.c | 292 +++-- src/math/levene.h | 17 +- tests/language/stats/t-test.at | 36 +- 11 files changed, 1679 insertions(+), 1709 deletions(-) create mode 100644 src/language/stats/t-test-indep.c create mode 100644 src/language/stats/t-test-one-sample.c create mode 100644 src/language/stats/t-test-paired.c create mode 100644 src/language/stats/t-test-parser.c create mode 100644 src/language/stats/t-test.h delete mode 100644 src/language/stats/t-test.q diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 4d4b662c..f7fd6373 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -7,8 +7,7 @@ src_language_stats_built_sources = \ src/language/stats/examine.c \ src/language/stats/frequencies.c \ src/language/stats/rank.c \ - src/language/stats/regression.c \ - src/language/stats/t-test.c + src/language/stats/regression.c language_stats_sources = \ src/language/stats/aggregate.c \ @@ -49,6 +48,11 @@ language_stats_sources = \ src/language/stats/sort-cases.c \ src/language/stats/sort-criteria.c \ src/language/stats/sort-criteria.h \ + src/language/stats/t-test.h \ + src/language/stats/t-test-indep.c \ + src/language/stats/t-test-one-sample.c \ + src/language/stats/t-test-paired.c \ + src/language/stats/t-test-parser.c \ src/language/stats/wilcoxon.c \ src/language/stats/wilcoxon.h diff --git a/src/language/stats/oneway.c b/src/language/stats/oneway.c index 9d4fdb8a..8e2805b3 100644 --- a/src/language/stats/oneway.c +++ b/src/language/stats/oneway.c @@ -91,7 +91,6 @@ struct oneway_spec /* The weight variable */ const struct variable *wv; - }; /* Per category data */ @@ -109,6 +108,7 @@ struct per_var_ws { struct categoricals *cat; struct covariance *cov; + struct levene *nl; double sst; double sse; @@ -117,7 +117,6 @@ struct per_var_ws int n_groups; double mse; - double levene_w; }; struct oneway_workspace @@ -393,6 +392,7 @@ run_oneway (const struct oneway_spec *cmd, ws.vws[v].cov = covariance_2pass_create (1, &cmd->vars[v], ws.vws[v].cat, cmd->wv, cmd->exclude); + ws.vws[v].nl = levene_create (var_get_width (cmd->indep_var), NULL); } c = casereader_peek (input, 0); @@ -413,20 +413,11 @@ run_oneway (const struct oneway_spec *cmd, cmd->exclude, NULL, NULL); input = casereader_create_filter_weight (input, dict, NULL, NULL); - - if (cmd->stats & STATS_HOMOGENEITY) - for (v = 0; v < cmd->n_vars; ++v) - { - struct per_var_ws *pvw = &ws.vws[v]; - - pvw->levene_w = levene (input, cmd->indep_var, cmd->vars[v], cmd->wv, cmd->exclude); - } - reader = casereader_clone (input); - for (; (c = casereader_read (reader)) != NULL; case_unref (c)) { int i; + double w = dict_get_case_weight (dict, c, NULL); for (i = 0; i < cmd->n_vars; ++i) { @@ -441,13 +432,16 @@ run_oneway (const struct oneway_spec *cmd, } covariance_accumulate_pass1 (pvw->cov, c); + levene_pass_one (pvw->nl, val->f, w, case_data (c, cmd->indep_var)); } } casereader_destroy (reader); + reader = casereader_clone (input); for ( ; (c = casereader_read (reader) ); case_unref (c)) { int i; + double w = dict_get_case_weight (dict, c, NULL); for (i = 0; i < cmd->n_vars; ++i) { struct per_var_ws *pvw = &ws.vws[i]; @@ -461,10 +455,35 @@ run_oneway (const struct oneway_spec *cmd, } covariance_accumulate_pass2 (pvw->cov, c); + levene_pass_two (pvw->nl, val->f, w, case_data (c, cmd->indep_var)); } } casereader_destroy (reader); + reader = casereader_clone (input); + for ( ; (c = casereader_read (reader) ); case_unref (c)) + { + int i; + double w = dict_get_case_weight (dict, c, NULL); + + for (i = 0; i < cmd->n_vars; ++i) + { + struct per_var_ws *pvw = &ws.vws[i]; + const struct variable *v = cmd->vars[i]; + const union value *val = case_data (c, v); + + if ( MISS_ANALYSIS == cmd->missing_type) + { + if ( var_is_value_missing (v, val, cmd->exclude)) + continue; + } + + levene_pass_three (pvw->nl, val->f, w, case_data (c, cmd->indep_var)); + } + } + casereader_destroy (reader); + + for (v = 0; v < cmd->n_vars; ++v) { struct per_var_ws *pvw = &ws.vws[v]; @@ -476,8 +495,6 @@ run_oneway (const struct oneway_spec *cmd, pvw->sst = gsl_matrix_get (cm, 0, 0); - // gsl_matrix_fprintf (stdout, cm, "%g "); - reg_sweep (cm, 0); pvw->sse = gsl_matrix_get (cm, 0, 0); @@ -512,11 +529,11 @@ run_oneway (const struct oneway_spec *cmd, for (v = 0; v < cmd->n_vars; ++v) { covariance_destroy (ws.vws[v].cov); + levene_destroy (ws.vws[v].nl); dd_destroy (ws.dd_total[v]); } free (ws.vws); free (ws.dd_total); - } static void show_contrast_coeffs (const struct oneway_spec *cmd, const struct oneway_workspace *ws); @@ -846,7 +863,7 @@ show_homogeneity (const struct oneway_spec *cmd, const struct oneway_workspace * { double n; const struct per_var_ws *pvw = &ws->vws[v]; - double F = pvw->levene_w; + double F = levene_calculate (pvw->nl); const struct variable *var = cmd->vars[v]; const char *s = var_to_string (var); diff --git a/src/language/stats/t-test-indep.c b/src/language/stats/t-test-indep.c new file mode 100644 index 00000000..876b9857 --- /dev/null +++ b/src/language/stats/t-test-indep.c @@ -0,0 +1,390 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "t-test.h" + +#include +#include +#include "libpspp/misc.h" + +#include "libpspp/str.h" +#include "data/casereader.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/variable.h" + +#include "math/moments.h" +#include "math/levene.h" + +#include +#include "gettext.h" +#define _(msgid) gettext (msgid) + + +struct indep_samples +{ + const struct variable *gvar; + bool cut; + const union value *gval0; + const union value *gval1; +}; + +struct pair_stats +{ + struct moments *mom[2]; + double lev ; + struct levene *nl; +}; + + +static void indep_summary (const struct tt *tt, struct indep_samples *is, const struct pair_stats *ps); +static void indep_test (const struct tt *tt, const struct pair_stats *ps); + +static int +which_group (const union value *v, const struct indep_samples *is) +{ + int width = var_get_width (is->gvar); + int cmp = value_compare_3way (v, is->gval0, width); + if ( is->cut ) + return (cmp < 0); + + if (cmp == 0) + return 0; + + if (0 == value_compare_3way (v, is->gval1, width)) + return 1; + + return -1; +} + +void +indep_run (struct tt *tt, const struct variable *gvar, + bool cut, + const union value *gval0, const union value *gval1, + struct casereader *reader) +{ + struct indep_samples is; + struct ccase *c; + struct casereader *r; + + struct pair_stats *ps = xcalloc (sizeof (*ps), tt->n_vars); + + int v; + + for (v = 0; v < tt->n_vars; ++v) + { + ps[v].mom[0] = moments_create (MOMENT_VARIANCE); + ps[v].mom[1] = moments_create (MOMENT_VARIANCE); + ps[v].nl = levene_create (var_get_width (gvar), cut ? gval0: NULL); + } + + is.gvar = gvar; + is.gval0 = gval0; + is.gval1 = gval1; + is.cut = cut; + + r = casereader_clone (reader); + for ( ; (c = casereader_read (r) ); case_unref (c)) + { + double w = dict_get_case_weight (tt->dict, c, NULL); + + const union value *gv = case_data (c, gvar); + + int grp = which_group (gv, &is); + if ( grp < 0) + continue; + + for (v = 0; v < tt->n_vars; ++v) + { + const union value *val = case_data (c, tt->vars[v]); + if (var_is_value_missing (tt->vars[v], val, tt->exclude)) + continue; + + moments_pass_one (ps[v].mom[grp], val->f, w); + levene_pass_one (ps[v].nl, val->f, w, gv); + } + } + casereader_destroy (r); + + r = casereader_clone (reader); + for ( ; (c = casereader_read (r) ); case_unref (c)) + { + double w = dict_get_case_weight (tt->dict, c, NULL); + + const union value *gv = case_data (c, gvar); + + int grp = which_group (gv, &is); + if ( grp < 0) + continue; + + for (v = 0; v < tt->n_vars; ++v) + { + const union value *val = case_data (c, tt->vars[v]); + if (var_is_value_missing (tt->vars[v], val, tt->exclude)) + continue; + + moments_pass_two (ps[v].mom[grp], val->f, w); + levene_pass_two (ps[v].nl, val->f, w, gv); + } + } + casereader_destroy (r); + + r = reader; + for ( ; (c = casereader_read (r) ); case_unref (c)) + { + double w = dict_get_case_weight (tt->dict, c, NULL); + + const union value *gv = case_data (c, gvar); + + int grp = which_group (gv, &is); + if ( grp < 0) + continue; + + for (v = 0; v < tt->n_vars; ++v) + { + const union value *val = case_data (c, tt->vars[v]); + if (var_is_value_missing (tt->vars[v], val, tt->exclude)) + continue; + + levene_pass_three (ps[v].nl, val->f, w, gv); + } + } + casereader_destroy (r); + + + for (v = 0; v < tt->n_vars; ++v) + ps[v].lev = levene_calculate (ps[v].nl); + + indep_summary (tt, &is, ps); + indep_test (tt, ps); + + + for (v = 0; v < tt->n_vars; ++v) + { + moments_destroy (ps[v].mom[0]); + moments_destroy (ps[v].mom[1]); + levene_destroy (ps[v].nl); + } + free (ps); +} + + +static void +indep_summary (const struct tt *tt, struct indep_samples *is, const struct pair_stats *ps) +{ + const struct fmt_spec *wfmt = tt->wv ? var_get_print_format (tt->wv) : & F_8_0; + + int v; + int cols = 6; + const int heading_rows = 1; + int rows = tt->n_vars * 2 + heading_rows; + + struct string vallab0 ; + struct string vallab1 ; + struct tab_table *t = tab_create (cols, rows); + + ds_init_empty (&vallab0); + ds_init_empty (&vallab1); + + tab_headers (t, 0, 0, 1, 0); + tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1); + tab_hline (t, TAL_2, 0, cols - 1, 1); + + tab_vline (t, TAL_GAP, 1, 0, rows - 1); + tab_title (t, _("Group Statistics")); + tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, var_to_string (is->gvar)); + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("N")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Mean")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation")); + tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean")); + + if (is->cut) + { + /* ds_put_cstr (&vallab0, "≥"); */ + ds_put_cstr (&vallab0, ">="); + ds_put_cstr (&vallab1, "<"); + + var_append_value_name (is->gvar, is->gval0, &vallab0); + var_append_value_name (is->gvar, is->gval0, &vallab1); + } + else + { + var_append_value_name (is->gvar, is->gval0, &vallab0); + var_append_value_name (is->gvar, is->gval1, &vallab1); + } + + for (v = 0; v < tt->n_vars; ++v) + { + int i; + const struct variable *var = tt->vars[v]; + + tab_text (t, 0, v * 2 + heading_rows, TAB_LEFT, + var_to_string (var)); + + tab_text (t, 1, v * 2 + heading_rows, TAB_LEFT, + ds_cstr (&vallab0)); + + tab_text (t, 1, v * 2 + 1 + heading_rows, TAB_LEFT, + ds_cstr (&vallab1)); + + for (i = 0 ; i < 2; ++i) + { + double cc, mean, sigma; + moments_calculate (ps[v].mom[i], &cc, &mean, &sigma, NULL, NULL); + + tab_double (t, 2, v * 2 + i + heading_rows, TAB_RIGHT, cc, wfmt); + tab_double (t, 3, v * 2 + i + heading_rows, TAB_RIGHT, mean, NULL); + tab_double (t, 4, v * 2 + i + heading_rows, TAB_RIGHT, sqrt (sigma), NULL); + tab_double (t, 5, v * 2 + i + heading_rows, TAB_RIGHT, sqrt (sigma / cc), NULL); + } + } + + tab_submit (t); + + ds_destroy (&vallab0); + ds_destroy (&vallab1); +} + + +static void +indep_test (const struct tt *tt, const struct pair_stats *ps) +{ + int v; + const int heading_rows = 3; + const int rows= tt->n_vars * 2 + heading_rows; + + const size_t cols = 11; + + struct tab_table *t = tab_create (cols, rows); + tab_headers (t, 0, 0, 3, 0); + tab_box (t, TAL_2, TAL_2, TAL_0, TAL_0, 0, 0, cols - 1, rows - 1); + tab_hline (t, TAL_2, 0, cols - 1, 3); + + tab_title (t, _("Independent Samples Test")); + + tab_hline (t, TAL_1, 2, cols - 1, 1); + tab_vline (t, TAL_2, 2, 0, rows - 1); + tab_vline (t, TAL_1, 4, 0, rows - 1); + tab_box (t, -1, -1, -1, TAL_1, 2, 1, cols - 2, rows - 1); + tab_hline (t, TAL_1, cols - 2, cols - 1, 2); + tab_box (t, -1, -1, -1, TAL_1, cols - 2, 2, cols - 1, rows - 1); + tab_joint_text (t, 2, 0, 3, 0, TAB_CENTER, _("Levene's Test for Equality of Variances")); + tab_joint_text (t, 4, 0, cols - 1, 0, TAB_CENTER, _("t-test for Equality of Means")); + + tab_text (t, 2, 2, TAB_CENTER | TAT_TITLE, _("F")); + tab_text (t, 3, 2, TAB_CENTER | TAT_TITLE, _("Sig.")); + tab_text (t, 4, 2, TAB_CENTER | TAT_TITLE, _("t")); + tab_text (t, 5, 2, TAB_CENTER | TAT_TITLE, _("df")); + tab_text (t, 6, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)")); + tab_text (t, 7, 2, TAB_CENTER | TAT_TITLE, _("Mean Difference")); + tab_text (t, 8, 2, TAB_CENTER | TAT_TITLE, _("Std. Error Difference")); + tab_text (t, 9, 2, TAB_CENTER | TAT_TITLE, _("Lower")); + tab_text (t, 10, 2, TAB_CENTER | TAT_TITLE, _("Upper")); + + tab_joint_text_format (t, 9, 1, 10, 1, TAB_CENTER, + _("%g%% Confidence Interval of the Difference"), + tt->confidence * 100.0); + + for (v = 0; v < tt->n_vars; ++v) + { + double df, pooled_variance, mean_diff, tval; + double se2, std_err_diff; + double p, q; + double cc0, mean0, sigma0; + double cc1, mean1, sigma1; + moments_calculate (ps[v].mom[0], &cc0, &mean0, &sigma0, NULL, NULL); + moments_calculate (ps[v].mom[1], &cc1, &mean1, &sigma1, NULL, NULL); + + tab_text (t, 0, v * 2 + heading_rows, TAB_LEFT, var_to_string (tt->vars[v])); + tab_text (t, 1, v * 2 + heading_rows, TAB_LEFT, _("Equal variances assumed")); + + df = cc0 + cc1 - 2.0; + tab_double (t, 5, v * 2 + heading_rows, TAB_RIGHT, df, NULL); + + pooled_variance = ((cc0 - 1)* sigma0 + (cc1 - 1) * sigma1) / df ; + + tval = (mean0 - mean1) / sqrt (pooled_variance); + tval /= sqrt ((cc0 + cc1) / (cc0 * cc1)); + + tab_double (t, 4, v * 2 + heading_rows, TAB_RIGHT, tval, NULL); + + p = gsl_cdf_tdist_P (tval, df); + q = gsl_cdf_tdist_Q (tval, df); + + mean_diff = mean0 - mean1; + + tab_double (t, 6, v * 2 + heading_rows, TAB_RIGHT, 2.0 * (tval > 0 ? q : p), NULL); + tab_double (t, 7, v * 2 + heading_rows, TAB_RIGHT, mean_diff, NULL); + + std_err_diff = sqrt ((sigma0 / cc0) + (sigma1 / cc1)); + tab_double (t, 8, v * 2 + heading_rows, TAB_RIGHT, std_err_diff, NULL); + + + /* Now work out the confidence interval */ + q = (1 - tt->confidence)/2.0; /* 2-tailed test */ + + tval = gsl_cdf_tdist_Qinv (q, df); + tab_double (t, 9, v * 2 + heading_rows, TAB_RIGHT, mean_diff - tval * std_err_diff, NULL); + tab_double (t, 10, v * 2 + heading_rows, TAB_RIGHT, mean_diff + tval * std_err_diff, NULL); + + /* Equal variances not assumed */ + tab_text (t, 1, v * 2 + heading_rows + 1, TAB_LEFT, _("Equal variances not assumed")); + + se2 = sigma0 / cc0 + sigma1 / cc1; + tval = mean_diff / sqrt (se2); + tab_double (t, 4, v * 2 + heading_rows + 1, TAB_RIGHT, tval, NULL); + + { + double p, q; + const double s0 = sigma0 / (cc0); + const double s1 = sigma1 / (cc1); + double df = pow2 (s0 + s1) ; + df /= pow2 (s0) / (cc0 - 1) + pow2 (s1) / (cc1 - 1); + + tab_double (t, 5, v * 2 + heading_rows + 1, TAB_RIGHT, df, NULL); + + p = gsl_cdf_tdist_P (tval, df); + q = gsl_cdf_tdist_Q (tval, df); + + tab_double (t, 6, v * 2 + heading_rows + 1, TAB_RIGHT, 2.0 * (tval > 0 ? q : p), NULL); + + /* Now work out the confidence interval */ + q = (1 - tt->confidence) / 2.0; /* 2-tailed test */ + + tval = gsl_cdf_tdist_Qinv (q, df); + } + + tab_double (t, 7, v * 2 + heading_rows + 1, TAB_RIGHT, mean_diff, NULL); + tab_double (t, 8, v * 2 + heading_rows + 1, TAB_RIGHT, std_err_diff, NULL); + tab_double (t, 9, v * 2 + heading_rows + 1, TAB_RIGHT, mean_diff - tval * std_err_diff, NULL); + tab_double (t, 10, v * 2 + heading_rows + 1, TAB_RIGHT, mean_diff + tval * std_err_diff, NULL); + + tab_double (t, 2, v * 2 + heading_rows, TAB_CENTER, ps[v].lev, NULL); + + + { + /* Now work out the significance of the Levene test */ + double df1 = 1; + double df2 = cc0 + cc1 - 2; + double q = gsl_cdf_fdist_Q (ps[v].lev, df1, df2); + tab_double (t, 3, v * 2 + heading_rows, TAB_CENTER, q, NULL); + } + } + + tab_submit (t); +} diff --git a/src/language/stats/t-test-one-sample.c b/src/language/stats/t-test-one-sample.c new file mode 100644 index 00000000..38c1eff6 --- /dev/null +++ b/src/language/stats/t-test-one-sample.c @@ -0,0 +1,253 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + + +#include "t-test.h" + +#include +#include + +#include "data/variable.h" +#include "data/format.h" +#include "data/casereader.h" +#include "data/dictionary.h" +#include "libpspp/hash-functions.h" +#include "libpspp/hmapx.h" +#include "math/moments.h" + +#include "output/tab.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) + + +struct per_var_stats +{ + const struct variable *var; + + /* The position for reporting purposes */ + int posn; + + /* N, Mean, Variance */ + struct moments *mom; + + /* Sum of the differences */ + double sum_diff; +}; + + +struct one_samp +{ + struct hmapx hmap; + double testval; +}; + + +static void +one_sample_test (const struct tt *tt, const struct one_samp *os) +{ + struct hmapx_node *node; + struct per_var_stats *per_var_stats; + + const int heading_rows = 3; + const size_t rows = heading_rows + tt->n_vars; + const size_t cols = 7; + const struct fmt_spec *wfmt = tt->wv ? var_get_print_format (tt->wv) : & F_8_0; + + struct tab_table *t = tab_create (cols, rows); + + tab_headers (t, 0, 0, heading_rows, 0); + tab_box (t, TAL_2, TAL_2, TAL_0, TAL_0, 0, 0, cols - 1, rows - 1); + tab_hline (t, TAL_2, 0, cols - 1, 3); + + tab_title (t, _("One-Sample Test")); + tab_hline (t, TAL_1, 1, cols - 1, 1); + tab_vline (t, TAL_2, 1, 0, rows - 1); + + tab_joint_text_format (t, 1, 0, cols - 1, 0, TAB_CENTER, + _("Test Value = %f"), os->testval); + + tab_box (t, -1, -1, -1, TAL_1, 1, 1, cols - 1, rows - 1); + + tab_joint_text_format (t, 5, 1, 6, 1, TAB_CENTER, + _("%g%% Confidence Interval of the Difference"), + tt->confidence * 100.0); + + tab_vline (t, TAL_GAP, 6, 1, 1); + tab_hline (t, TAL_1, 5, 6, 2); + tab_text (t, 1, 2, TAB_CENTER | TAT_TITLE, _("t")); + tab_text (t, 2, 2, TAB_CENTER | TAT_TITLE, _("df")); + tab_text (t, 3, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)")); + tab_text (t, 4, 2, TAB_CENTER | TAT_TITLE, _("Mean Difference")); + tab_text (t, 5, 2, TAB_CENTER | TAT_TITLE, _("Lower")); + tab_text (t, 6, 2, TAB_CENTER | TAT_TITLE, _("Upper")); + + HMAPX_FOR_EACH (per_var_stats, node, &os->hmap) + { + const struct moments *m = per_var_stats->mom; + double cc, mean, sigma; + double tval, df; + double p, q; + double mean_diff; + double se_mean ; + const int v = per_var_stats->posn; + + moments_calculate (m, &cc, &mean, &sigma, NULL, NULL); + + tval = (mean - os->testval) * sqrt (cc / sigma); + + mean_diff = per_var_stats->sum_diff / cc; + se_mean = sqrt (sigma / cc); + df = cc - 1.0; + p = gsl_cdf_tdist_P (tval, df); + q = gsl_cdf_tdist_Q (tval, df); + + tab_text (t, 0, v + heading_rows, TAB_LEFT, var_to_string (per_var_stats->var)); + tab_double (t, 1, v + heading_rows, TAB_RIGHT, tval, NULL); + tab_double (t, 2, v + heading_rows, TAB_RIGHT, df, wfmt); + + /* Multiply by 2 to get 2-tailed significance, makeing sure we've got + the correct tail*/ + tab_double (t, 3, v + heading_rows, TAB_RIGHT, 2.0 * (tval > 0 ? q : p), NULL); + + tab_double (t, 4, v + heading_rows, TAB_RIGHT, mean_diff, NULL); + + tval = gsl_cdf_tdist_Qinv ( (1.0 - tt->confidence) / 2.0, df); + + tab_double (t, 5, v + heading_rows, TAB_RIGHT, mean_diff - tval * se_mean, NULL); + tab_double (t, 6, v + heading_rows, TAB_RIGHT, mean_diff + tval * se_mean, NULL); + } + + tab_submit (t); +} + +static void +one_sample_summary (const struct tt *tt, const struct one_samp *os) +{ + struct hmapx_node *node; + struct per_var_stats *per_var_stats; + + const int cols = 5; + const int heading_rows = 1; + const int rows = tt->n_vars + heading_rows; + struct tab_table *t = tab_create (cols, rows); + const struct fmt_spec *wfmt = tt->wv ? var_get_print_format (tt->wv) : & F_8_0; + + tab_headers (t, 0, 0, heading_rows, 0); + tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1); + tab_hline (t, TAL_2, 0, cols - 1, 1); + + tab_title (t, _("One-Sample Statistics")); + tab_vline (t, TAL_2, 1, 0, rows - 1); + tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("N")); + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Mean")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean")); + + HMAPX_FOR_EACH (per_var_stats, node, &os->hmap) + { + const struct moments *m = per_var_stats->mom; + const int v = per_var_stats->posn; + double cc, mean, sigma; + moments_calculate (m, &cc, &mean, &sigma, NULL, NULL); + + tab_text (t, 0, v + heading_rows, TAB_LEFT, var_to_string (per_var_stats->var)); + tab_double (t, 1, v + heading_rows, TAB_RIGHT, cc, wfmt); + tab_double (t, 2, v + heading_rows, TAB_RIGHT, mean, NULL); + tab_double (t, 3, v + heading_rows, TAB_RIGHT, sqrt (sigma), NULL); + tab_double (t, 4, v + heading_rows, TAB_RIGHT, sqrt (sigma / cc), NULL); + } + + tab_submit (t); +} + +void +one_sample_run (const struct tt *tt, double testval, struct casereader *reader) +{ + int i; + struct ccase *c; + struct one_samp os; + struct casereader *r; + struct hmapx_node *node; + struct per_var_stats *per_var_stats; + + os.testval = testval; + hmapx_init (&os.hmap); + + /* Insert all the variables into the map */ + for (i = 0; i < tt->n_vars; ++i) + { + struct per_var_stats *per_var_stats = xzalloc (sizeof (*per_var_stats)); + + per_var_stats->posn = i; + per_var_stats->var = tt->vars[i]; + per_var_stats->mom = moments_create (MOMENT_VARIANCE); + + hmapx_insert (&os.hmap, per_var_stats, hash_pointer (per_var_stats->var, 0)); + } + + r = casereader_clone (reader); + for ( ; (c = casereader_read (r) ); case_unref (c)) + { + double w = dict_get_case_weight (tt->dict, c, NULL); + struct hmapx_node *node; + struct per_var_stats *per_var_stats; + HMAPX_FOR_EACH (per_var_stats, node, &os.hmap) + { + const struct variable *var = per_var_stats->var; + const union value *val = case_data (c, var); + if (var_is_value_missing (var, val, tt->exclude)) + continue; + + moments_pass_one (per_var_stats->mom, val->f, w); + } + } + casereader_destroy (r); + + r = reader; + for ( ; (c = casereader_read (r) ); case_unref (c)) + { + double w = dict_get_case_weight (tt->dict, c, NULL); + struct hmapx_node *node; + struct per_var_stats *per_var_stats; + HMAPX_FOR_EACH (per_var_stats, node, &os.hmap) + { + const struct variable *var = per_var_stats->var; + const union value *val = case_data (c, var); + if (var_is_value_missing (var, val, tt->exclude)) + continue; + + moments_pass_two (per_var_stats->mom, val->f, w); + per_var_stats->sum_diff += w * (val->f - os.testval); + } + } + casereader_destroy (r); + + one_sample_summary (tt, &os); + one_sample_test (tt, &os); + + HMAPX_FOR_EACH (per_var_stats, node, &os.hmap) + { + moments_destroy (per_var_stats->mom); + free (per_var_stats); + } + + hmapx_destroy (&os.hmap); +} + diff --git a/src/language/stats/t-test-paired.c b/src/language/stats/t-test-paired.c new file mode 100644 index 00000000..edeaa382 --- /dev/null +++ b/src/language/stats/t-test-paired.c @@ -0,0 +1,356 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include + +#include "t-test.h" + +#include "math/moments.h" +#include "math/correlation.h" +#include "data/casereader.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/variable.h" +#include "libpspp/hmapx.h" +#include "libpspp/hash-functions.h" + +#include "output/tab.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) + + +struct pair_stats +{ + int posn; + double sum_of_prod; + struct moments *mom0; + const struct variable *var0; + + struct moments *mom1; + const struct variable *var1; + + struct moments *mom_diff; +}; + +struct paired_samp +{ + struct hmapx hmap; +}; + +static void paired_summary (const struct tt *tt, struct paired_samp *os); +static void paired_correlations (const struct tt *tt, struct paired_samp *os); +static void paired_test (const struct tt *tt, const struct paired_samp *os); + +void +paired_run (const struct tt *tt, size_t n_pairs, vp *pairs, struct casereader *reader) +{ + int i; + struct ccase *c; + struct paired_samp ps; + struct casereader *r; + struct hmapx_node *node; + struct pair_stats *pp = NULL; + + hmapx_init (&ps.hmap); + + for (i = 0; i < n_pairs; ++i) + { + vp *pair = &pairs[i]; + unsigned int hash; + struct pair_stats *pp = xzalloc (sizeof *pp); + pp->posn = i; + pp->var0 = (*pair)[0]; + pp->var1 = (*pair)[1]; + pp->mom0 = moments_create (MOMENT_VARIANCE); + pp->mom1 = moments_create (MOMENT_VARIANCE); + pp->mom_diff = moments_create (MOMENT_VARIANCE); + + hash = hash_pointer ((*pair)[0], 0); + hash = hash_pointer ((*pair)[1], hash); + + hmapx_insert (&ps.hmap, pp, hash); + } + + r = casereader_clone (reader); + for ( ; (c = casereader_read (r) ); case_unref (c)) + { + double w = dict_get_case_weight (tt->dict, c, NULL); + + struct hmapx_node *node; + struct pair_stats *pp = NULL; + HMAPX_FOR_EACH (pp, node, &ps.hmap) + { + const union value *val0 = case_data (c, pp->var0); + const union value *val1 = case_data (c, pp->var1); + if (var_is_value_missing (pp->var0, val0, tt->exclude)) + continue; + + if (var_is_value_missing (pp->var1, val1, tt->exclude)) + continue; + + moments_pass_one (pp->mom0, val0->f, w); + moments_pass_one (pp->mom1, val1->f, w); + moments_pass_one (pp->mom_diff, val0->f - val1->f, w); + } + } + casereader_destroy (r); + + r = reader; + for ( ; (c = casereader_read (r) ); case_unref (c)) + { + double w = dict_get_case_weight (tt->dict, c, NULL); + + struct hmapx_node *node; + struct pair_stats *pp = NULL; + HMAPX_FOR_EACH (pp, node, &ps.hmap) + { + const union value *val0 = case_data (c, pp->var0); + const union value *val1 = case_data (c, pp->var1); + if (var_is_value_missing (pp->var0, val0, tt->exclude)) + continue; + + if (var_is_value_missing (pp->var1, val1, tt->exclude)) + continue; + + moments_pass_two (pp->mom0, val0->f, w); + moments_pass_two (pp->mom1, val1->f, w); + moments_pass_two (pp->mom_diff, val0->f - val1->f, w); + pp->sum_of_prod += val0->f * val1->f; + } + } + casereader_destroy (r); + + paired_summary (tt, &ps); + paired_correlations (tt, &ps); + paired_test (tt, &ps); + + /* Clean up */ + HMAPX_FOR_EACH (pp, node, &ps.hmap) + { + moments_destroy (pp->mom0); + moments_destroy (pp->mom1); + moments_destroy (pp->mom_diff); + free (pp); + } + + hmapx_destroy (&ps.hmap); +} + +static void +paired_summary (const struct tt *tt, struct paired_samp *os) +{ + size_t n_pairs = hmapx_count (&os->hmap); + struct hmapx_node *node; + struct pair_stats *pp = NULL; + + const int heading_rows = 1; + const int heading_cols = 2; + + const int cols = 4 + heading_cols; + const int rows = n_pairs * 2 + heading_rows; + struct tab_table *t = tab_create (cols, rows); + const struct fmt_spec *wfmt = tt->wv ? var_get_print_format (tt->wv) : & F_8_0; + + tab_headers (t, 0, 0, heading_rows, 0); + tab_box (t, TAL_2, TAL_2, TAL_0, TAL_0, 0, 0, cols - 1, rows - 1); + tab_box (t, -1, -1, TAL_0, TAL_1, heading_cols, 0, cols - 1, rows - 1); + + tab_hline (t, TAL_2, 0, cols - 1, 1); + + tab_title (t, _("Paired Sample Statistics")); + tab_vline (t, TAL_2, heading_cols, 0, rows - 1); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("N")); + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Mean")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation")); + tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean")); + + HMAPX_FOR_EACH (pp, node, &os->hmap) + { + int v = pp->posn; + double cc, mean, sigma; + + tab_text_format (t, 0, v * 2 + heading_rows, TAB_LEFT, _("Pair %d"), pp->posn); + + /* first var */ + moments_calculate (pp->mom0, &cc, &mean, &sigma, NULL, NULL); + tab_text (t, 1, v * 2 + heading_rows, TAB_LEFT, var_to_string (pp->var0)); + tab_double (t, 3, v * 2 + heading_rows, TAB_RIGHT, cc, wfmt); + tab_double (t, 2, v * 2 + heading_rows, TAB_RIGHT, mean, NULL); + tab_double (t, 4, v * 2 + heading_rows, TAB_RIGHT, sqrt (sigma), NULL); + tab_double (t, 5, v * 2 + heading_rows, TAB_RIGHT, sqrt (sigma / cc), NULL); + + /* second var */ + moments_calculate (pp->mom1, &cc, &mean, &sigma, NULL, NULL); + tab_text (t, 1, v * 2 + 1 + heading_rows, TAB_LEFT, var_to_string (pp->var1)); + tab_double (t, 3, v * 2 + 1 + heading_rows, TAB_RIGHT, cc, wfmt); + tab_double (t, 2, v * 2 + 1 + heading_rows, TAB_RIGHT, mean, NULL); + tab_double (t, 4, v * 2 + 1 + heading_rows, TAB_RIGHT, sqrt (sigma), NULL); + tab_double (t, 5, v * 2 + 1 + heading_rows, TAB_RIGHT, sqrt (sigma / cc), NULL); + } + + tab_submit (t); +} + + +static void +paired_correlations (const struct tt *tt, struct paired_samp *os) +{ + size_t n_pairs = hmapx_count (&os->hmap); + struct hmapx_node *node; + struct pair_stats *pp = NULL; + const int heading_rows = 1; + const int heading_cols = 2; + + const int cols = 5; + const int rows = n_pairs + heading_rows; + struct tab_table *t = tab_create (cols, rows); + const struct fmt_spec *wfmt = tt->wv ? var_get_print_format (tt->wv) : & F_8_0; + + tab_headers (t, 0, 0, heading_rows, 0); + tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1); + + tab_hline (t, TAL_2, 0, cols - 1, 1); + + tab_title (t, _("Paired Samples Correlations")); + tab_vline (t, TAL_2, heading_cols, 0, rows - 1); + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("N")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Correlation")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Sig.")); + + HMAPX_FOR_EACH (pp, node, &os->hmap) + { + double corr; + double cc0, mean0, sigma0; + double cc1, mean1, sigma1; + int v = pp->posn; + + tab_text_format (t, 0, v + heading_rows, TAB_LEFT, _("Pair %d"), pp->posn); + + tab_text_format (t, 1, v + heading_rows, TAB_LEFT, _("%s & %s"), + var_to_string (pp->var0), + var_to_string (pp->var1)); + + moments_calculate (pp->mom0, &cc0, &mean0, &sigma0, NULL, NULL); + moments_calculate (pp->mom1, &cc1, &mean1, &sigma1, NULL, NULL); + + /* If this fails, then we're not dealing with missing values properly */ + assert (cc0 == cc1); + + tab_double (t, 2, v + heading_rows, TAB_RIGHT, cc0, wfmt); + + corr = pp->sum_of_prod / cc0 - (mean0 * mean1); + corr /= sqrt (sigma0 * sigma1); + corr *= cc0 / (cc0 - 1); + + tab_double (t, 3, v + heading_rows, TAB_RIGHT, corr, NULL); + tab_double (t, 4, v + heading_rows, TAB_RIGHT, + 2.0 * significance_of_correlation (corr, cc0), NULL); + } + + tab_submit (t); +} + + +static void +paired_test (const struct tt *tt, const struct paired_samp *os) +{ + size_t n_pairs = hmapx_count (&os->hmap); + struct hmapx_node *node; + struct pair_stats *pp = NULL; + + const int heading_rows = 3; + const int heading_cols = 2; + const size_t rows = heading_rows + n_pairs; + const size_t cols = 10; + const struct fmt_spec *wfmt = tt->wv ? var_get_print_format (tt->wv) : & F_8_0; + + struct tab_table *t = tab_create (cols, rows); + + tab_headers (t, 0, 0, heading_rows, 0); + tab_box (t, TAL_2, TAL_2, TAL_0, TAL_0, 0, 0, cols - 1, rows - 1); + tab_hline (t, TAL_2, 0, cols - 1, 3); + + tab_title (t, _("Paired Samples Test")); + tab_hline (t, TAL_1, heading_cols, 6, 1); + tab_vline (t, TAL_2, heading_cols, 0, rows - 1); + + tab_box (t, -1, -1, -1, TAL_1, heading_cols, 0, cols - 1, rows - 1); + + tab_joint_text (t, 2, 0, 6, 0, TAB_CENTER, + _("Paired Differences")); + + tab_joint_text_format (t, 5, 1, 6, 1, TAB_CENTER, + _("%g%% Confidence Interval of the Difference"), + tt->confidence * 100.0); + + tab_vline (t, TAL_GAP, 6, 1, 1); + tab_hline (t, TAL_1, 5, 6, 2); + tab_text (t, 7, 2, TAB_CENTER | TAT_TITLE, _("t")); + tab_text (t, 8, 2, TAB_CENTER | TAT_TITLE, _("df")); + tab_text (t, 9, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)")); + tab_text (t, 4, 2, TAB_CENTER | TAT_TITLE, _("Std. Error Mean")); + tab_text (t, 3, 2, TAB_CENTER | TAT_TITLE, _("Std. Deviation")); + tab_text (t, 2, 2, TAB_CENTER | TAT_TITLE, _("Mean")); + + tab_text (t, 5, 2, TAB_CENTER | TAT_TITLE, _("Lower")); + tab_text (t, 6, 2, TAB_CENTER | TAT_TITLE, _("Upper")); + + HMAPX_FOR_EACH (pp, node, &os->hmap) + { + int v = pp->posn; + double cc, mean, sigma; + double df ; + double tval; + double p, q; + double se_mean; + + moments_calculate (pp->mom_diff, &cc, &mean, &sigma, NULL, NULL); + + df = cc - 1.0; + tab_text_format (t, 0, v + heading_rows, TAB_LEFT, _("Pair %d"), v); + + tab_text_format (t, 1, v + heading_rows, TAB_LEFT, _("%s - %s"), + var_to_string (pp->var0), + var_to_string (pp->var1)); + + tval = mean * sqrt (cc / sigma); + se_mean = sqrt (sigma / cc); + + tab_double (t, 2, v + heading_rows, TAB_RIGHT, mean, NULL); + tab_double (t, 3, v + heading_rows, TAB_RIGHT, sqrt (sigma), NULL); + tab_double (t, 4, v + heading_rows, TAB_RIGHT, se_mean, NULL); + + tab_double (t, 7, v + heading_rows, TAB_RIGHT, tval, NULL); + tab_double (t, 8, v + heading_rows, TAB_RIGHT, df, wfmt); + + + p = gsl_cdf_tdist_P (tval, df); + q = gsl_cdf_tdist_Q (tval, df); + + tab_double (t, 9, v + heading_rows, TAB_RIGHT, 2.0 * (tval > 0 ? q : p), NULL); + + tval = gsl_cdf_tdist_Qinv ( (1.0 - tt->confidence) / 2.0, df); + + tab_double (t, 5, v + heading_rows, TAB_RIGHT, mean - tval * se_mean, NULL); + tab_double (t, 6, v + heading_rows, TAB_RIGHT, mean + tval * se_mean, NULL); + } + + tab_submit (t); +} diff --git a/src/language/stats/t-test-parser.c b/src/language/stats/t-test-parser.c new file mode 100644 index 00000000..51f30b9c --- /dev/null +++ b/src/language/stats/t-test-parser.c @@ -0,0 +1,366 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "libpspp/message.h" + +#include "data/casegrouper.h" +#include "data/casereader.h" +#include "data/dictionary.h" +#include "data/dataset.h" +#include "data/missing-values.h" + +#include "language/lexer/lexer.h" +#include "language/command.h" +#include "language/lexer/variable-parser.h" +#include "language/lexer/value-parser.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) + +#include "t-test.h" + +int +cmd_t_test (struct lexer *lexer, struct dataset *ds) +{ + bool ok; + const struct dictionary *dict = dataset_dict (ds); + struct tt tt; + int mode_count = 0; + + /* Variables pertaining to the paired mode */ + const struct variable **v1 = NULL; + size_t n_v1; + const struct variable **v2 = NULL; + size_t n_v2; + + size_t n_pairs; + vp *pairs = NULL; + + + /* One sample mode */ + double testval; + + /* Independent samples mode */ + const struct variable *gvar; + union value gval0; + union value gval1; + bool cut; + + tt.wv = dict_get_weight (dict); + tt.dict = dict; + tt.confidence = 0.95; + tt.exclude = MV_ANY; + tt.missing_type = MISS_ANALYSIS; + tt.n_vars = 0; + tt.vars = NULL; + + lex_match (lexer, T_EQUALS); + + for (; lex_token (lexer) != T_ENDCMD; ) + { + lex_match (lexer, T_SLASH); + if (lex_match_id (lexer, "TESTVAL")) + { + mode_count++; + tt.mode = MODE_SINGLE; + lex_match (lexer, T_EQUALS); + lex_force_num (lexer); + testval = lex_number (lexer); + lex_get (lexer); + } + else if (lex_match_id (lexer, "GROUPS")) + { + mode_count++; + cut = false; + tt.mode = MODE_INDEP; + lex_match (lexer, T_EQUALS); + + if (NULL == (gvar = parse_variable (lexer, dict))) + goto parse_failed; + + if (lex_match (lexer, T_LPAREN)) + { + + value_init (&gval0, var_get_width (gvar)); + parse_value (lexer, &gval0, gvar); + cut = true; + if (lex_match (lexer, T_COMMA)) + { + value_init (&gval1, var_get_width (gvar)); + parse_value (lexer, &gval1, gvar); + cut = false; + } + + lex_force_match (lexer, T_RPAREN); + } + else + { + value_init (&gval0, 0); + value_init (&gval1, 0); + gval0.f = 1.0; + gval1.f = 2.0; + cut = false; + } + + if ( cut == true && var_is_alpha (gvar)) + { + msg (SE, _("When applying GROUPS to a string variable, two " + "values must be specified.")); + goto parse_failed; + } + } + else if (lex_match_id (lexer, "PAIRS")) + { + bool with = false; + bool paired = false; + mode_count++; + tt.mode = MODE_PAIRED; + lex_match (lexer, T_EQUALS); + + if (!parse_variables_const (lexer, dict, + &v1, &n_v1, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto parse_failed; + + if ( lex_match (lexer, T_WITH)) + { + with = true; + if (!parse_variables_const (lexer, dict, + &v2, &n_v2, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto parse_failed; + + if (lex_match (lexer, T_LPAREN) + && lex_match_id (lexer, "PAIRED") + && lex_match (lexer, T_RPAREN)) + { + paired = true; + if (n_v1 != n_v2) + { + msg (SE, _("PAIRED was specified but the number of variables " + "preceding WITH (%zu) did not match the number " + "following (%zu)."), + n_v1, n_v2); + goto parse_failed; + } + } + } + { + int i; + + if ( !with ) + n_pairs = (n_v1 * (n_v1 - 1)) / 2.0; + else if ( paired ) + n_pairs = n_v1; + else + n_pairs = n_v1 * n_v2; + + pairs = xcalloc (sizeof *pairs, n_pairs); + + + if ( with) + { + int x = 0; + if (paired) + { + for (i = 0 ; i < n_v1; ++i) + { + vp *pair = &pairs[i]; + (*pair)[0] = v1[i]; + (*pair)[1] = v2[i]; + } + } + else + { + for (i = 0 ; i < n_v1; ++i) + { + int j; + for (j = 0 ; j < n_v2; ++j) + { + vp *pair = &pairs[x++]; + (*pair)[0] = v1[i]; + (*pair)[1] = v2[j]; + } + } + } + } + else + { + int x = 0; + for (i = 0 ; i < n_v1; ++i) + { + int j; + + for (j = i + 1 ; j < n_v1; ++j) + { + vp *pair = &pairs[x++]; + (*pair)[0] = v1[i]; + (*pair)[1] = v1[j]; + } + } + } + + } + } + else if (lex_match_id (lexer, "VARIABLES")) + { + if ( tt.mode == MODE_PAIRED) + { + msg (SE, _("VARIABLES subcommand may not be used with PAIRS.")); + goto parse_failed; + } + + lex_match (lexer, T_EQUALS); + + if (!parse_variables_const (lexer, dict, + &tt.vars, + &tt.n_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto parse_failed; + } + else if ( lex_match_id (lexer, "MISSING")) + { + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "INCLUDE")) + { + tt.exclude = MV_SYSTEM; + } + else if (lex_match_id (lexer, "EXCLUDE")) + { + tt.exclude = MV_ANY; + } + else if (lex_match_id (lexer, "LISTWISE")) + { + tt.missing_type = MISS_LISTWISE; + } + else if (lex_match_id (lexer, "ANALYSIS")) + { + tt.missing_type = MISS_ANALYSIS; + } + else + { + lex_error (lexer, NULL); + goto parse_failed; + } + lex_match (lexer, T_COMMA); + } + } + else if (lex_match_id (lexer, "CRITERIA")) + { + lex_match (lexer, T_EQUALS); + if ( lex_force_match_id (lexer, "CIN")) + if ( lex_force_match (lexer, T_LPAREN)) + { + lex_force_num (lexer); + tt.confidence = lex_number (lexer); + lex_get (lexer); + lex_force_match (lexer, T_RPAREN); + } + } + else + { + lex_error (lexer, NULL); + goto parse_failed; + } + } + + if ( mode_count != 1) + { + msg (SE, _("Exactly one of TESTVAL, GROUPS and PAIRS subcommands " + "must be specified.")); + goto parse_failed; + } + + if (tt.n_vars == 0 && tt.mode != MODE_PAIRED) + { + msg (SE, _("One or more VARIABLES must be specified.")); + goto parse_failed; + } + + + + /* Deal with splits etc */ + { + struct casereader *group; + struct casegrouper *grouper = casegrouper_create_splits (proc_open (ds), dict); + + while (casegrouper_get_next_group (grouper, &group)) + { + if ( tt.mode == MODE_SINGLE) + { + if ( tt.missing_type == MISS_LISTWISE ) + group = casereader_create_filter_missing (group, + tt.vars, tt.n_vars, + tt.exclude, + NULL, NULL); + one_sample_run (&tt, testval, group); + } + else if ( tt.mode == MODE_PAIRED) + { + if ( tt.missing_type == MISS_LISTWISE ) + { + group = casereader_create_filter_missing (group, + v1, n_v1, + tt.exclude, + NULL, NULL); + group = casereader_create_filter_missing (group, + v2, n_v2, + tt.exclude, + NULL, NULL); + } + + paired_run (&tt, n_pairs, pairs, group); + } + else /* tt.mode == MODE_INDEP */ + { + if ( tt.missing_type == MISS_LISTWISE ) + { + group = casereader_create_filter_missing (group, + tt.vars, tt.n_vars, + tt.exclude, + NULL, NULL); + + group = casereader_create_filter_missing (group, + &gvar, 1, + tt.exclude, + NULL, NULL); + + } + + indep_run (&tt, gvar, cut, &gval0, &gval1, group); + } + } + + ok = casegrouper_destroy (grouper); + ok = proc_commit (ds) && ok; + } + + free (pairs); + free (v1); + free (v2); + + free (tt.vars); + + return ok ? CMD_SUCCESS : CMD_FAILURE; + + parse_failed: + return CMD_FAILURE; +} + diff --git a/src/language/stats/t-test.h b/src/language/stats/t-test.h new file mode 100644 index 00000000..2b4dba44 --- /dev/null +++ b/src/language/stats/t-test.h @@ -0,0 +1,61 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef T_TEST_H +#define T_TEST_H 1 + +#include "data/missing-values.h" + +struct variable; +typedef const struct variable *vp[2]; + +enum missing_type + { + MISS_LISTWISE, + MISS_ANALYSIS, + }; + +enum mode + { + MODE_PAIRED, + MODE_INDEP, + MODE_SINGLE, + }; + +struct tt +{ + size_t n_vars; + const struct variable **vars; + enum mode mode; + enum missing_type missing_type; + enum mv_class exclude; + double confidence; + const struct variable *wv; + const struct dictionary *dict; +}; + +struct casereader; +union value; + +void one_sample_run (const struct tt *tt, double testval, struct casereader *reader); +void paired_run (const struct tt *tt, size_t n_pairs, vp *pairs, struct casereader *reader); +void indep_run (struct tt *tt, const struct variable *gvar, + bool cut, + const union value *gval0, const union value *gval1, + struct casereader *reader); + + +#endif diff --git a/src/language/stats/t-test.q b/src/language/stats/t-test.q deleted file mode 100644 index d26fc8a4..00000000 --- a/src/language/stats/t-test.q +++ /dev/null @@ -1,1560 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#include - -#include -#include -#include -#include -#include - -#include "data/case.h" -#include "data/casegrouper.h" -#include "data/casereader.h" -#include "data/dataset.h" -#include "data/dictionary.h" -#include "data/format.h" -#include "data/value-labels.h" -#include "data/variable.h" -#include "language/command.h" -#include "language/dictionary/split-file.h" -#include "language/lexer/lexer.h" -#include "language/lexer/value-parser.h" -#include "libpspp/array.h" -#include "libpspp/assertion.h" -#include "libpspp/compiler.h" -#include "libpspp/hash.h" -#include "libpspp/message.h" -#include "libpspp/misc.h" -#include "libpspp/str.h" -#include "libpspp/taint.h" -#include "math/correlation.h" -#include "math/group-proc.h" -#include "math/levene.h" -#include "output/tab.h" - -#include "gl/minmax.h" -#include "gl/xalloc.h" -#include "gl/xmemdup0.h" - -#include "gettext.h" -#define _(msgid) gettext (msgid) - -/* (headers) */ - -/* (specification) - "T-TEST" (tts_): - +groups=custom; - testval=double; - +variables=varlist("PV_NO_SCRATCH | PV_NUMERIC"); - +pairs=custom; - missing=miss:!analysis/listwise, - incl:include/!exclude; - +format=fmt:!labels/nolabels; - criteria=:cin(d:criteria,"%s > 0. && %s < 1."). -*/ -/* (declarations) */ -/* (functions) */ - -enum comparison - { - CMP_LE, - CMP_EQ, - }; - -/* A pair of variables to be compared. */ -struct pair - { - const struct variable *v[2]; /* The paired variables. */ - double n; /* The number of valid variable pairs */ - double sum[2]; /* The sum of the members */ - double ssq[2]; /* sum of squares of the members */ - double std_dev[2]; /* Std deviation of the members */ - double s_std_dev[2]; /* Sample Std deviation of the members */ - double mean[2]; /* The means of the members */ - double correlation; /* Correlation coefficient between the variables. */ - double sum_of_diffs; /* The sum of the differences */ - double sum_of_prod; /* The sum of the products */ - double mean_diff; /* The mean of the differences */ - double ssq_diffs; /* The sum of the squares of the differences */ - double std_dev_diff; /* The std deviation of the differences */ - }; - -/* Which mode was T-TEST invoked */ -enum t_test_mode { - T_1_SAMPLE, /* One-sample tests. */ - T_IND_SAMPLES, /* Independent-sample tests. */ - T_PAIRED /* Paired-sample tests. */ -}; - -/* Total state of a T-TEST procedure. */ -struct t_test_proc - { - enum t_test_mode mode; /* Mode that T-TEST was invoked in. */ - double criteria; /* Confidence interval in (0, 1). */ - enum mv_class exclude; /* Classes of missing values to exclude. */ - bool listwise_missing; /* Drop whole case if one missing var? */ - struct fmt_spec weight_format; /* Format of weight variable. */ - - /* Dependent variables. */ - const struct variable **vars; - size_t n_vars; - - /* For mode == T_1_SAMPLE. */ - double testval; - - /* For mode == T_PAIRED only. */ - struct pair *pairs; - size_t n_pairs; - - /* For mode == T_IND_SAMPLES only. */ - struct variable *indep_var; /* Independent variable. */ - enum comparison criterion; /* Type of comparison. */ - double critical_value; /* CMP_LE only: Grouping threshold value. */ - union value g_value[2]; /* CMP_EQ only: Per-group indep var values. */ - }; - -/* Statistics Summary Box */ -struct ssbox - { - struct tab_table *t; - void (*populate) (struct ssbox *, struct t_test_proc *); - void (*finalize) (struct ssbox *); - }; - -static void ssbox_create (struct ssbox *, struct t_test_proc *); -static void ssbox_populate (struct ssbox *, struct t_test_proc *); -static void ssbox_finalize (struct ssbox *); - -/* Paired Samples Correlation box */ -static void pscbox (struct t_test_proc *); - - -/* Test Results Box. */ -struct trbox { - struct tab_table *t; - void (*populate) (struct trbox *, struct t_test_proc *); - void (*finalize) (struct trbox *); - }; - -static void trbox_create (struct trbox *, struct t_test_proc *); -static void trbox_populate (struct trbox *, struct t_test_proc *); -static void trbox_finalize (struct trbox *); - -static void calculate (struct t_test_proc *, struct casereader *, - const struct dataset *); - -static int compare_group_binary (const struct group_statistics *a, - const struct group_statistics *b, - const struct t_test_proc *); -static unsigned hash_group_binary (const struct group_statistics *g, - const struct t_test_proc *p); - -static void t_test_proc_destroy (struct t_test_proc *proc); - -int -cmd_t_test (struct lexer *lexer, struct dataset *ds) -{ - struct cmd_t_test cmd; - struct t_test_proc proc; - struct casegrouper *grouper; - struct casereader *group; - struct variable *wv; - bool ok = false; - - proc.pairs = NULL; - proc.n_pairs = 0; - proc.vars = NULL; - proc.indep_var = NULL; - if (!parse_t_test (lexer, ds, &cmd, &proc)) - goto parse_failed; - - wv = dict_get_weight (dataset_dict (ds)); - proc.weight_format = wv ? *var_get_print_format (wv) : F_8_0; - - if ((cmd.sbc_testval != 0) + (cmd.sbc_groups != 0) + (cmd.sbc_pairs != 0) - != 1) - { - msg (SE, _("Exactly one of TESTVAL, GROUPS and PAIRS subcommands " - "must be specified.")); - goto error; - } - - proc.mode = (cmd.sbc_testval ? T_1_SAMPLE - : cmd.sbc_groups ? T_IND_SAMPLES - : T_PAIRED); - proc.criteria = cmd.sbc_criteria ? cmd.criteria : 0.95; - proc.exclude = cmd.incl != TTS_INCLUDE ? MV_ANY : MV_SYSTEM; - proc.listwise_missing = cmd.miss == TTS_LISTWISE; - - if (proc.mode == T_1_SAMPLE) - proc.testval = cmd.n_testval[0]; - - if (proc.mode == T_PAIRED) - { - size_t i, j; - - if (cmd.sbc_variables) - { - msg (SE, _("VARIABLES subcommand may not be used with PAIRS.")); - goto error; - } - - /* Fill proc.vars with the unique variables from pairs. */ - proc.n_vars = proc.n_pairs * 2; - proc.vars = xmalloc (sizeof *proc.vars * proc.n_vars); - for (i = j = 0; i < proc.n_pairs; i++) - { - proc.vars[j++] = proc.pairs[i].v[0]; - proc.vars[j++] = proc.pairs[i].v[1]; - } - proc.n_vars = sort_unique (proc.vars, proc.n_vars, sizeof *proc.vars, - compare_var_ptrs_by_name, NULL); - } - else - { - if (!cmd.n_variables) - { - msg (SE, _("One or more VARIABLES must be specified.")); - goto error; - } - proc.n_vars = cmd.n_variables; - proc.vars = cmd.v_variables; - cmd.v_variables = NULL; - } - - /* Data pass. */ - grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); - while (casegrouper_get_next_group (grouper, &group)) - calculate (&proc, group, ds); - ok = casegrouper_destroy (grouper); - - /* Free 'proc' then commit the procedure. Must happen in this order because - if proc->indep_var was created by a temporary transformation then - committing will destroy it. */ - t_test_proc_destroy (&proc); - ok = proc_commit (ds) && ok; - - return ok ? CMD_SUCCESS : CMD_FAILURE; - -error: - free_t_test (&cmd); -parse_failed: - t_test_proc_destroy (&proc); - return CMD_FAILURE; -} - -static void -t_test_proc_destroy (struct t_test_proc *proc) -{ - if (proc->indep_var != NULL) - { - int width = var_get_width (proc->indep_var); - value_destroy (&proc->g_value[0], width); - value_destroy (&proc->g_value[1], width); - } - free (proc->vars); - free (proc->pairs); -} - -static int -tts_custom_groups (struct lexer *lexer, struct dataset *ds, - struct cmd_t_test *cmd UNUSED, void *proc_) -{ - struct t_test_proc *proc = proc_; - int n_values; - int width; - - lex_match (lexer, T_EQUALS); - - proc->indep_var = parse_variable (lexer, dataset_dict (ds)); - if (proc->indep_var == NULL) - { - lex_error (lexer, "expecting variable name in GROUPS subcommand"); - return 0; - } - width = var_get_width (proc->indep_var); - value_init (&proc->g_value[0], width); - value_init (&proc->g_value[1], width); - - if (!lex_match (lexer, T_LPAREN)) - n_values = 0; - else - { - if (!parse_value (lexer, &proc->g_value[0], proc->indep_var)) - return 0; - lex_match (lexer, T_COMMA); - if (lex_match (lexer, T_RPAREN)) - n_values = 1; - else - { - if (!parse_value (lexer, &proc->g_value[1], proc->indep_var) - || !lex_force_match (lexer, T_RPAREN)) - return 0; - n_values = 2; - } - } - - if (var_is_numeric (proc->indep_var)) - { - proc->criterion = n_values == 1 ? CMP_LE : CMP_EQ; - if (n_values == 1) - proc->critical_value = proc->g_value[0].f; - else if (n_values == 0) - { - proc->g_value[0].f = 1; - proc->g_value[1].f = 2; - } - } - else - { - proc->criterion = CMP_EQ; - if (n_values != 2) - { - msg (SE, _("When applying GROUPS to a string variable, two " - "values must be specified.")); - return 0; - } - } - return 1; -} - -static void -add_pair (struct t_test_proc *proc, - const struct variable *v0, const struct variable *v1) -{ - struct pair *p = &proc->pairs[proc->n_pairs++]; - p->v[0] = v0; - p->v[1] = v1; -} - -static int -tts_custom_pairs (struct lexer *lexer, struct dataset *ds, - struct cmd_t_test *cmd UNUSED, void *proc_) -{ - struct t_test_proc *proc = proc_; - - const struct variable **vars1 = NULL; - size_t n_vars1 = 0; - - const struct variable **vars2 = NULL; - size_t n_vars2 = 0; - - bool paired = false; - - size_t n_total_pairs; - size_t i, j; - - lex_match (lexer, T_EQUALS); - - if (!parse_variables_const (lexer, dataset_dict (ds), &vars1, &n_vars1, - PV_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH)) - return 0; - - if (lex_match (lexer, T_WITH)) - { - if (!parse_variables_const (lexer, dataset_dict (ds), &vars2, &n_vars2, - PV_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH)) - { - free (vars1); - return 0; - } - - if (lex_match (lexer, T_LPAREN) - && lex_match_id (lexer, "PAIRED") - && lex_match (lexer, T_RPAREN)) - { - paired = true; - if (n_vars1 != n_vars2) - { - msg (SE, _("PAIRED was specified but the number of variables " - "preceding WITH (%zu) did not match the number " - "following (%zu)."), - n_vars1, n_vars2); - free (vars1); - free (vars2); - return 0; - } - } - } - else - { - if (n_vars1 < 2) - { - free (vars1); - msg (SE, _("At least two variables must be specified on PAIRS.")); - return 0; - } - } - - /* Allocate storage for the new pairs. */ - n_total_pairs = proc->n_pairs + (paired ? n_vars1 - : n_vars2 > 0 ? n_vars1 * n_vars2 - : n_vars1 * (n_vars1 - 1) / 2); - proc->pairs = xnrealloc (proc->pairs, n_total_pairs, sizeof *proc->pairs); - - /* Populate the pairs with the appropriate variables. */ - if (paired) - for (i = 0; i < n_vars1; i++) - add_pair (proc, vars1[i], vars2[i]); - else if (n_vars2 > 0) - for (i = 0; i < n_vars1; i++) - for (j = 0; j < n_vars2; j++) - add_pair (proc, vars1[i], vars2[j]); - else - for (i = 0; i < n_vars1; i++) - for (j = i + 1; j < n_vars1; j++) - add_pair (proc, vars1[i], vars1[j]); - assert (proc->n_pairs == n_total_pairs); - - free (vars1); - free (vars2); - return 1; -} - -/* Implementation of the SSBOX object. */ - -static void ssbox_base_init (struct ssbox *, int cols, int rows); -static void ssbox_base_finalize (struct ssbox *); -static void ssbox_one_sample_init (struct ssbox *, struct t_test_proc *); -static void ssbox_independent_samples_init (struct ssbox *, struct t_test_proc *); -static void ssbox_paired_init (struct ssbox *, struct t_test_proc *); - -/* Factory to create an ssbox. */ -static void -ssbox_create (struct ssbox *ssb, struct t_test_proc *proc) -{ - switch (proc->mode) - { - case T_1_SAMPLE: - ssbox_one_sample_init (ssb, proc); - break; - case T_IND_SAMPLES: - ssbox_independent_samples_init (ssb, proc); - break; - case T_PAIRED: - ssbox_paired_init (ssb, proc); - break; - default: - NOT_REACHED (); - } -} - -/* Despatcher for the populate method */ -static void -ssbox_populate (struct ssbox *ssb, struct t_test_proc *proc) -{ - ssb->populate (ssb, proc); -} - -/* Despatcher for finalize */ -static void -ssbox_finalize (struct ssbox *ssb) -{ - ssb->finalize (ssb); -} - -/* Submit the box and clear up */ -static void -ssbox_base_finalize (struct ssbox *ssb) -{ - tab_submit (ssb->t); -} - -/* Initialize a ssbox struct */ -static void -ssbox_base_init (struct ssbox *this, int cols, int rows) -{ - this->finalize = ssbox_base_finalize; - this->t = tab_create (cols, rows); - - tab_headers (this->t, 0, 0, 1, 0); - tab_box (this->t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1); - tab_hline (this->t, TAL_2, 0, cols- 1, 1); -} - -/* ssbox implementations. */ - -static void ssbox_one_sample_populate (struct ssbox *, struct t_test_proc *); -static void ssbox_independent_samples_populate (struct ssbox *, - struct t_test_proc *); -static void ssbox_paired_populate (struct ssbox *, struct t_test_proc *); - -/* Initialize the one_sample ssbox */ -static void -ssbox_one_sample_init (struct ssbox *this, struct t_test_proc *proc) -{ - const int hsize = 5; - const int vsize = proc->n_vars + 1; - - this->populate = ssbox_one_sample_populate; - - ssbox_base_init (this, hsize, vsize); - tab_title (this->t, _("One-Sample Statistics")); - tab_vline (this->t, TAL_2, 1, 0, vsize - 1); - tab_text (this->t, 1, 0, TAB_CENTER | TAT_TITLE, _("N")); - tab_text (this->t, 2, 0, TAB_CENTER | TAT_TITLE, _("Mean")); - tab_text (this->t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation")); - tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean")); -} - -/* Initialize the independent samples ssbox */ -static void -ssbox_independent_samples_init (struct ssbox *this, struct t_test_proc *proc) -{ - int hsize=6; - int vsize = proc->n_vars * 2 + 1; - - this->populate = ssbox_independent_samples_populate; - - ssbox_base_init (this, hsize, vsize); - tab_vline (this->t, TAL_GAP, 1, 0, vsize - 1); - tab_title (this->t, _("Group Statistics")); - tab_text (this->t, 1, 0, TAB_CENTER | TAT_TITLE, - var_get_name (proc->indep_var)); - tab_text (this->t, 2, 0, TAB_CENTER | TAT_TITLE, _("N")); - tab_text (this->t, 3, 0, TAB_CENTER | TAT_TITLE, _("Mean")); - tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation")); - tab_text (this->t, 5, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean")); -} - -/* Populate the ssbox for independent samples */ -static void -ssbox_independent_samples_populate (struct ssbox *ssb, - struct t_test_proc *proc) -{ - int i; - - char *val_lab[2]; - double indep_value[2]; - - char prefix[2][3]; - - for (i = 0; i < 2; i++) - { - union value *value = &proc->g_value[i]; - int width = var_get_width (proc->indep_var); - - indep_value[i] = (proc->criterion == CMP_LE ? proc->critical_value - : value->f); - - if (val_type_from_width (width) == VAL_NUMERIC) - { - const char *s = var_lookup_value_label (proc->indep_var, value); - val_lab[i] = s ? xstrdup (s) : xasprintf ("%g", indep_value[i]); - } - else - val_lab[i] = xmemdup0 (value_str (value, width), width); - } - - if (proc->criterion == CMP_LE) - { - strcpy (prefix[0], ">="); - strcpy (prefix[1], "<"); - } - else - { - strcpy (prefix[0], ""); - strcpy (prefix[1], ""); - } - - for (i = 0; i < proc->n_vars; i++) - { - const struct variable *var = proc->vars[i]; - struct hsh_table *grp_hash = group_proc_get (var)->group_hash; - int count=0; - - tab_text (ssb->t, 0, i * 2 + 1, TAB_LEFT, - var_get_name (proc->vars[i])); - tab_text_format (ssb->t, 1, i * 2 + 1, TAB_LEFT, - "%s%s", prefix[0], val_lab[0]); - tab_text_format (ssb->t, 1, i * 2 + 1+ 1, TAB_LEFT, - "%s%s", prefix[1], val_lab[1]); - - /* Fill in the group statistics */ - for (count = 0; count < 2; count++) - { - union value search_val; - struct group_statistics *gs; - - if (proc->criterion == CMP_LE) - search_val.f = proc->critical_value + (count == 0 ? 1.0 : -1.0); - else - search_val = proc->g_value[count]; - - gs = hsh_find (grp_hash, &search_val); - assert (gs); - - tab_double (ssb->t, 2, i * 2 + count+ 1, TAB_RIGHT, gs->n, - &proc->weight_format); - tab_double (ssb->t, 3, i * 2 + count+ 1, TAB_RIGHT, gs->mean, NULL); - tab_double (ssb->t, 4, i * 2 + count+ 1, TAB_RIGHT, gs->std_dev, - NULL); - tab_double (ssb->t, 5, i * 2 + count+ 1, TAB_RIGHT, gs->se_mean, - NULL); - } - } - free (val_lab[0]); - free (val_lab[1]); -} - -/* Initialize the paired values ssbox */ -static void -ssbox_paired_init (struct ssbox *this, struct t_test_proc *proc) -{ - int hsize = 6; - int vsize = proc->n_pairs * 2 + 1; - - this->populate = ssbox_paired_populate; - - ssbox_base_init (this, hsize, vsize); - tab_title (this->t, _("Paired Sample Statistics")); - tab_vline (this->t, TAL_GAP, 1, 0, vsize - 1); - tab_vline (this->t, TAL_2, 2, 0, vsize - 1); - tab_text (this->t, 2, 0, TAB_CENTER | TAT_TITLE, _("Mean")); - tab_text (this->t, 3, 0, TAB_CENTER | TAT_TITLE, _("N")); - tab_text (this->t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Deviation")); - tab_text (this->t, 5, 0, TAB_CENTER | TAT_TITLE, _("S.E. Mean")); -} - -/* Populate the ssbox for paired values */ -static void -ssbox_paired_populate (struct ssbox *ssb, struct t_test_proc *proc) -{ - int i; - - for (i = 0; i < proc->n_pairs; i++) - { - struct pair *p = &proc->pairs[i]; - int j; - - tab_text_format (ssb->t, 0, i * 2 + 1, TAB_LEFT, _("Pair %d"), i); - for (j=0; j < 2; j++) - { - /* Titles */ - tab_text (ssb->t, 1, i * 2 + j + 1, TAB_LEFT, - var_get_name (p->v[j])); - - /* Values */ - tab_double (ssb->t, 2, i * 2 + j + 1, TAB_RIGHT, p->mean[j], NULL); - tab_double (ssb->t, 3, i * 2 + j + 1, TAB_RIGHT, p->n, - &proc->weight_format); - tab_double (ssb->t, 4, i * 2 + j + 1, TAB_RIGHT, p->std_dev[j], - NULL); - tab_double (ssb->t, 5, i * 2 + j + 1, TAB_RIGHT, - p->std_dev[j] /sqrt (p->n), NULL); - } - } -} - -/* Populate the one sample ssbox */ -static void -ssbox_one_sample_populate (struct ssbox *ssb, struct t_test_proc *proc) -{ - int i; - - for (i = 0; i < proc->n_vars; i++) - { - struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs; - - tab_text (ssb->t, 0, i + 1, TAB_LEFT, var_get_name (proc->vars[i])); - tab_double (ssb->t, 1, i + 1, TAB_RIGHT, gs->n, &proc->weight_format); - tab_double (ssb->t, 2, i + 1, TAB_RIGHT, gs->mean, NULL); - tab_double (ssb->t, 3, i + 1, TAB_RIGHT, gs->std_dev, NULL); - tab_double (ssb->t, 4, i + 1, TAB_RIGHT, gs->se_mean, NULL); - } -} - -/* Implementation of the Test Results box struct */ - -static void trbox_base_init (struct trbox *, size_t n_vars, int cols); -static void trbox_base_finalize (struct trbox *); -static void trbox_independent_samples_init (struct trbox *, - struct t_test_proc *); -static void trbox_independent_samples_populate (struct trbox *, - struct t_test_proc *); -static void trbox_one_sample_init (struct trbox *, struct t_test_proc *); -static void trbox_one_sample_populate (struct trbox *, struct t_test_proc *); -static void trbox_paired_init (struct trbox *, struct t_test_proc *); -static void trbox_paired_populate (struct trbox *, struct t_test_proc *); - -/* Create a trbox according to mode*/ -static void -trbox_create (struct trbox *trb, struct t_test_proc *proc) -{ - switch (proc->mode) - { - case T_1_SAMPLE: - trbox_one_sample_init (trb, proc); - break; - case T_IND_SAMPLES: - trbox_independent_samples_init (trb, proc); - break; - case T_PAIRED: - trbox_paired_init (trb, proc); - break; - default: - NOT_REACHED (); - } -} - -/* Populate a trbox according to proc */ -static void -trbox_populate (struct trbox *trb, struct t_test_proc *proc) -{ - trb->populate (trb, proc); -} - -/* Submit and destroy a trbox */ -static void -trbox_finalize (struct trbox *trb) -{ - trb->finalize (trb); -} - -/* Initialize the independent samples trbox */ -static void -trbox_independent_samples_init (struct trbox *self, - struct t_test_proc *proc) -{ - const int hsize = 11; - const int vsize = proc->n_vars * 2 + 3; - - assert (self); - self->populate = trbox_independent_samples_populate; - - trbox_base_init (self, proc->n_vars * 2, hsize); - tab_title (self->t, _("Independent Samples Test")); - tab_hline (self->t, TAL_1, 2, hsize - 1, 1); - tab_vline (self->t, TAL_2, 2, 0, vsize - 1); - tab_vline (self->t, TAL_1, 4, 0, vsize - 1); - tab_box (self->t, -1, -1, -1, TAL_1, 2, 1, hsize - 2, vsize - 1); - tab_hline (self->t, TAL_1, hsize - 2, hsize - 1, 2); - tab_box (self->t, -1, -1, -1, TAL_1, hsize - 2, 2, hsize - 1, vsize - 1); - tab_joint_text (self->t, 2, 0, 3, 0, - TAB_CENTER, _("Levene's Test for Equality of Variances")); - tab_joint_text (self->t, 4, 0, hsize- 1, 0, - TAB_CENTER, _("t-test for Equality of Means")); - - tab_text (self->t, 2, 2, TAB_CENTER | TAT_TITLE, _("F")); - tab_text (self->t, 3, 2, TAB_CENTER | TAT_TITLE, _("Sig.")); - tab_text (self->t, 4, 2, TAB_CENTER | TAT_TITLE, _("t")); - tab_text (self->t, 5, 2, TAB_CENTER | TAT_TITLE, _("df")); - tab_text (self->t, 6, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)")); - tab_text (self->t, 7, 2, TAB_CENTER | TAT_TITLE, _("Mean Difference")); - tab_text (self->t, 8, 2, TAB_CENTER | TAT_TITLE, _("Std. Error Difference")); - tab_text (self->t, 9, 2, TAB_CENTER | TAT_TITLE, _("Lower")); - tab_text (self->t, 10, 2, TAB_CENTER | TAT_TITLE, _("Upper")); - - tab_joint_text_format (self->t, 9, 1, 10, 1, TAB_CENTER, - _("%g%% Confidence Interval of the Difference"), - proc->criteria * 100.0); -} - -/* Populate the independent samples trbox */ -static void -trbox_independent_samples_populate (struct trbox *self, - struct t_test_proc *proc) -{ - int i; - - for (i = 0; i < proc->n_vars; i++) - { - double p, q; - - double t; - double df; - - double df1, df2; - - double pooled_variance; - double std_err_diff; - double mean_diff; - - double se2; - - const struct variable *var = proc->vars[i]; - struct group_proc *grp_data = group_proc_get (var); - - struct hsh_table *grp_hash = grp_data->group_hash; - - struct group_statistics *gs0; - struct group_statistics *gs1; - - union value search_val; - - if (proc->criterion == CMP_LE) - search_val.f = proc->critical_value - 1.0; - else - search_val = proc->g_value[0]; - - gs0 = hsh_find (grp_hash, &search_val); - assert (gs0); - - if (proc->criterion == CMP_LE) - search_val.f = proc->critical_value + 1.0; - else - search_val = proc->g_value[1]; - - gs1 = hsh_find (grp_hash, &search_val); - assert (gs1); - - - tab_text (self->t, 0, i * 2 + 3, TAB_LEFT, var_get_name (proc->vars[i])); - tab_text (self->t, 1, i * 2 + 3, TAB_LEFT, _("Equal variances assumed")); - tab_double (self->t, 2, i * 2 + 3, TAB_CENTER, grp_data->levene, NULL); - - /* Now work out the significance of the Levene test */ - df1 = 1; - df2 = grp_data->ugs.n - 2; - q = gsl_cdf_fdist_Q (grp_data->levene, df1, df2); - tab_double (self->t, 3, i * 2 + 3, TAB_CENTER, q, NULL); - - df = gs0->n + gs1->n - 2.0; - tab_double (self->t, 5, i * 2 + 3, TAB_RIGHT, df, NULL); - - pooled_variance = (gs0->n * pow2 (gs0->s_std_dev) - + gs1->n *pow2 (gs1->s_std_dev)) / df ; - - t = (gs0->mean - gs1->mean) / sqrt (pooled_variance); - t /= sqrt ((gs0->n + gs1->n) / (gs0->n * gs1->n)); - - tab_double (self->t, 4, i * 2 + 3, TAB_RIGHT, t, NULL); - - p = gsl_cdf_tdist_P (t, df); - q = gsl_cdf_tdist_Q (t, df); - - tab_double (self->t, 6, i * 2 + 3, TAB_RIGHT, 2.0 * (t > 0 ? q : p), - NULL); - - mean_diff = gs0->mean - gs1->mean; - tab_double (self->t, 7, i * 2 + 3, TAB_RIGHT, mean_diff, NULL); - - - std_err_diff = sqrt (pow2 (gs0->se_mean) + pow2 (gs1->se_mean)); - tab_double (self->t, 8, i * 2 + 3, TAB_RIGHT, std_err_diff, NULL); - - /* Now work out the confidence interval */ - q = (1 - proc->criteria)/2.0; /* 2-tailed test */ - - t = gsl_cdf_tdist_Qinv (q, df); - tab_double (self->t, 9, i * 2 + 3, TAB_RIGHT, - mean_diff - t * std_err_diff, NULL); - - tab_double (self->t, 10, i * 2 + 3, TAB_RIGHT, - mean_diff + t * std_err_diff, NULL); - - - /* Now for the \sigma_1 != \sigma_2 case */ - tab_text (self->t, 1, i * 2 + 3 + 1, - TAB_LEFT, _("Equal variances not assumed")); - - se2 = ((pow2 (gs0->s_std_dev) / (gs0->n - 1)) + - (pow2 (gs1->s_std_dev) / (gs1->n - 1))); - - t = mean_diff / sqrt (se2); - tab_double (self->t, 4, i * 2 + 3 + 1, TAB_RIGHT, t, NULL); - - df = pow2 (se2) / ((pow2 (pow2 (gs0->s_std_dev) / (gs0->n - 1)) - / (gs0->n - 1)) - + (pow2 (pow2 (gs1->s_std_dev) / (gs1->n - 1)) - / (gs1->n - 1))); - tab_double (self->t, 5, i * 2 + 3 + 1, TAB_RIGHT, df, NULL); - - p = gsl_cdf_tdist_P (t, df); - q = gsl_cdf_tdist_Q (t, df); - - tab_double (self->t, 6, i * 2 + 3 + 1, TAB_RIGHT, 2.0 * (t > 0 ? q : p), - NULL); - - /* Now work out the confidence interval */ - q = (1 - proc->criteria) / 2.0; /* 2-tailed test */ - - t = gsl_cdf_tdist_Qinv (q, df); - - tab_double (self->t, 7, i * 2 + 3 + 1, TAB_RIGHT, mean_diff, NULL); - tab_double (self->t, 8, i * 2 + 3 + 1, TAB_RIGHT, std_err_diff, NULL); - tab_double (self->t, 9, i * 2 + 3 + 1, TAB_RIGHT, - mean_diff - t * std_err_diff, NULL); - tab_double (self->t, 10, i * 2 + 3 + 1, TAB_RIGHT, - mean_diff + t * std_err_diff, NULL); - } -} - -/* Initialize the paired samples trbox */ -static void -trbox_paired_init (struct trbox *self, struct t_test_proc *proc) -{ - const int hsize=10; - const int vsize=proc->n_pairs+ 3; - - self->populate = trbox_paired_populate; - - trbox_base_init (self, proc->n_pairs, hsize); - tab_title (self->t, _("Paired Samples Test")); - tab_hline (self->t, TAL_1, 2, 6, 1); - tab_vline (self->t, TAL_2, 2, 0, vsize - 1); - tab_joint_text (self->t, 2, 0, 6, 0, TAB_CENTER, _("Paired Differences")); - tab_box (self->t, -1, -1, -1, TAL_1, 2, 1, 6, vsize - 1); - tab_box (self->t, -1, -1, -1, TAL_1, 6, 0, hsize - 1, vsize - 1); - tab_hline (self->t, TAL_1, 5, 6, 2); - tab_vline (self->t, TAL_GAP, 6, 0, 1); - - tab_joint_text_format (self->t, 5, 1, 6, 1, TAB_CENTER, - _("%g%% Confidence Interval of the Difference"), - proc->criteria*100.0); - - tab_text (self->t, 2, 2, TAB_CENTER | TAT_TITLE, _("Mean")); - tab_text (self->t, 3, 2, TAB_CENTER | TAT_TITLE, _("Std. Deviation")); - tab_text (self->t, 4, 2, TAB_CENTER | TAT_TITLE, _("Std. Error Mean")); - tab_text (self->t, 5, 2, TAB_CENTER | TAT_TITLE, _("Lower")); - tab_text (self->t, 6, 2, TAB_CENTER | TAT_TITLE, _("Upper")); - tab_text (self->t, 7, 2, TAB_CENTER | TAT_TITLE, _("t")); - tab_text (self->t, 8, 2, TAB_CENTER | TAT_TITLE, _("df")); - tab_text (self->t, 9, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)")); -} - -/* Populate the paired samples trbox */ -static void -trbox_paired_populate (struct trbox *trb, - struct t_test_proc *proc) -{ - int i; - - for (i = 0; i < proc->n_pairs; i++) - { - struct pair *pair = &proc->pairs[i]; - double p, q; - double se_mean; - - double n = pair->n; - double t; - double df = n - 1; - - tab_text_format (trb->t, 0, i + 3, TAB_LEFT, _("Pair %d"), i); - tab_text_format (trb->t, 1, i + 3, TAB_LEFT, "%s - %s", - var_get_name (pair->v[0]), - var_get_name (pair->v[1])); - tab_double (trb->t, 2, i + 3, TAB_RIGHT, pair->mean_diff, NULL); - tab_double (trb->t, 3, i + 3, TAB_RIGHT, pair->std_dev_diff, NULL); - - /* SE Mean */ - se_mean = pair->std_dev_diff / sqrt (n); - tab_double (trb->t, 4, i + 3, TAB_RIGHT, se_mean, NULL); - - /* Now work out the confidence interval */ - q = (1 - proc->criteria) / 2.0; /* 2-tailed test */ - - t = gsl_cdf_tdist_Qinv (q, df); - - tab_double (trb->t, 5, i + 3, TAB_RIGHT, - pair->mean_diff - t * se_mean, NULL); - tab_double (trb->t, 6, i + 3, TAB_RIGHT, - pair->mean_diff + t * se_mean, NULL); - - t = ((pair->mean[0] - pair->mean[1]) - / sqrt ((pow2 (pair->s_std_dev[0]) + pow2 (pair->s_std_dev[1]) - - (2 * pair->correlation - * pair->s_std_dev[0] * pair->s_std_dev[1])) - / (n - 1))); - - tab_double (trb->t, 7, i + 3, TAB_RIGHT, t, NULL); - - /* Degrees of freedom */ - tab_double (trb->t, 8, i + 3, TAB_RIGHT, df, &proc->weight_format); - - p = gsl_cdf_tdist_P (t,df); - q = gsl_cdf_tdist_Q (t,df); - - tab_double (trb->t, 9, i + 3, TAB_RIGHT, 2.0 * (t > 0 ? q : p), NULL); - } -} - -/* Initialize the one sample trbox */ -static void -trbox_one_sample_init (struct trbox *self, struct t_test_proc *proc) -{ - const int hsize = 7; - const int vsize = proc->n_vars + 3; - - self->populate = trbox_one_sample_populate; - - trbox_base_init (self, proc->n_vars, hsize); - tab_title (self->t, _("One-Sample Test")); - tab_hline (self->t, TAL_1, 1, hsize - 1, 1); - tab_vline (self->t, TAL_2, 1, 0, vsize - 1); - - tab_joint_text_format (self->t, 1, 0, hsize - 1, 0, TAB_CENTER, - _("Test Value = %f"), proc->testval); - - tab_box (self->t, -1, -1, -1, TAL_1, 1, 1, hsize - 1, vsize - 1); - - - tab_joint_text_format (self->t, 5, 1, 6, 1, TAB_CENTER, - _("%g%% Confidence Interval of the Difference"), - proc->criteria * 100.0); - - tab_vline (self->t, TAL_GAP, 6, 1, 1); - tab_hline (self->t, TAL_1, 5, 6, 2); - tab_text (self->t, 1, 2, TAB_CENTER | TAT_TITLE, _("t")); - tab_text (self->t, 2, 2, TAB_CENTER | TAT_TITLE, _("df")); - tab_text (self->t, 3, 2, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)")); - tab_text (self->t, 4, 2, TAB_CENTER | TAT_TITLE, _("Mean Difference")); - tab_text (self->t, 5, 2, TAB_CENTER | TAT_TITLE, _("Lower")); - tab_text (self->t, 6, 2, TAB_CENTER | TAT_TITLE, _("Upper")); -} - -/* Populate the one sample trbox */ -static void -trbox_one_sample_populate (struct trbox *trb, struct t_test_proc *proc) -{ - int i; - - assert (trb->t); - - for (i = 0; i < proc->n_vars; i++) - { - double t; - double p, q; - double df; - struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs; - - tab_text (trb->t, 0, i + 3, TAB_LEFT, var_get_name (proc->vars[i])); - - t = (gs->mean - proc->testval) * sqrt (gs->n) / gs->std_dev; - - tab_double (trb->t, 1, i + 3, TAB_RIGHT, t, NULL); - - /* degrees of freedom */ - df = gs->n - 1; - - tab_double (trb->t, 2, i + 3, TAB_RIGHT, df, &proc->weight_format); - - p = gsl_cdf_tdist_P (t, df); - q = gsl_cdf_tdist_Q (t, df); - - /* Multiply by 2 to get 2-tailed significance, makeing sure we've got - the correct tail*/ - tab_double (trb->t, 3, i + 3, TAB_RIGHT, 2.0 * (t > 0 ? q : p), NULL); - tab_double (trb->t, 4, i + 3, TAB_RIGHT, gs->mean_diff, NULL); - - - q = (1 - proc->criteria) / 2.0; /* 2-tailed test */ - t = gsl_cdf_tdist_Qinv (q, df); - - tab_double (trb->t, 5, i + 3, TAB_RIGHT, - gs->mean_diff - t * gs->se_mean, NULL); - tab_double (trb->t, 6, i + 3, TAB_RIGHT, - gs->mean_diff + t * gs->se_mean, NULL); - } -} - -/* Base initializer for the generalized trbox */ -static void -trbox_base_init (struct trbox *self, size_t data_rows, int cols) -{ - const size_t rows = 3 + data_rows; - - self->finalize = trbox_base_finalize; - self->t = tab_create (cols, rows); - tab_headers (self->t, 0, 0, 3, 0); - tab_box (self->t, TAL_2, TAL_2, TAL_0, TAL_0, 0, 0, cols - 1, rows - 1); - tab_hline (self->t, TAL_2, 0, cols- 1, 3); -} - -/* Base finalizer for the trbox */ -static void -trbox_base_finalize (struct trbox *trb) -{ - tab_submit (trb->t); -} - -/* Create, populate and submit the Paired Samples Correlation box */ -static void -pscbox (struct t_test_proc *proc) -{ - const int rows=1+proc->n_pairs; - const int cols=5; - int i; - - struct tab_table *table; - - table = tab_create (cols, rows); - - tab_headers (table, 0, 0, 1, 0); - tab_box (table, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, cols - 1, rows - 1); - tab_hline (table, TAL_2, 0, cols - 1, 1); - tab_vline (table, TAL_2, 2, 0, rows - 1); - tab_title (table, _("Paired Samples Correlations")); - - /* column headings */ - tab_text (table, 2, 0, TAB_CENTER | TAT_TITLE, _("N")); - tab_text (table, 3, 0, TAB_CENTER | TAT_TITLE, _("Correlation")); - tab_text (table, 4, 0, TAB_CENTER | TAT_TITLE, _("Sig.")); - - for (i = 0; i < proc->n_pairs; i++) - { - struct pair *pair = &proc->pairs[i]; - - /* row headings */ - tab_text_format (table, 0, i + 1, TAB_LEFT | TAT_TITLE, - _("Pair %d"), i); - tab_text_format (table, 1, i + 1, TAB_LEFT | TAT_TITLE, - _("%s & %s"), - var_get_name (pair->v[0]), - var_get_name (pair->v[1])); - - /* row data */ - tab_double (table, 2, i + 1, TAB_RIGHT, pair->n, &proc->weight_format); - tab_double (table, 3, i + 1, TAB_RIGHT, pair->correlation, NULL); - - tab_double (table, 4, i + 1, TAB_RIGHT, - 2.0 * significance_of_correlation (pair->correlation, pair->n), NULL); - } - - tab_submit (table); -} - -/* Calculation Implementation */ - -/* Calculations common to all variants of the T test. */ -static void -common_calc (const struct dictionary *dict, - struct t_test_proc *proc, - struct casereader *reader) -{ - struct ccase *c; - int i; - - for (i = 0; i < proc->n_vars; i++) - { - struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs; - gs->sum = 0; - gs->n = 0; - gs->ssq = 0; - gs->sum_diff = 0; - } - - for (; (c = casereader_read (reader)) != NULL; case_unref (c)) - { - double weight = dict_get_case_weight (dict, c, NULL); - - /* Listwise has to be implicit if the independent variable - is missing ?? */ - if (proc->mode == T_IND_SAMPLES) - { - if (var_is_value_missing (proc->indep_var, - case_data (c, proc->indep_var), - proc->exclude)) - continue; - } - - for (i = 0; i < proc->n_vars; i++) - { - const struct variable *v = proc->vars[i]; - const union value *val = case_data (c, v); - - if (!var_is_value_missing (v, val, proc->exclude)) - { - struct group_statistics *gs; - gs = &group_proc_get (v)->ugs; - - gs->n += weight; - gs->sum += weight * val->f; - gs->ssq += weight * pow2 (val->f); - } - } - } - casereader_destroy (reader); - - for (i = 0; i < proc->n_vars; i++) - { - struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs; - - gs->mean = gs->sum / gs->n; - gs->s_std_dev = sqrt (((gs->ssq / gs->n) - pow2 (gs->mean))); - gs->std_dev = sqrt (gs->n / (gs->n- 1) - * ((gs->ssq / gs->n) - pow2 (gs->mean))); - gs->se_mean = gs->std_dev / sqrt (gs->n); - gs->mean_diff = gs->sum_diff / gs->n; - } -} - -/* Calculations for one sample T test. */ -static int -one_sample_calc (const struct dictionary *dict, struct t_test_proc *proc, - struct casereader *reader) -{ - struct ccase *c; - int i; - - for (i = 0; i < proc->n_vars; i++) - { - struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs; - gs->sum_diff = 0; - } - - for (; (c = casereader_read (reader)) != NULL; case_unref (c)) - { - double weight = dict_get_case_weight (dict, c, NULL); - for (i = 0; i < proc->n_vars; i++) - { - const struct variable *v = proc->vars[i]; - struct group_statistics *gs = &group_proc_get (v)->ugs; - const union value *val = case_data (c, v); - if (!var_is_value_missing (v, val, proc->exclude)) - gs->sum_diff += weight * (val->f - proc->testval); - } - } - - for (i = 0; i < proc->n_vars; i++) - { - struct group_statistics *gs = &group_proc_get (proc->vars[i])->ugs; - gs->mean_diff = gs->sum_diff / gs->n; - } - - casereader_destroy (reader); - - return 0; -} - -static int -paired_calc (const struct dictionary *dict, struct t_test_proc *proc, - struct casereader *reader) -{ - struct ccase *c; - int i; - - for (i = 0; i < proc->n_pairs; i++) - { - struct pair *pair = &proc->pairs[i]; - pair->n = 0; - pair->sum[0] = pair->sum[1] = 0; - pair->ssq[0] = pair->ssq[1] = 0; - pair->sum_of_prod = 0; - pair->correlation = 0; - pair->sum_of_diffs = 0; - pair->ssq_diffs = 0; - } - - for (; (c = casereader_read (reader)) != NULL; case_unref (c)) - { - double weight = dict_get_case_weight (dict, c, NULL); - for (i = 0; i < proc->n_pairs; i++) - { - struct pair *pair = &proc->pairs[i]; - const struct variable *v0 = pair->v[0]; - const struct variable *v1 = pair->v[1]; - - const union value *val0 = case_data (c, v0); - const union value *val1 = case_data (c, v1); - - if (!var_is_value_missing (v0, val0, proc->exclude) - && !var_is_value_missing (v1, val1, proc->exclude)) - { - pair->n += weight; - pair->sum[0] += weight * val0->f; - pair->sum[1] += weight * val1->f; - pair->ssq[0] += weight * pow2 (val0->f); - pair->ssq[1] += weight * pow2 (val1->f); - pair->sum_of_prod += weight * val0->f * val1->f; - pair->sum_of_diffs += weight * (val0->f - val1->f); - pair->ssq_diffs += weight * pow2 (val0->f - val1->f); - } - } - } - - for (i = 0; i < proc->n_pairs; i++) - { - struct pair *pair = &proc->pairs[i]; - const double n = pair->n; - int j; - - for (j=0; j < 2; j++) - { - pair->mean[j] = pair->sum[j] / n; - pair->s_std_dev[j] = sqrt ((pair->ssq[j] / n - - pow2 (pair->mean[j]))); - pair->std_dev[j] = sqrt (n / (n- 1) * (pair->ssq[j] / n - - pow2 (pair->mean[j]))); - } - - pair->correlation = (pair->sum_of_prod / pair->n - - pair->mean[0] * pair->mean[1]); - /* correlation now actually contains the covariance */ - pair->correlation /= pair->std_dev[0] * pair->std_dev[1]; - pair->correlation *= pair->n / (pair->n - 1); - - pair->mean_diff = pair->sum_of_diffs / n; - pair->std_dev_diff = sqrt (n / (n - 1) * ((pair->ssq_diffs / n) - - pow2 (pair->mean_diff))); - } - - casereader_destroy (reader); - return 0; -} - -static int -group_calc (const struct dictionary *dict, struct t_test_proc *proc, - struct casereader *reader) -{ - struct ccase *c; - int i; - - for (i = 0; i < proc->n_vars; i++) - { - struct group_proc *ttpr = group_proc_get (proc->vars[i]); - int j; - - /* There's always 2 groups for a T - TEST */ - ttpr->n_groups = 2; - ttpr->group_hash = hsh_create (2, - (hsh_compare_func *) compare_group_binary, - (hsh_hash_func *) hash_group_binary, - (hsh_free_func *) free_group, - proc); - - for (j = 0; j < 2; j++) - { - struct group_statistics *gs = xmalloc (sizeof *gs); - gs->sum = 0; - gs->n = 0; - gs->ssq = 0; - if (proc->criterion == CMP_EQ) - gs->id = proc->g_value[j]; - else - { - if (j == 0) - gs->id.f = proc->critical_value - 1.0; - else - gs->id.f = proc->critical_value + 1.0; - } - - hsh_insert (ttpr->group_hash, gs); - } - } - - for (; (c = casereader_read (reader)) != NULL; case_unref (c)) - { - const double weight = dict_get_case_weight (dict, c, NULL); - const union value *gv; - - if (var_is_value_missing (proc->indep_var, - case_data (c, proc->indep_var), proc->exclude)) - continue; - - gv = case_data (c, proc->indep_var); - for (i = 0; i < proc->n_vars; i++) - { - const struct variable *var = proc->vars[i]; - const union value *val = case_data (c, var); - struct hsh_table *grp_hash = group_proc_get (var)->group_hash; - struct group_statistics *gs = hsh_find (grp_hash, gv); - - /* If the independent variable doesn't match either of the values - for this case then move on to the next case. */ - if (gs == NULL) - break; - - if (!var_is_value_missing (var, val, proc->exclude)) - { - gs->n += weight; - gs->sum += weight * val->f; - gs->ssq += weight * pow2 (val->f); - } - } - } - - for (i = 0; i < proc->n_vars; i++) - { - const struct variable *var = proc->vars[i]; - struct hsh_table *grp_hash = group_proc_get (var)->group_hash; - struct hsh_iterator g; - struct group_statistics *gs; - int count = 0; - - for (gs = hsh_first (grp_hash, &g); gs != NULL; - gs = hsh_next (grp_hash, &g)) - { - gs->mean = gs->sum / gs->n; - gs->s_std_dev = sqrt (((gs->ssq / gs->n) - pow2 (gs->mean))); - gs->std_dev = sqrt (gs->n / (gs->n- 1) - * ((gs->ssq / gs->n) - pow2 (gs->mean))); - gs->se_mean = gs->std_dev / sqrt (gs->n); - count++; - } - assert (count == 2); - } - - casereader_destroy (reader); - - return 0; -} - - -static bool -is_criteria_value (const struct ccase *c, void *aux) -{ - const struct t_test_proc *proc = aux; - const union value *val = case_data (c, proc->indep_var); - int width = var_get_width (proc->indep_var); - - if ( value_equal (val, &proc->g_value[0], width)) - return true; - - if ( value_equal (val, &proc->g_value[1], width)) - return true; - - return false; -} - -static void -calculate (struct t_test_proc *proc, - struct casereader *input, const struct dataset *ds) -{ - const struct dictionary *dict = dataset_dict (ds); - struct ssbox stat_summary_box; - struct trbox test_results_box; - struct taint *taint; - struct ccase *c; - int i; - c = casereader_peek (input, 0); - if (c == NULL) - { - casereader_destroy (input); - return; - } - output_split_file_values (ds, c); - case_unref (c); - - if (proc->listwise_missing) - input = casereader_create_filter_missing (input, - proc->vars, - proc->n_vars, - proc->exclude, NULL, NULL); - input = casereader_create_filter_weight (input, dict, NULL, NULL); - taint = taint_clone (casereader_get_taint (input)); - - common_calc (dict, proc, casereader_clone (input)); - switch (proc->mode) - { - case T_1_SAMPLE: - one_sample_calc (dict, proc, input); - break; - case T_PAIRED: - paired_calc (dict, proc, input); - break; - case T_IND_SAMPLES: - group_calc (dict, proc, casereader_clone (input)); - - for (i = 0; i < proc->n_vars; ++i) - { - struct group_proc *grp_data = group_proc_get (proc->vars[i]); - - if ( proc->criterion == CMP_EQ ) - { - input = casereader_create_filter_func (input, is_criteria_value, NULL, - proc, - NULL); - } - - grp_data->levene = levene ( input, proc->indep_var, proc->vars[i], dict_get_weight (dict), proc->exclude); - } - break; - default: - NOT_REACHED (); - } - - if (!taint_has_tainted_successor (taint)) - { - ssbox_create (&stat_summary_box, proc); - ssbox_populate (&stat_summary_box, proc); - ssbox_finalize (&stat_summary_box); - - if (proc->mode == T_PAIRED) - pscbox (proc); - - trbox_create (&test_results_box, proc); - trbox_populate (&test_results_box, proc); - trbox_finalize (&test_results_box); - } - - taint_destroy (taint); -} - -/* return 0 if G belongs to group 0, - 1 if it belongs to group 1, - 2 if it belongs to neither group */ -static int -which_group (const struct group_statistics *g, - const struct t_test_proc *proc) -{ - int width = var_get_width (proc->indep_var); - - if (0 == value_compare_3way (&g->id, &proc->g_value[0], width)) - return 0; - - if (0 == value_compare_3way (&g->id, &proc->g_value[1], width)) - return 1; - - return 2; -} - -/* Return -1 if the id of a is less than b; +1 if greater than and - 0 if equal */ -static int -compare_group_binary (const struct group_statistics *a, - const struct group_statistics *b, - const struct t_test_proc *proc) -{ - int flag_a; - int flag_b; - - if (proc->criterion == CMP_LE) - { - flag_a = (a->id.f < proc->critical_value); - flag_b = (b->id.f < proc->critical_value); - } - else - { - flag_a = which_group (a, proc); - flag_b = which_group (b, proc); - } - - if (flag_a < flag_b) - return - 1; - - return (flag_a > flag_b); -} - -/* This is a degenerate case of a hash, since it can only return three possible - values. It's really a comparison, being used as a hash function */ - -static unsigned -hash_group_binary (const struct group_statistics *g, - const struct t_test_proc *proc) -{ - return (proc->criterion == CMP_LE - ? g->id.f < proc->critical_value - : which_group (g, proc)); -} - -/* - Local Variables: - mode: c - End: -*/ diff --git a/src/math/levene.c b/src/math/levene.c index 13fad93e..fe04d294 100644 --- a/src/math/levene.c +++ b/src/math/levene.c @@ -16,165 +16,245 @@ #include -#include "math/levene.h" - +#include "levene.h" #include -#include "data/case.h" -#include "data/casereader.h" -#include "data/variable.h" -#include "libpspp/hmap.h" #include "libpspp/misc.h" +#include "libpspp/hmap.h" +#include "data/value.h" + +#include +#include struct lev { struct hmap_node node; union value group; - int width; double t_bar; double z_mean; double n; }; +typedef unsigned int hash_func (const struct levene *, const union value *v); +typedef bool cmp_func (const struct levene *, const union value *v0, const union value *v1); + +struct levene +{ + /* Width of the categorical variable */ + int gvw ; + + /* The value deviding the the groups. Valid only for dichotomous categorical variable.*/ + const union value *cutpoint; + + + /* A hashtable of struct lev objects indexed by union value */ + struct hmap hmap; + + hash_func *hash; + cmp_func *cmp; + + + /* A state variable indicating how many passes have been done */ + int pass; + + double grand_n; + double z_grand_mean; + + double denominator; +}; + + +static unsigned int +unique_hash (const struct levene *nl, const union value *val) +{ + return value_hash (val, nl->gvw, 0); +} + +static bool +unique_cmp (const struct levene *nl, const union value *val0, const union value *val1) +{ + return value_equal (val0, val1, nl->gvw); +} + +static unsigned int +cutpoint_hash (const struct levene *nl, const union value *val) +{ + int x = value_compare_3way (val, nl->cutpoint, nl->gvw); + + return (x < 0); +} + +static bool +cutpoint_cmp (const struct levene *nl, const union value *val0, const union value *val1) +{ + int x = value_compare_3way (val0, nl->cutpoint, nl->gvw); + + int y = value_compare_3way (val1, nl->cutpoint, nl->gvw); + + if ( x == 0) x = 1; + if ( y == 0) y = 1; + + return ( x == y); +} + + static struct lev * -find_group (struct hmap *map, const union value *target, int width) +find_group (const struct levene *nl, const union value *target) { struct lev *l = NULL; - unsigned int hash = value_hash (target, width, 0); - HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, hash, map) + + HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, nl->hash (nl, target), &nl->hmap) { - if (value_equal (&l->group, target, width)) + if (nl->cmp (nl, &l->group, target)) break; l = NULL; } - return l; } -double -levene (struct casereader *rx, const struct variable *gvar, - const struct variable *var, const struct variable *wv, - enum mv_class exclude) +struct levene * +levene_create (int indep_width, const union value *cutpoint) { - double numerator = 0.0; - double denominator = 0.0; - int n_groups = 0; - double z_grand_mean = 0.0; - double grand_n = 0.0; + struct levene *nl = xzalloc (sizeof *nl); + + hmap_init (&nl->hmap); + + nl->gvw = indep_width; + nl->cutpoint = cutpoint; - struct hmap map = HMAP_INITIALIZER (map); + nl->hash = cutpoint ? cutpoint_hash : unique_hash; + nl->cmp = cutpoint ? cutpoint_cmp : unique_cmp; + + return nl; +} - struct ccase *c; - struct casereader *r = casereader_clone (rx); - for (; (c = casereader_read (r)) != NULL; case_unref (c)) +/* Data accumulation. First pass */ +void +levene_pass_one (struct levene *nl, double value, double weight, const union value *gv) +{ + struct lev *lev = find_group (nl, gv); + + if ( nl->pass == 0 ) + { + nl->pass = 1; + } + assert (nl->pass == 1); + + if ( NULL == lev) { - struct lev *l = NULL; - const union value *target = case_data (c, gvar); - int width = var_get_width (gvar); - unsigned int hash = value_hash (target, width, 0); - const double x = case_data (c, var)->f; - const double weight = wv ? case_data (c, wv)->f : 1.0; - - if (var_is_value_missing (var, case_data (c, var), exclude)) - continue; - - l = find_group (&map, target, width); - - if (l == NULL) - { - l = xzalloc (sizeof *l); - value_clone (&l->group, target, width); - hmap_insert (&map, &l->node, hash); - } - - l->n += weight; - l->t_bar += x * weight; - grand_n += weight; + struct lev *l = xzalloc (sizeof *l); + value_clone (&l->group, gv, nl->gvw); + hmap_insert (&nl->hmap, &l->node, nl->hash (nl, &l->group)); + lev = l; } - casereader_destroy (r); - { - struct lev *l; - HMAP_FOR_EACH (l, struct lev, node, &map) + lev->n += weight; + lev->t_bar += value * weight; + + nl->grand_n += weight; +} + +/* Data accumulation. Second pass */ +void +levene_pass_two (struct levene *nl, double value, double weight, const union value *gv) +{ + struct lev *lev = NULL; + + if ( nl->pass == 1 ) + { + struct lev *next; + struct lev *l; + + nl->pass = 2; + + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) { l->t_bar /= l->n; } - } + } + assert (nl->pass == 2); - n_groups = hmap_count (&map); + lev = find_group (nl, gv); - r = casereader_clone (rx); - for (; (c = casereader_read (r)) != NULL; case_unref (c)) + lev->z_mean += fabs (value - lev->t_bar) * weight; + nl->z_grand_mean += fabs (value - lev->t_bar) * weight; +} + +/* Data accumulation. Third pass */ +void +levene_pass_three (struct levene *nl, double value, double weight, const union value *gv) +{ + double z; + struct lev *lev = NULL; + + if ( nl->pass == 2 ) { - struct lev *l = NULL; - const union value *target = case_data (c, gvar); - int width = var_get_width (gvar); - const double x = case_data (c, var)->f; - const double weight = wv ? case_data (c, wv)->f : 1.0; - - if (var_is_value_missing (var, case_data (c, var), exclude)) - continue; - - l = find_group (&map, target, width); - assert (l); - - l->z_mean += fabs (x - l->t_bar) * weight; - z_grand_mean += fabs (x - l->t_bar) * weight; - } - casereader_destroy (r); + struct lev *next; + struct lev *l; + + nl->pass = 3; - { - struct lev *l; - HMAP_FOR_EACH (l, struct lev, node, &map) + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) { l->z_mean /= l->n; } + + nl->z_grand_mean /= nl->grand_n; } - z_grand_mean /= grand_n; + assert (nl->pass == 3); + lev = find_group (nl, gv); + + z = fabs (value - lev->t_bar); + nl->denominator += pow2 (z - lev->z_mean) * weight; +} + - r = casereader_clone (rx); - for (; (c = casereader_read (r)) != NULL; case_unref (c)) +/* Return the value of the levene statistic */ +double +levene_calculate (struct levene *nl) +{ + struct lev *next; + struct lev *l; + + double numerator = 0.0; + double nn = 0.0; + if ( nl->pass == 3 ) { - double z; - struct lev *l; - const union value *target = case_data (c, gvar); - int width = var_get_width (gvar); - const double x = case_data (c, var)->f; - const double weight = wv ? case_data (c, wv)->f : 1.0; + nl->pass = 4; + } - if (var_is_value_missing (var, case_data (c, var), exclude)) - continue; + assert (nl->pass == 4); - l = find_group (&map, target, width); - assert (l); + nl->denominator *= hmap_count (&nl->hmap) - 1; - z = fabs (x - l->t_bar); - denominator += pow2 (z - l->z_mean) * weight; + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + numerator += l->n * pow2 (l->z_mean - nl->z_grand_mean); + nn += l->n; } - casereader_destroy (r); - denominator *= n_groups - 1; + numerator *= nn - hmap_count (&nl->hmap); + + return numerator / nl->denominator; +} - { - double grand_n = 0.0; - struct lev *next; - struct lev *l; - HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &map) - { - numerator += l->n * pow2 (l->z_mean - z_grand_mean); - grand_n += l->n; +void +levene_destroy (struct levene *nl) +{ + struct lev *next; + struct lev *l; - /* We don't need these anymore */ - free (l); - } - numerator *= grand_n - n_groups ; - hmap_destroy (&map); - } + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + value_destroy (&l->group, nl->gvw); + free (l); + } - return numerator/ denominator; + hmap_destroy (&nl->hmap); + free (nl); } diff --git a/src/math/levene.h b/src/math/levene.h index c5e8bc3b..4351ee09 100644 --- a/src/math/levene.h +++ b/src/math/levene.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2004, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,15 +17,18 @@ #if !levene_h #define levene_h 1 -struct casereader; -struct variable; +struct nl; +union value; -enum mv_class; +struct levene *levene_create (int indep_width, const union value *cutpoint); -double -levene (struct casereader *rx, const struct variable *gvar, - const struct variable *var, const struct variable *wv, enum mv_class exclude ); +void levene_pass_one (struct levene *, double value, double weight, const union value *gv); +void levene_pass_two (struct levene *, double value, double weight, const union value *gv); +void levene_pass_three (struct levene *, double value, double weight, const union value *gv); +double levene_calculate (struct levene*); + +void levene_destroy (struct levene*); #endif diff --git a/tests/language/stats/t-test.at b/tests/language/stats/t-test.at index 90b7ba9e..b97db81f 100644 --- a/tests/language/stats/t-test.at +++ b/tests/language/stats/t-test.at @@ -231,10 +231,10 @@ DEP2,F8.0 Table: Group Statistics ,INDEP,N,Mean,Std. Deviation,S.E. Mean -DEP1,1.1,5,2.00,.71,.32 -,2.1,5,4.00,.71,.32 -DEP2,1.1,5,4.00,.71,.32 -,2.1,5,2.00,.71,.32 +DEP1,1.10,5,2.00,.71,.32 +,2.10,5,4.00,.71,.32 +DEP2,1.10,5,4.00,.71,.32 +,2.10,5,2.00,.71,.32 Table: Independent Samples Test ,,Levene's Test for Equality of Variances,,t-test for Equality of Means,,,,,, @@ -285,15 +285,15 @@ DEP,F8.0 Table: Group Statistics ,INDEP,N,Mean,Std. Deviation,S.E. Mean -DEP,>=1.514,11,9.00,3.82,1.15 -,<1.514,11,8.00,2.86,.86 +DEP,>= 1.51,11,9.00,3.82,1.15 +,< 1.51,11,8.00,2.86,.86 Table: Independent Samples Test ,,Levene's Test for Equality of Variances,,t-test for Equality of Means,,,,,, ,,,,,,,,,95% Confidence Interval of the Difference, ,,F,Sig.,t,df,Sig. (2-tailed),Mean Difference,Std. Error Difference,Lower,Upper -DEP,Equal variances assumed,.17,.68,-.69,20.00,.50,-1.00,1.44,-4.00,2.00 -,Equal variances not assumed,,,-.69,18.54,.50,-1.00,1.44,-4.02,2.02 +DEP,Equal variances assumed,.17,.68,.69,20.00,.50,1.00,1.44,-2.00,4.00 +,Equal variances not assumed,,,.69,18.54,.50,1.00,1.44,-2.02,4.02 ]) AT_CLEANUP @@ -320,10 +320,10 @@ dep2,F8.0 Table: Group Statistics ,indep,N,Mean,Std. Deviation,S.E. Mean -dep1,1,3,2.50,.87,.50 -,2,2,3.25,.35,.25 -dep2,1,3,5.00,1.00,.58 -,2,2,2.00,1.41,1.00 +dep1,1.00,3,2.50,.87,.50 +,2.00,2,3.25,.35,.25 +dep2,1.00,3,5.00,1.00,.58 +,2.00,2,2.00,1.41,1.00 Table: Independent Samples Test ,,Levene's Test for Equality of Variances,,t-test for Equality of Means,,,,,, @@ -381,10 +381,10 @@ dep2,F8.0 Table: Group Statistics ,indep,N,Mean,Std. Deviation,S.E. Mean -dep1,1,3,2.50,.87,.50 -,2,3,3.50,.50,.29 -dep2,1,3,5.00,1.00,.58 -,2,3,2.00,1.00,.58 +dep1,1.00,3,2.50,.87,.50 +,2.00,3,3.50,.50,.29 +dep2,1.00,3,5.00,1.00,.58 +,2.00,3,2.00,1.00,.58 Table: Independent Samples Test ,,Levene's Test for Equality of Variances,,t-test for Equality of Means,,,,,, @@ -576,8 +576,8 @@ x,F8.0 Table: Group Statistics ,ind,N,Mean,Std. Deviation,S.E. Mean -x,1,3,2.50,.87,.50 -,2,3,3.50,.50,.29 +x,1.00,3,2.50,.87,.50 +,2.00,3,3.50,.50,.29 Table: Independent Samples Test ,,Levene's Test for Equality of Variances,,t-test for Equality of Means,,,,,, -- 2.30.2