From ec19587bd09b05588cd757533e5366d1e2fbf447 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sun, 5 Feb 2012 17:15:02 +0100 Subject: [PATCH 1/1] Added an implementation of the Jonckheere-Terpstra test --- NEWS | 4 +- src/language/stats/automake.mk | 2 + src/language/stats/jonckheere-terpstra.c | 417 +++++++++++++++++++++++ src/language/stats/jonckheere-terpstra.h | 41 +++ src/language/stats/npar.c | 46 +++ tests/language/stats/npar.at | 44 ++- 6 files changed, 551 insertions(+), 3 deletions(-) create mode 100644 src/language/stats/jonckheere-terpstra.c create mode 100644 src/language/stats/jonckheere-terpstra.h diff --git a/NEWS b/NEWS index b5c6d84b57..916c7d4848 100644 --- a/NEWS +++ b/NEWS @@ -48,8 +48,8 @@ Changes from 0.6.2 to 0.7.9: - ONEWAY: the POSTHOC subcommand is now implemented. - The following new subcommands to NPAR TESTS have been implemented: - COCHRAN, FRIEDMAN, KENDALL, KRUSKAL-WALLIS, MANN-WHITNEY, MCNEMAR, - SIGN, WILCOXON, and RUNS + COCHRAN, FRIEDMAN, JONCKHEERE-TERPSTRA, KENDALL, KRUSKAL-WALLIS, MANN-WHITNEY, + MCNEMAR, SIGN, WILCOXON, and RUNS - SET and SHOW no longer have ENDCMD, NULLINE, PROMPT, CPROMPT, and DPROMPT subcommands. The defaults are now fixed values. diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 1ca22afc85..583835498b 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -31,6 +31,8 @@ language_stats_sources = \ src/language/stats/kruskal-wallis.h \ src/language/stats/ks-one-sample.c \ src/language/stats/ks-one-sample.h \ + src/language/stats/jonckheere-terpstra.c \ + src/language/stats/jonckheere-terpstra.h \ src/language/stats/mann-whitney.c \ src/language/stats/mann-whitney.h \ src/language/stats/means.c \ diff --git a/src/language/stats/jonckheere-terpstra.c b/src/language/stats/jonckheere-terpstra.c new file mode 100644 index 0000000000..087f655b3d --- /dev/null +++ b/src/language/stats/jonckheere-terpstra.c @@ -0,0 +1,417 @@ +/* Pspp - a program for statistical analysis. + Copyright (C) 2012 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + + +#include + +#include "jonckheere-terpstra.h" + +#include +#include + +#include "data/casegrouper.h" +#include "data/casereader.h" +#include "data/casewriter.h" +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/subcase.h" +#include "data/variable.h" +#include "libpspp/assertion.h" +#include "libpspp/hmap.h" +#include "libpspp/message.h" +#include "libpspp/misc.h" +#include "math/sort.h" +#include "output/tab.h" + +#include "gl/minmax.h" +#include "gl/xalloc.h" + +/* Returns true iff the independent variable lies in the + between val1 and val2. Regardless of which is the greater value. +*/ +static bool +include_func_bi (const struct ccase *c, void *aux) +{ + const struct n_sample_test *nst = aux; + const union value *bigger = NULL; + const union value *smaller = NULL; + + if (0 > value_compare_3way (&nst->val1, &nst->val2, var_get_width (nst->indep_var))) + { + bigger = &nst->val2; + smaller = &nst->val1; + } + else + { + smaller = &nst->val2; + bigger = &nst->val1; + } + + if (0 < value_compare_3way (smaller, case_data (c, nst->indep_var), var_get_width (nst->indep_var))) + return false; + + if (0 > value_compare_3way (bigger, case_data (c, nst->indep_var), var_get_width (nst->indep_var))) + return false; + + return true; +} + +struct group_data +{ + /* The total of the caseweights in the group */ + double cc; + + /* A casereader containing the group data. + This casereader contains just two values: + 0: The raw value of the data + 1: The cumulative caseweight + */ + struct casereader *reader; +}; + + +static double +u (const struct group_data *grp0, const struct group_data *grp1) +{ + struct ccase *c0; + + struct casereader *r0 = casereader_clone (grp0->reader); + double usum = 0; + double prev_cc0 = 0.0; + for (; (c0 = casereader_read (r0)); case_unref (c0)) + { + struct ccase *c1; + struct casereader *r1 = casereader_clone (grp1->reader); + double x0 = case_data_idx (c0, 0)->f; + double cc0 = case_data_idx (c0, 1)->f; + double w0 = cc0 - prev_cc0; + + double prev_cc1 = 0; + + for (; (c1 = casereader_read (r1)); case_unref (c1)) + { + double x1 = case_data_idx (c1, 0)->f; + double cc1 = case_data_idx (c1, 1)->f; + + if (x0 > x1) + { + /* Do nothing */ + } + else if (x0 < x1) + { + usum += w0 * (grp1->cc - prev_cc1); + case_unref (c1); + break; + } + else + { +#if 1 + usum += w0 * ( (grp1->cc - prev_cc1) / 2.0); +#else + usum += w0 * (grp1->cc - (prev_cc1 + cc) / 2.0); +#endif + case_unref (c1); + break; + } + + prev_cc1 = cc1; + } + casereader_destroy (r1); + prev_cc0 = cc0; + } + casereader_destroy (r0); + + return usum; +} + + +typedef double func_f (double e_l); + +/* + These 3 functions are used repeatedly in the calculation of the + variance of the JT statistic. + Having them explicitly defined makes the variance calculation + a lot simpler. +*/ +static double +ff1 (double e) +{ + return e * (e - 1) * (2*e + 5); +} + +static double +ff2 (double e) +{ + return e * (e - 1) * (e - 2); +} + +static double +ff3 (double e) +{ + return e * (e - 1) ; +} + +static func_f *mff[3] = + { + ff1, ff2, ff3 + }; + + +/* + This function does the following: + It creates an ordered set of *distinct* values from IR. + For each case in that set, it calls f[0..N] passing it the caseweight. + It returns the sum of f[j] in result[j]. + + result and f must be allocated prior to calling this function. + */ +static +void variance_calculation (struct casereader *ir, const struct variable *var, + const struct dictionary *dict, + func_f **f, double *result, size_t n) +{ + int i; + struct casereader *r = casereader_clone (ir); + struct ccase *c; + const struct variable *wv = dict_get_weight (dict); + const int w_idx = wv ? + var_get_case_index (wv) : + caseproto_get_n_widths (casereader_get_proto (r)) ; + + r = sort_execute_1var (r, var); + + r = casereader_create_distinct (r, var, dict_get_weight (dict)); + + for (; (c = casereader_read (r)); case_unref (c)) + { + double w = case_data_idx (c, w_idx)->f; + + for (i = 0; i < n; ++i) + result[i] += f[i] (w); + } + + casereader_destroy (r); +} + +struct jt +{ + int levels; + double n; + double obs; + double mean; + double stddev; +}; + +static void show_jt (const struct n_sample_test *nst, const struct jt *jt, const struct variable *wv); + + +void +jonckheere_terpstra_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact UNUSED, + double timer UNUSED) +{ + int v; + bool warn = true; + const struct dictionary *dict = dataset_dict (ds); + const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, parent); + + struct caseproto *proto = caseproto_create (); + proto = caseproto_add_width (proto, 0); + proto = caseproto_add_width (proto, 0); + + /* If the independent variable is missing, then we ignore the case */ + input = casereader_create_filter_missing (input, + &nst->indep_var, 1, + exclude, + NULL, NULL); + + /* Remove cases with invalid weigths */ + input = casereader_create_filter_weight (input, dict, &warn, NULL); + + /* Remove all those cases which are outside the range (val1, val2) */ + input = casereader_create_filter_func (input, include_func_bi, NULL, + CONST_CAST (struct n_sample_test *, nst), NULL); + + /* Sort the data by the independent variable */ + input = sort_execute_1var (input, nst->indep_var); + + for (v = 0; v < nst->n_vars ; ++v) + { + struct jt jt; + double variance; + int g0; + double nn = 0; + int i; + double sums[3] = {0,0,0}; + double e_sum[3] = {0,0,0}; + + struct group_data *grp = NULL; + double ccsq_sum = 0; + + struct casegrouper *grouper; + struct casereader *group; + struct casereader *vreader= casereader_clone (input); + + /* Get a few values into e_sum - we'll be needing these later */ + variance_calculation (vreader, nst->vars[v], dict, mff, e_sum, 3); + + grouper = + casegrouper_create_vars (vreader, &nst->indep_var, 1); + + jt.obs = 0; + jt.levels = 0; + jt.n = 0; + for (; casegrouper_get_next_group (grouper, &group); + casereader_destroy (group) ) + { + struct casewriter *writer = autopaging_writer_create (proto); + struct ccase *c; + double cc = 0; + + group = sort_execute_1var (group, nst->vars[v]); + for (; (c = casereader_read (group)); case_unref (c)) + { + struct ccase *c_out = case_create (proto); + const union value *x = case_data (c, nst->vars[v]); + + case_data_rw_idx (c_out, 0)->f = x->f; + + cc += dict_get_case_weight (dict, c, &warn); + case_data_rw_idx (c_out, 1)->f = cc; + casewriter_write (writer, c_out); + } + + grp = xrealloc (grp, sizeof *grp * (jt.levels + 1)); + + grp[jt.levels].reader = casewriter_make_reader (writer); + grp[jt.levels].cc = cc; + + jt.levels++; + jt.n += cc; + ccsq_sum += pow2 (cc); + } + + casegrouper_destroy (grouper); + + for (g0 = 0; g0 < jt.levels; ++g0) + { + int g1; + for (g1 = g0 +1 ; g1 < jt.levels; ++g1) + { + double uu = u (&grp[g0], &grp[g1]); + jt.obs += uu; + } + nn += pow2 (grp[g0].cc) * (2 * grp[g0].cc + 3); + + for (i = 0; i < 3; ++i) + sums[i] += mff[i] (grp[g0].cc); + + casereader_destroy (grp[g0].reader); + } + + free (grp); + + variance = (mff[0](jt.n) - sums[0] - e_sum[0]) / 72.0; + variance += sums[1] * e_sum[1] / (36.0 * mff[1] (jt.n)); + variance += sums[2] * e_sum[2] / (8.0 * mff[2] (jt.n)); + + jt.stddev = sqrt (variance); + + jt.mean = (pow2 (jt.n) - ccsq_sum) / 4.0; + + show_jt (nst, &jt, dict_get_weight (dict)); + } + + casereader_destroy (input); + caseproto_unref (proto); +} + + + +#include "gettext.h" +#define _(msgid) gettext (msgid) + +static void +show_jt (const struct n_sample_test *nst, const struct jt *jt, const struct variable *wv) +{ + int i; + const int row_headers = 1; + const int column_headers = 1; + const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : & F_8_0; + + struct tab_table *table = + tab_create (row_headers + 7, column_headers + nst->n_vars); + + tab_headers (table, row_headers, 0, column_headers, 0); + + tab_title (table, _("Jonckheere-Terpstra Test")); + + /* Vertical lines inside the box */ + tab_box (table, 1, 0, -1, TAL_1, + row_headers, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + /* Box around the table */ + tab_box (table, TAL_2, TAL_2, -1, -1, + 0, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + tab_hline (table, TAL_2, 0, tab_nc (table) -1, column_headers); + tab_vline (table, TAL_2, row_headers, 0, tab_nr (table) - 1); + + tab_text_format (table, 1, 0, TAT_TITLE | TAB_CENTER, + _("Number of levels in %s"), + var_to_string (nst->indep_var)); + tab_text (table, 2, 0, TAT_TITLE | TAB_CENTER, _("N")); + tab_text (table, 3, 0, TAT_TITLE | TAB_CENTER, _("Observed J-T Statistic")); + tab_text (table, 4, 0, TAT_TITLE | TAB_CENTER, _("Mean J-T Statistic")); + tab_text (table, 5, 0, TAT_TITLE | TAB_CENTER, _("Std. Deviation of J-T Statistic")); + tab_text (table, 6, 0, TAT_TITLE | TAB_CENTER, _("Std. J-T Statistic")); + tab_text (table, 7, 0, TAT_TITLE | TAB_CENTER, _("Asymp. Sig. (2-tailed)")); + + + for (i = 0; i < nst->n_vars; ++i) + { + tab_text (table, 0, i + row_headers, TAT_TITLE, + var_to_string (nst->vars[i]) ); + + tab_double (table, 1, i + row_headers, TAT_TITLE, + jt[0].levels, &F_8_0); + + tab_double (table, 2, i + row_headers, TAT_TITLE, + jt[0].n, wfmt); + + tab_double (table, 3, i + row_headers, TAT_TITLE, + jt[0].obs, 0); + + tab_double (table, 4, i + row_headers, TAT_TITLE, + jt[0].mean, 0); + + tab_double (table, 5, i + row_headers, TAT_TITLE, + jt[0].stddev, 0); + + double std_jt = (jt[0].obs - jt[0].mean) / jt[0].stddev; + tab_double (table, 6, i + row_headers, TAT_TITLE, + std_jt, 0); + + tab_double (table, 7, i + row_headers, TAT_TITLE, + 2.0 * gsl_cdf_ugaussian_P (std_jt), 0); + } + + tab_submit (table); +} diff --git a/src/language/stats/jonckheere-terpstra.h b/src/language/stats/jonckheere-terpstra.h new file mode 100644 index 0000000000..5bd54b1473 --- /dev/null +++ b/src/language/stats/jonckheere-terpstra.h @@ -0,0 +1,41 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2012 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#if !jonckheere_terpstra_h +#define jonckheere_terpstra_h 1 + +#include +#include +#include "data/case.h" +#include "language/stats/npar.h" + +struct jonckheere_terpstra_test +{ + struct two_sample_test parent; +}; + +struct casereader; +struct dataset; + +void jonckheere_terpstra_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact, + double timer + ); + +#endif diff --git a/src/language/stats/npar.c b/src/language/stats/npar.c index 3260117d4f..8d40faad01 100644 --- a/src/language/stats/npar.c +++ b/src/language/stats/npar.c @@ -37,6 +37,7 @@ #include "language/stats/ks-one-sample.h" #include "language/stats/cochran.h" #include "language/stats/friedman.h" +#include "language/stats/jonckheere-terpstra.h" #include "language/stats/kruskal-wallis.h" #include "language/stats/mann-whitney.h" #include "language/stats/mcnemar.h" @@ -95,6 +96,7 @@ struct cmd_npar_tests int mann_whitney; int mcnemar; int median; + int jonckheere_terpstra; int missing; int method; int statistics; @@ -138,6 +140,7 @@ static int npar_cochran (struct lexer *, struct dataset *, struct npar_specs *); static int npar_wilcoxon (struct lexer *, struct dataset *, struct npar_specs *); static int npar_sign (struct lexer *, struct dataset *, struct npar_specs *); static int npar_kruskal_wallis (struct lexer *, struct dataset *, struct npar_specs *); +static int npar_jonckheere_terpstra (struct lexer *, struct dataset *, struct npar_specs *); static int npar_mann_whitney (struct lexer *, struct dataset *, struct npar_specs *); static int npar_mcnemar (struct lexer *, struct dataset *, struct npar_specs *); static int npar_median (struct lexer *, struct dataset *, struct npar_specs *); @@ -159,6 +162,7 @@ parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests npt->friedman = 0; npt->kruskal_wallis = 0; npt->mann_whitney = 0; + npt->jonckheere_terpstra = 0; npt->mcnemar = 0; npt->runs = 0; npt->sign = 0; @@ -288,6 +292,24 @@ parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests NOT_REACHED (); } } + else if (lex_match_phrase (lexer, "J-T") || + lex_match_phrase (lexer, "JONCKHEERE-TERPSTRA")) + { + lex_match (lexer, T_EQUALS); + npt->jonckheere_terpstra++; + switch (npar_jonckheere_terpstra (lexer, ds, nps)) + { + case 0: + goto lossage; + case 1: + break; + case 2: + lex_error (lexer, NULL); + goto lossage; + default: + NOT_REACHED (); + } + } else if (lex_match_phrase (lexer, "K-W") || lex_match_phrase (lexer, "KRUSKAL-WALLIS")) { @@ -1317,6 +1339,30 @@ npar_mcnemar (struct lexer *lexer, struct dataset *ds, } +static int +npar_jonckheere_terpstra (struct lexer *lexer, struct dataset *ds, + struct npar_specs *specs) +{ + struct n_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp)); + struct npar_test *nt = &tp->parent; + + nt->insert_variables = n_sample_insert_variables; + + nt->execute = jonckheere_terpstra_execute; + + if (!parse_n_sample_related_test (lexer, dataset_dict (ds), + tp, specs->pool) ) + return 0; + + specs->n_tests++; + specs->test = pool_realloc (specs->pool, + specs->test, + sizeof (*specs->test) * specs->n_tests); + specs->test[specs->n_tests - 1] = nt; + + return 1; +} + static int npar_kruskal_wallis (struct lexer *lexer, struct dataset *ds, struct npar_specs *specs) diff --git a/tests/language/stats/npar.at b/tests/language/stats/npar.at index f4cec2edb9..4fa26ecf47 100644 --- a/tests/language/stats/npar.at +++ b/tests/language/stats/npar.at @@ -1582,4 +1582,46 @@ xx,9,7.000,NaN,1,NaN Years expected,9,7.000,.900,1,.343 ]) -AT_CLEANUP \ No newline at end of file +AT_CLEANUP + + +AT_SETUP([NPAR TESTS Jonckheere-Terpstra]) + +AT_DATA([jt.sps], [dnl +set format = F12.3. +data list notable list /x * g * w *. +begin data. +52 2 2 +58 2 1 +60 2 1 +62 2 1 +58 0 1 +44 2 1 +46 2 1 +14 3 1 +32 2 1 +16 3 1 +56 2 1 +26 3 1 +40 3 2 +50 4 1 +6 5 1 +34 2 3 +36 2 2 +40 2 2 +50 2 1 +end data. + +weight by w. + +npar test /jonckheere-terpstra = x by g (5, 2). +]) + + +AT_CHECK([pspp -O format=csv jt.sps], [0], [dnl +Table: Jonckheere-Terpstra Test +,Number of levels in g,N,Observed J-T Statistic,Mean J-T Statistic,Std. Deviation of J-T Statistic,Std. J-T Statistic,Asymp. Sig. (2-tailed) +x,4,24.000,29.500,65.000,15.902,-2.232,.026 +]) + +AT_CLEANUP -- 2.30.2