From: John Darrington Date: Sun, 28 Aug 2011 10:39:51 +0000 (+0200) Subject: Added an implementation of the median test X-Git-Tag: v0.7.9~149 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1babb2ecc20dcae51f6cc639d39cce197d31dbdc;p=pspp-builds.git Added an implementation of the median test --- diff --git a/doc/statistics.texi b/doc/statistics.texi index b1121b49..18cc79e7 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -691,6 +691,7 @@ is used. * KRUSKAL-WALLIS:: Kruskal-Wallis Test * MANN-WHITNEY:: Mann Whitney U Test * MCNEMAR:: McNemar Test +* MEDIAN:: Median Test * RUNS:: Runs Test * SIGN:: The Sign Test * WILCOXON:: Wilcoxon Signed Ranks Test @@ -931,6 +932,31 @@ The data in each variable must be dichotomous. If there are more than two distinct variables an error will occur and the test will not be run. +@node MEDIAN +@subsection Median Test +@vindex MEDIAN +@cindex Median test + +@display + [ /MEDIAN [(value)] = varlist BY variable (value1, value2) ] +@end display + +The median test is used to test whether independent samples come from +populations with a common median. +The median of the populations against which the samples are to be tested +may be given in parentheses immediately after the +/MEDIAN subcommand. If it is not given, the median will be imputed from the +union of all the samples. + +The variables of the samples to be tested should immediately follow the @samp{=} sign. The +keyword @code{BY} must come next, and then the grouping variable. Two values +in parentheses should follow. If the first value is greater than the second, +then a 2 sample test is performed using these two values to determine the groups. +If however, the first variable is less than the second, then a @i{k} sample test is +conducted and the group values used are all values encountered which lie in the +range [@var{value1},@var{value2}]. + + @node RUNS @subsection Runs Test @vindex RUNS diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 4b4510e9..b44ebb2d 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -36,6 +36,8 @@ language_stats_sources = \ src/language/stats/mann-whitney.h \ src/language/stats/mcnemar.c \ src/language/stats/mcnemar.h \ + src/language/stats/median.c \ + src/language/stats/median.h \ src/language/stats/npar.c \ src/language/stats/npar.h \ src/language/stats/npar-summary.c \ diff --git a/src/language/stats/median.c b/src/language/stats/median.c new file mode 100644 index 00000000..2db57ab7 --- /dev/null +++ b/src/language/stats/median.c @@ -0,0 +1,461 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include "median.h" + +#include + +#include "data/format.h" +#include "libpspp/cast.h" + +#include "data/variable.h" +#include "data/case.h" +#include "data/dictionary.h" +#include "data/dataset.h" +#include "data/casereader.h" +#include "data/casewriter.h" +#include "data/subcase.h" + + +#include "data/casereader.h" +#include "math/percentiles.h" + +#include "math/sort.h" + +#include "libpspp/hmap.h" +#include "libpspp/array.h" +#include "libpspp/str.h" +#include "data/value.h" +#include "libpspp/misc.h" + +#include "output/tab.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) + + +struct val_node +{ + struct hmap_node node; + union value val; + casenumber le; + casenumber gt; +}; + +struct results +{ + const struct variable *var; + struct val_node **sorted_array; + double n; + double median; + double chisq; +}; + + + +static int +val_node_cmp_3way (const void *a_, const void *b_, const void *aux) +{ + const struct variable *indep_var = aux; + const struct val_node *const *a = a_; + const struct val_node *const *b = b_; + + return value_compare_3way (&(*a)->val, &(*b)->val, var_get_width (indep_var)); +} + +static void +show_frequencies (const struct n_sample_test *nst, const struct results *results, int n_vals, const struct dictionary *); + +static void +show_test_statistics (const struct n_sample_test *nst, const struct results *results, int, const struct dictionary *); + + +static struct val_node * +find_value (const struct hmap *map, const union value *val, + const struct variable *var) +{ + struct val_node *foo = NULL; + size_t hash = value_hash (val, var_get_width (var), 0); + HMAP_FOR_EACH_WITH_HASH (foo, struct val_node, node, hash, map) + if (value_equal (val, &foo->val, var_get_width (var))) + break; + + return foo; +} + +void +median_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact UNUSED, + double timer UNUSED) +{ + const struct dictionary *dict = dataset_dict (ds); + const struct variable *wvar = dict_get_weight (dict); + bool warn = true; + int v; + const struct median_test *mt = UP_CAST (test, const struct median_test, + parent.parent); + + const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test, + parent); + + const bool n_sample_test = (value_compare_3way (&nst->val2, &nst->val1, + var_get_width (nst->indep_var)) > 0); + + struct results *results = xcalloc (nst->n_vars, sizeof (*results)); + int n_vals = 0; + for (v = 0; v < nst->n_vars; ++v) + { + double count = 0; + double cc = 0; + double median = mt->median; + const struct variable *var = nst->vars[v]; + struct ccase *c; + struct hmap map = HMAP_INITIALIZER (map); + struct casereader *r = casereader_clone (input); + + + + if (n_sample_test == false) + { + struct val_node *vn = xzalloc (sizeof *vn); + value_clone (&vn->val, &nst->val1, var_get_width (nst->indep_var)); + hmap_insert (&map, &vn->node, value_hash (&nst->val1, + var_get_width (nst->indep_var), 0)); + + vn = xzalloc (sizeof *vn); + value_clone (&vn->val, &nst->val2, var_get_width (nst->indep_var)); + hmap_insert (&map, &vn->node, value_hash (&nst->val2, + var_get_width (nst->indep_var), 0)); + } + + if ( median == SYSMIS) + { + struct percentile *ptl; + struct order_stats *os; + + struct casereader *rr; + struct subcase sc; + struct casewriter *writer; + subcase_init_var (&sc, var, SC_ASCEND); + rr = casereader_clone (r); + writer = sort_create_writer (&sc, casereader_get_proto (rr)); + + for (; (c = casereader_read (rr)) != NULL; ) + { + if ( var_is_value_missing (var, case_data (c, var), exclude)) + { + case_unref (c); + continue; + } + + cc += dict_get_case_weight (dict, c, &warn); + casewriter_write (writer, c); + } + subcase_destroy (&sc); + casereader_destroy (rr); + + rr = casewriter_make_reader (writer); + + ptl = percentile_create (0.5, cc); + os = &ptl->parent; + + order_stats_accumulate (&os, 1, + rr, + wvar, + var, + exclude); + + median = percentile_calculate (ptl, PC_HAVERAGE); + statistic_destroy (&ptl->parent.parent); + } + + results[v].median = median; + + + for (; (c = casereader_read (r)) != NULL; case_unref (c)) + { + struct val_node *vn ; + const double weight = dict_get_case_weight (dict, c, &warn); + const union value *val = case_data (c, var); + const union value *indep_val = case_data (c, nst->indep_var); + + if ( var_is_value_missing (var, case_data (c, var), exclude)) + { + case_unref (c); + continue; + } + + if (n_sample_test) + { + int width = var_get_width (nst->indep_var); + /* Ignore out of range values */ + if ( + value_compare_3way (indep_val, &nst->val1, width) < 0 + || + value_compare_3way (indep_val, &nst->val2, width) > 0 + ) + { + case_unref (c); + continue; + } + } + + vn = find_value (&map, indep_val, nst->indep_var); + if ( vn == NULL) + { + if ( n_sample_test == true) + { + int width = var_get_width (nst->indep_var); + vn = xzalloc (sizeof *vn); + value_clone (&vn->val, indep_val, width); + + hmap_insert (&map, &vn->node, value_hash (indep_val, width, 0)); + } + else + { + continue; + } + } + + if (val->f <= median) + vn->le += weight; + else + vn->gt += weight; + + count += weight; + } + casereader_destroy (r); + + { + int x = 0; + struct val_node *vn = NULL; + double r_0 = 0; + double r_1 = 0; + HMAP_FOR_EACH (vn, struct val_node, node, &map) + { + r_0 += vn->le; + r_1 += vn->gt; + } + + results[v].n = count; + results[v].sorted_array = xcalloc (hmap_count (&map), sizeof (void*)); + results[v].var = var; + + HMAP_FOR_EACH (vn, struct val_node, node, &map) + { + double e_0j = r_0 * (vn->le + vn->gt) / count; + double e_1j = r_1 * (vn->le + vn->gt) / count; + + results[v].chisq += pow2 (vn->le - e_0j) / e_0j; + results[v].chisq += pow2 (vn->gt - e_1j) / e_1j; + + results[v].sorted_array[x++] = vn; + } + + n_vals = x; + hmap_destroy (&map); + + sort (results[v].sorted_array, x, sizeof (*results[v].sorted_array), + val_node_cmp_3way, nst->indep_var); + + } + } + + casereader_destroy (input); + + show_frequencies (nst, results, n_vals, dict); + show_test_statistics (nst, results, n_vals, dict); + + for (v = 0; v < nst->n_vars; ++v) + { + int i; + const struct results *rs = results + v; + + for (i = 0; i < n_vals; ++i) + { + struct val_node *vn = rs->sorted_array[i]; + value_destroy (&vn->val, var_get_width (nst->indep_var)); + free (vn); + } + free (rs->sorted_array); + } + free (results); +} + + + +static void +show_frequencies (const struct n_sample_test *nst, const struct results *results, int n_vals, const struct dictionary *dict) +{ + const struct variable *weight = dict_get_weight (dict); + const struct fmt_spec *wfmt = weight ? var_get_print_format (weight) : &F_8_0; + + int i; + int v; + + const int row_headers = 2; + const int column_headers = 2; + const int nc = row_headers + n_vals; + const int nr = column_headers + nst->n_vars * 2; + + struct tab_table *table = tab_create (nc, nr); + + tab_headers (table, row_headers, 0, column_headers, 0); + + tab_title (table, _("Frequencies")); + + /* Box around the table and vertical lines inside*/ + tab_box (table, TAL_2, TAL_2, -1, TAL_1, + 0, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + tab_hline (table, TAL_2, 0, tab_nc (table) -1, column_headers); + tab_vline (table, TAL_2, row_headers, 0, tab_nr (table) - 1); + + tab_joint_text (table, + row_headers, 0, row_headers + n_vals - 1, 0, + TAT_TITLE | TAB_CENTER, var_to_string (nst->indep_var)); + + + tab_hline (table, TAL_1, row_headers, tab_nc (table) - 1, 1); + + + for (i = 0; i < n_vals; ++i) + { + const struct results *rs = results + 0; + struct string label; + ds_init_empty (&label); + + var_append_value_name (nst->indep_var, &rs->sorted_array[i]->val, + &label); + + tab_text (table, row_headers + i, 1, + TAT_TITLE | TAB_LEFT, ds_cstr (&label)); + + ds_destroy (&label); + } + + for (v = 0; v < nst->n_vars; ++v) + { + const struct results *rs = &results[v]; + tab_text (table, 0, column_headers + v * 2, + TAT_TITLE | TAB_LEFT, var_to_string (rs->var) ); + + tab_text (table, 1, column_headers + v * 2, + TAT_TITLE | TAB_LEFT, _("> Median") ); + + tab_text (table, 1, column_headers + v * 2 + 1, + TAT_TITLE | TAB_LEFT, _("≤ Median") ); + + if ( v > 0) + tab_hline (table, TAL_1, 0, tab_nc (table) - 1, column_headers + v * 2); + } + + for (v = 0; v < nst->n_vars; ++v) + { + int i; + const struct results *rs = &results[v]; + + for (i = 0; i < n_vals; ++i) + { + const struct val_node *vn = rs->sorted_array[i]; + tab_double (table, row_headers + i, column_headers + v * 2, + 0, vn->gt, wfmt); + + tab_double (table, row_headers + i, column_headers + v * 2 + 1, + 0, vn->le, wfmt); + } + } + + tab_submit (table); +} + + +static void +show_test_statistics (const struct n_sample_test *nst, + const struct results *results, + int n_vals, + const struct dictionary *dict) +{ + const struct variable *weight = dict_get_weight (dict); + const struct fmt_spec *wfmt = weight ? var_get_print_format (weight) : &F_8_0; + + int v; + + const int row_headers = 1; + const int column_headers = 1; + const int nc = row_headers + 5; + const int nr = column_headers + nst->n_vars; + + struct tab_table *table = tab_create (nc, nr); + + tab_headers (table, row_headers, 0, column_headers, 0); + + tab_title (table, _("Test Statistics")); + + + tab_box (table, TAL_2, TAL_2, -1, TAL_1, + 0, 0, tab_nc (table) - 1, tab_nr (table) - 1 ); + + tab_hline (table, TAL_2, 0, tab_nc (table) -1, column_headers); + tab_vline (table, TAL_2, row_headers, 0, tab_nr (table) - 1); + + tab_text (table, row_headers + 0, 0, + TAT_TITLE | TAB_CENTER, _("N")); + + tab_text (table, row_headers + 1, 0, + TAT_TITLE | TAB_CENTER, _("Median")); + + tab_text (table, row_headers + 2, 0, + TAT_TITLE | TAB_CENTER, _("Chi-Square")); + + tab_text (table, row_headers + 3, 0, + TAT_TITLE | TAB_CENTER, _("df")); + + tab_text (table, row_headers + 4, 0, + TAT_TITLE | TAB_CENTER, _("Asymp. Sig.")); + + + for (v = 0; v < nst->n_vars; ++v) + { + double df = n_vals - 1; + const struct results *rs = &results[v]; + tab_text (table, 0, column_headers + v, + TAT_TITLE | TAB_LEFT, var_to_string (rs->var)); + + + tab_double (table, row_headers + 0, column_headers + v, + 0, rs->n, wfmt); + + tab_double (table, row_headers + 1, column_headers + v, + 0, rs->median, NULL); + + tab_double (table, row_headers + 2, column_headers + v, + 0, rs->chisq, NULL); + + tab_double (table, row_headers + 3, column_headers + v, + 0, df, wfmt); + + tab_double (table, row_headers + 4, column_headers + v, + 0, gsl_cdf_chisq_Q (rs->chisq, df), NULL); + } + + tab_submit (table); +} diff --git a/src/language/stats/median.h b/src/language/stats/median.h new file mode 100644 index 00000000..236984cf --- /dev/null +++ b/src/language/stats/median.h @@ -0,0 +1,41 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#if !median_h +#define median_h 1 + +#include +#include +#include "language/stats/npar.h" + +struct median_test +{ + struct n_sample_test parent; + double median; +}; + +struct casereader; +struct dataset; + +void median_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact, + double timer + ); + +#endif diff --git a/src/language/stats/npar.c b/src/language/stats/npar.c index fda7b0c6..e0d50d4a 100644 --- a/src/language/stats/npar.c +++ b/src/language/stats/npar.c @@ -40,6 +40,7 @@ #include "language/stats/kruskal-wallis.h" #include "language/stats/mann-whitney.h" #include "language/stats/mcnemar.h" +#include "language/stats/median.h" #include "language/stats/npar-summary.h" #include "language/stats/runs.h" #include "language/stats/sign.h" @@ -93,6 +94,7 @@ struct cmd_npar_tests int kruskal_wallis; int mann_whitney; int mcnemar; + int median; int missing; int method; int statistics; @@ -138,6 +140,8 @@ static int npar_sign (struct lexer *, struct dataset *, struct npar_specs *); static int npar_kruskal_wallis (struct lexer *, struct dataset *, struct npar_specs *); static int npar_mann_whitney (struct lexer *, struct dataset *, struct npar_specs *); static int npar_mcnemar (struct lexer *, struct dataset *, struct npar_specs *); +static int npar_median (struct lexer *, struct dataset *, struct npar_specs *); + static int npar_method (struct lexer *, struct npar_specs *); /* Command parsing functions. */ @@ -336,6 +340,23 @@ parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests default: NOT_REACHED (); } + } + else if (lex_match_phrase (lexer, "MEDIAN")) + { + npt->median++; + + switch (npar_median (lexer, ds, nps)) + { + case 0: + goto lossage; + case 1: + break; + case 2: + lex_error (lexer, NULL); + goto lossage; + default: + NOT_REACHED (); + } } else if (lex_match_id (lexer, "WILCOXON")) { @@ -1212,6 +1233,42 @@ npar_mann_whitney (struct lexer *lexer, } +static int +npar_median (struct lexer *lexer, + struct dataset *ds, + struct npar_specs *specs) +{ + struct median_test *mt = pool_alloc (specs->pool, sizeof (*mt)); + struct n_sample_test *tp = &mt->parent; + struct npar_test *nt = &tp->parent; + + mt->median = SYSMIS; + + if ( lex_match (lexer, T_LPAREN)) + { + lex_force_num (lexer); + mt->median = lex_number (lexer); + lex_get (lexer); + lex_force_match (lexer, T_RPAREN); + } + + lex_match (lexer, T_EQUALS); + + nt->insert_variables = n_sample_insert_variables; + nt->execute = median_execute; + + if (!parse_n_sample_related_test (lexer, dataset_dict (ds), + tp, specs->pool) ) + return 0; + + specs->n_tests++; + specs->test = pool_realloc (specs->pool, + specs->test, + sizeof (*specs->test) * specs->n_tests); + specs->test[specs->n_tests - 1] = nt; + + return 1; +} static int diff --git a/tests/language/stats/npar.at b/tests/language/stats/npar.at index cc1bd167..f4cec2ed 100644 --- a/tests/language/stats/npar.at +++ b/tests/language/stats/npar.at @@ -1389,3 +1389,197 @@ Asymp. Sig. (2-tailed),,.569,.552 AT_CLEANUP + + +AT_SETUP([NPAR TESTS Median Test (median imputed)]) + +AT_DATA([median1.sps], [dnl +set format F12.3. +data list notable list /ignore * animal * years * w *. +begin data +99 1 10 1 +99 4 1 1 +99 5 11 1 +99 5 10 1 +99 3 7 1 +99 6 10 1 +99 0 7 1 +99 3 14 1 +99 2 3 1 +99 1 1 1 +99 4 7 1 +99 5 12 1 +99 3 6 1 +99 4 1 1 +99 3 5 1 +99 5 7 1 +99 4 6 1 +99 3 14 1 +99 4 8 1 +99 5 13 1 +99 2 0 1 +99 4 7 1 +99 4 7 1 +99 1 0 1 +99 2 8 1 +99 4 10 1 +99 2 3 1 +99 2 0 1 +99 4 8 1 +99 1 8 1 +end data. + + +variable label years 'Years expected'. +variable label animal 'Animal Genus'. + +add value labels animal 1 'Animal 1' 2 'Animal 2' 3 'Animal 3' 4 'Animal 4' 5 'Animal 5'. + +npar tests + /median = years by animal (1, 5) + . +]) + + +AT_CHECK([pspp -O format=csv median1.sps], [0], [dnl +Table: Frequencies +,,Animal Genus,,,, +,,Animal 1,Animal 2,Animal 3,Animal 4,Animal 5 +Years expected,> Median,2,1,2,3,4 +,≤ Median,2,4,3,6,1 + +Table: Test Statistics +,N,Median,Chi-Square,df,Asymp. Sig. +Years expected,28,7.000,4.317,4,.365 +]) + +AT_CLEANUP + + +AT_SETUP([NPAR TESTS Median Test (median given)]) + +AT_DATA([median2.sps], [dnl +set format F12.3. +data list notable list /ignore * animal * years * w *. +begin data +99 1 10 1 +99 4 1 1 +99 5 11 1 +99 5 10 1 +99 3 7 1 +99 3 14 1 +99 2 3 1 +99 1 1 1 +99 4 7 1 +99 5 12 1 +99 3 6 1 +99 4 1 1 +99 3 5 1 +99 5 7 1 +99 4 6 1 +99 3 14 1 +99 4 8 1 +99 5 13 1 +99 2 0 1 +99 4 7 1 +99 4 7 1 +99 1 0 1 +99 2 8 1 +99 4 10 1 +99 2 3 1 +99 2 0 1 +99 4 8 1 +99 1 8 1 +end data. + + +variable label years 'Years expected'. +variable label animal 'Animal Genus'. + +add value labels animal 1 'Animal 1' 2 'Animal 2' 3 'Animal 3' 4 'Animal 4' 5 'Animal 5'. + +npar tests + /median (7) = years by animal (1, 5) + . +]) + + +AT_CHECK([pspp -O format=csv median2.sps], [0], [dnl +Table: Frequencies +,,Animal Genus,,,, +,,Animal 1,Animal 2,Animal 3,Animal 4,Animal 5 +Years expected,> Median,2,1,2,3,4 +,≤ Median,2,4,3,6,1 + +Table: Test Statistics +,N,Median,Chi-Square,df,Asymp. Sig. +Years expected,28,7.000,4.317,4,.365 +]) + +AT_CLEANUP + + +AT_SETUP([NPAR TESTS Median Test (two sample)]) + +AT_DATA([median3.sps], [dnl +set format F12.3. +data list notable list /xx * animal * years * w *. +begin data +99 1 10 1 +99 4 1 1 +99 5 11 1 +99 5 10 1 +99 3 7 1 +99 3 14 1 +99 2 3 1 +99 1 1 1 +99 4 7 1 +99 5 12 1 +99 3 6 1 +99 4 1 1 +99 3 5 1 +99 5 7 1 +99 4 6 1 +99 3 14 1 +99 4 8 1 +99 5 13 1 +99 2 0 1 +99 4 7 1 +99 4 7 1 +99 1 0 1 +99 2 8 1 +99 4 10 1 +99 2 3 1 +99 2 0 1 +99 4 8 1 +99 1 8 1 +end data. + + +variable label years 'Years expected'. +variable label animal 'Animal Genus'. + +add value labels animal 1 'Animal 1' 2 'Animal 2' 3 'Animal 3' 4 'Animal 4' 5 'Animal 5'. + +npar tests + /median (7) = xx years by animal (5, 1) + . +]) + + +AT_CHECK([pspp -O format=csv median3.sps], [0], [dnl +Table: Frequencies +,,Animal Genus, +,,Animal 1,Animal 5 +xx,> Median,4,5 +,≤ Median,0,0 +Years expected,> Median,2,4 +,≤ Median,2,1 + +Table: Test Statistics +,N,Median,Chi-Square,df,Asymp. Sig. +xx,9,7.000,NaN,1,NaN +Years expected,9,7.000,.900,1,.343 +]) + +AT_CLEANUP \ No newline at end of file