1 /* PSPP - a program for statistical analysis. -*-c-*-
2 Copyright (C) 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <language/stats/npar.h>
20 #include "npar-summary.h"
27 #include <data/case.h>
28 #include <data/casegrouper.h>
29 #include <data/casereader.h>
30 #include <data/dictionary.h>
31 #include <data/procedure.h>
32 #include <data/settings.h>
33 #include <data/variable.h>
34 #include <libpspp/assertion.h>
35 #include <libpspp/cast.h>
36 #include <libpspp/hmapx.h>
37 #include <libpspp/hash-functions.h>
38 #include <libpspp/message.h>
39 #include <libpspp/pool.h>
40 #include <libpspp/str.h>
41 #include <libpspp/taint.h>
42 #include <language/command.h>
43 #include <language/lexer/lexer.h>
44 #include <language/lexer/variable-parser.h>
45 #include <language/lexer/value-parser.h>
46 #include <language/stats/binomial.h>
47 #include <language/stats/chisquare.h>
48 #include <language/stats/runs.h>
49 #include <language/stats/friedman.h>
50 #include <language/stats/kruskal-wallis.h>
51 #include <language/stats/wilcoxon.h>
52 #include <language/stats/sign.h>
53 #include <math/moments.h>
56 #define _(msgid) gettext (msgid)
58 /* Settings for subcommand specifiers. */
65 /* Array indices for STATISTICS subcommand. */
68 NPAR_ST_DESCRIPTIVES = 0,
69 NPAR_ST_QUARTILES = 1,
74 /* NPAR TESTS structure. */
77 /* Count variables indicating how many
78 of the subcommands have been given. */
90 /* How missing values should be treated */
93 /* Which statistics have been requested */
94 int a_statistics[NPAR_ST_count];
101 struct npar_test **test;
104 const struct variable **vv; /* Compendium of all variables
105 (those mentioned on ANY subcommand */
106 int n_vars; /* Number of variables in vv */
108 enum mv_class filter; /* Missing values to filter. */
110 bool descriptives; /* Descriptive statistics should be calculated */
111 bool quartiles; /* Quartiles should be calculated */
113 bool exact; /* Whether exact calculations have been requested */
114 double timer; /* Maximum time (in minutes) to wait for exact calculations */
118 /* Prototype for custom subcommands of NPAR TESTS. */
119 static int npar_chisquare (struct lexer *, struct dataset *, struct npar_specs *);
120 static int npar_binomial (struct lexer *, struct dataset *, struct npar_specs *);
121 static int npar_runs (struct lexer *, struct dataset *, struct npar_specs *);
122 static int npar_friedman (struct lexer *, struct dataset *, struct npar_specs *);
123 static int npar_wilcoxon (struct lexer *, struct dataset *, struct npar_specs *);
124 static int npar_sign (struct lexer *, struct dataset *, struct npar_specs *);
125 static int npar_kruskal_wallis (struct lexer *, struct dataset *, struct npar_specs *);
126 static int npar_method (struct lexer *, struct npar_specs *);
128 /* Command parsing functions. */
129 static int parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests *p,
130 struct npar_specs *npar_specs );
133 parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests *npt,
134 struct npar_specs *nps)
143 npt->miss = MISS_ANALYSIS;
146 memset (npt->a_statistics, 0, sizeof npt->a_statistics);
149 if (lex_match_hyphenated_word (lexer, "FRIEDMAN"))
152 switch (npar_friedman (lexer, ds, nps))
159 lex_error (lexer, NULL);
165 else if (lex_match_hyphenated_word (lexer, "RUNS"))
168 switch (npar_runs (lexer, ds, nps))
175 lex_error (lexer, NULL);
181 else if (lex_match_hyphenated_word (lexer, "CHISQUARE"))
183 lex_match (lexer, '=');
185 switch (npar_chisquare (lexer, ds, nps))
192 lex_error (lexer, NULL);
198 else if (lex_match_hyphenated_word (lexer, "BINOMIAL"))
200 lex_match (lexer, '=');
202 switch (npar_binomial (lexer, ds, nps))
209 lex_error (lexer, NULL);
215 else if (lex_match_hyphenated_word (lexer, "K-W") ||
216 lex_match_hyphenated_word (lexer, "KRUSKAL-WALLIS"))
218 lex_match (lexer, '=');
219 npt->kruskal_wallis++;
220 switch (npar_kruskal_wallis (lexer, ds, nps))
227 lex_error (lexer, NULL);
233 else if (lex_match_hyphenated_word (lexer, "WILCOXON"))
235 lex_match (lexer, '=');
237 switch (npar_wilcoxon (lexer, ds, nps))
244 lex_error (lexer, NULL);
250 else if (lex_match_hyphenated_word (lexer, "SIGN"))
252 lex_match (lexer, '=');
254 switch (npar_sign (lexer, ds, nps))
261 lex_error (lexer, NULL);
267 else if (lex_match_hyphenated_word (lexer, "MISSING"))
269 lex_match (lexer, '=');
271 if (npt->missing > 1)
273 msg (SE, _("The %s subcommand may be given only once."), "MISSING");
276 while (lex_token (lexer) != '/' && lex_token (lexer) != '.')
278 if (lex_match_hyphenated_word (lexer, "ANALYSIS"))
279 npt->miss = MISS_ANALYSIS;
280 else if (lex_match_hyphenated_word (lexer, "LISTWISE"))
281 npt->miss = MISS_LISTWISE;
282 else if (lex_match_hyphenated_word (lexer, "INCLUDE"))
283 nps->filter = MV_SYSTEM;
284 else if (lex_match_hyphenated_word (lexer, "EXCLUDE"))
285 nps->filter = MV_ANY;
288 lex_error (lexer, NULL);
291 lex_match (lexer, ',');
294 else if (lex_match_hyphenated_word (lexer, "METHOD"))
296 lex_match (lexer, '=');
300 msg (SE, _("The %s subcommand may be given only once."), "METHOD");
303 switch (npar_method (lexer, nps))
310 lex_error (lexer, NULL);
316 else if (lex_match_hyphenated_word (lexer, "STATISTICS"))
318 lex_match (lexer, '=');
320 while (lex_token (lexer) != '/' && lex_token (lexer) != '.')
322 if (lex_match_hyphenated_word (lexer, "DESCRIPTIVES"))
323 npt->a_statistics[NPAR_ST_DESCRIPTIVES] = 1;
324 else if (lex_match_hyphenated_word (lexer, "QUARTILES"))
325 npt->a_statistics[NPAR_ST_QUARTILES] = 1;
326 else if (lex_match (lexer, T_ALL))
327 npt->a_statistics[NPAR_ST_ALL] = 1;
330 lex_error (lexer, NULL);
333 lex_match (lexer, ',');
336 else if ( settings_get_syntax () != COMPATIBLE && lex_match_id (lexer, "ALGORITHM"))
338 lex_match (lexer, '=');
339 if (lex_match_id (lexer, "COMPATIBLE"))
340 settings_set_cmd_algorithm (COMPATIBLE);
341 else if (lex_match_id (lexer, "ENHANCED"))
342 settings_set_cmd_algorithm (ENHANCED);
344 if (!lex_match (lexer, '/'))
348 if (lex_token (lexer) != '.')
350 lex_error (lexer, _("expecting end of command"));
361 static void one_sample_insert_variables (const struct npar_test *test,
364 static void two_sample_insert_variables (const struct npar_test *test,
367 static void n_sample_insert_variables (const struct npar_test *test,
371 npar_execute (struct casereader *input,
372 const struct npar_specs *specs,
373 const struct dataset *ds)
376 struct descriptives *summary_descriptives = NULL;
378 for ( t = 0 ; t < specs->n_tests; ++t )
380 const struct npar_test *test = specs->test[t];
381 if ( NULL == test->execute )
383 msg (SW, _("NPAR subcommand not currently implemented."));
386 test->execute (ds, casereader_clone (input), specs->filter, test, specs->exact, specs->timer);
389 if ( specs->descriptives )
391 summary_descriptives = xnmalloc (sizeof (*summary_descriptives),
394 npar_summary_calc_descriptives (summary_descriptives,
395 casereader_clone (input),
397 specs->vv, specs->n_vars,
401 if ( (specs->descriptives || specs->quartiles)
402 && !taint_has_tainted_successor (casereader_get_taint (input)) )
403 do_summary_box (summary_descriptives, specs->vv, specs->n_vars );
405 free (summary_descriptives);
406 casereader_destroy (input);
410 cmd_npar_tests (struct lexer *lexer, struct dataset *ds)
412 struct cmd_npar_tests cmd;
415 struct npar_specs npar_specs = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
416 struct casegrouper *grouper;
417 struct casereader *input, *group;
418 struct hmapx var_map = HMAPX_INITIALIZER (var_map);
421 npar_specs.pool = pool_create ();
422 npar_specs.filter = MV_ANY;
423 npar_specs.n_vars = -1;
424 npar_specs.vv = NULL;
426 if ( ! parse_npar_tests (lexer, ds, &cmd, &npar_specs) )
428 pool_destroy (npar_specs.pool);
432 for (i = 0; i < npar_specs.n_tests; ++i )
434 const struct npar_test *test = npar_specs.test[i];
435 test->insert_variables (test, &var_map);
439 struct hmapx_node *node;
440 struct variable *var;
441 npar_specs.n_vars = 0;
443 HMAPX_FOR_EACH (var, node, &var_map)
445 npar_specs.n_vars ++;
446 npar_specs.vv = pool_nrealloc (npar_specs.pool, npar_specs.vv, npar_specs.n_vars, sizeof (*npar_specs.vv));
447 npar_specs.vv[npar_specs.n_vars - 1] = var;
451 qsort (npar_specs.vv, npar_specs.n_vars, sizeof (*npar_specs.vv),
452 compare_var_ptrs_by_name);
454 if ( cmd.statistics )
458 for ( i = 0 ; i < NPAR_ST_count; ++i )
460 if ( cmd.a_statistics[i] )
464 case NPAR_ST_DESCRIPTIVES:
465 npar_specs.descriptives = true;
467 case NPAR_ST_QUARTILES:
468 npar_specs.quartiles = true;
471 npar_specs.quartiles = true;
472 npar_specs.descriptives = true;
481 input = proc_open (ds);
482 if ( cmd.miss == MISS_LISTWISE )
484 input = casereader_create_filter_missing (input,
492 grouper = casegrouper_create_splits (input, dataset_dict (ds));
493 while (casegrouper_get_next_group (grouper, &group))
494 npar_execute (group, &npar_specs, ds);
495 ok = casegrouper_destroy (grouper);
496 ok = proc_commit (ds) && ok;
498 pool_destroy (npar_specs.pool);
499 hmapx_destroy (&var_map);
501 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
505 npar_runs (struct lexer *lexer, struct dataset *ds,
506 struct npar_specs *specs)
508 struct runs_test *rt = pool_alloc (specs->pool, sizeof (*rt));
509 struct one_sample_test *tp = &rt->parent;
510 struct npar_test *nt = &tp->parent;
512 nt->execute = runs_execute;
513 nt->insert_variables = one_sample_insert_variables;
515 if ( lex_force_match (lexer, '(') )
517 if ( lex_match_id (lexer, "MEAN"))
519 rt->cp_mode = CP_MEAN;
521 else if (lex_match_id (lexer, "MEDIAN"))
523 rt->cp_mode = CP_MEDIAN;
525 else if (lex_match_id (lexer, "MODE"))
527 rt->cp_mode = CP_MODE;
529 else if (lex_is_number (lexer))
531 rt->cutpoint = lex_number (lexer);
532 rt->cp_mode = CP_CUSTOM;
537 lex_error (lexer, _("Expecting MEAN, MEDIAN, MODE or number"));
541 lex_force_match (lexer, ')');
542 lex_force_match (lexer, '=');
543 if (!parse_variables_const_pool (lexer, specs->pool, dataset_dict (ds),
544 &tp->vars, &tp->n_vars,
545 PV_NO_SCRATCH | PV_NO_DUPLICATE | PV_NUMERIC))
552 specs->test = pool_realloc (specs->pool,
554 sizeof (*specs->test) * specs->n_tests);
556 specs->test[specs->n_tests - 1] = nt;
562 npar_friedman (struct lexer *lexer, struct dataset *ds,
563 struct npar_specs *specs)
565 struct one_sample_test *ft = pool_alloc (specs->pool, sizeof (*ft));
566 struct npar_test *nt = &ft->parent;
568 nt->execute = friedman_execute;
569 nt->insert_variables = one_sample_insert_variables;
571 lex_match (lexer, '=');
573 if (!parse_variables_const_pool (lexer, specs->pool, dataset_dict (ds),
574 &ft->vars, &ft->n_vars,
575 PV_NO_SCRATCH | PV_NO_DUPLICATE | PV_NUMERIC))
581 specs->test = pool_realloc (specs->pool,
583 sizeof (*specs->test) * specs->n_tests);
585 specs->test[specs->n_tests - 1] = nt;
592 npar_chisquare (struct lexer *lexer, struct dataset *ds,
593 struct npar_specs *specs)
595 struct chisquare_test *cstp = pool_alloc (specs->pool, sizeof (*cstp));
596 struct one_sample_test *tp = &cstp->parent;
597 struct npar_test *nt = &tp->parent;
600 nt->execute = chisquare_execute;
601 nt->insert_variables = one_sample_insert_variables;
603 if (!parse_variables_const_pool (lexer, specs->pool, dataset_dict (ds),
604 &tp->vars, &tp->n_vars,
605 PV_NO_SCRATCH | PV_NO_DUPLICATE))
610 cstp->ranged = false;
612 if ( lex_match (lexer, '('))
615 if ( ! lex_force_num (lexer)) return 0;
616 cstp->lo = lex_integer (lexer);
618 lex_force_match (lexer, ',');
619 if (! lex_force_num (lexer) ) return 0;
620 cstp->hi = lex_integer (lexer);
621 if ( cstp->lo >= cstp->hi )
624 _("The specified value of HI (%d) is "
625 "lower than the specified value of LO (%d)"),
630 if (! lex_force_match (lexer, ')')) return 0;
633 cstp->n_expected = 0;
634 cstp->expected = NULL;
635 if ( lex_match (lexer, '/') )
637 if ( lex_match_id (lexer, "EXPECTED") )
639 lex_force_match (lexer, '=');
640 if ( ! lex_match_id (lexer, "EQUAL") )
644 while ( lex_is_number (lexer) )
648 f = lex_number (lexer);
650 if ( lex_match (lexer, '*'))
653 f = lex_number (lexer);
656 lex_match (lexer, ',');
658 cstp->n_expected += n;
659 cstp->expected = pool_realloc (specs->pool,
663 for ( i = cstp->n_expected - n ;
664 i < cstp->n_expected;
666 cstp->expected[i] = f;
672 lex_put_back (lexer, '/');
675 if ( cstp->ranged && cstp->n_expected > 0 &&
676 cstp->n_expected != cstp->hi - cstp->lo + 1 )
679 _("%d expected values were given, but the specified "
680 "range (%d-%d) requires exactly %d values."),
681 cstp->n_expected, cstp->lo, cstp->hi,
682 cstp->hi - cstp->lo +1);
687 specs->test = pool_realloc (specs->pool,
689 sizeof (*specs->test) * specs->n_tests);
691 specs->test[specs->n_tests - 1] = nt;
698 npar_binomial (struct lexer *lexer, struct dataset *ds,
699 struct npar_specs *specs)
701 struct binomial_test *btp = pool_alloc (specs->pool, sizeof (*btp));
702 struct one_sample_test *tp = &btp->parent;
703 struct npar_test *nt = &tp->parent;
705 nt->execute = binomial_execute;
706 nt->insert_variables = one_sample_insert_variables;
708 btp->category1 = btp->category2 = btp->cutpoint = SYSMIS;
712 if ( lex_match (lexer, '(') )
714 if ( lex_force_num (lexer) )
716 btp->p = lex_number (lexer);
718 lex_force_match (lexer, ')');
724 /* Kludge: q2c swallows the '=' so put it back here */
725 lex_put_back (lexer, '=');
727 if (lex_match (lexer, '=') )
729 if (parse_variables_const_pool (lexer, specs->pool, dataset_dict (ds),
730 &tp->vars, &tp->n_vars,
731 PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) )
733 if (lex_match (lexer, '('))
735 lex_force_num (lexer);
736 btp->category1 = lex_number (lexer);
738 if ( lex_match (lexer, ','))
740 if ( ! lex_force_num (lexer) ) return 2;
741 btp->category2 = lex_number (lexer);
746 btp->cutpoint = btp->category1;
749 lex_force_match (lexer, ')');
758 specs->test = pool_realloc (specs->pool,
760 sizeof (*specs->test) * specs->n_tests);
762 specs->test[specs->n_tests - 1] = nt;
769 parse_two_sample_related_test (struct lexer *lexer,
770 const struct dictionary *dict,
771 struct two_sample_test *test_parameters,
777 parse_two_sample_related_test (struct lexer *lexer,
778 const struct dictionary *dict,
779 struct two_sample_test *test_parameters,
786 const struct variable **vlist1;
789 const struct variable **vlist2;
792 test_parameters->parent.insert_variables = two_sample_insert_variables;
794 if (!parse_variables_const_pool (lexer, pool,
797 PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) )
800 if ( lex_match (lexer, T_WITH))
803 if ( !parse_variables_const_pool (lexer, pool, dict,
805 PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) )
808 paired = (lex_match (lexer, '(') &&
809 lex_match_id (lexer, "PAIRED") && lex_match (lexer, ')'));
817 if ( n_vlist1 != n_vlist2)
818 msg (SE, _("PAIRED was specified but the number of variables "
819 "preceding WITH (%zu) did not match the number "
820 "following (%zu)."), n_vlist1, n_vlist2);
822 test_parameters->n_pairs = n_vlist1 ;
826 test_parameters->n_pairs = n_vlist1 * n_vlist2;
831 test_parameters->n_pairs = (n_vlist1 * (n_vlist1 - 1)) / 2 ;
834 test_parameters->pairs =
835 pool_alloc (pool, sizeof ( variable_pair) * test_parameters->n_pairs);
842 assert (n_vlist1 == n_vlist2);
843 for ( i = 0 ; i < n_vlist1; ++i )
845 test_parameters->pairs[n][1] = vlist1[i];
846 test_parameters->pairs[n][0] = vlist2[i];
853 for ( i = 0 ; i < n_vlist1; ++i )
855 for ( j = 0 ; j < n_vlist2; ++j )
857 test_parameters->pairs[n][1] = vlist1[i];
858 test_parameters->pairs[n][0] = vlist2[j];
867 for ( i = 0 ; i < n_vlist1 - 1; ++i )
869 for ( j = i + 1 ; j < n_vlist1; ++j )
871 assert ( n < test_parameters->n_pairs);
872 test_parameters->pairs[n][1] = vlist1[i];
873 test_parameters->pairs[n][0] = vlist1[j];
879 assert ( n == test_parameters->n_pairs);
886 parse_n_sample_related_test (struct lexer *lexer,
887 const struct dictionary *dict,
888 struct n_sample_test *nst,
892 if (!parse_variables_const_pool (lexer, pool,
894 &nst->vars, &nst->n_vars,
895 PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) )
898 if ( ! lex_force_match (lexer, T_BY))
901 nst->indep_var = parse_variable_const (lexer, dict);
903 if ( ! lex_force_match (lexer, '('))
906 value_init (&nst->val1, var_get_width (nst->indep_var));
907 if ( ! parse_value (lexer, &nst->val1, var_get_width (nst->indep_var)))
909 value_destroy (&nst->val1, var_get_width (nst->indep_var));
913 if ( ! lex_force_match (lexer, ','))
916 value_init (&nst->val2, var_get_width (nst->indep_var));
917 if ( ! parse_value (lexer, &nst->val2, var_get_width (nst->indep_var)))
919 value_destroy (&nst->val2, var_get_width (nst->indep_var));
923 if ( ! lex_force_match (lexer, ')'))
930 npar_wilcoxon (struct lexer *lexer,
932 struct npar_specs *specs )
936 struct two_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp));
937 struct npar_test *nt = &tp->parent;
938 nt->execute = wilcoxon_execute;
940 if (!parse_two_sample_related_test (lexer, dataset_dict (ds),
945 specs->test = pool_realloc (specs->pool,
947 sizeof (*specs->test) * specs->n_tests);
948 specs->test[specs->n_tests - 1] = nt;
954 npar_sign (struct lexer *lexer, struct dataset *ds,
955 struct npar_specs *specs)
957 struct two_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp));
958 struct npar_test *nt = &tp->parent;
960 nt->execute = sign_execute;
962 if (!parse_two_sample_related_test (lexer, dataset_dict (ds),
967 specs->test = pool_realloc (specs->pool,
969 sizeof (*specs->test) * specs->n_tests);
970 specs->test[specs->n_tests - 1] = nt;
976 npar_kruskal_wallis (struct lexer *lexer, struct dataset *ds,
977 struct npar_specs *specs)
979 struct n_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp));
980 struct npar_test *nt = &tp->parent;
982 nt->insert_variables = n_sample_insert_variables;
984 nt->execute = kruskal_wallis_execute;
986 if (!parse_n_sample_related_test (lexer, dataset_dict (ds),
991 specs->test = pool_realloc (specs->pool,
993 sizeof (*specs->test) * specs->n_tests);
994 specs->test[specs->n_tests - 1] = nt;
1000 insert_variable_into_map (struct hmapx *var_map, const struct variable *var)
1002 size_t hash = hash_pointer (var, 0);
1003 struct hmapx_node *node;
1004 const struct variable *v = NULL;
1006 HMAPX_FOR_EACH_WITH_HASH (v, node, hash, var_map)
1012 hmapx_insert (var_map, CONST_CAST (struct variable *, var), hash);
1015 /* Insert the variables for TEST into VAR_MAP */
1017 one_sample_insert_variables (const struct npar_test *test,
1018 struct hmapx *var_map)
1021 const struct one_sample_test *ost = UP_CAST (test, const struct one_sample_test, parent);
1023 for ( i = 0 ; i < ost->n_vars ; ++i )
1024 insert_variable_into_map (var_map, ost->vars[i]);
1029 two_sample_insert_variables (const struct npar_test *test,
1030 struct hmapx *var_map)
1033 const struct two_sample_test *tst = UP_CAST (test, const struct two_sample_test, parent);
1035 for ( i = 0 ; i < tst->n_pairs ; ++i )
1037 variable_pair *pair = &tst->pairs[i];
1039 insert_variable_into_map (var_map, (*pair)[0]);
1040 insert_variable_into_map (var_map, (*pair)[1]);
1045 n_sample_insert_variables (const struct npar_test *test,
1046 struct hmapx *var_map)
1049 const struct n_sample_test *tst = UP_CAST (test, const struct n_sample_test, parent);
1051 for ( i = 0 ; i < tst->n_vars ; ++i )
1052 insert_variable_into_map (var_map, tst->vars[i]);
1054 insert_variable_into_map (var_map, tst->indep_var);
1059 npar_method (struct lexer *lexer, struct npar_specs *specs)
1061 if ( lex_match_id (lexer, "EXACT") )
1063 specs->exact = true;
1065 if (lex_match_id (lexer, "TIMER"))
1069 if ( lex_match (lexer, '('))
1071 if ( lex_force_num (lexer) )
1073 specs->timer = lex_number (lexer);
1076 lex_force_match (lexer, ')');