1 /* PSPP - a program for statistical analysis. -*-c-*-
2 Copyright (C) 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <language/stats/npar.h>
20 #include "npar-summary.h"
26 #include <data/case.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/dictionary.h>
30 #include <data/procedure.h>
31 #include <data/settings.h>
32 #include <data/variable.h>
33 #include <language/command.h>
34 #include <language/lexer/lexer.h>
35 #include <language/lexer/variable-parser.h>
36 #include <language/lexer/value-parser.h>
37 #include <language/stats/binomial.h>
38 #include <language/stats/chisquare.h>
39 #include <language/stats/kruskal-wallis.h>
40 #include <language/stats/wilcoxon.h>
41 #include <language/stats/sign.h>
42 #include <libpspp/assertion.h>
43 #include <libpspp/cast.h>
44 #include <libpspp/hash.h>
45 #include <libpspp/message.h>
46 #include <libpspp/pool.h>
47 #include <libpspp/str.h>
48 #include <libpspp/taint.h>
49 #include <math/moments.h>
52 #define _(msgid) gettext (msgid)
54 /* Settings for subcommand specifiers. */
61 /* Array indices for STATISTICS subcommand. */
64 NPAR_ST_DESCRIPTIVES = 0,
65 NPAR_ST_QUARTILES = 1,
70 /* NPAR TESTS structure. */
73 /* Count variables indicating how many
74 of the subcommands have been given. */
84 /* How missing values should be treated */
87 /* Which statistics have been requested */
88 int a_statistics[NPAR_ST_count];
95 struct npar_test **test;
98 const struct variable **vv; /* Compendium of all variables
99 (those mentioned on ANY subcommand */
100 int n_vars; /* Number of variables in vv */
102 enum mv_class filter; /* Missing values to filter. */
104 bool descriptives; /* Descriptive statistics should be calculated */
105 bool quartiles; /* Quartiles should be calculated */
107 bool exact; /* Whether exact calculations have been requested */
108 double timer; /* Maximum time (in minutes) to wait for exact calculations */
112 /* Prototype for custom subcommands of NPAR TESTS. */
113 static int npar_chisquare (struct lexer *, struct dataset *, struct npar_specs *);
114 static int npar_binomial (struct lexer *, struct dataset *, struct npar_specs *);
115 static int npar_wilcoxon (struct lexer *, struct dataset *, struct npar_specs *);
116 static int npar_sign (struct lexer *, struct dataset *, struct npar_specs *);
117 static int npar_kruskal_wallis (struct lexer *, struct dataset *, struct npar_specs *);
118 static int npar_method (struct lexer *, struct npar_specs *);
120 /* Command parsing functions. */
121 static int parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests *p,
122 struct npar_specs *npar_specs );
125 parse_npar_tests (struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests *npt,
126 struct npar_specs *nps)
133 npt->miss = MISS_ANALYSIS;
136 memset (npt->a_statistics, 0, sizeof npt->a_statistics);
139 if (lex_match_hyphenated_word (lexer, "CHISQUARE"))
141 lex_match (lexer, '=');
143 switch (npar_chisquare (lexer, ds, nps))
150 lex_error (lexer, NULL);
156 else if (lex_match_hyphenated_word (lexer, "BINOMIAL"))
158 lex_match (lexer, '=');
160 switch (npar_binomial (lexer, ds, nps))
167 lex_error (lexer, NULL);
173 else if (lex_match_hyphenated_word (lexer, "K-W") ||
174 lex_match_hyphenated_word (lexer, "KRUSKAL-WALLIS"))
176 lex_match (lexer, '=');
177 npt->kruskal_wallis++;
178 switch (npar_kruskal_wallis (lexer, ds, nps))
185 lex_error (lexer, NULL);
191 else if (lex_match_hyphenated_word (lexer, "WILCOXON"))
193 lex_match (lexer, '=');
195 switch (npar_wilcoxon (lexer, ds, nps))
202 lex_error (lexer, NULL);
208 else if (lex_match_hyphenated_word (lexer, "SIGN"))
210 lex_match (lexer, '=');
212 switch (npar_sign (lexer, ds, nps))
219 lex_error (lexer, NULL);
225 else if (lex_match_hyphenated_word (lexer, "MISSING"))
227 lex_match (lexer, '=');
229 if (npt->missing > 1)
231 msg (SE, _ ("The %s subcommand may be given only once."), "MISSING");
234 while (lex_token (lexer) != '/' && lex_token (lexer) != '.')
236 if (lex_match_hyphenated_word (lexer, "ANALYSIS"))
237 npt->miss = MISS_ANALYSIS;
238 else if (lex_match_hyphenated_word (lexer, "LISTWISE"))
239 npt->miss = MISS_LISTWISE;
240 else if (lex_match_hyphenated_word (lexer, "INCLUDE"))
241 nps->filter = MV_SYSTEM;
242 else if (lex_match_hyphenated_word (lexer, "EXCLUDE"))
243 nps->filter = MV_ANY;
246 lex_error (lexer, NULL);
249 lex_match (lexer, ',');
252 else if (lex_match_hyphenated_word (lexer, "METHOD"))
254 lex_match (lexer, '=');
258 msg (SE, _ ("The %s subcommand may be given only once."), "METHOD");
261 switch (npar_method (lexer, nps))
268 lex_error (lexer, NULL);
274 else if (lex_match_hyphenated_word (lexer, "STATISTICS"))
276 lex_match (lexer, '=');
278 while (lex_token (lexer) != '/' && lex_token (lexer) != '.')
280 if (lex_match_hyphenated_word (lexer, "DESCRIPTIVES"))
281 npt->a_statistics[NPAR_ST_DESCRIPTIVES] = 1;
282 else if (lex_match_hyphenated_word (lexer, "QUARTILES"))
283 npt->a_statistics[NPAR_ST_QUARTILES] = 1;
284 else if (lex_match (lexer, T_ALL))
285 npt->a_statistics[NPAR_ST_ALL] = 1;
288 lex_error (lexer, NULL);
291 lex_match (lexer, ',');
294 else if ( settings_get_syntax () != COMPATIBLE && lex_match_id (lexer, "ALGORITHM"))
296 lex_match (lexer, '=');
297 if (lex_match_id (lexer, "COMPATIBLE"))
298 settings_set_cmd_algorithm (COMPATIBLE);
299 else if (lex_match_id (lexer, "ENHANCED"))
300 settings_set_cmd_algorithm (ENHANCED);
302 if (!lex_match (lexer, '/'))
306 if (lex_token (lexer) != '.')
308 lex_error (lexer, _ ("expecting end of command"));
321 static void one_sample_insert_variables (const struct npar_test *test,
322 struct const_hsh_table *variables);
324 static void two_sample_insert_variables (const struct npar_test *test,
325 struct const_hsh_table *variables);
328 static void n_sample_insert_variables (const struct npar_test *test,
329 struct const_hsh_table *variables);
334 npar_execute (struct casereader *input,
335 const struct npar_specs *specs,
336 const struct dataset *ds)
339 struct descriptives *summary_descriptives = NULL;
341 for ( t = 0 ; t < specs->n_tests; ++t )
343 const struct npar_test *test = specs->test[t];
344 if ( NULL == test->execute )
346 msg (SW, _ ("NPAR subcommand not currently implemented."));
349 test->execute (ds, casereader_clone (input), specs->filter, test, specs->exact, specs->timer);
352 if ( specs->descriptives )
354 summary_descriptives = xnmalloc (sizeof (*summary_descriptives),
357 npar_summary_calc_descriptives (summary_descriptives,
358 casereader_clone (input),
360 specs->vv, specs->n_vars,
364 if ( (specs->descriptives || specs->quartiles)
365 && !taint_has_tainted_successor (casereader_get_taint (input)) )
366 do_summary_box (summary_descriptives, specs->vv, specs->n_vars );
368 free (summary_descriptives);
369 casereader_destroy (input);
374 cmd_npar_tests (struct lexer *lexer, struct dataset *ds)
376 struct cmd_npar_tests cmd;
379 struct npar_specs npar_specs = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
380 struct const_hsh_table *var_hash;
381 struct casegrouper *grouper;
382 struct casereader *input, *group;
384 npar_specs.pool = pool_create ();
385 npar_specs.filter = MV_ANY;
387 var_hash = const_hsh_create_pool (npar_specs.pool, 0,
388 compare_vars_by_name, hash_var_by_name,
391 if ( ! parse_npar_tests (lexer, ds, &cmd, &npar_specs) )
393 pool_destroy (npar_specs.pool);
397 for (i = 0; i < npar_specs.n_tests; ++i )
399 const struct npar_test *test = npar_specs.test[i];
400 test->insert_variables (test, var_hash);
403 npar_specs.vv = (const struct variable **) const_hsh_sort (var_hash);
404 npar_specs.n_vars = const_hsh_count (var_hash);
406 if ( cmd.statistics )
410 for ( i = 0 ; i < NPAR_ST_count; ++i )
412 if ( cmd.a_statistics[i] )
416 case NPAR_ST_DESCRIPTIVES:
417 npar_specs.descriptives = true;
419 case NPAR_ST_QUARTILES:
420 npar_specs.quartiles = true;
423 npar_specs.quartiles = true;
424 npar_specs.descriptives = true;
433 input = proc_open (ds);
434 if ( cmd.miss == MISS_LISTWISE )
436 input = casereader_create_filter_missing (input,
444 grouper = casegrouper_create_splits (input, dataset_dict (ds));
445 while (casegrouper_get_next_group (grouper, &group))
446 npar_execute (group, &npar_specs, ds);
447 ok = casegrouper_destroy (grouper);
448 ok = proc_commit (ds) && ok;
450 const_hsh_destroy (var_hash);
452 pool_destroy (npar_specs.pool);
454 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
458 npar_chisquare (struct lexer *lexer, struct dataset *ds,
459 struct npar_specs *specs)
461 struct chisquare_test *cstp = pool_alloc (specs->pool, sizeof (*cstp));
462 struct one_sample_test *tp = &cstp->parent;
463 struct npar_test *nt = &tp->parent;
465 nt->execute = chisquare_execute;
466 nt->insert_variables = one_sample_insert_variables;
468 if (!parse_variables_const_pool (lexer, specs->pool, dataset_dict (ds),
469 &tp->vars, &tp->n_vars,
470 PV_NO_SCRATCH | PV_NO_DUPLICATE))
475 cstp->ranged = false;
477 if ( lex_match (lexer, '('))
480 if ( ! lex_force_num (lexer)) return 0;
481 cstp->lo = lex_integer (lexer);
483 lex_force_match (lexer, ',');
484 if (! lex_force_num (lexer) ) return 0;
485 cstp->hi = lex_integer (lexer);
486 if ( cstp->lo >= cstp->hi )
489 _ ("The specified value of HI (%d) is "
490 "lower than the specified value of LO (%d)"),
495 if (! lex_force_match (lexer, ')')) return 0;
498 cstp->n_expected = 0;
499 cstp->expected = NULL;
500 if ( lex_match (lexer, '/') )
502 if ( lex_match_id (lexer, "EXPECTED") )
504 lex_force_match (lexer, '=');
505 if ( ! lex_match_id (lexer, "EQUAL") )
509 while ( lex_is_number (lexer) )
513 f = lex_number (lexer);
515 if ( lex_match (lexer, '*'))
518 f = lex_number (lexer);
521 lex_match (lexer, ',');
523 cstp->n_expected += n;
524 cstp->expected = pool_realloc (specs->pool,
528 for ( i = cstp->n_expected - n ;
529 i < cstp->n_expected;
531 cstp->expected[i] = f;
537 lex_put_back (lexer, '/');
540 if ( cstp->ranged && cstp->n_expected > 0 &&
541 cstp->n_expected != cstp->hi - cstp->lo + 1 )
544 _ ("%d expected values were given, but the specified "
545 "range (%d-%d) requires exactly %d values."),
546 cstp->n_expected, cstp->lo, cstp->hi,
547 cstp->hi - cstp->lo +1);
552 specs->test = pool_realloc (specs->pool,
554 sizeof (*specs->test) * specs->n_tests);
556 specs->test[specs->n_tests - 1] = nt;
563 npar_binomial (struct lexer *lexer, struct dataset *ds,
564 struct npar_specs *specs)
566 struct binomial_test *btp = pool_alloc (specs->pool, sizeof (*btp));
567 struct one_sample_test *tp = &btp->parent;
568 struct npar_test *nt = &tp->parent;
570 nt->execute = binomial_execute;
571 nt->insert_variables = one_sample_insert_variables;
573 btp->category1 = btp->category2 = btp->cutpoint = SYSMIS;
577 if ( lex_match (lexer, '(') )
579 if ( lex_force_num (lexer) )
581 btp->p = lex_number (lexer);
583 lex_force_match (lexer, ')');
589 /* Kludge: q2c swallows the '=' so put it back here */
590 lex_put_back (lexer, '=');
592 if (lex_match (lexer, '=') )
594 if (parse_variables_const_pool (lexer, specs->pool, dataset_dict (ds),
595 &tp->vars, &tp->n_vars,
596 PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) )
598 if (lex_match (lexer, '('))
600 lex_force_num (lexer);
601 btp->category1 = lex_number (lexer);
603 if ( lex_match (lexer, ','))
605 if ( ! lex_force_num (lexer) ) return 2;
606 btp->category2 = lex_number (lexer);
611 btp->cutpoint = btp->category1;
614 lex_force_match (lexer, ')');
623 specs->test = pool_realloc (specs->pool,
625 sizeof (*specs->test) * specs->n_tests);
627 specs->test[specs->n_tests - 1] = nt;
634 parse_two_sample_related_test (struct lexer *lexer,
635 const struct dictionary *dict,
636 struct two_sample_test *test_parameters,
642 parse_two_sample_related_test (struct lexer *lexer,
643 const struct dictionary *dict,
644 struct two_sample_test *test_parameters,
651 const struct variable **vlist1;
654 const struct variable **vlist2;
657 test_parameters->parent.insert_variables = two_sample_insert_variables;
659 if (!parse_variables_const_pool (lexer, pool,
662 PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) )
665 if ( lex_match (lexer, T_WITH))
668 if ( !parse_variables_const_pool (lexer, pool, dict,
670 PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) )
673 paired = (lex_match (lexer, '(') &&
674 lex_match_id (lexer, "PAIRED") && lex_match (lexer, ')'));
682 if ( n_vlist1 != n_vlist2)
683 msg (SE, _ ("PAIRED was specified but the number of variables "
684 "preceding WITH (%zu) did not match the number "
685 "following (%zu)."), n_vlist1, n_vlist2);
687 test_parameters->n_pairs = n_vlist1 ;
691 test_parameters->n_pairs = n_vlist1 * n_vlist2;
696 test_parameters->n_pairs = (n_vlist1 * (n_vlist1 - 1)) / 2 ;
699 test_parameters->pairs =
700 pool_alloc (pool, sizeof ( variable_pair) * test_parameters->n_pairs);
707 assert (n_vlist1 == n_vlist2);
708 for ( i = 0 ; i < n_vlist1; ++i )
710 test_parameters->pairs[n][1] = vlist1[i];
711 test_parameters->pairs[n][0] = vlist2[i];
718 for ( i = 0 ; i < n_vlist1; ++i )
720 for ( j = 0 ; j < n_vlist2; ++j )
722 test_parameters->pairs[n][1] = vlist1[i];
723 test_parameters->pairs[n][0] = vlist2[j];
732 for ( i = 0 ; i < n_vlist1 - 1; ++i )
734 for ( j = i + 1 ; j < n_vlist1; ++j )
736 assert ( n < test_parameters->n_pairs);
737 test_parameters->pairs[n][1] = vlist1[i];
738 test_parameters->pairs[n][0] = vlist1[j];
744 assert ( n == test_parameters->n_pairs);
751 parse_n_sample_related_test (struct lexer *lexer,
752 const struct dictionary *dict,
753 struct n_sample_test *nst,
757 if (!parse_variables_const_pool (lexer, pool,
759 &nst->vars, &nst->n_vars,
760 PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) )
763 if ( ! lex_force_match (lexer, T_BY))
766 nst->indep_var = parse_variable_const (lexer, dict);
768 if ( ! lex_force_match (lexer, '('))
771 value_init (&nst->val1, var_get_width (nst->indep_var));
772 if ( ! parse_value (lexer, &nst->val1, var_get_width (nst->indep_var)))
774 value_destroy (&nst->val1, var_get_width (nst->indep_var));
778 if ( ! lex_force_match (lexer, ','))
781 value_init (&nst->val2, var_get_width (nst->indep_var));
782 if ( ! parse_value (lexer, &nst->val2, var_get_width (nst->indep_var)))
784 value_destroy (&nst->val2, var_get_width (nst->indep_var));
788 if ( ! lex_force_match (lexer, ')'))
795 npar_wilcoxon (struct lexer *lexer,
797 struct npar_specs *specs )
801 struct two_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp));
802 struct npar_test *nt = &tp->parent;
803 nt->execute = wilcoxon_execute;
805 if (!parse_two_sample_related_test (lexer, dataset_dict (ds),
810 specs->test = pool_realloc (specs->pool,
812 sizeof (*specs->test) * specs->n_tests);
813 specs->test[specs->n_tests - 1] = nt;
819 npar_sign (struct lexer *lexer, struct dataset *ds,
820 struct npar_specs *specs)
822 struct two_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp));
823 struct npar_test *nt = &tp->parent;
825 nt->execute = sign_execute;
827 if (!parse_two_sample_related_test (lexer, dataset_dict (ds),
832 specs->test = pool_realloc (specs->pool,
834 sizeof (*specs->test) * specs->n_tests);
835 specs->test[specs->n_tests - 1] = nt;
841 npar_kruskal_wallis (struct lexer *lexer, struct dataset *ds,
842 struct npar_specs *specs)
844 struct n_sample_test *tp = pool_alloc (specs->pool, sizeof (*tp));
845 struct npar_test *nt = &tp->parent;
847 nt->insert_variables = n_sample_insert_variables;
849 nt->execute = kruskal_wallis_execute;
851 if (!parse_n_sample_related_test (lexer, dataset_dict (ds),
856 specs->test = pool_realloc (specs->pool,
858 sizeof (*specs->test) * specs->n_tests);
859 specs->test[specs->n_tests - 1] = nt;
864 /* Insert the variables for TEST into VAR_HASH */
866 one_sample_insert_variables (const struct npar_test *test,
867 struct const_hsh_table *var_hash)
870 const struct one_sample_test *ost = UP_CAST (test, const struct one_sample_test, parent);
872 for ( i = 0 ; i < ost->n_vars ; ++i )
873 const_hsh_insert (var_hash, ost->vars[i]);
878 two_sample_insert_variables (const struct npar_test *test,
879 struct const_hsh_table *var_hash)
882 const struct two_sample_test *tst = UP_CAST (test, const struct two_sample_test, parent);
884 for ( i = 0 ; i < tst->n_pairs ; ++i )
886 variable_pair *pair = &tst->pairs[i];
888 const_hsh_insert (var_hash, (*pair)[0]);
889 const_hsh_insert (var_hash, (*pair)[1]);
894 n_sample_insert_variables (const struct npar_test *test,
895 struct const_hsh_table *var_hash)
898 const struct n_sample_test *tst = UP_CAST (test, const struct n_sample_test, parent);
900 for ( i = 0 ; i < tst->n_vars ; ++i )
901 const_hsh_insert (var_hash, tst->vars[i]);
903 const_hsh_insert (var_hash, tst->indep_var);
907 npar_method (struct lexer *lexer, struct npar_specs *specs)
909 if ( lex_match_id (lexer, "EXACT") )
913 if (lex_match_id (lexer, "TIMER"))
917 if ( lex_match (lexer, '('))
919 if ( lex_force_num (lexer) )
921 specs->timer = lex_number (lexer);
924 lex_force_match (lexer, ')');