From 48cf1d7de82d12cdf3c0433d49d3c66f820f1609 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sat, 13 Mar 2004 06:22:50 +0000 Subject: [PATCH] Fixed the handling of MISSING values in t-test --- doc/pspp.texi | 28 +- po/en_GB.po | 92 +++---- po/pspp.pot | 92 +++---- src/ChangeLog | 5 + src/levene.c | 91 +++++-- src/levene.h | 6 +- src/t-test.q | 245 ++++++++++++++---- src/var.h | 1 + tests/Makefile.am | 6 + tests/command/t-test-1-sample-missing-anal.sh | 101 ++++++++ tests/command/t-test-1-sample-missing-list.sh | 101 ++++++++ tests/command/t-test-indep-missing-anal.sh | 103 ++++++++ tests/command/t-test-indep-missing-list.sh | 101 ++++++++ tests/command/t-test-paired-missing-anal.sh | 101 ++++++++ tests/command/t-test-paired-missing-list.sh | 101 ++++++++ 15 files changed, 999 insertions(+), 175 deletions(-) create mode 100755 tests/command/t-test-1-sample-missing-anal.sh create mode 100755 tests/command/t-test-1-sample-missing-list.sh create mode 100755 tests/command/t-test-indep-missing-anal.sh create mode 100755 tests/command/t-test-indep-missing-list.sh create mode 100755 tests/command/t-test-paired-missing-anal.sh create mode 100755 tests/command/t-test-paired-missing-list.sh diff --git a/doc/pspp.texi b/doc/pspp.texi index 509efa6d..cda0308f 100644 --- a/doc/pspp.texi +++ b/doc/pspp.texi @@ -7907,14 +7907,16 @@ in the tests. The default value is 0.95. The @cmd{MISSING} subcommand determines the handling of missing variables. If INCLUDE is set, then user-missing values are included in the -calculations. +calculations, but system-missing values are not. If EXCLUDE is set, which is the default, user-missing -values are excluded. -If LISTWISE is set, then -the entire case is excluded whenever any value in that case has a -system-missing or, if INCLUDE is set, user-missing value. -If ANALYSIS is set, then cases are excluded only where a value used in -the analysis has a system-missing or, if INCLUDE is set, user-missing value. +values are excluded as well as system-missing values. +This is the default. + +If LISTWISE is set, then the entire case is excluded from analysis +whenever any variable specified in the @cmd{/VARIABLES}, @cmd{/PAIRS} or +@cmd{/GROUPS} subcommands contains a missing value. +If ANALYSIS is set, then missing values are excluded only in the analysis for +which they would be needed. This is the default. @menu @@ -7951,13 +7953,17 @@ The variable given in the @cmd{GROUPS} subcommand is the independent variable which determines to which group the samples belong. The values in parentheses are the specific values of the independent variable for each group. -If the parentheses are omitted, and no values are given, the default values +If the parentheses are omitted and no values are given, the default values of 1.0 and 2.0 are assumed. -If only one value is given, then cases where the independent variable is + +If the independent variable is numeric, +it is acceptable to specify only one value inside the parentheses. +If you do this, cases where the independent variable is less than or equal to this value belong to the first group, and cases greater than this value belong to the second group. -If only one value is given, then the independent variable must be -numeric. +When using this form of the @cmd{GROUPS} subcommand, missing values in +the independent variable are excluded on a listwise basis, regardless +of whether @cmd{/MISSING=LISTWISE} was specified. @node Paired Samples Mode, , Independent Samples Mode, T-TEST diff --git a/po/en_GB.po b/po/en_GB.po index 43d5fa17..82622e7b 100644 --- a/po/en_GB.po +++ b/po/en_GB.po @@ -6,8 +6,8 @@ msgid "" msgstr "" "Project-Id-Version: PSPP 0.3.1\n" -"Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2004-03-12 16:21+0800\n" +"Report-Msgid-Bugs-To: pspp-dev@gnu.org\n" +"POT-Creation-Date: 2004-03-13 14:14+0800\n" "PO-Revision-Date: 2004-01-23 13:04+0800\n" "Last-Translator: John Darrington \n" "Language-Team: John Darrington \n" @@ -4210,8 +4210,8 @@ msgstr "" msgid "Total" msgstr "" -#: src/crosstabs.q:874 src/t-test.q:622 src/t-test.q:645 src/t-test.q:736 -#: src/t-test.q:1318 +#: src/crosstabs.q:874 src/t-test.q:655 src/t-test.q:678 src/t-test.q:769 +#: src/t-test.q:1343 msgid "N" msgstr "" @@ -4257,7 +4257,7 @@ msgstr "" msgid "Statistic" msgstr "" -#: src/crosstabs.q:1159 src/t-test.q:889 src/t-test.q:1090 src/t-test.q:1202 +#: src/crosstabs.q:1159 src/t-test.q:920 src/t-test.q:1121 src/t-test.q:1227 msgid "df" msgstr "" @@ -4294,11 +4294,11 @@ msgstr "" msgid " 95%% Confidence Interval" msgstr "" -#: src/crosstabs.q:1206 src/t-test.q:893 src/t-test.q:1087 src/t-test.q:1205 +#: src/crosstabs.q:1206 src/t-test.q:924 src/t-test.q:1118 src/t-test.q:1230 msgid "Lower" msgstr "" -#: src/crosstabs.q:1207 src/t-test.q:894 src/t-test.q:1088 src/t-test.q:1206 +#: src/crosstabs.q:1207 src/t-test.q:925 src/t-test.q:1119 src/t-test.q:1231 msgid "Upper" msgstr "" @@ -4435,8 +4435,8 @@ msgstr "" msgid "%s Dependent" msgstr "" -#: src/descript.q:151 src/frequencies.q:94 src/t-test.q:623 src/t-test.q:646 -#: src/t-test.q:735 src/t-test.q:1084 +#: src/descript.q:151 src/frequencies.q:94 src/t-test.q:656 src/t-test.q:679 +#: src/t-test.q:768 src/t-test.q:1115 msgid "Mean" msgstr "" @@ -4978,157 +4978,157 @@ msgstr "Frame colour must be between 0 and 6." msgid "Drive letter expected in WORKDEV subcommand." msgstr "" -#: src/t-test.q:208 +#: src/t-test.q:237 msgid "TESTVAL, GROUPS and PAIRS subcommands are mutually exclusive." msgstr "" -#: src/t-test.q:225 +#: src/t-test.q:254 msgid "VARIABLES subcommand is not appropriate with PAIRS" msgstr "" -#: src/t-test.q:317 src/t-test.q:396 +#: src/t-test.q:352 src/t-test.q:429 #, c-format msgid "`%s' is not a variable name" msgstr "" -#: src/t-test.q:330 +#: src/t-test.q:365 #, c-format msgid "Long string variable %s is not valid here." msgstr "" -#: src/t-test.q:347 +#: src/t-test.q:382 msgid "" "When applying GROUPS to a string variable, at least one value must be " "specified." msgstr "" -#: src/t-test.q:431 +#: src/t-test.q:464 #, c-format msgid "" "PAIRED was specified but the number of variables preceding WITH (%d) did not " "match the number following (%d)." msgstr "" -#: src/t-test.q:448 +#: src/t-test.q:481 msgid "At least two variables must be specified on PAIRS." msgstr "" -#: src/t-test.q:620 +#: src/t-test.q:653 msgid "One-Sample Statistics" msgstr "" -#: src/t-test.q:624 src/t-test.q:647 src/t-test.q:737 src/t-test.q:1085 +#: src/t-test.q:657 src/t-test.q:680 src/t-test.q:770 src/t-test.q:1116 msgid "Std. Deviation" msgstr "" -#: src/t-test.q:625 src/t-test.q:648 src/t-test.q:738 +#: src/t-test.q:658 src/t-test.q:681 src/t-test.q:771 msgid "SE. Mean" msgstr "" -#: src/t-test.q:642 +#: src/t-test.q:675 msgid "Group Statistics" msgstr "" -#: src/t-test.q:732 +#: src/t-test.q:765 msgid "Paired Sample Statistics" msgstr "" -#: src/t-test.q:754 src/t-test.q:1119 src/t-test.q:1339 +#: src/t-test.q:787 src/t-test.q:1144 src/t-test.q:1364 #, c-format msgid "Pair %d" msgstr "" -#: src/t-test.q:874 +#: src/t-test.q:905 msgid "Independent Samples Test" msgstr "" -#: src/t-test.q:882 +#: src/t-test.q:913 msgid "Levene's Test for Equality of Variances" msgstr "" -#: src/t-test.q:884 +#: src/t-test.q:915 msgid "t-test for Equality of Means" msgstr "" -#: src/t-test.q:886 +#: src/t-test.q:917 msgid "F" msgstr "" -#: src/t-test.q:887 src/t-test.q:1320 +#: src/t-test.q:918 src/t-test.q:1345 msgid "Sig." msgstr "" -#: src/t-test.q:888 src/t-test.q:1089 src/t-test.q:1201 +#: src/t-test.q:919 src/t-test.q:1120 src/t-test.q:1226 msgid "t" msgstr "" -#: src/t-test.q:890 src/t-test.q:1091 src/t-test.q:1203 +#: src/t-test.q:921 src/t-test.q:1122 src/t-test.q:1228 msgid "Sig. (2-tailed)" msgstr "" -#: src/t-test.q:891 src/t-test.q:1204 +#: src/t-test.q:922 src/t-test.q:1229 msgid "Mean Difference" msgstr "" -#: src/t-test.q:892 +#: src/t-test.q:923 msgid "Std. Error Difference" msgstr "" -#: src/t-test.q:897 src/t-test.q:1081 src/t-test.q:1196 +#: src/t-test.q:928 src/t-test.q:1112 src/t-test.q:1221 #, c-format msgid "%d%% Confidence Interval of the Difference" msgstr "" -#: src/t-test.q:931 +#: src/t-test.q:962 msgid "Equal variances assumed" msgstr "" -#: src/t-test.q:946 +#: src/t-test.q:977 #, c-format msgid "Error calculating F statistic (cdff returned %d)." msgstr "" -#: src/t-test.q:969 src/t-test.q:989 src/t-test.q:1027 src/t-test.q:1039 -#: src/t-test.q:1140 src/t-test.q:1166 src/t-test.q:1246 src/t-test.q:1263 -#: src/t-test.q:1354 +#: src/t-test.q:1000 src/t-test.q:1020 src/t-test.q:1058 src/t-test.q:1070 +#: src/t-test.q:1165 src/t-test.q:1191 src/t-test.q:1271 src/t-test.q:1288 +#: src/t-test.q:1377 #, c-format msgid "Error calculating T statistic (cdft returned %d)." msgstr "" -#: src/t-test.q:1003 +#: src/t-test.q:1034 msgid "Equal variances not assumed" msgstr "" -#: src/t-test.q:1071 +#: src/t-test.q:1102 msgid "Paired Samples Test" msgstr "" -#: src/t-test.q:1074 +#: src/t-test.q:1105 msgid "Paired Differences" msgstr "" -#: src/t-test.q:1086 +#: src/t-test.q:1117 msgid "Std. Error Mean" msgstr "" -#: src/t-test.q:1185 +#: src/t-test.q:1210 msgid "One-Sample Test" msgstr "" -#: src/t-test.q:1190 +#: src/t-test.q:1215 #, c-format msgid "Test Value = %f" msgstr "" -#: src/t-test.q:1315 +#: src/t-test.q:1340 msgid "Paired Samples Correlations" msgstr "" -#: src/t-test.q:1319 +#: src/t-test.q:1344 msgid "Correlation" msgstr "" -#: src/t-test.q:1342 +#: src/t-test.q:1367 #, c-format msgid "%s & %s" msgstr "" diff --git a/po/pspp.pot b/po/pspp.pot index d2a9fa36..d1039a6c 100644 --- a/po/pspp.pot +++ b/po/pspp.pot @@ -7,8 +7,8 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" -"Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2004-03-12 16:21+0800\n" +"Report-Msgid-Bugs-To: pspp-dev@gnu.org\n" +"POT-Creation-Date: 2004-03-13 14:14+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -4211,8 +4211,8 @@ msgstr "" msgid "Total" msgstr "" -#: src/crosstabs.q:874 src/t-test.q:622 src/t-test.q:645 src/t-test.q:736 -#: src/t-test.q:1318 +#: src/crosstabs.q:874 src/t-test.q:655 src/t-test.q:678 src/t-test.q:769 +#: src/t-test.q:1343 msgid "N" msgstr "" @@ -4258,7 +4258,7 @@ msgstr "" msgid "Statistic" msgstr "" -#: src/crosstabs.q:1159 src/t-test.q:889 src/t-test.q:1090 src/t-test.q:1202 +#: src/crosstabs.q:1159 src/t-test.q:920 src/t-test.q:1121 src/t-test.q:1227 msgid "df" msgstr "" @@ -4295,11 +4295,11 @@ msgstr "" msgid " 95%% Confidence Interval" msgstr "" -#: src/crosstabs.q:1206 src/t-test.q:893 src/t-test.q:1087 src/t-test.q:1205 +#: src/crosstabs.q:1206 src/t-test.q:924 src/t-test.q:1118 src/t-test.q:1230 msgid "Lower" msgstr "" -#: src/crosstabs.q:1207 src/t-test.q:894 src/t-test.q:1088 src/t-test.q:1206 +#: src/crosstabs.q:1207 src/t-test.q:925 src/t-test.q:1119 src/t-test.q:1231 msgid "Upper" msgstr "" @@ -4436,8 +4436,8 @@ msgstr "" msgid "%s Dependent" msgstr "" -#: src/descript.q:151 src/frequencies.q:94 src/t-test.q:623 src/t-test.q:646 -#: src/t-test.q:735 src/t-test.q:1084 +#: src/descript.q:151 src/frequencies.q:94 src/t-test.q:656 src/t-test.q:679 +#: src/t-test.q:768 src/t-test.q:1115 msgid "Mean" msgstr "" @@ -4979,157 +4979,157 @@ msgstr "" msgid "Drive letter expected in WORKDEV subcommand." msgstr "" -#: src/t-test.q:208 +#: src/t-test.q:237 msgid "TESTVAL, GROUPS and PAIRS subcommands are mutually exclusive." msgstr "" -#: src/t-test.q:225 +#: src/t-test.q:254 msgid "VARIABLES subcommand is not appropriate with PAIRS" msgstr "" -#: src/t-test.q:317 src/t-test.q:396 +#: src/t-test.q:352 src/t-test.q:429 #, c-format msgid "`%s' is not a variable name" msgstr "" -#: src/t-test.q:330 +#: src/t-test.q:365 #, c-format msgid "Long string variable %s is not valid here." msgstr "" -#: src/t-test.q:347 +#: src/t-test.q:382 msgid "" "When applying GROUPS to a string variable, at least one value must be " "specified." msgstr "" -#: src/t-test.q:431 +#: src/t-test.q:464 #, c-format msgid "" "PAIRED was specified but the number of variables preceding WITH (%d) did not " "match the number following (%d)." msgstr "" -#: src/t-test.q:448 +#: src/t-test.q:481 msgid "At least two variables must be specified on PAIRS." msgstr "" -#: src/t-test.q:620 +#: src/t-test.q:653 msgid "One-Sample Statistics" msgstr "" -#: src/t-test.q:624 src/t-test.q:647 src/t-test.q:737 src/t-test.q:1085 +#: src/t-test.q:657 src/t-test.q:680 src/t-test.q:770 src/t-test.q:1116 msgid "Std. Deviation" msgstr "" -#: src/t-test.q:625 src/t-test.q:648 src/t-test.q:738 +#: src/t-test.q:658 src/t-test.q:681 src/t-test.q:771 msgid "SE. Mean" msgstr "" -#: src/t-test.q:642 +#: src/t-test.q:675 msgid "Group Statistics" msgstr "" -#: src/t-test.q:732 +#: src/t-test.q:765 msgid "Paired Sample Statistics" msgstr "" -#: src/t-test.q:754 src/t-test.q:1119 src/t-test.q:1339 +#: src/t-test.q:787 src/t-test.q:1144 src/t-test.q:1364 #, c-format msgid "Pair %d" msgstr "" -#: src/t-test.q:874 +#: src/t-test.q:905 msgid "Independent Samples Test" msgstr "" -#: src/t-test.q:882 +#: src/t-test.q:913 msgid "Levene's Test for Equality of Variances" msgstr "" -#: src/t-test.q:884 +#: src/t-test.q:915 msgid "t-test for Equality of Means" msgstr "" -#: src/t-test.q:886 +#: src/t-test.q:917 msgid "F" msgstr "" -#: src/t-test.q:887 src/t-test.q:1320 +#: src/t-test.q:918 src/t-test.q:1345 msgid "Sig." msgstr "" -#: src/t-test.q:888 src/t-test.q:1089 src/t-test.q:1201 +#: src/t-test.q:919 src/t-test.q:1120 src/t-test.q:1226 msgid "t" msgstr "" -#: src/t-test.q:890 src/t-test.q:1091 src/t-test.q:1203 +#: src/t-test.q:921 src/t-test.q:1122 src/t-test.q:1228 msgid "Sig. (2-tailed)" msgstr "" -#: src/t-test.q:891 src/t-test.q:1204 +#: src/t-test.q:922 src/t-test.q:1229 msgid "Mean Difference" msgstr "" -#: src/t-test.q:892 +#: src/t-test.q:923 msgid "Std. Error Difference" msgstr "" -#: src/t-test.q:897 src/t-test.q:1081 src/t-test.q:1196 +#: src/t-test.q:928 src/t-test.q:1112 src/t-test.q:1221 #, c-format msgid "%d%% Confidence Interval of the Difference" msgstr "" -#: src/t-test.q:931 +#: src/t-test.q:962 msgid "Equal variances assumed" msgstr "" -#: src/t-test.q:946 +#: src/t-test.q:977 #, c-format msgid "Error calculating F statistic (cdff returned %d)." msgstr "" -#: src/t-test.q:969 src/t-test.q:989 src/t-test.q:1027 src/t-test.q:1039 -#: src/t-test.q:1140 src/t-test.q:1166 src/t-test.q:1246 src/t-test.q:1263 -#: src/t-test.q:1354 +#: src/t-test.q:1000 src/t-test.q:1020 src/t-test.q:1058 src/t-test.q:1070 +#: src/t-test.q:1165 src/t-test.q:1191 src/t-test.q:1271 src/t-test.q:1288 +#: src/t-test.q:1377 #, c-format msgid "Error calculating T statistic (cdft returned %d)." msgstr "" -#: src/t-test.q:1003 +#: src/t-test.q:1034 msgid "Equal variances not assumed" msgstr "" -#: src/t-test.q:1071 +#: src/t-test.q:1102 msgid "Paired Samples Test" msgstr "" -#: src/t-test.q:1074 +#: src/t-test.q:1105 msgid "Paired Differences" msgstr "" -#: src/t-test.q:1086 +#: src/t-test.q:1117 msgid "Std. Error Mean" msgstr "" -#: src/t-test.q:1185 +#: src/t-test.q:1210 msgid "One-Sample Test" msgstr "" -#: src/t-test.q:1190 +#: src/t-test.q:1215 #, c-format msgid "Test Value = %f" msgstr "" -#: src/t-test.q:1315 +#: src/t-test.q:1340 msgid "Paired Samples Correlations" msgstr "" -#: src/t-test.q:1319 +#: src/t-test.q:1344 msgid "Correlation" msgstr "" -#: src/t-test.q:1342 +#: src/t-test.q:1367 #, c-format msgid "%s & %s" msgstr "" diff --git a/src/ChangeLog b/src/ChangeLog index 39611214..7097fc14 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,8 @@ +Sat Mar 13 14:19:52 WST 2004 John Darrington + + * t-test.q, levene.c: Fixed up the handling of MISSING values + int the T-TEST + Fri Mar 12 16:23:35 WST 2004 John Darrington * t-test.q, levene.c: Added support for T-TEST /GROUP where only diff --git a/src/levene.c b/src/levene.c index 1666e020..5617b7ba 100644 --- a/src/levene.c +++ b/src/levene.c @@ -83,18 +83,27 @@ struct levene_info /* The dependent variables */ struct variable **v_dep; + /* How to treat missing values */ + enum lev_missing missing; + + /* Function to test for missing values */ + is_missing_func is_missing; + }; void -levene(struct variable *v_indep, int n_dep, struct variable **v_dep) +levene(struct variable *v_indep, int n_dep, struct variable **v_dep, + enum lev_missing missing, is_missing_func value_is_missing) { struct levene_info l; - l.n_dep=n_dep; - l.v_indep=v_indep; - l.v_dep=v_dep; + l.n_dep = n_dep; + l.v_indep = v_indep; + l.v_dep = v_dep; + l.missing = missing; + l.is_missing = value_is_missing; procedure(levene_precalc, levene_calc, levene_postcalc, &l); procedure(levene2_precalc,levene2_calc,levene2_postcalc,&l); @@ -203,32 +212,50 @@ levene_precalc (void *_l) static int levene_calc (struct ccase *c, void *_l) { - int var; + int i; struct levene_info *l = (struct levene_info *) _l; union value *gv = &c->data[l->v_indep->fv]; struct group_statistics key; double weight = dict_get_case_weight(default_dict,c); + + + /* Skip the entire case if /MISSING=LISTWISE is set */ + if ( l->missing == LEV_LISTWISE ) + { + for (i = 0; i < l->n_dep; ++i) + { + struct variable *v = l->v_dep[i]; + union value *val = &c->data[v->fv]; + + if (l->is_missing(val,v) ) + { + return 0; + } + } + } + key.id = *gv; key.criterion = CMP_EQ; - for (var = 0; var < l->n_dep; ++var) + for (i = 0; i < l->n_dep; ++i) { + struct variable *var = l->v_dep[i]; double levene_z; - union value *v = &c->data[l->v_dep[var]->fv]; + union value *v = &c->data[var->fv]; struct group_statistics *gs; - gs = get_group(var,&key); + gs = get_group(i,&key); if ( 0 == gs ) continue ; - /* FIXME: handle SYSMIS properly */ - - levene_z= fabs(v->f - gs->mean); - lz[var].grand_total += levene_z * weight; - lz[var].total_n += weight; - - gs->lz_total += levene_z * weight; + if ( ! l->is_missing(v,var)) + { + levene_z= fabs(v->f - gs->mean); + lz[i].grand_total += levene_z * weight; + lz[i].total_n += weight; + gs->lz_total += levene_z * weight; + } } return 0; } @@ -280,7 +307,7 @@ levene2_precalc (void *_l) static int levene2_calc (struct ccase *c, void *_l) { - int var; + int i; struct levene_info *l = (struct levene_info *) _l; @@ -289,23 +316,39 @@ levene2_calc (struct ccase *c, void *_l) union value *gv = &c->data[l->v_indep->fv]; struct group_statistics key; + /* Skip the entire case if /MISSING=LISTWISE is set */ + if ( l->missing == LEV_LISTWISE ) + { + for (i = 0; i < l->n_dep; ++i) + { + struct variable *v = l->v_dep[i]; + union value *val = &c->data[v->fv]; + + if (l->is_missing(val,v) ) + { + return 0; + } + } + } + key.id = *gv; key.criterion = CMP_EQ; - for (var = 0; var < l->n_dep; ++var) + for (i = 0; i < l->n_dep; ++i) { double levene_z; - union value *v = &c->data[l->v_dep[var]->fv]; + struct variable *var = l->v_dep[i] ; + union value *v = &c->data[var->fv]; struct group_statistics *gs; - gs = get_group(var,&key); + gs = get_group(i,&key); if ( 0 == gs ) continue; - /* FIXME: handle SYSMIS properly */ - - levene_z = fabs(v->f - gs->mean); - - lz_denominator[var] += weight * sqr(levene_z - gs->lz_mean); + if ( ! l->is_missing(v,var) ) + { + levene_z = fabs(v->f - gs->mean); + lz_denominator[i] += weight * sqr(levene_z - gs->lz_mean); + } } return 0; diff --git a/src/levene.h b/src/levene.h index 8f19dead..37019524 100644 --- a/src/levene.h +++ b/src/levene.h @@ -25,6 +25,8 @@ #include "var.h" +/* What to do with missing values */ +enum lev_missing { LEV_ANALYSIS, LEV_LISTWISE }; /* Calculate the Levene statistic @@ -36,7 +38,9 @@ The dependent variables : v_dep; */ -void levene(struct variable *v_indep, int n_dep, struct variable **v_dep); + +void levene(struct variable *v_indep, int n_dep, struct variable **v_dep, + enum lev_missing, is_missing_func); diff --git a/src/t-test.q b/src/t-test.q index f9012555..2f8cd6fa 100644 --- a/src/t-test.q +++ b/src/t-test.q @@ -55,8 +55,12 @@ /* (declarations) */ /* (functions) */ + static struct cmd_t_test cmd; +/* Function to use for testing for missing values */ +static is_missing_func value_is_missing; + /* Variable for the GROUPS subcommand, if given. */ static struct variable *indep_var; @@ -68,12 +72,34 @@ static union value groups_values[2]; static enum comparison criteria[2]; + /* PAIRS: Number of pairs to be compared ; each pair. */ static int n_pairs = 0 ; struct pair { +#if 1 /* The variables comprising the pair */ struct variable *v[2]; +#endif + + /* The number of valid variable pairs */ + double n; + + /* The sum of the members */ + double sum[2]; + + /* sum of squares of the members */ + double ssq[2]; + + /* Std deviation of the members */ + double std_dev[2]; + + + /* Sample Std deviation of the members */ + double s_std_dev[2]; + + /* The means of the members */ + double mean[2]; /* The correlation coefficient between the variables */ double correlation; @@ -81,6 +107,9 @@ struct pair /* The sum of the differences */ double sum_of_diffs; + /* The sum of the products */ + double sum_of_prod; + /* The mean of the differences */ double mean_diff; @@ -258,6 +287,11 @@ cmd_t_test(void) } } + /* If /MISSING=INCLUDE is set, then user missing values are ignored */ + if (cmd.incl == TTS_INCLUDE ) + value_is_missing = is_system_missing; + else + value_is_missing = is_missing; procedure(common_precalc,common_calc,common_postcalc, NULL); @@ -271,7 +305,9 @@ cmd_t_test(void) break; case T_IND_SAMPLES: procedure(group_precalc,group_calc,group_postcalc, NULL); - levene(indep_var, cmd.n_variables, cmd.v_variables); + levene(indep_var, cmd.n_variables, cmd.v_variables, + (cmd.miss == TTS_LISTWISE)?LEV_LISTWISE:LEV_ANALYSIS , + value_is_missing); break; } @@ -290,7 +326,6 @@ cmd_t_test(void) free(pairs); pairs=0; - if ( mode == T_IND_SAMPLES) { int i; @@ -375,8 +410,6 @@ tts_custom_groups (struct cmd_t_test *cmd UNUSED) } - - static int tts_custom_pairs (struct cmd_t_test *cmd UNUSED) { @@ -764,15 +797,13 @@ ssbox_paired_populate(struct ssbox *ssb,struct cmd_t_test *cmd UNUSED) tab_text (ssb->t, 1, i*2+j+1, TAB_LEFT, pairs[i].v[j]->name); /* Values */ - tab_float (ssb->t,2, i*2+j+1, TAB_RIGHT, gs->mean, 8, 2); - tab_float (ssb->t,3, i*2+j+1, TAB_RIGHT, gs->n, 2, 0); - tab_float (ssb->t,4, i*2+j+1, TAB_RIGHT, gs->std_dev, 8, 3); - tab_float (ssb->t,5, i*2+j+1, TAB_RIGHT, gs->se_mean, 8, 3); + tab_float (ssb->t,2, i*2+j+1, TAB_RIGHT, pairs[i].mean[j], 8, 2); + tab_float (ssb->t,3, i*2+j+1, TAB_RIGHT, pairs[i].n, 2, 0); + tab_float (ssb->t,4, i*2+j+1, TAB_RIGHT, pairs[i].std_dev[j], 8, 3); + tab_float (ssb->t,5, i*2+j+1, TAB_RIGHT, pairs[i].std_dev[j]/sqrt(pairs[i].n), 8, 3); } - } - } /* Populate the one sample ssbox */ @@ -1106,13 +1137,7 @@ trbox_paired_populate(struct trbox *trb, double bound; double se_mean; - struct variable *v0 = pairs[i].v[0]; - struct variable *v1 = pairs[i].v[1]; - - struct group_statistics *gs0 = &v0->p.t_t.ugs; - struct group_statistics *gs1 = &v1->p.t_t.ugs; - - double n = gs0->n; + double n = pairs[i].n; double t; double df = n - 1; @@ -1146,12 +1171,13 @@ trbox_paired_populate(struct trbox *trb, tab_float(trb->t, 6, i+3, TAB_RIGHT, pairs[i].mean_diff + t * se_mean , 8, 4); - t = ( gs0->mean - gs1->mean) - / sqrt ( - ( sqr(gs0->s_std_dev) + sqr(gs1->s_std_dev) - - 2 * pairs[i].correlation * gs0->s_std_dev * gs1->s_std_dev ) - / (n-1) ) - ; + t = (pairs[i].mean[0] - pairs[i].mean[1]) + / sqrt ( + ( sqr (pairs[i].s_std_dev[0]) + sqr (pairs[i].s_std_dev[1]) - + 2 * pairs[i].correlation * + pairs[i].s_std_dev[0] * pairs[i].s_std_dev[1] ) + / (n - 1) + ); tab_float(trb->t, 7, i+3, TAB_RIGHT, t , 8,3 ); @@ -1160,7 +1186,6 @@ trbox_paired_populate(struct trbox *trb, which=1; cdft(&which, &p, &q, &t, &df, &status, &bound); - if ( 0 != status ) { msg( SE, _("Error calculating T statistic (cdft returned %d)."),status); @@ -1327,7 +1352,7 @@ pscbox(void) int status; double bound; - double df = pairs[i].v[0]->p.t_t.ugs.n -2; + double df = pairs[i].n -2; double correlation_t = pairs[i].correlation * sqrt(df) / @@ -1343,20 +1368,16 @@ pscbox(void) /* row data */ + tab_float(table, 2, i+1, TAB_RIGHT, pairs[i].n, 4, 0); tab_float(table, 3, i+1, TAB_RIGHT, pairs[i].correlation, 8, 3); - tab_float(table, 2, i+1, TAB_RIGHT, pairs[i].v[0]->p.t_t.ugs.n , 4, 0); - cdft(&which, &p, &q, &correlation_t, &df, &status, &bound); - if ( 0 != status ) { msg( SE, _("Error calculating T statistic (cdft returned %d)."),status); } - tab_float(table, 4, i+1, TAB_RIGHT, 2.0*(correlation_t>0?q:p), 8, 3); - } tab_submit(table); @@ -1374,6 +1395,33 @@ common_calc (struct ccase *c, void *aux UNUSED) double weight = dict_get_case_weight(default_dict,c); + + /* Skip the entire case if /MISSING=LISTWISE is set */ + if ( cmd.miss == TTS_LISTWISE ) + { + for(i=0; i< cmd.n_variables ; ++i) + { + struct variable *v = cmd.v_variables[i]; + union value *val = &c->data[v->fv]; + + if (value_is_missing(val,v) ) + { + return 0; + } + } + } + + /* Listwise has to be implicit if the independent variable is missing ?? */ + if ( cmd.sbc_groups ) + { + union value *gv = &c->data[indep_var->fv]; + if ( value_is_missing(gv,indep_var) ) + { + return 0; + } + } + + for(i=0; i< cmd.n_variables ; ++i) { struct group_statistics *gs; @@ -1382,7 +1430,7 @@ common_calc (struct ccase *c, void *aux UNUSED) gs= &cmd.v_variables[i]->p.t_t.ugs; - if (val->f != SYSMIS) + if (! value_is_missing(val,v) ) { gs->n+=weight; gs->sum+=weight * val->f; @@ -1444,6 +1492,21 @@ one_sample_calc (struct ccase *c, void *aux UNUSED) double weight = dict_get_case_weight(default_dict,c); + /* Skip the entire case if /MISSING=LISTWISE is set */ + if ( cmd.miss == TTS_LISTWISE ) + { + for(i=0; i< cmd.n_variables ; ++i) + { + struct variable *v = cmd.v_variables[i]; + union value *val = &c->data[v->fv]; + + if (value_is_missing(val,v) ) + { + return 0; + } + } + } + for(i=0; i< cmd.n_variables ; ++i) { struct group_statistics *gs; @@ -1452,7 +1515,7 @@ one_sample_calc (struct ccase *c, void *aux UNUSED) gs= &cmd.v_variables[i]->p.t_t.ugs; - if (val->f != SYSMIS) + if ( ! value_is_missing(val,v)) gs->sum_diff += weight * (val->f - cmd.n_testval); } @@ -1518,9 +1581,13 @@ paired_precalc (void *aux UNUSED) for(i=0; i < n_pairs ; ++i ) { - pairs[i].correlation=0; - pairs[i].sum_of_diffs=0; - pairs[i].ssq_diffs=0; + pairs[i].n = 0; + pairs[i].sum[0] = 0; pairs[i].sum[1] = 0; + pairs[i].ssq[0] = 0; pairs[i].ssq[1] = 0; + pairs[i].sum_of_prod = 0; + pairs[i].correlation = 0; + pairs[i].sum_of_diffs = 0; + pairs[i].ssq_diffs = 0; } } @@ -1531,6 +1598,28 @@ paired_calc (struct ccase *c, void *aux UNUSED) { int i; + double weight = dict_get_case_weight(default_dict,c); + + /* Skip the entire case if /MISSING=LISTWISE is set , + AND one member of a pair is missing */ + if ( cmd.miss == TTS_LISTWISE ) + { + for(i=0; i < n_pairs ; ++i ) + { + struct variable *v0 = pairs[i].v[0]; + struct variable *v1 = pairs[i].v[1]; + + union value *val0 = &c->data[v0->fv]; + union value *val1 = &c->data[v1->fv]; + + if ( value_is_missing(val0,v0) || + value_is_missing(val1,v1) ) + { + return 0; + } + } + } + for(i=0; i < n_pairs ; ++i ) { struct variable *v0 = pairs[i].v[0]; @@ -1539,13 +1628,28 @@ paired_calc (struct ccase *c, void *aux UNUSED) union value *val0 = &c->data[v0->fv]; union value *val1 = &c->data[v1->fv]; - pairs[i].correlation += ( val0->f - pairs[i].v[0]->p.t_t.ugs.mean ) - * - ( val1->f - pairs[i].v[1]->p.t_t.ugs.mean ); + if ( ( !value_is_missing(val0,v0) && !value_is_missing(val1,v1) ) ) + { + pairs[i].n += weight; + pairs[i].sum[0] += weight * val0->f; + pairs[i].sum[1] += weight * val1->f; + + pairs[i].ssq[0] += weight * sqr(val0->f); + pairs[i].ssq[1] += weight * sqr(val1->f); - pairs[i].sum_of_diffs += val0->f - val1->f ; - pairs[i].ssq_diffs += sqr(val0->f - val1->f); +#if 0 + pairs[i].correlation += weight * + ( val0->f - pairs[i].v[0]->p.t_t.ugs.mean ) + * + ( val1->f - pairs[i].v[1]->p.t_t.ugs.mean ); +#endif + pairs[i].sum_of_prod += weight * val0->f * val1->f ; + + + pairs[i].sum_of_diffs += weight * ( val0->f - val1->f ) ; + pairs[i].ssq_diffs += weight * sqr(val0->f - val1->f); + } } return 0; @@ -1558,11 +1662,33 @@ paired_postcalc (void *aux UNUSED) for(i=0; i < n_pairs ; ++i ) { - const double n = pairs[i].v[0]->p.t_t.ugs.n ; + int j; + const double n = pairs[i].n; + + for (j=0; j < 2 ; ++j) + { + pairs[i].mean[j] = pairs[i].sum[j] / n ; + pairs[i].s_std_dev[j] = sqrt((pairs[i].ssq[j] / n - + sqr(pairs[i].mean[j])) + ); + + pairs[i].std_dev[j] = sqrt(n/(n-1)*(pairs[i].ssq[j] / n - + sqr(pairs[i].mean[j])) + ); + } + + pairs[i].correlation = pairs[i].sum_of_prod / pairs[i].n - + pairs[i].mean[0] * pairs[i].mean[1] ; + /* correlation now actually contains the covariance */ + pairs[i].correlation /= pairs[i].std_dev[0] * pairs[i].std_dev[1]; + pairs[i].correlation *= pairs[i].n / ( pairs[i].n - 1 ); + +#if 0 pairs[i].correlation /= pairs[i].v[0]->p.t_t.ugs.std_dev * pairs[i].v[1]->p.t_t.ugs.std_dev ; - pairs[i].correlation /= pairs[i].v[0]->p.t_t.ugs.n -1; + pairs[i].correlation /= n - 1; +#endif pairs[i].mean_diff = pairs[i].sum_of_diffs / n ; @@ -1653,6 +1779,26 @@ group_calc (struct ccase *c, void *aux UNUSED) double weight = dict_get_case_weight(default_dict,c); + if ( value_is_missing(gv,indep_var) ) + { + return 0; + } + + if ( cmd.miss == TTS_LISTWISE ) + { + for(i=0; i< cmd.n_variables ; ++i) + { + struct variable *v = cmd.v_variables[i]; + union value *val = &c->data[v->fv]; + + if (value_is_missing(val,v) ) + { + return 0; + } + } + } + + gv = &c->data[indep_var->fv]; g = get_group(gv,indep_var); @@ -1664,13 +1810,18 @@ group_calc (struct ccase *c, void *aux UNUSED) for(i=0; i< cmd.n_variables ; ++i) { - struct group_statistics *gs = &cmd.v_variables[i]->p.t_t.gs[g]; + struct variable *var = cmd.v_variables[i]; - union value *val=&c->data[cmd.v_variables[i]->fv]; + struct group_statistics *gs = &var->p.t_t.gs[g]; - gs->n+=weight; - gs->sum+=weight * val->f; - gs->ssq+=weight * sqr(val->f); + union value *val=&c->data[var->fv]; + + if ( !value_is_missing(val,var) ) + { + gs->n+=weight; + gs->sum+=weight * val->f; + gs->ssq+=weight * sqr(val->f); + } } return 0; diff --git a/src/var.h b/src/var.h index 11add5b0..60298513 100644 --- a/src/var.h +++ b/src/var.h @@ -405,6 +405,7 @@ void cancel_temporary (void); /* Functions. */ void dump_split_vars (const struct ccase *); +typedef int (* is_missing_func )(const union value *, const struct variable *); int is_num_user_missing (double, const struct variable *); int is_str_user_missing (const unsigned char[], const struct variable *); diff --git a/tests/Makefile.am b/tests/Makefile.am index 229b5e06..32d838aa 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -25,6 +25,12 @@ TESTS = command/aggregate.sh \ command/t-test-pairs.sh \ command/t-test-groups.sh \ command/t-test-1-indep-val.sh \ + command/t-test-1-sample-missing-list.sh \ + command/t-test-paired-missing-list.sh \ + command/t-test-paired-missing-anal.sh \ + command/t-test-1-sample-missing-anal.sh \ + command/t-test-indep-missing-list.sh \ + command/t-test-indep-missing-anal.sh \ command/weight.sh \ bugs/alpha-freq.sh \ bugs/compute-fmt.sh \ diff --git a/tests/command/t-test-1-sample-missing-anal.sh b/tests/command/t-test-1-sample-missing-anal.sh new file mode 100755 index 00000000..bd6a2249 --- /dev/null +++ b/tests/command/t-test-1-sample-missing-anal.sh @@ -0,0 +1,101 @@ +#!/bin/sh + +# This program tests that the T-TEST /TESTVAL command works OK +# when there are per analysis missing values involved. + +TEMPDIR=/tmp/pspp-tst-$$ + +here=`pwd`; + +# ensure that top_srcdir is absolute +cd $top_srcdir; top_srcdir=`pwd` + +export STAT_CONFIG_PATH=$top_srcdir/config + + +cleanup() +{ + rm -rf $TEMPDIR +} + + +fail() +{ + echo $activity + echo FAILED + cleanup; + exit 1; +} + + +no_result() +{ + echo $activity + echo NO RESULT; + cleanup; + exit 2; +} + +pass() +{ + cleanup; + exit 0; +} + +mkdir -p $TEMPDIR + +cd $TEMPDIR + +activity="create program 1" +cat > $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat < $TEMPDIR/out.stat <