From: John Darrington Date: Fri, 29 Oct 2004 12:07:54 +0000 (+0000) Subject: Fixed the t-test model to be consistent with the anova model. X-Git-Tag: v0.4.0~249 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c33f6387da9826d640ef1f21068bf5c94055ba77;p=pspp-builds.git Fixed the t-test model to be consistent with the anova model. --- diff --git a/src/ChangeLog b/src/ChangeLog index e8850ced..9ae138ff 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,15 @@ +Fri Oct 29 17:39:03 WST 2004 John Darrington + + * group.c group.h group_proc.h levene.c oneway.q t-test.q + + Made the t-test more consistent + with the way it handles groups. That is, it now uses a hash instead + of an array of 2. Also, made the levene.c file independent of the + implementation of the t-test. So now levene should be fine for both + t-test and anova. + + * Added an oneway.q file for one way anova + Wed Jun 2 22:08:02 2004 Ben Pfaff * descript.c: (cmd_descriptives) Remove harmless but bogus test in diff --git a/src/group.c b/src/group.c index fb86295b..e394745e 100644 --- a/src/group.c +++ b/src/group.c @@ -29,16 +29,7 @@ compare_group(const struct group_statistics *a, const struct group_statistics *b, int width) { - int id_cmp = compare_values(&a->id, &b->id, width); - - if (id_cmp == 0 ) - { - int c; - c= memcmp(&a->criterion,&b->criterion,sizeof(enum comparison)); - return c; - } - else - return id_cmp; + return compare_values(&a->id, &b->id, width); } diff --git a/src/group.h b/src/group.h index a195f06a..c9050604 100644 --- a/src/group.h +++ b/src/group.h @@ -29,10 +29,7 @@ enum comparison { CMP_LE = -2, - CMP_LT = -1, CMP_EQ = 0, - CMP_GT = 1, - CMP_GE = 2 }; @@ -42,11 +39,6 @@ struct group_statistics /* The value of the independent variable for this group */ union value id; - /* The criterium matching for comparing with id - (applicable only to T-TEST) FIXME: therefore it shouldn't be here - */ - enum comparison criterion; - /* The arithmetic mean */ double mean; diff --git a/src/group_proc.h b/src/group_proc.h index 7caf95e8..af1b9758 100644 --- a/src/group_proc.h +++ b/src/group_proc.h @@ -36,9 +36,6 @@ struct group_proc /* The levene statistic */ double levene ; - /* Stats for individual groups */ - struct group_statistics *gs; - /* A hash of group statistics keyed by the value of the independent variable */ struct hsh_table *group_hash; diff --git a/src/levene.c b/src/levene.c index 67d8c374..4659f199 100644 --- a/src/levene.c +++ b/src/levene.c @@ -55,8 +55,6 @@ */ -static struct group_statistics *get_group(int v, struct group_statistics *key); - struct levene_info { @@ -132,8 +130,6 @@ levene(const struct casefile *cf, } -static struct hsh_table **hash; - /* Internal variables used in calculating the Levene statistic */ /* Per variable statistics */ @@ -155,9 +151,6 @@ struct lz_stats /* An array of lz_stats for each variable */ static struct lz_stats *lz; -/* Set to 1 if the groups require inequality comparisions */ -static int inequality_compare; - static void levene_precalc (const struct levene_info *l) @@ -166,33 +159,13 @@ levene_precalc (const struct levene_info *l) lz = xmalloc (sizeof (struct lz_stats ) * l->n_dep ) ; - hash = xmalloc (sizeof ( struct hsh_table *) * l->n_dep ); - for(i=0; i < l->n_dep ; ++i ) { struct variable *v = l->v_dep[i]; - int g; - int number_of_groups = v->p.grp_data.n_groups ; - - hash[i] = hsh_create (l->n_dep * number_of_groups, - (hsh_compare_func *) compare_group, - (hsh_hash_func *) hash_group, - 0,(void *) l->v_indep->width); lz[i].grand_total = 0; lz[i].total_n = 0; - lz[i].n_groups = number_of_groups; - - for (g = 0 ; g < v->p.grp_data.n_groups ; ++g ) - { - struct group_statistics *gs = &v->p.grp_data.gs[g]; - gs->lz_total = 0; - hsh_insert(hash[i], gs); - if ( gs->criterion != CMP_EQ ) - { - inequality_compare = 1; - } - } + lz[i].n_groups = v->p.grp_data.n_groups ; } } @@ -207,7 +180,6 @@ levene_calc (const struct ccase *c, void *_l) struct group_statistics key; double weight = dict_get_case_weight(default_dict,c,&warn); - /* Skip the entire case if /MISSING=LISTWISE is set */ if ( l->missing == LEV_LISTWISE ) { @@ -225,7 +197,6 @@ levene_calc (const struct ccase *c, void *_l) key.id = *gv; - key.criterion = CMP_EQ; for (i = 0; i < l->n_dep; ++i) { @@ -233,7 +204,9 @@ levene_calc (const struct ccase *c, void *_l) double levene_z; const union value *v = case_data (c, var->fv); struct group_statistics *gs; - gs = get_group(i,&key); + + gs = hsh_find(var->p.grp_data.group_hash,(void *) &key ); + if ( 0 == gs ) continue ; @@ -283,9 +256,14 @@ levene2_precalc (void *_l) { struct hsh_iterator hi; struct group_statistics *g; - for(g = (struct group_statistics *) hsh_first(hash[v],&hi); + + struct variable *var = l->v_dep[v] ; + struct hsh_table *hash = var->p.grp_data.group_hash; + + + for(g = (struct group_statistics *) hsh_first(hash,&hi); g != 0 ; - g = (struct group_statistics *) hsh_next(hash[v],&hi) ) + g = (struct group_statistics *) hsh_next(hash,&hi) ) { g->lz_mean = g->lz_total/g->n ; } @@ -322,7 +300,6 @@ levene2_calc (const struct ccase *c, void *_l) } key.id = *gv; - key.criterion = CMP_EQ; for (i = 0; i < l->n_dep; ++i) { @@ -330,7 +307,9 @@ levene2_calc (const struct ccase *c, void *_l) struct variable *var = l->v_dep[i] ; const union value *v = case_data (c, var->fv); struct group_statistics *gs; - gs = get_group(i,&key); + + gs = hsh_find(var->p.grp_data.group_hash,(void *) &key ); + if ( 0 == gs ) continue; @@ -357,9 +336,13 @@ levene2_postcalc (void *_l) double lz_numerator = 0; struct hsh_iterator hi; struct group_statistics *g; - for(g = (struct group_statistics *) hsh_first(hash[v],&hi); + + struct variable *var = l->v_dep[v] ; + struct hsh_table *hash = var->p.grp_data.group_hash; + + for(g = (struct group_statistics *) hsh_first(hash,&hi); g != 0 ; - g = (struct group_statistics *) hsh_next(hash[v],&hi) ) + g = (struct group_statistics *) hsh_next(hash,&hi) ) { lz_numerator += g->n * pow2(g->lz_mean - lz[v].grand_mean ); @@ -376,55 +359,6 @@ levene2_postcalc (void *_l) /* Now clear up after ourselves */ free(lz_denominator); - for (v = 0; v < l->n_dep; ++v) - { - hsh_destroy(hash[v]); - } - - free(hash); free(lz); } - -/* Return the group belonging to the v_th dependent variable - which matches the key */ -static struct group_statistics * -get_group(int v, struct group_statistics *key) -{ - struct group_statistics *gs; - gs = hsh_find(hash[v],key); - - - if ( ( !gs ) && inequality_compare) - { - /* Here we degrade to a linear search. - This would seem inefficient. However, it should only ever happen - with the T-TEST, for which there are exactly two groups */ - - struct hsh_iterator hi; - - assert( hsh_count(hash[v]) == 2 ) ; - for(gs = (struct group_statistics *) hsh_first(hash[v],&hi); - gs != 0 ; - gs = (struct group_statistics *) hsh_next(hash[v],&hi) ) - { - int cmp; - - cmp = compare_values(&gs->id, &key->id, 0); - - assert( cmp != 0 ); /* or else the hash would have found something */ - - if ( cmp == -1 && - ( gs->criterion == CMP_GT || gs->criterion == CMP_GE ) - ) - break; - - if ( cmp == 1 && - ( gs->criterion == CMP_LT || gs->criterion == CMP_LE ) - ) - break; - } - } - - return gs; -} diff --git a/src/oneway.q b/src/oneway.q index dbbd79fb..7477a3f0 100644 --- a/src/oneway.q +++ b/src/oneway.q @@ -868,7 +868,6 @@ precalc ( struct cmd_oneway *cmd UNUSED ) (void *) indep_var->width ); - totals->criterion = CMP_EQ; totals->sum=0; totals->n=0; totals->ssq=0; @@ -929,7 +928,6 @@ calculate(const struct casefile *cf, void *cmd_) gs = (struct group_statistics *) xmalloc (sizeof(struct group_statistics)); - gs->criterion = CMP_EQ; gs->id = *indep_val; gs->sum=0; gs->n=0; diff --git a/src/t-test.q b/src/t-test.q index 2993ac91..a991f29e 100644 --- a/src/t-test.q +++ b/src/t-test.q @@ -69,9 +69,29 @@ static struct variable *indep_var; /* GROUPS: Number of values specified by the user; the values specified if any. */ -static int n_group_values; -static union value groups_values[2]; -static enum comparison criteria[2]; + +struct group_properties +{ + /* The comparison criterion */ + enum comparison criterion; + + /* The width of the independent variable */ + int indep_width ; + + union { + /* The value of the independent variable at which groups are determined to + belong to one group or the other */ + double critical_value; + + + /* The values of the independent variable for each group */ + union value g_value[2]; + } v ; + +}; + + +static struct group_properties gp ; @@ -211,6 +231,17 @@ static struct cmd_t_test cmd; static int bad_weight_warn; + +static int compare_group_binary(const struct group_statistics *a, + const struct group_statistics *b, + struct group_properties *p); + + +static unsigned hash_group_binary(const struct group_statistics *g, + struct group_properties *p); + + + int cmd_t_test(void) { @@ -305,11 +336,12 @@ cmd_t_test(void) if ( mode == T_IND_SAMPLES) { - int i; + int v; /* Destroy any group statistics we created */ - for (i= 0 ; i < cmd.n_variables ; ++i ) + for (v = 0 ; v < cmd.n_variables ; ++v ) { - free(cmd.v_variables[i]->p.grp_data.gs); + struct group_proc *grpp = &cmd.v_variables[v]->p.grp_data; + free(grpp->group_hash); } } @@ -319,6 +351,7 @@ cmd_t_test(void) static int tts_custom_groups (struct cmd_t_test *cmd UNUSED) { + int n_group_values=0; lex_match('='); @@ -348,10 +381,13 @@ tts_custom_groups (struct cmd_t_test *cmd UNUSED) { if (indep_var->type == NUMERIC) { - groups_values[0].f = 1; - groups_values[1].f = 2; - criteria[0] = criteria[1] = CMP_EQ; + gp.v.g_value[0].f = 1; + gp.v.g_value[1].f = 2; + + gp.criterion = CMP_EQ; + n_group_values = 2; + return 1; } else @@ -362,27 +398,32 @@ tts_custom_groups (struct cmd_t_test *cmd UNUSED) } } - if (!parse_value (&groups_values[0],indep_var->type)) + if (!parse_value (&gp.v.g_value[0],indep_var->type)) return 0; lex_match (','); if (lex_match (')')) { - criteria[0] = CMP_LE; - criteria[1] = CMP_GT; - groups_values[1] = groups_values[0]; + gp.criterion = CMP_LE; + gp.v.critical_value = gp.v.g_value[0].f; + n_group_values = 1; return 1; } - if (!parse_value (&groups_values[1],indep_var->type)) + if (!parse_value (&gp.v.g_value[1],indep_var->type)) return 0; - + n_group_values = 2; if (!lex_force_match (')')) return 0; - criteria[0] = criteria[1] = CMP_EQ; + if ( n_group_values == 2 ) + gp.criterion = CMP_EQ ; + else + gp.criterion = CMP_LE ; + + return 1; } @@ -556,6 +597,7 @@ void ssbox_independent_samples_init(struct ssbox *this, void ssbox_paired_init(struct ssbox *this, struct cmd_t_test *cmd); + /* Factory to create an ssbox */ void ssbox_create(struct ssbox *ssb, struct cmd_t_test *cmd, int mode) @@ -577,6 +619,7 @@ ssbox_create(struct ssbox *ssb, struct cmd_t_test *cmd, int mode) } + /* Despatcher for the populate method */ void ssbox_populate(struct ssbox *ssb,struct cmd_t_test *cmd) @@ -600,6 +643,8 @@ ssbox_base_finalize(struct ssbox *ssb) tab_submit(ssb->t); } + + /* Initialize a ssbox struct */ void ssbox_base_init(struct ssbox *this, int cols,int rows) @@ -669,31 +714,41 @@ ssbox_independent_samples_populate(struct ssbox *ssb, char *val_lab0=0; char *val_lab1=0; + double indep_value[2]; char prefix[2][3]={"",""}; if ( indep_var->type == NUMERIC ) { - val_lab0 = val_labs_find( indep_var->val_labs,groups_values[0]); - val_lab1 = val_labs_find( indep_var->val_labs,groups_values[1]); + val_lab0 = val_labs_find( indep_var->val_labs,gp.v.g_value[0]); + val_lab1 = val_labs_find( indep_var->val_labs,gp.v.g_value[1]); } else { - val_lab0 = groups_values[0].s; - val_lab1 = groups_values[1].s; + val_lab0 = gp.v.g_value[0].s; + val_lab1 = gp.v.g_value[1].s; } - if (n_group_values == 1) + if (gp.criterion == CMP_LE ) { strcpy(prefix[0],"< "); strcpy(prefix[1],">="); + indep_value[0] = gp.v.critical_value; + indep_value[1] = gp.v.critical_value; + } + else + { + indep_value[0] = gp.v.g_value[0].f; + indep_value[1] = gp.v.g_value[1].f; } assert(ssb->t); for (i=0; i < cmd->n_variables; ++i) { - int g; + struct variable *var = cmd->v_variables[i]; + struct hsh_table *grp_hash = var->p.grp_data.group_hash; + int count=0; tab_text (ssb->t, 0, i*2+1, TAB_LEFT, cmd->v_variables[i]->name); @@ -701,26 +756,50 @@ ssbox_independent_samples_populate(struct ssbox *ssb, tab_text (ssb->t, 1, i*2+1, TAB_LEFT | TAT_PRINTF, "%s%s", prefix[0], val_lab0); else - tab_text (ssb->t, 1, i*2+1, TAB_LEFT | TAT_PRINTF, - "%s%g", prefix[0], groups_values[0].f); + tab_text (ssb->t, 1, i*2+1, TAB_LEFT | TAT_PRINTF, + "%s%g", prefix[0], indep_value[0]); if (val_lab1) tab_text (ssb->t, 1, i*2+1+1, TAB_LEFT | TAT_PRINTF, "%s%s", prefix[1], val_lab1); else - tab_text (ssb->t, 1, i*2+1+1, TAB_LEFT | TAT_PRINTF, - "%s%g", prefix[1], groups_values[1].f); + tab_text (ssb->t, 1, i*2+1+1, TAB_LEFT | TAT_PRINTF, + "%s%g", prefix[1], indep_value[1]); + /* Fill in the group statistics */ - for ( g=0; g < 2 ; ++g ) + for ( count = 0 ; count < 2 ; ++count ) { - struct group_statistics *gs = &cmd->v_variables[i]->p.grp_data.gs[g]; + union value search_val; + + struct group_statistics *gs; + + if ( gp.criterion == CMP_LE ) + { + if ( count == 0 ) + { + /* less than ( < ) case */ + search_val.f = gp.v.critical_value - 1.0; + } + else + { + /* >= case */ + search_val.f = gp.v.critical_value + 1.0; + } + } + else + { + search_val = gp.v.g_value[count]; + } + + gs = hsh_find(grp_hash, (void *) &search_val); + assert(gs); - tab_float(ssb->t, 2 ,i*2+g+1, TAB_RIGHT, gs->n, 2, 0); - tab_float(ssb->t, 3 ,i*2+g+1, TAB_RIGHT, gs->mean, 8, 2); - tab_float(ssb->t, 4 ,i*2+g+1, TAB_RIGHT, gs->std_dev, 8, 3); - tab_float(ssb->t, 5 ,i*2+g+1, TAB_RIGHT, gs->se_mean, 8, 3); + tab_float(ssb->t, 2 ,i*2+count+1, TAB_RIGHT, gs->n, 2, 0); + tab_float(ssb->t, 3 ,i*2+count+1, TAB_RIGHT, gs->mean, 8, 2); + tab_float(ssb->t, 4 ,i*2+count+1, TAB_RIGHT, gs->std_dev, 8, 3); + tab_float(ssb->t, 5 ,i*2+count+1, TAB_RIGHT, gs->se_mean, 8, 3); } } } @@ -929,8 +1008,31 @@ trbox_independent_samples_populate(struct trbox *self, double std_err_diff; double mean_diff; - struct group_statistics *gs0 = &cmd->v_variables[i]->p.grp_data.gs[0]; - struct group_statistics *gs1 = &cmd->v_variables[i]->p.grp_data.gs[1]; + struct variable *var = cmd->v_variables[i]; + + struct hsh_table *grp_hash = var->p.grp_data.group_hash; + + struct group_statistics *gs0 ; + struct group_statistics *gs1 ; + + union value search_val; + + if ( gp.criterion == CMP_LE ) + search_val.f = gp.v.critical_value - 1.0; + else + search_val = gp.v.g_value[0]; + + gs0 = hsh_find(grp_hash, (void *) &search_val); + assert(gs0); + + if ( gp.criterion == CMP_LE ) + search_val.f = gp.v.critical_value + 1.0; + else + search_val = gp.v.g_value[1]; + + gs1 = hsh_find(grp_hash, (void *) &search_val); + assert(gs1); + tab_text (self->t, 0, i*2+3, TAB_LEFT, cmd->v_variables[i]->name); @@ -1296,6 +1398,7 @@ pscbox(void) + /* Calculation Implementation */ /* Per case calculations common to all variants of the T test */ @@ -1602,44 +1705,6 @@ paired_postcalc (struct cmd_t_test *cmd UNUSED) } } -/* Return the group # corresponding to the - independent variable with the value val -*/ -static int -get_group(const union value *val, struct variable *indep) -{ - int i; - - for (i = 0; i < 2 ; ++i ) - { - const int cmp = compare_values(val,&groups_values[i],indep->width) ; - switch ( criteria[i]) - { - case CMP_EQ: - if ( 0 == cmp ) return i; - break; - case CMP_LT: - if ( 0 > cmp ) return i; - break; - case CMP_LE: - if ( cmp <= 0 ) return i; - break; - case CMP_GT: - if ( cmp > 0 ) return i; - break; - case CMP_GE: - if ( cmp >= 0 ) return i; - break; - default: - assert(0); - }; - } - - /* No groups matched */ - return -1; -} - - static void group_precalc (struct cmd_t_test *cmd ) { @@ -1652,19 +1717,39 @@ group_precalc (struct cmd_t_test *cmd ) /* There's always 2 groups for a T - TEST */ ttpr->n_groups = 2; - ttpr->gs = xmalloc(sizeof(struct group_statistics) * 2) ; + + gp.indep_width = indep_var->width; + + ttpr->group_hash = hsh_create(2, + (hsh_compare_func *) compare_group_binary, + (hsh_hash_func *) hash_group_binary, + (hsh_free_func *) free_group, + (void *) &gp ); for (j=0 ; j < 2 ; ++j) { - ttpr->gs[j].sum = 0; - ttpr->gs[j].n = 0; - ttpr->gs[j].ssq = 0; + + struct group_statistics *gs = (struct group_statistics *) + xmalloc (sizeof(struct group_statistics)); + + gs->sum = 0; + gs->n = 0; + gs->ssq = 0; - if ( n_group_values == 2 ) - ttpr->gs[j].id = groups_values[j]; + if ( gp.criterion == CMP_EQ ) + { + gs->id = gp.v.g_value[j]; + } else - ttpr->gs[j].id = groups_values[0]; - ttpr->gs[j].criterion = criteria[j]; + { + if ( j == 0 ) + gs->id.f = gp.v.critical_value - 1.0 ; + else + gs->id.f = gp.v.critical_value + 1.0 ; + } + + hsh_insert ( ttpr->group_hash, (void *) gs ); + } } @@ -1674,7 +1759,6 @@ static int group_calc (const struct ccase *c, struct cmd_t_test *cmd) { int i; - int g; const union value *gv = case_data (c, indep_var->fv); @@ -1699,24 +1783,21 @@ group_calc (const struct ccase *c, struct cmd_t_test *cmd) } } - gv = case_data (c, indep_var->fv); - g = get_group(gv,indep_var); - - - /* If the independent variable doesn't match either of the values - for this case then move on to the next case */ - if (g == -1 ) - return 0; - for(i=0; i< cmd->n_variables ; ++i) { struct variable *var = cmd->v_variables[i]; + const union value *val = case_data (c, var->fv); + struct hsh_table *grp_hash = var->p.grp_data.group_hash; + struct group_statistics *gs; - struct group_statistics *gs = &var->p.grp_data.gs[g]; + gs = hsh_find(grp_hash, (void *) gv); - const union value *val = case_data (c, var->fv); + /* If the independent variable doesn't match either of the values + for this case then move on to the next case */ + if ( ! gs ) + return 0; if ( !value_is_missing(val,var) ) { @@ -1734,28 +1815,34 @@ static void group_postcalc ( struct cmd_t_test *cmd ) { int i; - int j; for(i=0; i< cmd->n_variables ; ++i) { - for (j=0 ; j < 2 ; ++j) - { - struct group_statistics *gs; - gs=&cmd->v_variables[i]->p.grp_data.gs[j]; + struct variable *var = cmd->v_variables[i]; + struct hsh_table *grp_hash = var->p.grp_data.group_hash; + struct hsh_iterator g; + struct group_statistics *gs; + int count=0; + for (gs = hsh_first (grp_hash,&g); + gs != 0; + gs = hsh_next(grp_hash,&g)) + { gs->mean = gs->sum / gs->n; gs->s_std_dev= sqrt( - ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) - ) ; + ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) + ) ; gs->std_dev= sqrt( - gs->n/(gs->n-1) * - ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) - ) ; + gs->n/(gs->n-1) * + ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) + ) ; gs->se_mean = gs->std_dev / sqrt(gs->n); + count ++; } + assert(count == 2); } } @@ -1820,7 +1907,6 @@ calculate(const struct casefile *cf, void *cmd_) casereader_destroy (r); group_postcalc(cmd); - levene(cf, indep_var, cmd->n_variables, cmd->v_variables, (cmd->miss == TTS_LISTWISE)?LEV_LISTWISE:LEV_ANALYSIS , value_is_missing); @@ -1839,3 +1925,62 @@ calculate(const struct casefile *cf, void *cmd_) trbox_finalize(&test_results_box); } + + +/* Return -1 if the id of a is less than b; +1 if greater than and + 0 if equal */ +static int +compare_group_binary(const struct group_statistics *a, + const struct group_statistics *b, + struct group_properties *p) +{ + + short flag_a; + short flag_b; + + assert(p->indep_width == 0 ) ; + + if ( p->criterion == CMP_LE ) + { + flag_a = ( a->id.f < p->v.critical_value ) ; + flag_b = ( b->id.f < p->v.critical_value ) ; + } + else + { + flag_a = ( a->id.f == p->v.critical_value ) ; + flag_b = ( b->id.f == p->v.critical_value ) ; + } + + + if ( flag_a == flag_b) + return 0 ; + + return ( flag_a < flag_b); +} + +static unsigned +hash_group_binary(const struct group_statistics *g, struct group_properties *p) +{ + short flag = -1; + + assert(p->indep_width == 0 ) ; + + /* FIXME: should compare union values */ + if ( p->criterion == CMP_LE ) + { + flag = ( g->id.f < p->v.critical_value ) ; + } + else if ( p->criterion == CMP_EQ) + { + if ( g->id.f == p->v.g_value[0].f ) + flag = 0 ; + else if ( g->id.f == p->v.g_value[1].f ) + flag = 1; + else + flag = 2; + } + else + assert(0); + + return flag; +}