X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Ffactor_stats.c;h=16e1930d58188723db0ae4a7f64c3c9c9f083447;hb=f1cd7ca88d074b671844ef073b364e069672ce66;hp=cb2197ada76e1b32bd365def88d9e73c165dbb39;hpb=9c01f251cf0e5b5eb3899fc7c62cc595f3d48511;p=pspp-builds.git diff --git a/src/factor_stats.c b/src/factor_stats.c index cb2197ad..16e1930d 100644 --- a/src/factor_stats.c +++ b/src/factor_stats.c @@ -35,7 +35,10 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA void metrics_precalc(struct metrics *fs) { + assert (fs) ; + fs->n = 0; + fs->n_missing = 0; fs->ssq = 0; fs->sum = 0; fs->min = DBL_MAX; @@ -44,18 +47,29 @@ metrics_precalc(struct metrics *fs) fs->ordered_data = hsh_create(20, (hsh_compare_func *) compare_values, (hsh_hash_func *) hash_value, - 0, + (hsh_free_func *) weighted_value_free, (void *) 0); + } + +/* Include val in the calculation for the metrics. + If val is null, then treat it as MISSING +*/ void -metrics_calc(struct metrics *fs, const union value *val, double weight) +metrics_calc(struct metrics *fs, const union value *val, + double weight, int case_no) { - - struct weighted_value **wv; - const double x = val->f; + double x; + if ( ! val ) + { + fs->n_missing += weight; + return ; + } + + x = val->f; fs->n += weight; fs->ssq += x * x * weight; fs->sum += x * weight; @@ -69,23 +83,38 @@ metrics_calc(struct metrics *fs, const union value *val, double weight) if ( *wv ) { /* If this value has already been seen, then simply - increase its weight */ + increase its weight and push a new case number */ + + struct case_node *cn; assert( (*wv)->v.f == val->f ); (*wv)->w += weight; + + cn = xmalloc( sizeof (struct case_node) ) ; + cn->next = (*wv)->case_nos ; + cn->num = case_no; + + (*wv)->case_nos = cn; } else { - *wv = xmalloc( sizeof (struct weighted_value) ); + struct case_node *cn; + + *wv = weighted_value_create(); (*wv)->v = *val; (*wv)->w = weight; - hsh_insert(fs->ordered_data,(void *) *wv); + + cn = xmalloc( sizeof (struct case_node) ) ; + cn->next=0; + cn->num = case_no; + (*wv)->case_nos = cn; + } } void -metrics_postcalc(struct metrics *fs) +metrics_postcalc(struct metrics *m) { double sample_var; double cc = 0.0; @@ -99,44 +128,52 @@ metrics_postcalc(struct metrics *fs) int n_data; - fs->mean = fs->sum / fs->n; + m->mean = m->sum / m->n; - sample_var = ( fs->ssq / fs->n - fs->mean * fs->mean ); + sample_var = ( m->ssq / m->n - m->mean * m->mean ); - fs->var = fs->n * sample_var / ( fs->n - 1) ; - fs->stddev = sqrt(fs->var); + m->var = m->n * sample_var / ( m->n - 1) ; + m->stddev = sqrt(m->var); /* FIXME: Check this is correct ??? Shouldn't we use the sample variance ??? */ - fs->stderr = sqrt (fs->var / fs->n) ; + m->stderr = sqrt (m->var / m->n) ; + + data = (struct weighted_value **) hsh_data(m->ordered_data); + n_data = hsh_count(m->ordered_data); - data = (struct weighted_value **) hsh_data(fs->ordered_data); - n_data = hsh_count(fs->ordered_data); + if ( n_data == 0 ) + { + m->trimmed_mean = m->mean; + return; + } - fs->wv = xmalloc ( sizeof (struct weighted_value) * n_data); + + m->wv = xmalloc(sizeof(struct weighted_value ) * n_data); for ( i = 0 ; i < n_data ; ++i ) - fs->wv[i] = *(data[i]); + m->wv[i] = *(data[i]); - sort (fs->wv, n_data, sizeof (struct weighted_value) , + sort (m->wv, n_data, sizeof (struct weighted_value) , (algo_compare_func *) compare_values, 0); - - tc = fs->n * 0.05 ; + /* Trimmed mean calculation */ + + tc = m->n * 0.05 ; k1 = -1; k2 = -1; for ( i = 0 ; i < n_data ; ++i ) { - cc += fs->wv[i].w; - fs->wv[i].cc = cc; + cc += m->wv[i].w; + m->wv[i].cc = cc; - fs->wv[i].rank = j + (fs->wv[i].w - 1) / 2.0 ; + m->wv[i].rank = j + (m->wv[i].w - 1) / 2.0 ; - j += fs->wv[i].w; + j += m->wv[i].w; if ( cc < tc ) k1 = i; @@ -146,44 +183,127 @@ metrics_postcalc(struct metrics *fs) k2 = n_data; for ( i = n_data -1 ; i >= 0; --i ) { - if ( tc > fs->n - fs->wv[i].cc) + if ( tc > m->n - m->wv[i].cc) k2 = i; } - fs->trimmed_mean = 0; + m->trimmed_mean = 0; for ( i = k1 + 2 ; i <= k2 - 1 ; ++i ) { - fs->trimmed_mean += fs->wv[i].v.f * fs->wv[i].w; + m->trimmed_mean += m->wv[i].v.f * m->wv[i].w; } - fs->trimmed_mean += (fs->n - fs->wv[k2 - 1].cc - tc) * fs->wv[k2].v.f ; - fs->trimmed_mean += (fs->wv[k1 + 1].cc - tc) * fs->wv[k1 + 1].v.f ; - fs->trimmed_mean /= 0.9 * fs->n ; + m->trimmed_mean += (m->n - m->wv[k2 - 1].cc - tc) * m->wv[k2].v.f ; + m->trimmed_mean += (m->wv[k1 + 1].cc - tc) * m->wv[k1 + 1].v.f ; + m->trimmed_mean /= 0.9 * m->n ; } -/* Functions for hashes */ +struct weighted_value * +weighted_value_create(void) +{ + struct weighted_value *wv; + wv = xmalloc (sizeof (struct weighted_value )); + + wv->cc = 0; + wv->case_nos = 0; + + return wv; +} void -free_factor_stats(struct factor_statistics *f, int width UNUSED) +weighted_value_free(struct weighted_value *wv) { - free (f); + struct case_node *cn = wv->case_nos; + + while(cn) + { + struct case_node *next = cn->next; + + free(cn); + cn = next; + } + + free(wv); + } -int -compare_indep_values(const struct factor_statistics *f1, - const struct factor_statistics *f2, - int width) + + + + +/* Create a factor statistics object with for N dependent vars + and ID as the value of the independent variable */ +struct factor_statistics * +create_factor_statistics (int n, union value *id0, union value *id1) +{ + struct factor_statistics *f; + + f = xmalloc( sizeof ( struct factor_statistics )); + + f->id[0] = *id0; + f->id[1] = *id1; + f->m = xmalloc( sizeof ( struct metrics ) * n ) ; + + return f; +} + + +void +factor_statistics_free(struct factor_statistics *f) { - return compare_values(f1->id, f2->id, width); + free(f->m) ; + + free(f); } -unsigned -hash_indep_value(const struct factor_statistics *f, int width) + + + + +int +factor_statistics_compare(const struct factor_statistics *f0, + const struct factor_statistics *f1, void *aux) +{ + + int cmp0; + + assert(f0); + assert(f1); + + cmp0 = compare_values(&f0->id[0], &f1->id[0], aux); + + if ( cmp0 != 0 ) + return cmp0; + + + if ( ( f0->id[1].f == SYSMIS ) && (f1->id[1].f != SYSMIS) ) + return 1; + + if ( ( f0->id[1].f != SYSMIS ) && (f1->id[1].f == SYSMIS) ) + return -1; + + return compare_values(&f0->id[1], &f1->id[1], aux); + +} + +unsigned int +factor_statistics_hash(const struct factor_statistics *f, void *aux) { - return hash_value(f->id, width); + + unsigned int h; + + h = hash_value(&f->id[0], aux); + + if ( f->id[1].f != SYSMIS ) + h += hash_value(&f->id[1], aux); + + + return h; + } +