X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Ffactor_stats.c;h=7e5ac8b466d546accfec1ae9bf592d4cfa2b7f0c;hb=4239c455e7b1061b7c960b793f9080e113123845;hp=16e1930d58188723db0ae4a7f64c3c9c9f083447;hpb=cd7b08ad5e6bbec75e778acf008f84e1eb548154;p=pspp diff --git a/src/factor_stats.c b/src/factor_stats.c index 16e1930d58..7e5ac8b466 100644 --- a/src/factor_stats.c +++ b/src/factor_stats.c @@ -24,32 +24,34 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA #include "hash.h" #include "algorithm.h" #include "alloc.h" +#include "moments.h" +#include "percentiles.h" #include #include #include #include - +#include void -metrics_precalc(struct metrics *fs) +metrics_precalc(struct metrics *m) { - assert (fs) ; + assert (m) ; + + m->n_missing = 0; + + m->min = DBL_MAX; + m->max = -DBL_MAX; - fs->n = 0; - fs->n_missing = 0; - fs->ssq = 0; - fs->sum = 0; - fs->min = DBL_MAX; - fs->max = -DBL_MAX; - fs->ordered_data = hsh_create(20, + m->moments = moments1_create(MOMENT_KURTOSIS); + + m->ordered_data = hsh_create(20, (hsh_compare_func *) compare_values, (hsh_hash_func *) hash_value, (hsh_free_func *) weighted_value_free, (void *) 0); - } @@ -70,9 +72,9 @@ metrics_calc(struct metrics *fs, const union value *val, } x = val->f; - fs->n += weight; - fs->ssq += x * x * weight; - fs->sum += x * weight; + + moments1_add(fs->moments, x, weight); + if ( x < fs->min) fs->min = x; if ( x > fs->max) fs->max = x; @@ -116,87 +118,93 @@ metrics_calc(struct metrics *fs, const union value *val, void metrics_postcalc(struct metrics *m) { - double sample_var; double cc = 0.0; double tc ; int k1, k2 ; int i; int j = 1; - struct weighted_value **data; + moments1_calculate (m->moments, &m->n, &m->mean, &m->var, + &m->skewness, &m->kurtosis); - int n_data; - - m->mean = m->sum / m->n; + moments1_destroy (m->moments); - sample_var = ( m->ssq / m->n - m->mean * m->mean ); - m->var = m->n * sample_var / ( m->n - 1) ; m->stddev = sqrt(m->var); - /* FIXME: Check this is correct ??? Shouldn't we use the sample variance ??? */ - m->stderr = sqrt (m->var / m->n) ; + m->se_mean = sqrt (m->var / m->n) ; - data = (struct weighted_value **) hsh_data(m->ordered_data); - n_data = hsh_count(m->ordered_data); - if ( n_data == 0 ) - { - m->trimmed_mean = m->mean; - return; - } + m->wvp = (struct weighted_value **) hsh_sort(m->ordered_data); + m->n_data = hsh_count(m->ordered_data); - m->wv = xmalloc(sizeof(struct weighted_value ) * n_data); + m->histogram = histogram_create(10, m->min, m->max); - for ( i = 0 ; i < n_data ; ++i ) - m->wv[i] = *(data[i]); - - sort (m->wv, n_data, sizeof (struct weighted_value) , - (algo_compare_func *) compare_values, 0); + for ( i = 0 ; i < m->n_data ; ++i ) + { + struct weighted_value **wv = (m->wvp) ; + gsl_histogram_accumulate(m->histogram, wv[i]->v.f, wv[i]->w); + } - /* Trimmed mean calculation */ + if ( m->n_data <= 1 ) + { + m->trimmed_mean = m->mean; + return; + } tc = m->n * 0.05 ; k1 = -1; k2 = -1; - - for ( i = 0 ; i < n_data ; ++i ) + for ( i = 0 ; i < m->n_data ; ++i ) { - cc += m->wv[i].w; - m->wv[i].cc = cc; + cc += m->wvp[i]->w; + m->wvp[i]->cc = cc; - m->wv[i].rank = j + (m->wv[i].w - 1) / 2.0 ; + m->wvp[i]->rank = j + (m->wvp[i]->w - 1) / 2.0 ; - j += m->wv[i].w; + j += m->wvp[i]->w; if ( cc < tc ) k1 = i; - } - k2 = n_data; - for ( i = n_data -1 ; i >= 0; --i ) + + + k2 = m->n_data; + for ( i = m->n_data -1 ; i >= 0; --i ) { - if ( tc > m->n - m->wv[i].cc) + if ( tc > m->n - m->wvp[i]->cc) k2 = i; } + /* Calculate the percentiles */ + ptiles(m->ptile_hash, m->wvp, m->n_data, m->n, m->ptile_alg); + + tukey_hinges(m->wvp, m->n_data, m->n, m->hinges); + + /* Special case here */ + if ( k1 + 1 == k2 ) + { + m->trimmed_mean = m->wvp[k2]->v.f; + return; + } + m->trimmed_mean = 0; for ( i = k1 + 2 ; i <= k2 - 1 ; ++i ) { - m->trimmed_mean += m->wv[i].v.f * m->wv[i].w; + m->trimmed_mean += m->wvp[i]->v.f * m->wvp[i]->w; } - m->trimmed_mean += (m->n - m->wv[k2 - 1].cc - tc) * m->wv[k2].v.f ; - m->trimmed_mean += (m->wv[k1 + 1].cc - tc) * m->wv[k1 + 1].v.f ; + m->trimmed_mean += (m->n - m->wvp[k2 - 1]->cc - tc) * m->wvp[k2]->v.f ; + m->trimmed_mean += (m->wvp[k1 + 1]->cc - tc) * m->wvp[k1 + 1]->v.f ; m->trimmed_mean /= 0.9 * m->n ; } @@ -255,19 +263,18 @@ create_factor_statistics (int n, union value *id0, union value *id1) void factor_statistics_free(struct factor_statistics *f) { + hsh_destroy(f->m->ordered_data); + gsl_histogram_free(f->m->histogram); free(f->m) ; - free(f); } - - int factor_statistics_compare(const struct factor_statistics *f0, - const struct factor_statistics *f1, void *aux) + const struct factor_statistics *f1, int width) { int cmp0; @@ -275,7 +282,7 @@ factor_statistics_compare(const struct factor_statistics *f0, assert(f0); assert(f1); - cmp0 = compare_values(&f0->id[0], &f1->id[0], aux); + cmp0 = compare_values(&f0->id[0], &f1->id[0], width); if ( cmp0 != 0 ) return cmp0; @@ -287,20 +294,20 @@ factor_statistics_compare(const struct factor_statistics *f0, if ( ( f0->id[1].f != SYSMIS ) && (f1->id[1].f == SYSMIS) ) return -1; - return compare_values(&f0->id[1], &f1->id[1], aux); + return compare_values(&f0->id[1], &f1->id[1], width); } unsigned int -factor_statistics_hash(const struct factor_statistics *f, void *aux) +factor_statistics_hash(const struct factor_statistics *f, int width) { unsigned int h; - h = hash_value(&f->id[0], aux); + h = hash_value(&f->id[0], width); if ( f->id[1].f != SYSMIS ) - h += hash_value(&f->id[1], aux); + h += hash_value(&f->id[1], width); return h;