X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fmath%2Fhistogram.c;h=264b0785351b99c9da8c3397294a0ef1bb1f716f;hb=2031b99a3f1970a0b9a840652f0aff80ec34b433;hp=f2f75e12b3af80a26c9ac24b4ed20e83ef5d7c8c;hpb=d1a9160a893030f10c13e46dcf5bbf7501229958;p=pspp diff --git a/src/math/histogram.c b/src/math/histogram.c index f2f75e12b3..264b078535 100644 --- a/src/math/histogram.c +++ b/src/math/histogram.c @@ -21,6 +21,7 @@ #include #include +#include "data/settings.h" #include "libpspp/message.h" #include "libpspp/assertion.h" #include "libpspp/cast.h" @@ -36,16 +37,7 @@ void histogram_add (struct histogram *h, double y, double c) { - struct statistic *stat = &h->parent; - stat->accumulate (stat, NULL, c, 0, y); -} - -static void -acc (struct statistic *s, const struct ccase *cx UNUSED, double c, double cc UNUSED, double y) -{ - struct histogram *hist = UP_CAST (s, struct histogram, parent); - - gsl_histogram_accumulate (hist->gsl_hist, y, c); + gsl_histogram_accumulate (h->gsl_hist, y, c); } static void @@ -57,100 +49,83 @@ destroy (struct statistic *s) } -/* This functions adjusts the upper and lower range of the histogram to make them fit BIN_WIDTH - MIN and MAX are the lowest and highest data to be plotted in the histogram. - ADJ_MIN and ADJ_MAX are locations of the adjusted values of MIN and MAX (the range will always be - equal or slightly larger). - Returns the number of bins. - */ -static int -adjust_bin_ranges (double bin_width, double min, double max, double *adj_min, double *adj_max) -{ - const double half_bin_width = bin_width / 2.0; - - /* The lower and upper limits of the histogram, in units of half - bin widths */ - int lower_limit, upper_limit; - - /* -1 if the lower end of the range contains more unused space - than the upper end. - +1 otherwise. */ - short sparse_end = 0; +/* Find a bin width which is adapted to the scaling of the x axis +In the example here, the binwidth is half of the tick interval. - double ul, ll; - double lower_remainder = fabs (modf (min / half_bin_width, &ll)); - double upper_remainder = fabs (modf (max / half_bin_width, &ul)); + binwidth + > < + |....+....+....+. .+....| + LOWER 1 2 3 N_TICKS + ^LOWDBL ^HIGHDBL +This only works, when the min and max value for the histogram are adapted +such that (max-min) is a multiple of the binwidth. Then the location of the +first bin has to be aligned to the ticks. +*/ +static int +hist_find_pretty_no_of_bins(double bin_width_in, double min, double max, + double *adjusted_min, double *adjusted_max) +{ + double lower, interval; + int n_ticks; + double binwidth; + int nbins; - assert (max > min); - - lower_limit = ll; - - /* If the minimum value is not aligned on a half bin width, - then the lower bound must be extended so that the histogram range includes it. */ - if (lower_remainder > 0) - lower_limit--; - - /* However, the upper bound must be extended regardless, because histogram bins - span the range [lower, upper) */ - upper_limit = ul + 1; - - /* So, in the case of the maximum value coinciding with a half bin width, - the upper end will be the sparse end (because is got extended by a complete - half bin width). In other cases, it depends which got the bigger extension. */ - if (upper_remainder == 0) - sparse_end = +1; - else - sparse_end = lower_remainder < upper_remainder ? -1 : +1; + chart_get_scale (max, min, &lower, &interval, &n_ticks); - /* The range must be an EVEN number of half bin_widths */ - if ( (upper_limit - lower_limit) % 2) + if (bin_width_in >= 2 * interval) { - /* Extend the range at the end which gives the least unused space */ - if (sparse_end == +1) - lower_limit--; + binwidth = floor(bin_width_in/interval) * interval; + *adjusted_min = lower; + } + else if (bin_width_in >= 1.5 * interval) + { + binwidth = 1.5 * interval; + if (min < (lower + 0.5 * interval)) + *adjusted_min = lower; else - upper_limit++; - - /* Now the other end has more space */ - sparse_end *= -1; + *adjusted_min = lower + 0.5 * interval; } - - /* But the range should be aligned to an ODD number of - half bin widths, so that the labels are aesthetically pleasing ones. - Otherwise we are likely to get labels such as -3 -1 1 3 instead of -2 0 2 4 - */ - if ( lower_limit % 2 == 0) + else if (bin_width_in >= interval) + { + binwidth = interval; + *adjusted_min = lower; + } + else if (bin_width_in >= (2.0/3.0 * interval)) { - /* Shift the range away from the sparse end, EXCEPT if that is the upper end, - and it was extended to prevent the maximum value from getting lost */ - if (sparse_end == +1 && upper_remainder > 0) - { - lower_limit --; - upper_limit --; - } + binwidth = (2.0/3.0 * interval); + if (min >= lower + binwidth) + *adjusted_min = lower + binwidth; else - { - lower_limit ++; - upper_limit ++; - } + *adjusted_min = lower; + } + else + { + int i; + for(i = 2; bin_width_in < interval/i; i++); + binwidth = interval/i; + *adjusted_min = floor((min - lower)/binwidth)*binwidth + lower; } - *adj_min = lower_limit * half_bin_width; - *adj_max = upper_limit * half_bin_width; + nbins = ceil((max-*adjusted_min)/binwidth); + *adjusted_max = nbins*binwidth + *adjusted_min; - assert (*adj_max >= max); - assert (*adj_min <= min); + /* adjusted_max should never be smaller than max but if it is equal + then the gsl_histogram will not add the cases which have max value */ + if (*adjusted_max <= max) + { + *adjusted_max += binwidth; + nbins++; + } + assert (*adjusted_min <= min); - return (upper_limit - lower_limit) / 2.0; + return nbins; } - struct histogram * -histogram_create (double bin_width, double min, double max) +histogram_create (double bin_width_in, double min, double max) { - const int MAX_BINS = 25; struct histogram *h; struct statistic *stat; int bins; @@ -162,29 +137,37 @@ histogram_create (double bin_width, double min, double max) return NULL; } - assert (bin_width > 0); - - bins = adjust_bin_ranges (bin_width, min, max, &adjusted_min, &adjusted_max); + assert (bin_width_in > 0); - /* Force the number of bins to lie in a sensible range. */ - if (bins > MAX_BINS) - { - bins = adjust_bin_ranges ((max - min) / (double) (MAX_BINS - 1), - min, max, &adjusted_min, &adjusted_max); - } - - /* Can this ever happen? */ - if (bins < 1) - bins = 1; + bins = hist_find_pretty_no_of_bins(bin_width_in, min, max, &adjusted_min, &adjusted_max); h = xmalloc (sizeof *h); h->gsl_hist = gsl_histogram_alloc (bins); - gsl_histogram_set_ranges_uniform (h->gsl_hist, adjusted_min, adjusted_max); + /* The bin ranges could be computed with gsl_histogram_set_ranges_uniform, + but the number of bins is adapted to the ticks of the axis such that for example + data ranging from 1.0 to 7.0 with 6 bins will have bin limits at + 2.0,3.0,4.0 and 5.0. Due to numerical accuracy the computed bin limits might + be 4.99999999 for a value which is expected to be 5.0. But the limits of + the histogram bins should be that what is expected from the displayed ticks. + Therefore the bin limits are computed from the rounded values which is similar + to the procedure at the chart_get_ticks_format. Actual bin limits should be + exactly what is displayed at the ticks. + But... I cannot reproduce the problem that I see with gsl_histogram_set_ranges_uniform + with the code below without rounding... + */ + { + double *ranges = xmalloc (sizeof (double) * (bins + 1)); + double interval = (adjusted_max - adjusted_min) / bins; + for (int i = 0; i < bins; i++) + ranges[i] = adjusted_min + interval * i; + ranges[bins] = adjusted_max; + gsl_histogram_set_ranges (h->gsl_hist, ranges, bins + 1); + free (ranges); + } stat = &h->parent; - stat->accumulate = acc; stat->destroy = destroy; return h;