1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include <libpspp/alloc.h>
25 #include <libpspp/misc.h>
26 #include <data/value.h>
29 #define _(msgid) gettext (msgid)
31 /* Calculates variance, skewness, and kurtosis into *VARIANCE,
32 *SKEWNESS, and *KURTOSIS if they are non-null and not greater
33 moments than MAX_MOMENT. Accepts W as the total weight, D1 as
34 the total deviation from the estimated mean, and D2, D3, and
35 D4 as the sum of the squares, cubes, and 4th powers,
36 respectively, of the deviation from the estimated mean. */
38 calc_moments (enum moment max_moment,
39 double w, double d1, double d2, double d3, double d4,
40 double *variance, double *skewness, double *kurtosis)
44 if (max_moment >= MOMENT_VARIANCE && w > 1.)
46 double s2 = (d2 - pow2 (d1) / w) / (w - 1.);
50 /* From _SPSS Statistical Algorithms, 2nd ed.,
51 0-918469-89-9, section "DESCRIPTIVES". */
52 if (fabs (*variance) >= 1e-20)
54 if (max_moment >= MOMENT_SKEWNESS && skewness != NULL && w > 2.)
56 double s3 = s2 * sqrt (s2);
57 double g1 = (w * d3) / ((w - 1.0) * (w - 2.0) * s3);
61 if (max_moment >= MOMENT_KURTOSIS && kurtosis != NULL && w > 3.)
63 double den = (w - 2.) * (w - 3.) * pow2 (s2);
64 double g2 = (w * (w + 1) * d4 / (w - 1.) / den
65 - 3. * pow2 (d2) / den);
73 /* Two-pass moments. */
75 /* A set of two-pass moments. */
78 enum moment max_moment; /* Highest-order moment we're computing. */
79 int pass; /* Current pass (1 or 2). */
82 double w1; /* Total weight for pass 1, so far. */
83 double sum; /* Sum of values so far. */
84 double mean; /* Mean = sum / w1. */
87 double w2; /* Total weight for pass 2, so far. */
88 double d1; /* Sum of deviations from the mean. */
89 double d2; /* Sum of squared deviations from the mean. */
90 double d3; /* Sum of cubed deviations from the mean. */
91 double d4; /* Sum of (deviations from the mean)**4. */
94 /* Initializes moments M for calculating moment MAX_MOMENT and
97 init_moments (struct moments *m, enum moment max_moment)
100 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
101 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
102 m->max_moment = max_moment;
106 /* Clears out a set of moments so that it can be reused for a new
107 set of values. The moments to be calculated are not changed. */
109 moments_clear (struct moments *m)
116 /* Creates and returns a data structure for calculating moment
117 MAX_MOMENT and lower moments on a data series. The user
118 should call moments_pass_one() for each value in the series,
119 then call moments_pass_two() for the same set of values in the
120 same order, then call moments_calculate() to obtain the
121 moments. The user may ask for the mean at any time during the
122 first pass (using moments_calculate()), but otherwise no
123 statistics may be requested until the end of the second pass.
124 Call moments_destroy() when the moments are no longer
127 moments_create (enum moment max_moment)
129 struct moments *m = xmalloc (sizeof *m);
130 init_moments (m, max_moment);
134 /* Adds VALUE with the given WEIGHT to the calculation of
135 moments for the first pass. */
137 moments_pass_one (struct moments *m, double value, double weight)
140 assert (m->pass == 1);
142 if (value != SYSMIS && weight > 0.)
144 m->sum += value * weight;
149 /* Adds VALUE with the given WEIGHT to the calculation of
150 moments for the second pass. */
152 moments_pass_two (struct moments *m, double value, double weight)
161 m->mean = m->w1 != 0. ? m->sum / m->w1 : 0.;
162 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
165 if (value != SYSMIS && weight >= 0.)
169 d = d_power = value - m->mean;
170 m->d1 += d_power * weight;
172 if (m->max_moment >= MOMENT_VARIANCE)
175 m->d2 += d_power * weight;
177 if (m->max_moment >= MOMENT_SKEWNESS)
180 m->d3 += d_power * weight;
182 if (m->max_moment >= MOMENT_KURTOSIS)
185 m->d4 += d_power * weight;
192 /* Calculates moments based on the input data. Stores the total
193 weight in *WEIGHT, the mean in *MEAN, the variance in
194 *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
195 *KURTOSIS. Any of these result parameters may be null
196 pointers, in which case the values are not calculated. If any
197 result cannot be calculated, either because they are undefined
198 based on the input data or because their moments are higher
199 than the maximum requested on moments_create(), then SYSMIS is
200 stored into that result. */
202 moments_calculate (const struct moments *m,
204 double *mean, double *variance,
205 double *skewness, double *kurtosis)
211 if (variance != NULL)
213 if (skewness != NULL)
215 if (kurtosis != NULL)
221 /* How many passes so far? */
224 /* In the first pass we can only calculate the mean. */
225 if (mean != NULL && m->w1 > 0.)
226 *mean = m->sum / m->w1;
230 /* After the second pass we can calculate any stat. We
231 don't support "online" computation during the second
232 pass, so As a simple self-check, the total weight for
233 the passes must agree. */
234 assert (m->pass == 2);
235 assert (m->w1 == m->w2);
241 calc_moments (m->max_moment,
242 m->w2, m->d1, m->d2, m->d3, m->d4,
243 variance, skewness, kurtosis);
248 /* Destroys a set of moments. */
250 moments_destroy (struct moments *m)
255 /* Calculates the requested moments on the CNT values in ARRAY.
256 Each value is given a weight of 1. The total weight is stored
257 into *WEIGHT (trivially) and the mean, variance, skewness, and
258 kurtosis are stored into *MEAN, *VARIANCE, *SKEWNESS, and
259 *KURTOSIS, respectively. Any of the result pointers may be
260 null, in which case no value is stored. */
262 moments_of_doubles (const double *array, size_t cnt,
264 double *mean, double *variance,
265 double *skewness, double *kurtosis)
267 enum moment max_moment;
271 if (kurtosis != NULL)
272 max_moment = MOMENT_KURTOSIS;
273 else if (skewness != NULL)
274 max_moment = MOMENT_SKEWNESS;
275 else if (variance != NULL)
276 max_moment = MOMENT_VARIANCE;
278 max_moment = MOMENT_MEAN;
280 init_moments (&m, max_moment);
281 for (idx = 0; idx < cnt; idx++)
282 moments_pass_one (&m, array[idx], 1.);
283 for (idx = 0; idx < cnt; idx++)
284 moments_pass_two (&m, array[idx], 1.);
285 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
288 /* Calculates the requested moments on the CNT numeric values in
289 ARRAY. Each value is given a weight of 1. The total weight
290 is stored into *WEIGHT (trivially) and the mean, variance,
291 skewness, and kurtosis are stored into *MEAN, *VARIANCE,
292 *SKEWNESS, and *KURTOSIS, respectively. Any of the result
293 pointers may be null, in which case no value is stored. */
295 moments_of_values (const union value *array, size_t cnt,
297 double *mean, double *variance,
298 double *skewness, double *kurtosis)
300 enum moment max_moment;
304 if (kurtosis != NULL)
305 max_moment = MOMENT_KURTOSIS;
306 else if (skewness != NULL)
307 max_moment = MOMENT_SKEWNESS;
308 else if (variance != NULL)
309 max_moment = MOMENT_VARIANCE;
311 max_moment = MOMENT_MEAN;
313 init_moments (&m, max_moment);
314 for (idx = 0; idx < cnt; idx++)
315 moments_pass_one (&m, array[idx].f, 1.);
316 for (idx = 0; idx < cnt; idx++)
317 moments_pass_two (&m, array[idx].f, 1.);
318 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
321 /* One-pass moments. */
323 /* A set of one-pass moments. */
326 enum moment max_moment; /* Highest-order moment we're computing. */
327 double w; /* Total weight so far. */
328 double d1; /* Sum of deviations from the mean. */
329 double d2; /* Sum of squared deviations from the mean. */
330 double d3; /* Sum of cubed deviations from the mean. */
331 double d4; /* Sum of (deviations from the mean)**4. */
334 /* Initializes one-pass moments M for calculating moment
335 MAX_MOMENT and lower moments. */
337 init_moments1 (struct moments1 *m, enum moment max_moment)
340 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
341 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
342 m->max_moment = max_moment;
346 /* Clears out a set of one-pass moments so that it can be reused
347 for a new set of values. The moments to be calculated are not
350 moments1_clear (struct moments1 *m)
353 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
356 /* Creates and returns a data structure for calculating moment
357 MAX_MOMENT and lower moments on a data series in a single
358 pass. The user should call moments1_add() for each value in
359 the series. The user may call moments1_calculate() to obtain
360 the current moments at any time. Call moments1_destroy() when
361 the moments are no longer needed.
363 One-pass moments should only be used when two passes over the
364 data are impractical. */
366 moments1_create (enum moment max_moment)
368 struct moments1 *m = xmalloc (sizeof *m);
369 init_moments1 (m, max_moment);
373 /* Adds VALUE with the given WEIGHT to the calculation of
376 moments1_add (struct moments1 *m, double value, double weight)
380 if (value != SYSMIS && weight > 0.)
386 v1 = (weight / m->w) * (value - m->d1);
389 if (m->max_moment >= MOMENT_VARIANCE)
392 double w_prev_w = m->w * prev_w;
393 double prev_m2 = m->d2;
395 m->d2 += w_prev_w / weight * v2;
396 if (m->max_moment >= MOMENT_SKEWNESS)
398 double w2 = weight * weight;
400 double prev_m3 = m->d3;
402 m->d3 += (-3. * v1 * prev_m2
403 + w_prev_w / w2 * (m->w - 2. * weight) * v3);
404 if (m->max_moment >= MOMENT_KURTOSIS)
406 double w3 = w2 * weight;
409 m->d4 += (-4. * v1 * prev_m3
411 + ((pow2 (m->w) - 3. * weight * prev_w)
412 * v4 * w_prev_w / w3));
419 /* Calculates one-pass moments based on the input data. Stores
420 the total weight in *WEIGHT, the mean in *MEAN, the variance
421 in *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
422 *KURTOSIS. Any of these result parameters may be null
423 pointers, in which case the values are not calculated. If any
424 result cannot be calculated, either because they are undefined
425 based on the input data or because their moments are higher
426 than the maximum requested on moments_create(), then SYSMIS is
427 stored into that result. */
429 moments1_calculate (const struct moments1 *m,
431 double *mean, double *variance,
432 double *skewness, double *kurtosis)
438 if (variance != NULL)
440 if (skewness != NULL)
442 if (kurtosis != NULL)
453 calc_moments (m->max_moment,
454 m->w, 0., m->d2, m->d3, m->d4,
455 variance, skewness, kurtosis);
459 /* Destroy one-pass moments M. */
461 moments1_destroy (struct moments1 *m)
466 /* Returns the standard error of the skewness for the given total
469 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
470 section "DESCRIPTIVES". */
472 calc_seskew (double W)
474 return sqrt ((6. * W * (W - 1.)) / ((W - 2.) * (W + 1.) * (W + 3.)));
477 /* Returns the standard error of the kurtosis for the given total
480 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
481 section "DESCRIPTIVES", except that the sqrt symbol is omitted
484 calc_sekurt (double W)
486 return sqrt ((4. * (pow2 (W) - 1.) * pow2 (calc_seskew (W)))
487 / ((W - 3.) * (W + 5.)));