1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include <libpspp/alloc.h>
25 #include <libpspp/misc.h>
26 #include <data/value.h>
29 #define _(msgid) gettext (msgid)
31 /* Calculates variance, skewness, and kurtosis into *VARIANCE,
32 *SKEWNESS, and *KURTOSIS if they are non-null and not greater
33 moments than MAX_MOMENT. Accepts W as the total weight, D1 as
34 the total deviation from the estimated mean, and D2, D3, and
35 D4 as the sum of the squares, cubes, and 4th powers,
36 respectively, of the deviation from the estimated mean. */
38 calc_moments (enum moment max_moment,
39 double w, double d1, double d2, double d3, double d4,
40 double *variance, double *skewness, double *kurtosis)
44 if (max_moment >= MOMENT_VARIANCE && w > 1.)
48 /* From _Numerical Recipes in C_, 2nd ed., 0-521-43108-5,
50 s2 = (d2 - pow2 (d1) / w) / (w - 1.);
54 /* From _SPSS Statistical Algorithms, 2nd ed.,
55 0-918469-89-9, section "DESCRIPTIVES". */
56 if (fabs (*variance) >= 1e-20)
58 if (max_moment >= MOMENT_SKEWNESS && skewness != NULL && w > 2.)
60 double s3 = s2 * sqrt (s2);
61 double g1 = (w * d3) / ((w - 1.0) * (w - 2.0) * s3);
65 if (max_moment >= MOMENT_KURTOSIS && kurtosis != NULL && w > 3.)
67 double den = (w - 2.) * (w - 3.) * pow2 (s2);
68 double g2 = (w * (w + 1) * d4 / (w - 1.) / den
69 - 3. * pow2 (d2) / den);
77 /* Two-pass moments. */
79 /* A set of two-pass moments. */
82 enum moment max_moment; /* Highest-order moment we're computing. */
83 int pass; /* Current pass (1 or 2). */
86 double w1; /* Total weight for pass 1, so far. */
87 double sum; /* Sum of values so far. */
88 double mean; /* Mean = sum / w1. */
91 double w2; /* Total weight for pass 2, so far. */
92 double d1; /* Sum of deviations from the mean. */
93 double d2; /* Sum of squared deviations from the mean. */
94 double d3; /* Sum of cubed deviations from the mean. */
95 double d4; /* Sum of (deviations from the mean)**4. */
98 /* Initializes moments M for calculating moment MAX_MOMENT and
101 init_moments (struct moments *m, enum moment max_moment)
104 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
105 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
106 m->max_moment = max_moment;
110 /* Clears out a set of moments so that it can be reused for a new
111 set of values. The moments to be calculated are not changed. */
113 moments_clear (struct moments *m)
120 /* Creates and returns a data structure for calculating moment
121 MAX_MOMENT and lower moments on a data series. The user
122 should call moments_pass_one() for each value in the series,
123 then call moments_pass_two() for the same set of values in the
124 same order, then call moments_calculate() to obtain the
125 moments. The user may ask for the mean at any time during the
126 first pass (using moments_calculate()), but otherwise no
127 statistics may be requested until the end of the second pass.
128 Call moments_destroy() when the moments are no longer
131 moments_create (enum moment max_moment)
133 struct moments *m = xmalloc (sizeof *m);
134 init_moments (m, max_moment);
138 /* Adds VALUE with the given WEIGHT to the calculation of
139 moments for the first pass. */
141 moments_pass_one (struct moments *m, double value, double weight)
144 assert (m->pass == 1);
146 if (value != SYSMIS && weight > 0.)
148 m->sum += value * weight;
153 /* Adds VALUE with the given WEIGHT to the calculation of
154 moments for the second pass. */
156 moments_pass_two (struct moments *m, double value, double weight)
165 m->mean = m->w1 != 0. ? m->sum / m->w1 : 0.;
166 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
169 if (value != SYSMIS && weight >= 0.)
173 d = d_power = value - m->mean;
174 m->d1 += d_power * weight;
176 if (m->max_moment >= MOMENT_VARIANCE)
179 m->d2 += d_power * weight;
181 if (m->max_moment >= MOMENT_SKEWNESS)
184 m->d3 += d_power * weight;
186 if (m->max_moment >= MOMENT_KURTOSIS)
189 m->d4 += d_power * weight;
196 /* Calculates moments based on the input data. Stores the total
197 weight in *WEIGHT, the mean in *MEAN, the variance in
198 *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
199 *KURTOSIS. Any of these result parameters may be null
200 pointers, in which case the values are not calculated. If any
201 result cannot be calculated, either because they are undefined
202 based on the input data or because their moments are higher
203 than the maximum requested on moments_create(), then SYSMIS is
204 stored into that result. */
206 moments_calculate (const struct moments *m,
208 double *mean, double *variance,
209 double *skewness, double *kurtosis)
215 if (variance != NULL)
217 if (skewness != NULL)
219 if (kurtosis != NULL)
225 /* How many passes so far? */
228 /* In the first pass we can only calculate the mean. */
229 if (mean != NULL && m->w1 > 0.)
230 *mean = m->sum / m->w1;
234 /* After the second pass we can calculate any stat. We
235 don't support "online" computation during the second
236 pass, so As a simple self-check, the total weight for
237 the passes must agree. */
238 assert (m->pass == 2);
239 assert (m->w1 == m->w2);
245 calc_moments (m->max_moment,
246 m->w2, m->d1, m->d2, m->d3, m->d4,
247 variance, skewness, kurtosis);
252 /* Destroys a set of moments. */
254 moments_destroy (struct moments *m)
259 /* Calculates the requested moments on the CNT values in ARRAY.
260 Each value is given a weight of 1. The total weight is stored
261 into *WEIGHT (trivially) and the mean, variance, skewness, and
262 kurtosis are stored into *MEAN, *VARIANCE, *SKEWNESS, and
263 *KURTOSIS, respectively. Any of the result pointers may be
264 null, in which case no value is stored. */
266 moments_of_doubles (const double *array, size_t cnt,
268 double *mean, double *variance,
269 double *skewness, double *kurtosis)
271 enum moment max_moment;
275 if (kurtosis != NULL)
276 max_moment = MOMENT_KURTOSIS;
277 else if (skewness != NULL)
278 max_moment = MOMENT_SKEWNESS;
279 else if (variance != NULL)
280 max_moment = MOMENT_VARIANCE;
282 max_moment = MOMENT_MEAN;
284 init_moments (&m, max_moment);
285 for (idx = 0; idx < cnt; idx++)
286 moments_pass_one (&m, array[idx], 1.);
287 for (idx = 0; idx < cnt; idx++)
288 moments_pass_two (&m, array[idx], 1.);
289 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
292 /* Calculates the requested moments on the CNT numeric values in
293 ARRAY. Each value is given a weight of 1. The total weight
294 is stored into *WEIGHT (trivially) and the mean, variance,
295 skewness, and kurtosis are stored into *MEAN, *VARIANCE,
296 *SKEWNESS, and *KURTOSIS, respectively. Any of the result
297 pointers may be null, in which case no value is stored. */
299 moments_of_values (const union value *array, size_t cnt,
301 double *mean, double *variance,
302 double *skewness, double *kurtosis)
304 enum moment max_moment;
308 if (kurtosis != NULL)
309 max_moment = MOMENT_KURTOSIS;
310 else if (skewness != NULL)
311 max_moment = MOMENT_SKEWNESS;
312 else if (variance != NULL)
313 max_moment = MOMENT_VARIANCE;
315 max_moment = MOMENT_MEAN;
317 init_moments (&m, max_moment);
318 for (idx = 0; idx < cnt; idx++)
319 moments_pass_one (&m, array[idx].f, 1.);
320 for (idx = 0; idx < cnt; idx++)
321 moments_pass_two (&m, array[idx].f, 1.);
322 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
325 /* One-pass moments. */
327 /* A set of one-pass moments. */
330 enum moment max_moment; /* Highest-order moment we're computing. */
331 double w; /* Total weight so far. */
332 double d1; /* Sum of deviations from the mean. */
333 double d2; /* Sum of squared deviations from the mean. */
334 double d3; /* Sum of cubed deviations from the mean. */
335 double d4; /* Sum of (deviations from the mean)**4. */
338 /* Initializes one-pass moments M for calculating moment
339 MAX_MOMENT and lower moments. */
341 init_moments1 (struct moments1 *m, enum moment max_moment)
344 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
345 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
346 m->max_moment = max_moment;
350 /* Clears out a set of one-pass moments so that it can be reused
351 for a new set of values. The moments to be calculated are not
354 moments1_clear (struct moments1 *m)
357 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
360 /* Creates and returns a data structure for calculating moment
361 MAX_MOMENT and lower moments on a data series in a single
362 pass. The user should call moments1_add() for each value in
363 the series. The user may call moments1_calculate() to obtain
364 the current moments at any time. Call moments1_destroy() when
365 the moments are no longer needed.
367 One-pass moments should only be used when two passes over the
368 data are impractical. */
370 moments1_create (enum moment max_moment)
372 struct moments1 *m = xmalloc (sizeof *m);
373 init_moments1 (m, max_moment);
377 /* Adds VALUE with the given WEIGHT to the calculation of
380 moments1_add (struct moments1 *m, double value, double weight)
384 if (value != SYSMIS && weight > 0.)
390 v1 = (weight / m->w) * (value - m->d1);
393 if (m->max_moment >= MOMENT_VARIANCE)
396 double w_prev_w = m->w * prev_w;
397 double prev_m2 = m->d2;
399 m->d2 += w_prev_w / weight * v2;
400 if (m->max_moment >= MOMENT_SKEWNESS)
402 double w2 = weight * weight;
404 double prev_m3 = m->d3;
406 m->d3 += (-3. * v1 * prev_m2
407 + w_prev_w / w2 * (m->w - 2. * weight) * v3);
408 if (m->max_moment >= MOMENT_KURTOSIS)
410 double w3 = w2 * weight;
413 m->d4 += (-4. * v1 * prev_m3
415 + ((pow2 (m->w) - 3. * weight * prev_w)
416 * v4 * w_prev_w / w3));
423 /* Calculates one-pass moments based on the input data. Stores
424 the total weight in *WEIGHT, the mean in *MEAN, the variance
425 in *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
426 *KURTOSIS. Any of these result parameters may be null
427 pointers, in which case the values are not calculated. If any
428 result cannot be calculated, either because they are undefined
429 based on the input data or because their moments are higher
430 than the maximum requested on moments_create(), then SYSMIS is
431 stored into that result. */
433 moments1_calculate (const struct moments1 *m,
435 double *mean, double *variance,
436 double *skewness, double *kurtosis)
442 if (variance != NULL)
444 if (skewness != NULL)
446 if (kurtosis != NULL)
457 calc_moments (m->max_moment,
458 m->w, 0., m->d2, m->d3, m->d4,
459 variance, skewness, kurtosis);
463 /* Destroy one-pass moments M. */
465 moments1_destroy (struct moments1 *m)
470 /* Returns the standard error of the skewness for the given total
473 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
474 section "DESCRIPTIVES". */
476 calc_seskew (double W)
478 return sqrt ((6. * W * (W - 1.)) / ((W - 2.) * (W + 1.) * (W + 3.)));
481 /* Returns the standard error of the kurtosis for the given total
484 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
485 section "DESCRIPTIVES", except that the sqrt symbol is omitted
488 calc_sekurt (double W)
490 return sqrt ((4. * (pow2 (W) - 1.) * pow2 (calc_seskew (W)))
491 / ((W - 3.) * (W + 5.)));