1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include <gsl/gsl_math.h>
23 #include <libpspp/misc.h>
24 #include <data/val-type.h>
25 #include <data/value.h>
30 #define _(msgid) gettext (msgid)
32 /* Calculates variance, skewness, and kurtosis into *VARIANCE,
33 *SKEWNESS, and *KURTOSIS if they are non-null and not greater
34 moments than MAX_MOMENT. Accepts W as the total weight, D1 as
35 the total deviation from the estimated mean, and D2, D3, and
36 D4 as the sum of the squares, cubes, and 4th powers,
37 respectively, of the deviation from the estimated mean. */
39 calc_moments (enum moment max_moment,
40 double w, double d1, double d2, double d3, double d4,
41 double *variance, double *skewness, double *kurtosis)
45 if (max_moment >= MOMENT_VARIANCE && w > 1.)
47 double s2 = (d2 - pow2 (d1) / w) / (w - 1.);
51 /* From _SPSS Statistical Algorithms, 2nd ed.,
52 0-918469-89-9, section "DESCRIPTIVES". */
53 if (fabs (*variance) >= 1e-20)
55 if (max_moment >= MOMENT_SKEWNESS && skewness != NULL && w > 2.)
57 double s3 = s2 * sqrt (s2);
58 double g1 = (w * d3) / ((w - 1.0) * (w - 2.0) * s3);
62 if (max_moment >= MOMENT_KURTOSIS && kurtosis != NULL && w > 3.)
64 double den = (w - 2.) * (w - 3.) * pow2 (s2);
65 double g2 = (w * (w + 1) * d4 / (w - 1.) / den
66 - 3. * pow2 (d2) / den);
74 /* Two-pass moments. */
76 /* A set of two-pass moments. */
79 enum moment max_moment; /* Highest-order moment we're computing. */
80 int pass; /* Current pass (1 or 2). */
83 double w1; /* Total weight for pass 1, so far. */
84 double sum; /* Sum of values so far. */
85 double mean; /* Mean = sum / w1. */
88 double w2; /* Total weight for pass 2, so far. */
89 double d1; /* Sum of deviations from the mean. */
90 double d2; /* Sum of squared deviations from the mean. */
91 double d3; /* Sum of cubed deviations from the mean. */
92 double d4; /* Sum of (deviations from the mean)**4. */
95 /* Initializes moments M for calculating moment MAX_MOMENT and
98 init_moments (struct moments *m, enum moment max_moment)
101 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
102 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
103 m->max_moment = max_moment;
107 /* Clears out a set of moments so that it can be reused for a new
108 set of values. The moments to be calculated are not changed. */
110 moments_clear (struct moments *m)
117 /* Creates and returns a data structure for calculating moment
118 MAX_MOMENT and lower moments on a data series. The user
119 should call moments_pass_one() for each value in the series,
120 then call moments_pass_two() for the same set of values in the
121 same order, then call moments_calculate() to obtain the
122 moments. The user may ask for the mean at any time during the
123 first pass (using moments_calculate()), but otherwise no
124 statistics may be requested until the end of the second pass.
125 Call moments_destroy() when the moments are no longer
128 moments_create (enum moment max_moment)
130 struct moments *m = xmalloc (sizeof *m);
131 init_moments (m, max_moment);
135 /* Adds VALUE with the given WEIGHT to the calculation of
136 moments for the first pass. */
138 moments_pass_one (struct moments *m, double value, double weight)
141 assert (m->pass == 1);
143 if (value != SYSMIS && weight > 0.)
145 m->sum += value * weight;
150 /* Adds VALUE with the given WEIGHT to the calculation of
151 moments for the second pass. */
153 moments_pass_two (struct moments *m, double value, double weight)
160 m->mean = m->w1 != 0. ? m->sum / m->w1 : 0.;
161 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
164 if (value != SYSMIS && weight >= 0.)
166 double d = value - m->mean;
167 double d1_delta = d * weight;
169 if (m->max_moment >= MOMENT_VARIANCE)
171 double d2_delta = d1_delta * d;
173 if (m->max_moment >= MOMENT_SKEWNESS)
175 double d3_delta = d2_delta * d;
177 if (m->max_moment >= MOMENT_KURTOSIS)
179 double d4_delta = d3_delta * d;
188 /* Calculates moments based on the input data. Stores the total
189 weight in *WEIGHT, the mean in *MEAN, the variance in
190 *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
191 *KURTOSIS. Any of these result parameters may be null
192 pointers, in which case the values are not calculated. If any
193 result cannot be calculated, either because they are undefined
194 based on the input data or because their moments are higher
195 than the maximum requested on moments_create(), then SYSMIS is
196 stored into that result. */
198 moments_calculate (const struct moments *m,
200 double *mean, double *variance,
201 double *skewness, double *kurtosis)
207 if (variance != NULL)
209 if (skewness != NULL)
211 if (kurtosis != NULL)
217 /* How many passes so far? */
220 /* In the first pass we can only calculate the mean. */
221 if (mean != NULL && m->w1 > 0.)
222 *mean = m->sum / m->w1;
226 /* After the second pass we can calculate any stat. We
227 don't support "online" computation during the second
228 pass, so As a simple self-check, the total weight for
229 the passes must agree. */
230 assert (m->pass == 2);
231 assert (m->w1 == m->w2);
237 calc_moments (m->max_moment,
238 m->w2, m->d1, m->d2, m->d3, m->d4,
239 variance, skewness, kurtosis);
244 /* Destroys a set of moments. */
246 moments_destroy (struct moments *m)
251 /* Calculates the requested moments on the CNT values in ARRAY.
252 Each value is given a weight of 1. The total weight is stored
253 into *WEIGHT (trivially) and the mean, variance, skewness, and
254 kurtosis are stored into *MEAN, *VARIANCE, *SKEWNESS, and
255 *KURTOSIS, respectively. Any of the result pointers may be
256 null, in which case no value is stored. */
258 moments_of_doubles (const double *array, size_t cnt,
260 double *mean, double *variance,
261 double *skewness, double *kurtosis)
263 enum moment max_moment;
267 if (kurtosis != NULL)
268 max_moment = MOMENT_KURTOSIS;
269 else if (skewness != NULL)
270 max_moment = MOMENT_SKEWNESS;
271 else if (variance != NULL)
272 max_moment = MOMENT_VARIANCE;
274 max_moment = MOMENT_MEAN;
276 init_moments (&m, max_moment);
277 for (idx = 0; idx < cnt; idx++)
278 moments_pass_one (&m, array[idx], 1.);
279 for (idx = 0; idx < cnt; idx++)
280 moments_pass_two (&m, array[idx], 1.);
281 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
284 /* Calculates the requested moments on the CNT numeric values in
285 ARRAY. Each value is given a weight of 1. The total weight
286 is stored into *WEIGHT (trivially) and the mean, variance,
287 skewness, and kurtosis are stored into *MEAN, *VARIANCE,
288 *SKEWNESS, and *KURTOSIS, respectively. Any of the result
289 pointers may be null, in which case no value is stored. */
291 moments_of_values (const union value *array, size_t cnt,
293 double *mean, double *variance,
294 double *skewness, double *kurtosis)
296 enum moment max_moment;
300 if (kurtosis != NULL)
301 max_moment = MOMENT_KURTOSIS;
302 else if (skewness != NULL)
303 max_moment = MOMENT_SKEWNESS;
304 else if (variance != NULL)
305 max_moment = MOMENT_VARIANCE;
307 max_moment = MOMENT_MEAN;
309 init_moments (&m, max_moment);
310 for (idx = 0; idx < cnt; idx++)
311 moments_pass_one (&m, array[idx].f, 1.);
312 for (idx = 0; idx < cnt; idx++)
313 moments_pass_two (&m, array[idx].f, 1.);
314 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
317 /* One-pass moments. */
319 /* A set of one-pass moments. */
322 enum moment max_moment; /* Highest-order moment we're computing. */
323 double w; /* Total weight so far. */
324 double d1; /* Sum of deviations from the mean. */
325 double d2; /* Sum of squared deviations from the mean. */
326 double d3; /* Sum of cubed deviations from the mean. */
327 double d4; /* Sum of (deviations from the mean)**4. */
330 /* Initializes one-pass moments M for calculating moment
331 MAX_MOMENT and lower moments. */
333 init_moments1 (struct moments1 *m, enum moment max_moment)
336 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
337 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
338 m->max_moment = max_moment;
342 /* Clears out a set of one-pass moments so that it can be reused
343 for a new set of values. The moments to be calculated are not
346 moments1_clear (struct moments1 *m)
349 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
352 /* Creates and returns a data structure for calculating moment
353 MAX_MOMENT and lower moments on a data series in a single
354 pass. The user should call moments1_add() for each value in
355 the series. The user may call moments1_calculate() to obtain
356 the current moments at any time. Call moments1_destroy() when
357 the moments are no longer needed.
359 One-pass moments should only be used when two passes over the
360 data are impractical. */
362 moments1_create (enum moment max_moment)
364 struct moments1 *m = xmalloc (sizeof *m);
365 init_moments1 (m, max_moment);
369 /* Adds VALUE with the given WEIGHT to the calculation of
372 moments1_add (struct moments1 *m, double value, double weight)
376 if (value != SYSMIS && weight > 0.)
382 v1 = (weight / m->w) * (value - m->d1);
385 if (m->max_moment >= MOMENT_VARIANCE)
388 double w_prev_w = m->w * prev_w;
389 double prev_m2 = m->d2;
391 m->d2 += w_prev_w / weight * v2;
392 if (m->max_moment >= MOMENT_SKEWNESS)
394 double w2 = weight * weight;
396 double prev_m3 = m->d3;
398 m->d3 += (-3. * v1 * prev_m2
399 + w_prev_w / w2 * (m->w - 2. * weight) * v3);
400 if (m->max_moment >= MOMENT_KURTOSIS)
402 double w3 = w2 * weight;
405 m->d4 += (-4. * v1 * prev_m3
407 + ((pow2 (m->w) - 3. * weight * prev_w)
408 * v4 * w_prev_w / w3));
415 /* Calculates one-pass moments based on the input data. Stores
416 the total weight in *WEIGHT, the mean in *MEAN, the variance
417 in *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
418 *KURTOSIS. Any of these result parameters may be null
419 pointers, in which case the values are not calculated. If any
420 result cannot be calculated, either because they are undefined
421 based on the input data or because their moments are higher
422 than the maximum requested on moments_create(), then SYSMIS is
423 stored into that result. */
425 moments1_calculate (const struct moments1 *m,
427 double *mean, double *variance,
428 double *skewness, double *kurtosis)
434 if (variance != NULL)
436 if (skewness != NULL)
438 if (kurtosis != NULL)
449 calc_moments (m->max_moment,
450 m->w, 0., m->d2, m->d3, m->d4,
451 variance, skewness, kurtosis);
455 /* Destroy one-pass moments M. */
457 moments1_destroy (struct moments1 *m)
462 /* Returns the standard error of the skewness for the given total
465 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
466 section "DESCRIPTIVES". */
468 calc_seskew (double W)
470 return sqrt ((6. * W * (W - 1.)) / ((W - 2.) * (W + 1.) * (W + 3.)));
473 /* Returns the standard error of the kurtosis for the given total
476 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
477 section "DESCRIPTIVES", except that the sqrt symbol is omitted
480 calc_sekurt (double W)
482 return sqrt ((4. * (pow2 (W) - 1.) * pow2 (calc_seskew (W)))
483 / ((W - 3.) * (W + 5.)));