1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
29 /* Calculates variance, skewness, and kurtosis into *VARIANCE,
30 *SKEWNESS, and *KURTOSIS if they are non-null and not greater
31 moments than MAX_MOMENT. Accepts W as the total weight, D1 as
32 the total deviation from the estimated mean, and D2, D3, and
33 D4 as the sum of the squares, cubes, and 4th powers,
34 respectively, of the deviation from the estimated mean. */
36 calc_moments (enum moment max_moment,
37 double w, double d1, double d2, double d3, double d4,
38 double *variance, double *skewness, double *kurtosis)
42 if (max_moment >= MOMENT_VARIANCE && w > 1.)
46 /* From _Numerical Recipes in C_, 2nd ed., 0-521-43108-5,
48 s2 = (d2 - pow2 (d1) / w) / (w - 1.);
52 /* From _SPSS Statistical Algorithms, 2nd ed.,
53 0-918469-89-9, section "DESCRIPTIVES". */
54 if (fabs (*variance) >= 1e-20)
56 if (max_moment >= MOMENT_SKEWNESS && skewness != NULL && w > 2.)
58 double s3 = s2 * sqrt (s2);
59 double g1 = (w * d3) / ((w - 1.0) * (w - 2.0) * s3);
63 if (max_moment >= MOMENT_KURTOSIS && kurtosis != NULL && w > 3.)
65 double den = (w - 2.) * (w - 3.) * pow2 (s2);
66 double g2 = (w * (w + 1) * d4 / (w - 1.) / den
67 - 3. * pow2 (d2) / den);
75 /* Two-pass moments. */
77 /* A set of two-pass moments. */
80 enum moment max_moment; /* Highest-order moment we're computing. */
81 int pass; /* Current pass (1 or 2). */
84 double w1; /* Total weight for pass 1, so far. */
85 double sum; /* Sum of values so far. */
86 double mean; /* Mean = sum / w1. */
89 double w2; /* Total weight for pass 2, so far. */
90 double d1; /* Sum of deviations from the mean. */
91 double d2; /* Sum of squared deviations from the mean. */
92 double d3; /* Sum of cubed deviations from the mean. */
93 double d4; /* Sum of (deviations from the mean)**4. */
96 /* Initializes moments M for calculating moment MAX_MOMENT and
99 init_moments (struct moments *m, enum moment max_moment)
102 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
103 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
104 m->max_moment = max_moment;
108 /* Clears out a set of moments so that it can be reused for a new
109 set of values. The moments to be calculated are not changed. */
111 moments_clear (struct moments *m)
118 /* Creates and returns a data structure for calculating moment
119 MAX_MOMENT and lower moments on a data series. The user
120 should call moments_pass_one() for each value in the series,
121 then call moments_pass_two() for the same set of values in the
122 same order, then call moments_calculate() to obtain the
123 moments. The user may ask for the mean at any time during the
124 first pass (using moments_calculate()), but otherwise no
125 statistics may be requested until the end of the second pass.
126 Call moments_destroy() when the moments are no longer
129 moments_create (enum moment max_moment)
131 struct moments *m = xmalloc (sizeof *m);
132 init_moments (m, max_moment);
136 /* Adds VALUE with the given WEIGHT to the calculation of
137 moments for the first pass. */
139 moments_pass_one (struct moments *m, double value, double weight)
142 assert (m->pass == 1);
144 if (value != SYSMIS && weight > 0.)
146 m->sum += value * weight;
151 /* Adds VALUE with the given WEIGHT to the calculation of
152 moments for the second pass. */
154 moments_pass_two (struct moments *m, double value, double weight)
163 m->mean = m->w1 != 0. ? m->sum / m->w1 : 0.;
164 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
167 if (value != SYSMIS && weight >= 0.)
171 d = d_power = value - m->mean;
172 m->d1 += d_power * weight;
174 if (m->max_moment >= MOMENT_VARIANCE)
177 m->d2 += d_power * weight;
179 if (m->max_moment >= MOMENT_SKEWNESS)
182 m->d3 += d_power * weight;
184 if (m->max_moment >= MOMENT_KURTOSIS)
187 m->d4 += d_power * weight;
194 /* Calculates moments based on the input data. Stores the total
195 weight in *WEIGHT, the mean in *MEAN, the variance in
196 *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
197 *KURTOSIS. Any of these result parameters may be null
198 pointers, in which case the values are not calculated. If any
199 result cannot be calculated, either because they are undefined
200 based on the input data or because their moments are higher
201 than the maximum requested on moments_create(), then SYSMIS is
202 stored into that result. */
204 moments_calculate (const struct moments *m,
206 double *mean, double *variance,
207 double *skewness, double *kurtosis)
213 if (variance != NULL)
215 if (skewness != NULL)
217 if (kurtosis != NULL)
223 /* How many passes so far? */
226 /* In the first pass we can only calculate the mean. */
227 if (mean != NULL && m->w1 > 0.)
228 *mean = m->sum / m->w1;
232 /* After the second pass we can calculate any stat. We
233 don't support "online" computation during the second
234 pass, so As a simple self-check, the total weight for
235 the passes must agree. */
236 assert (m->pass == 2);
237 assert (m->w1 == m->w2);
243 calc_moments (m->max_moment,
244 m->w2, m->d1, m->d2, m->d3, m->d4,
245 variance, skewness, kurtosis);
250 /* Destroys a set of moments. */
252 moments_destroy (struct moments *m)
257 /* Calculates the requested moments on the CNT values in ARRAY.
258 Each value is given a weight of 1. The total weight is stored
259 into *WEIGHT (trivially) and the mean, variance, skewness, and
260 kurtosis are stored into *MEAN, *VARIANCE, *SKEWNESS, and
261 *KURTOSIS, respectively. Any of the result pointers may be
262 null, in which case no value is stored. */
264 moments_of_doubles (const double *array, size_t cnt,
266 double *mean, double *variance,
267 double *skewness, double *kurtosis)
269 enum moment max_moment;
273 if (kurtosis != NULL)
274 max_moment = MOMENT_KURTOSIS;
275 else if (skewness != NULL)
276 max_moment = MOMENT_SKEWNESS;
277 else if (variance != NULL)
278 max_moment = MOMENT_VARIANCE;
280 max_moment = MOMENT_MEAN;
282 init_moments (&m, max_moment);
283 for (idx = 0; idx < cnt; idx++)
284 moments_pass_one (&m, array[idx], 1.);
285 for (idx = 0; idx < cnt; idx++)
286 moments_pass_two (&m, array[idx], 1.);
287 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
290 /* Calculates the requested moments on the CNT numeric values in
291 ARRAY. Each value is given a weight of 1. The total weight
292 is stored into *WEIGHT (trivially) and the mean, variance,
293 skewness, and kurtosis are stored into *MEAN, *VARIANCE,
294 *SKEWNESS, and *KURTOSIS, respectively. Any of the result
295 pointers may be null, in which case no value is stored. */
297 moments_of_values (const union value *array, size_t cnt,
299 double *mean, double *variance,
300 double *skewness, double *kurtosis)
302 enum moment max_moment;
306 if (kurtosis != NULL)
307 max_moment = MOMENT_KURTOSIS;
308 else if (skewness != NULL)
309 max_moment = MOMENT_SKEWNESS;
310 else if (variance != NULL)
311 max_moment = MOMENT_VARIANCE;
313 max_moment = MOMENT_MEAN;
315 init_moments (&m, max_moment);
316 for (idx = 0; idx < cnt; idx++)
317 moments_pass_one (&m, array[idx].f, 1.);
318 for (idx = 0; idx < cnt; idx++)
319 moments_pass_two (&m, array[idx].f, 1.);
320 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
323 /* One-pass moments. */
325 /* A set of one-pass moments. */
328 enum moment max_moment; /* Highest-order moment we're computing. */
329 double w; /* Total weight so far. */
330 double d1; /* Sum of deviations from the mean. */
331 double d2; /* Sum of squared deviations from the mean. */
332 double d3; /* Sum of cubed deviations from the mean. */
333 double d4; /* Sum of (deviations from the mean)**4. */
336 /* Initializes one-pass moments M for calculating moment
337 MAX_MOMENT and lower moments. */
339 init_moments1 (struct moments1 *m, enum moment max_moment)
342 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
343 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
344 m->max_moment = max_moment;
348 /* Clears out a set of one-pass moments so that it can be reused
349 for a new set of values. The moments to be calculated are not
352 moments1_clear (struct moments1 *m)
355 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
358 /* Creates and returns a data structure for calculating moment
359 MAX_MOMENT and lower moments on a data series in a single
360 pass. The user should call moments1_add() for each value in
361 the series. The user may call moments1_calculate() to obtain
362 the current moments at any time. Call moments1_destroy() when
363 the moments are no longer needed.
365 One-pass moments should only be used when two passes over the
366 data are impractical. */
368 moments1_create (enum moment max_moment)
370 struct moments1 *m = xmalloc (sizeof *m);
371 init_moments1 (m, max_moment);
375 /* Adds VALUE with the given WEIGHT to the calculation of
378 moments1_add (struct moments1 *m, double value, double weight)
382 if (value != SYSMIS && weight > 0.)
388 v1 = (weight / m->w) * (value - m->d1);
391 if (m->max_moment >= MOMENT_VARIANCE)
394 double w_prev_w = m->w * prev_w;
395 double prev_m2 = m->d2;
397 m->d2 += w_prev_w / weight * v2;
398 if (m->max_moment >= MOMENT_SKEWNESS)
400 double w2 = weight * weight;
402 double prev_m3 = m->d3;
404 m->d3 += (-3. * v1 * prev_m2
405 + w_prev_w / w2 * (m->w - 2. * weight) * v3);
406 if (m->max_moment >= MOMENT_KURTOSIS)
408 double w3 = w2 * weight;
411 m->d4 += (-4. * v1 * prev_m3
413 + ((pow2 (m->w) - 3. * weight * prev_w)
414 * v4 * w_prev_w / w3));
421 /* Calculates one-pass moments based on the input data. Stores
422 the total weight in *WEIGHT, the mean in *MEAN, the variance
423 in *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
424 *KURTOSIS. Any of these result parameters may be null
425 pointers, in which case the values are not calculated. If any
426 result cannot be calculated, either because they are undefined
427 based on the input data or because their moments are higher
428 than the maximum requested on moments_create(), then SYSMIS is
429 stored into that result. */
431 moments1_calculate (const struct moments1 *m,
433 double *mean, double *variance,
434 double *skewness, double *kurtosis)
440 if (variance != NULL)
442 if (skewness != NULL)
444 if (kurtosis != NULL)
455 calc_moments (m->max_moment,
456 m->w, 0., m->d2, m->d3, m->d4,
457 variance, skewness, kurtosis);
461 /* Destroy one-pass moments M. */
463 moments1_destroy (struct moments1 *m)
468 /* Returns the standard error of the skewness for the given total
471 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
472 section "DESCRIPTIVES". */
474 calc_seskew (double W)
476 return sqrt ((6. * W * (W - 1.)) / ((W - 2.) * (W + 1.) * (W + 3.)));
479 /* Returns the standard error of the kurtosis for the given total
482 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
483 section "DESCRIPTIVES", except that the sqrt symbol is omitted
486 calc_sekurt (double W)
488 return sqrt ((4. * (pow2 (W) - 1.) * pow2 (calc_seskew (W)))
489 / ((W - 3.) * (W + 5.)));
497 read_values (double **values, double **weights, size_t *cnt)
504 while (token == T_NUM)
506 double value = tokval;
513 lex_error (_("expecting weight value"));
523 *values = xrealloc (*values, sizeof **values * cap);
524 *weights = xrealloc (*weights, sizeof **weights * cap);
527 (*values)[*cnt] = value;
528 (*weights)[*cnt] = weight;
536 cmd_debug_moments (void)
538 int retval = CMD_FAILURE;
539 double *values = NULL;
540 double *weights = NULL;
546 if (lex_match_id ("ONEPASS"))
550 lex_force_match ('/');
553 fprintf (stderr, "%s => ", lex_rest_of_line (NULL));
558 struct moments *m = NULL;
560 m = moments_create (MOMENT_KURTOSIS);
561 if (!read_values (&values, &weights, &cnt))
566 for (i = 0; i < cnt; i++)
567 moments_pass_one (m, values[i], weights[i]);
568 for (i = 0; i < cnt; i++)
569 moments_pass_two (m, values[i], weights[i]);
570 moments_calculate (m, &weight, &M[0], &M[1], &M[2], &M[3]);
575 struct moments1 *m = NULL;
577 m = moments1_create (MOMENT_KURTOSIS);
578 if (!read_values (&values, &weights, &cnt))
580 moments1_destroy (m);
583 for (i = 0; i < cnt; i++)
584 moments1_add (m, values[i], weights[i]);
585 moments1_calculate (m, &weight, &M[0], &M[1], &M[2], &M[3]);
586 moments1_destroy (m);
589 fprintf (stderr, "W=%.3f", weight);
590 for (i = 0; i < 4; i++)
592 fprintf (stderr, " M%d=", i + 1);
594 fprintf (stderr, "sysmis");
595 else if (fabs (M[i]) <= 0.0005)
596 fprintf (stderr, "0.000");
598 fprintf (stderr, "%.3f", M[i]);
600 fprintf (stderr, "\n");
602 retval = lex_end_of_command ();