1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 /* FIXME: Many possible optimizations. */
27 #include "algorithm.h"
29 #include "bitvector.h"
44 +missing=miss:!variable/listwise,incl:!noinclude/include;
45 +format=labeled:!labels/nolabels,indexed:!noindex/index,lined:!line/serial;
47 +options[op_]=1,2,3,4,5,6,7,8;
48 +statistics[st_]=all,1|mean,2|semean,5|stddev,6|variance,7|kurtosis,
49 8|skewness,9|range,10|minimum,11|maximum,12|sum,
50 13|default,14|seskewness,15|sekurtosis;
51 +sort=sortby:mean/semean/stddev/variance/kurtosis/skewness/range/
52 range/minimum/maximum/sum/name/seskewness/sekurtosis/!none,
58 /* DESCRIPTIVES private data. */
60 /* Describes properties of a distribution for the purpose of
61 calculating a Z-score. */
64 struct variable *s, *d; /* Source, destination variable. */
65 double mean; /* Distribution mean. */
66 double std_dev; /* Distribution standard deviation. */
69 /* DESCRIPTIVES transformation (for calculating Z-scores). */
70 struct descriptives_trns
73 int n; /* Number of Z-scores. */
74 struct dsc_z_score *z; /* Array of Z-scores. */
77 /* These next three vars, see comment at top of display(). */
78 /* Number of cases missing listwise, even if option 5 not selected. */
79 static double d_glob_miss_list;
81 /* Number of total *cases* valid or missing, as a double. Unless
82 option 5 is selected, d_glob_missing is 0. */
83 static double d_glob_valid, d_glob_missing;
85 /* Set when a weighting variable is missing or <=0. */
86 static int bad_weight;
88 /* Number of generic zvarnames we've generated in this execution. */
91 /* Variables specified on command. */
92 static struct variable **v_variables;
93 static int n_variables;
95 /* Command specifications. */
96 static struct cmd_descriptives cmd;
98 /* Whether z-scores are computed. */
101 /* Statistic to sort by. */
102 static int sortby_stat;
104 /* Statistics to display. */
105 static unsigned long stats;
107 /* Easier access to long-named arrays. */
108 #define stat cmd.a_statistics
109 #define opt cmd.a_options
111 /* Groups of statistics. */
114 #define dsc_default \
115 (BI (dsc_mean) | BI (dsc_stddev) | BI (dsc_min) | BI (dsc_max))
118 (BI (dsc_sum) | BI (dsc_min) | BI (dsc_max) \
119 | BI (dsc_mean) | BI (dsc_semean) | BI (dsc_stddev) \
120 | BI (dsc_variance) | BI (dsc_kurt) | BI (dsc_sekurt) \
121 | BI (dsc_skew) | BI (dsc_seskew) | BI (dsc_range) \
124 /* Table of options. */
125 #define op_incl_miss DSC_OP_1 /* Honored. */
126 #define op_no_varlabs DSC_OP_2 /* Ignored. */
127 #define op_zscores DSC_OP_3 /* Honored. */
128 #define op_index DSC_OP_4 /* FIXME. */
129 #define op_excl_miss DSC_OP_5 /* Honored. */
130 #define op_serial DSC_OP_6 /* Honored. */
131 #define op_narrow DSC_OP_7 /* Ignored. */
132 #define op_no_varnames DSC_OP_8 /* Honored. */
134 /* Describes one statistic that can be calculated. */
135 /* FIXME: Currently sm,col_width are not used. */
138 int st_indx; /* Index into st_a_statistics[]. */
139 int sb_indx; /* Sort-by index. */
140 const char *s10; /* Name, stuffed into 10 columns. */
141 const char *s8; /* Name, stuffed into 8 columns. */
142 const char *sm; /* Name, stuffed minimally. */
143 const char *s; /* Full name. */
144 int max_degree; /* Highest degree necessary to calculate this
146 int col_width; /* Column width (not incl. spacing between columns) */
149 /* Table of statistics, indexed by dsc_*. */
150 static struct dsc_info dsc_info[dsc_n_stats] =
152 {DSC_ST_MEAN, DSC_MEAN, N_("Mean"), N_("Mean"), N_("Mean"), N_("mean"), 1, 10},
153 {DSC_ST_SEMEAN, DSC_SEMEAN, N_("S.E. Mean"), N_("S E Mean"), N_("SE"),
154 N_("standard error of mean"), 2, 10},
155 {DSC_ST_STDDEV, DSC_STDDEV, N_("Std Dev"), N_("Std Dev"), N_("SD"),
156 N_("standard deviation"), 2, 11},
157 {DSC_ST_VARIANCE, DSC_VARIANCE, N_("Variance"), N_("Variance"),
158 N_("Var"), N_("variance"), 2, 12},
159 {DSC_ST_KURTOSIS, DSC_KURTOSIS, N_("Kurtosis"), N_("Kurtosis"),
160 N_("Kurt"), N_("kurtosis"), 4, 9},
161 {DSC_ST_SEKURTOSIS, DSC_SEKURTOSIS, N_("S.E. Kurt"), N_("S E Kurt"), N_("SEKurt"),
162 N_("standard error of kurtosis"), 0, 9},
163 {DSC_ST_SKEWNESS, DSC_SKEWNESS, N_("Skewness"), N_("Skewness"), N_("Skew"),
164 N_("skewness"), 3, 9},
165 {DSC_ST_SESKEWNESS, DSC_SESKEWNESS, N_("S.E. Skew"), N_("S E Skew"), N_("SESkew"),
166 N_("standard error of skewness"), 0, 9},
167 {DSC_ST_RANGE, DSC_RANGE, N_("Range"), N_("Range"), N_("Rng"), N_("range"), 0, 10},
168 {DSC_ST_MINIMUM, DSC_MINIMUM, N_("Minimum"), N_("Minimum"), N_("Min"),
169 N_("minimum"), 0, 10},
170 {DSC_ST_MAXIMUM, DSC_MAXIMUM, N_("Maximum"), N_("Maximum"), N_("Max"),
171 N_("maximum"), 0, 10},
172 {DSC_ST_SUM, DSC_SUM, N_("Sum"), N_("Sum"), N_("Sum"), N_("sum"), 1, 13},
175 /* Z-score functions. */
176 static int generate_z_varname (struct variable * v);
177 static void dump_z_table (void);
178 static void run_z_pass (void);
180 /* Procedure execution functions. */
181 static int calc (struct ccase *, void *);
182 static void precalc (void *);
183 static void postcalc (void *);
184 static void display (void);
186 /* Parser and outline. */
189 cmd_descriptives (void)
197 lex_match_id ("DESCRIPTIVES");
198 lex_match_id ("CONDESCRIPTIVES");
199 if (!parse_descriptives (&cmd))
202 if (n_variables == 0)
204 for (i = 0; i < n_variables; i++)
208 v->p.dsc.zname[0] = 0;
213 msg (SE, _("No variables specified."));
217 if (cmd.sbc_options && (cmd.sbc_save || cmd.sbc_format || cmd.sbc_missing))
219 msg (SE, _("OPTIONS may not be used with SAVE, FORMAT, or MISSING."));
223 if (!cmd.sbc_options)
225 if (cmd.incl == DSC_INCLUDE)
226 opt[op_incl_miss] = 1;
227 if (cmd.labeled == DSC_NOLABELS)
228 opt[op_no_varlabs] = 1;
231 if (cmd.miss == DSC_LISTWISE)
232 opt[op_excl_miss] = 1;
233 if (cmd.lined == DSC_SERIAL)
237 /* Construct z-score varnames, show translation table. */
241 for (i = 0; i < n_variables; i++)
247 if (v->p.dsc.zname[0] == 0)
248 if (!generate_z_varname (v))
255 /* Figure out statistics to calculate. */
257 if (stat[DSC_ST_DEFAULT] || !cmd.sbc_statistics)
258 stats |= dsc_default;
259 if (stat[DSC_ST_ALL])
261 for (i = 0; i < dsc_n_stats; i++)
262 if (stat[dsc_info[i].st_indx])
263 stats |= BIT_INDEX (i);
264 if (stats & dsc_kurt)
266 if (stats & dsc_skew)
269 /* Check the sort order. */
271 if (cmd.sortby == DSC_NONE)
273 else if (cmd.sortby != DSC_NAME)
275 for (i = 0; i < n_variables; i++)
276 if (dsc_info[i].sb_indx == cmd.sortby)
279 if (!(stats & BIT_INDEX (i)))
281 msg (SE, _("It's not possible to sort on `%s' without "
283 gettext (dsc_info[i].s), gettext (dsc_info[i].s));
287 assert (sortby_stat != -1);
292 procedure (precalc, calc, postcalc, NULL);
295 msg (SW, _("At least one case in the data file had a weight value "
296 "that was system-missing, zero, or negative. These case(s) "
313 /* Parses the VARIABLES subcommand. */
315 dsc_custom_variables (struct cmd_descriptives *cmd UNUSED)
317 if (!lex_match_id ("VARIABLES")
318 && (token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
323 while (token == T_ID || token == T_ALL)
328 if (!parse_variables (default_dict, &v_variables, &n_variables,
329 PV_DUPLICATE | PV_SINGLE | PV_APPEND | PV_NUMERIC
334 if (n_variables - n > 1)
336 msg (SE, _("Names for z-score variables must be given for "
337 "individual variables, not for groups of "
341 assert (n_variables - n <= 0);
344 msg (SE, _("Name for z-score variable expected."));
347 if (dict_lookup_var (default_dict, tokid))
349 msg (SE, _("Z-score variable name `%s' is a "
350 "duplicate variable name with a current variable."),
354 for (i = 0; i < n_variables; i++)
355 if (v_variables[i]->p.dsc.zname[0]
356 && !strcmp (v_variables[i]->p.dsc.zname, tokid))
358 msg (SE, _("Z-score variable name `%s' is "
359 "used multiple times."), tokid);
362 strcpy (v_variables[n_variables - 1]->p.dsc.zname, tokid);
366 msg (SE, _("`)' expected after z-score variable name."));
379 /* Returns 0 if NAME is a duplicate of any existing variable name or
380 of any previously-declared z-var name; otherwise returns 1. */
382 try_name (char *name)
386 if (dict_lookup_var (default_dict, name) != NULL)
388 for (i = 0; i < n_variables; i++)
390 struct variable *v = v_variables[i];
391 if (!strcmp (v->p.dsc.zname, name))
398 generate_z_varname (struct variable * v)
402 strcpy (&zname[1], v->name);
405 if (try_name (zname))
407 strcpy (v->p.dsc.zname, zname);
413 /* Generate variable name. */
417 sprintf (zname, "ZSC%03d", z_count);
418 else if (z_count <= 108)
419 sprintf (zname, "STDZ%02d", z_count - 99);
420 else if (z_count <= 117)
421 sprintf (zname, "ZZZZ%02d", z_count - 108);
422 else if (z_count <= 126)
423 sprintf (zname, "ZQZQ%02d", z_count - 117);
426 msg (SE, _("Ran out of generic names for Z-score variables. "
427 "There are only 126 generic names: ZSC001-ZSC0999, "
428 "STDZ01-STDZ09, ZZZZ01-ZZZZ09, ZQZQ01-ZQZQ09."));
432 if (try_name (zname))
434 strcpy (v->p.dsc.zname, zname);
449 for (count = i = 0; i < n_variables; i++)
450 if (v_variables[i]->p.dsc.zname)
454 t = tab_create (2, count + 1, 0);
455 tab_title (t, 0, _("Mapping of variables to corresponding Z-scores."));
456 tab_columns (t, SOM_COL_DOWN, 1);
457 tab_headers (t, 0, 0, 1, 0);
458 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, count);
459 tab_hline (t, TAL_2, 0, 1, 1);
460 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Source"));
461 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Target"));
462 tab_dim (t, tab_natural_dimensions);
467 for (i = 0, y = 1; i < n_variables; i++)
468 if (v_variables[i]->p.dsc.zname)
470 tab_text (t, 0, y, TAB_LEFT, v_variables[i]->name);
471 tab_text (t, 1, y++, TAB_LEFT, v_variables[i]->p.dsc.zname);
478 /* Transformation function to calculate Z-scores. */
480 descriptives_trns_proc (struct trns_header * trns, struct ccase * c)
482 struct descriptives_trns *t = (struct descriptives_trns *) trns;
483 struct dsc_z_score *z;
486 for (i = t->n, z = t->z; i--; z++)
488 double score = c->data[z->s->fv].f;
490 if (z->mean == SYSMIS || score == SYSMIS)
491 c->data[z->d->fv].f = SYSMIS;
493 c->data[z->d->fv].f = (score - z->mean) / z->std_dev;
498 /* Frees a descriptives_trns struct. */
500 descriptives_trns_free (struct trns_header * trns)
502 struct descriptives_trns *t = (struct descriptives_trns *) trns;
507 /* The name is a misnomer: actually this function sets up a
508 transformation by which scores can be converted into Z-scores. */
512 struct descriptives_trns *t;
515 for (i = 0; i < n_variables; i++)
516 v_variables[i]->p.dsc.dup = 0;
517 for (count = i = 0; i < n_variables; i++)
519 if (v_variables[i]->p.dsc.dup++)
521 if (v_variables[i]->p.dsc.zname)
525 t = xmalloc (sizeof *t);
526 t->h.proc = descriptives_trns_proc;
527 t->h.free = descriptives_trns_free;
529 t->z = xmalloc (count * sizeof *t->z);
531 for (i = 0; i < n_variables; i++)
532 v_variables[i]->p.dsc.dup = 0;
533 for (count = i = 0; i < n_variables; i++)
535 struct variable *v = v_variables[i];
536 if (v->p.dsc.dup++ == 0 && v->p.dsc.zname[0])
542 t->z[count].d = d = dict_create_var_assert (default_dict,
547 d->label = xmalloc (strlen (v->label) + 12);
548 cp = stpcpy (d->label, _("Z-score of "));
549 strcpy (cp, v->label);
553 d->label = xmalloc (strlen (v->name) + 12);
554 cp = stpcpy (d->label, _("Z-score of "));
555 strcpy (cp, v->name);
557 t->z[count].mean = v->p.dsc.stats[dsc_mean];
558 t->z[count].std_dev = v->p.dsc.stats[dsc_stddev];
559 if (t->z[count].std_dev == SYSMIS
560 || approx_eq (t->z[count].std_dev, 0.0))
561 t->z[count].mean = SYSMIS;
566 add_transformation ((struct trns_header *) t);
569 /* Statistical calculation. */
572 precalc (void *aux UNUSED)
576 for (i = 0; i < n_variables; i++)
577 v_variables[i]->p.dsc.dup = -2;
578 for (i = 0; i < n_variables; i++)
580 struct descriptives_proc *dsc = &v_variables[i]->p.dsc;
582 /* Don't need to initialize more than once. */
587 dsc->valid = dsc->miss = 0.0;
588 dsc->X_bar = dsc->M2 = dsc->M3 = dsc->M4 = 0.0;
592 d_glob_valid = d_glob_missing = 0.0;
596 calc (struct ccase * c, void *aux UNUSED)
600 /* Unique case identifier. */
603 /* Get the weight for this case. */
604 double weight = dict_get_case_weight (default_dict, c);
613 /* Handle missing values. */
614 for (i = 0; i < n_variables; i++)
616 struct variable *v = v_variables[i];
617 double X = c->data[v->fv].f;
619 if (X == SYSMIS || (!opt[op_incl_miss] && is_num_user_missing (X, v)))
621 if (opt[op_excl_miss])
623 d_glob_missing += weight;
628 d_glob_miss_list += weight;
633 d_glob_valid += weight;
636 for (i = 0; i < n_variables; i++)
638 struct descriptives_proc *inf = &v_variables[i]->p.dsc;
645 if (inf->dup == case_id)
649 X = c->data[v_variables[i]->fv].f;
651 || (!opt[op_incl_miss] && is_num_user_missing (X, v_variables[i])))
657 /* These formulas taken from _SPSS Statistical Algorithms_. The
658 names W_old, and W_new are used for W_j-1 and W_j,
659 respectively, and other variables simply have the subscripts
660 trimmed off, except for X_bar.
662 I am happy that mathematical formulas may not be
665 W_new = inf->valid += weight;
666 v = (weight / W_new) * (X - inf->X_bar);
670 w2 = weight * weight;
673 inf->M4 += (-4.0 * v * inf->M3 + 6.0 * v2 * inf->M2
674 + (W_new * W_new - 3 * weight * W_old / w3) * v4 * W_old * W_new);
675 inf->M3 += (-3.0 * v * inf->M2 + W_new * W_old / w2
676 * (W_new - 2 * weight) * v3);
677 inf->M2 += W_new * W_old / weight * v2;
688 postcalc (void *aux UNUSED)
692 if (opt[op_excl_miss])
693 d_glob_miss_list = d_glob_missing;
695 for (i = 0; i < n_variables; i++)
697 struct descriptives_proc *dsc = &v_variables[i]->p.dsc;
700 /* Don't duplicate our efforts. */
707 dsc->stats[dsc_mean] = dsc->X_bar;
708 dsc->stats[dsc_variance] = dsc->M2 / (W - 1);
709 dsc->stats[dsc_stddev] = sqrt (dsc->stats[dsc_variance]);
710 dsc->stats[dsc_semean] = dsc->stats[dsc_stddev] / sqrt (W);
711 dsc->stats[dsc_min] = dsc->min == DBL_MAX ? SYSMIS : dsc->min;
712 dsc->stats[dsc_max] = dsc->max == -DBL_MAX ? SYSMIS : dsc->max;
713 dsc->stats[dsc_range] = ((dsc->min == DBL_MAX || dsc->max == -DBL_MAX)
714 ? SYSMIS : dsc->max - dsc->min);
715 dsc->stats[dsc_sum] = W * dsc->X_bar;
716 if (W > 2.0 && dsc->stats[dsc_variance] >= 1e-20)
718 double S = dsc->stats[dsc_stddev];
719 dsc->stats[dsc_skew] = (W * dsc->M3 / ((W - 1.0) * (W - 2.0) * S * S * S));
720 dsc->stats[dsc_seskew] =
721 sqrt (6.0 * W * (W - 1.0) / ((W - 2.0) * (W + 1.0) * (W + 3.0)));
725 dsc->stats[dsc_skew] = dsc->stats[dsc_seskew] = SYSMIS;
727 if (W > 3.0 && dsc->stats[dsc_variance] >= 1e-20)
729 double S2 = dsc->stats[dsc_variance];
730 double SE_g1 = dsc->stats[dsc_seskew];
732 dsc->stats[dsc_kurt] =
733 (W * (W + 1.0) * dsc->M4 - 3.0 * dsc->M2 * dsc->M2 * (W - 1.0))
734 / ((W - 1.0) * (W - 2.0) * (W - 3.0) * S2 * S2);
736 /* Note that in _SPSS Statistical Algorithms_, the square
737 root symbol is missing from this formula. */
738 dsc->stats[dsc_sekurt] =
739 sqrt ((4.0 * (W * W - 1.0) * SE_g1 * SE_g1) / ((W - 3.0) * (W + 5.0)));
743 dsc->stats[dsc_kurt] = dsc->stats[dsc_sekurt] = SYSMIS;
750 /* Statistical display. */
752 static algo_compare_func descriptives_compare_variables;
762 /* If op_excl_miss is on, d_glob_valid and (potentially)
763 d_glob_missing are nonzero, and d_glob_missing equals
766 If op_excl_miss is off, d_glob_valid is nonzero. d_glob_missing
767 is zero. d_glob_miss_list is (potentially) nonzero. */
769 if (sortby_stat != -2)
770 sort (v_variables, n_variables, sizeof *v_variables,
771 descriptives_compare_variables, &cmd);
773 for (nc = i = 0; i < dsc_n_stats; i++)
774 if (stats & BIT_INDEX (i))
777 if (!opt[op_no_varnames])
779 nc += opt[op_serial] ? 2 : 1;
781 t = tab_create (nc, n_variables + 1, 0);
782 tab_headers (t, 1, 0, 1, 0);
783 tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, nc - 1, n_variables);
784 tab_box (t, -1, -1, -1, TAL_1, 1, 0, nc - 1, n_variables);
785 tab_hline (t, TAL_2, 0, nc - 1, 1);
786 tab_vline (t, TAL_2, 1, 0, n_variables);
787 tab_dim (t, tab_natural_dimensions);
790 if (!opt[op_no_varnames])
792 tab_text (t, nc++, 0, TAB_LEFT | TAT_TITLE, _("Variable"));
796 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Valid N"));
797 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Missing N"));
799 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, "N");
802 for (i = 0; i < dsc_n_stats; i++)
803 if (stats & BIT_INDEX (i))
805 const char *title = gettext (dsc_info[i].s8);
806 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, title);
809 for (i = 0; i < n_variables; i++)
811 struct variable *v = v_variables[i];
814 if (!opt[op_no_varnames])
815 tab_text (t, nc++, i + 1, TAB_LEFT, v->name);
816 tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.valid);
818 tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.miss);
819 for (j = 0; j < dsc_n_stats; j++)
820 if (stats & BIT_INDEX (j))
821 tab_float (t, nc++, i + 1, TAB_NONE, v->p.dsc.stats[j], 10, 3);
824 tab_title (t, 1, _("Valid cases = %g; cases with missing value(s) = %g."),
825 d_glob_valid, d_glob_miss_list);
830 /* Compares variables A and B according to the ordering specified
833 descriptives_compare_variables (const void *a_, const void *b_, void *cmd_)
835 struct variable *const *ap = a_;
836 struct variable *const *bp = b_;
837 struct variable *a = *ap;
838 struct variable *b = *bp;
839 struct cmd_descriptives *cmd = cmd_;
843 if (cmd->sortby == DSC_NAME)
844 result = strcmp (a->name, b->name);
847 double as = a->p.dsc.stats[sortby_stat];
848 double bs = b->p.dsc.stats[sortby_stat];
850 result = as < bs ? -1 : as > bs;
853 if (cmd->order == DSC_D)