1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 /* FIXME: Many possible optimizations. */
27 #include "algorithm.h"
29 #include "bitvector.h"
43 +missing=miss:!variable/listwise,incl:!noinclude/include;
44 +format=labeled:!labels/nolabels,indexed:!noindex/index,lined:!line/serial;
46 +options[op_]=1,2,3,4,5,6,7,8;
47 +statistics[st_]=all,1|mean,2|semean,5|stddev,6|variance,7|kurtosis,
48 8|skewness,9|range,10|minimum,11|maximum,12|sum,
49 13|default,14|seskewness,15|sekurtosis;
50 +sort=sortby:mean/semean/stddev/variance/kurtosis/skewness/range/
51 range/minimum/maximum/sum/name/seskewness/sekurtosis/!none,
57 /* DESCRIPTIVES private data. */
59 /* Describes properties of a distribution for the purpose of
60 calculating a Z-score. */
63 struct variable *s, *d; /* Source, destination variable. */
64 double mean; /* Distribution mean. */
65 double std_dev; /* Distribution standard deviation. */
68 /* DESCRIPTIVES transformation (for calculating Z-scores). */
69 struct descriptives_trns
72 int n; /* Number of Z-scores. */
73 struct dsc_z_score *z; /* Array of Z-scores. */
76 /* These next three vars, see comment at top of display(). */
77 /* Number of cases missing listwise, even if option 5 not selected. */
78 static double d_glob_miss_list;
80 /* Number of total *cases* valid or missing, as a double. Unless
81 option 5 is selected, d_glob_missing is 0. */
82 static double d_glob_valid, d_glob_missing;
84 /* Set when a weighting variable is missing or <=0. */
85 static int bad_weight;
87 /* Number of generic zvarnames we've generated in this execution. */
90 /* Variables specified on command. */
91 static struct variable **v_variables;
92 static int n_variables;
94 /* Command specifications. */
95 static struct cmd_descriptives cmd;
97 /* Whether z-scores are computed. */
100 /* Statistic to sort by. */
101 static int sortby_stat;
103 /* Statistics to display. */
104 static unsigned long stats;
106 /* Easier access to long-named arrays. */
107 #define stat cmd.a_statistics
108 #define opt cmd.a_options
110 /* Groups of statistics. */
113 #define dsc_default \
114 (BI (dsc_mean) | BI (dsc_stddev) | BI (dsc_min) | BI (dsc_max))
117 (BI (dsc_sum) | BI (dsc_min) | BI (dsc_max) \
118 | BI (dsc_mean) | BI (dsc_semean) | BI (dsc_stddev) \
119 | BI (dsc_variance) | BI (dsc_kurt) | BI (dsc_sekurt) \
120 | BI (dsc_skew) | BI (dsc_seskew) | BI (dsc_range) \
123 /* Table of options. */
124 #define op_incl_miss DSC_OP_1 /* Honored. */
125 #define op_no_varlabs DSC_OP_2 /* Ignored. */
126 #define op_zscores DSC_OP_3 /* Honored. */
127 #define op_index DSC_OP_4 /* FIXME. */
128 #define op_excl_miss DSC_OP_5 /* Honored. */
129 #define op_serial DSC_OP_6 /* Honored. */
130 #define op_narrow DSC_OP_7 /* Ignored. */
131 #define op_no_varnames DSC_OP_8 /* Honored. */
133 /* Describes one statistic that can be calculated. */
134 /* FIXME: Currently sm,col_width are not used. */
137 int st_indx; /* Index into st_a_statistics[]. */
138 int sb_indx; /* Sort-by index. */
139 const char *s10; /* Name, stuffed into 10 columns. */
140 const char *s8; /* Name, stuffed into 8 columns. */
141 const char *sm; /* Name, stuffed minimally. */
142 const char *s; /* Full name. */
143 int max_degree; /* Highest degree necessary to calculate this
145 int col_width; /* Column width (not incl. spacing between columns) */
148 /* Table of statistics, indexed by dsc_*. */
149 static struct dsc_info dsc_info[dsc_n_stats] =
151 {DSC_ST_MEAN, DSC_MEAN, N_("Mean"), N_("Mean"), N_("Mean"), N_("mean"), 1, 10},
152 {DSC_ST_SEMEAN, DSC_SEMEAN, N_("S.E. Mean"), N_("S E Mean"), N_("SE"),
153 N_("standard error of mean"), 2, 10},
154 {DSC_ST_STDDEV, DSC_STDDEV, N_("Std Dev"), N_("Std Dev"), N_("SD"),
155 N_("standard deviation"), 2, 11},
156 {DSC_ST_VARIANCE, DSC_VARIANCE, N_("Variance"), N_("Variance"),
157 N_("Var"), N_("variance"), 2, 12},
158 {DSC_ST_KURTOSIS, DSC_KURTOSIS, N_("Kurtosis"), N_("Kurtosis"),
159 N_("Kurt"), N_("kurtosis"), 4, 9},
160 {DSC_ST_SEKURTOSIS, DSC_SEKURTOSIS, N_("S.E. Kurt"), N_("S E Kurt"), N_("SEKurt"),
161 N_("standard error of kurtosis"), 0, 9},
162 {DSC_ST_SKEWNESS, DSC_SKEWNESS, N_("Skewness"), N_("Skewness"), N_("Skew"),
163 N_("skewness"), 3, 9},
164 {DSC_ST_SESKEWNESS, DSC_SESKEWNESS, N_("S.E. Skew"), N_("S E Skew"), N_("SESkew"),
165 N_("standard error of skewness"), 0, 9},
166 {DSC_ST_RANGE, DSC_RANGE, N_("Range"), N_("Range"), N_("Rng"), N_("range"), 0, 10},
167 {DSC_ST_MINIMUM, DSC_MINIMUM, N_("Minimum"), N_("Minimum"), N_("Min"),
168 N_("minimum"), 0, 10},
169 {DSC_ST_MAXIMUM, DSC_MAXIMUM, N_("Maximum"), N_("Maximum"), N_("Max"),
170 N_("maximum"), 0, 10},
171 {DSC_ST_SUM, DSC_SUM, N_("Sum"), N_("Sum"), N_("Sum"), N_("sum"), 1, 13},
174 /* Z-score functions. */
175 static int generate_z_varname (struct variable * v);
176 static void dump_z_table (void);
177 static void run_z_pass (void);
179 /* Procedure execution functions. */
180 static int calc (struct ccase *, void *);
181 static void precalc (void *);
182 static void postcalc (void *);
183 static void display (void);
185 /* Parser and outline. */
188 cmd_descriptives (void)
196 lex_match_id ("DESCRIPTIVES");
197 lex_match_id ("CONDESCRIPTIVES");
198 if (!parse_descriptives (&cmd))
201 if (n_variables == 0)
203 for (i = 0; i < n_variables; i++)
207 v->p.dsc.zname[0] = 0;
212 msg (SE, _("No variables specified."));
216 if (cmd.sbc_options && (cmd.sbc_save || cmd.sbc_format || cmd.sbc_missing))
218 msg (SE, _("OPTIONS may not be used with SAVE, FORMAT, or MISSING."));
222 if (!cmd.sbc_options)
224 if (cmd.incl == DSC_INCLUDE)
225 opt[op_incl_miss] = 1;
226 if (cmd.labeled == DSC_NOLABELS)
227 opt[op_no_varlabs] = 1;
230 if (cmd.miss == DSC_LISTWISE)
231 opt[op_excl_miss] = 1;
232 if (cmd.lined == DSC_SERIAL)
236 /* Construct z-score varnames, show translation table. */
240 for (i = 0; i < n_variables; i++)
246 if (v->p.dsc.zname[0] == 0)
247 if (!generate_z_varname (v))
254 /* Figure out statistics to calculate. */
256 if (stat[DSC_ST_DEFAULT] || !cmd.sbc_statistics)
257 stats |= dsc_default;
258 if (stat[DSC_ST_ALL])
260 for (i = 0; i < dsc_n_stats; i++)
261 if (stat[dsc_info[i].st_indx])
262 stats |= BIT_INDEX (i);
263 if (stats & dsc_kurt)
265 if (stats & dsc_skew)
268 /* Check the sort order. */
270 if (cmd.sortby == DSC_NONE)
272 else if (cmd.sortby != DSC_NAME)
274 for (i = 0; i < n_variables; i++)
275 if (dsc_info[i].sb_indx == cmd.sortby)
278 if (!(stats & BIT_INDEX (i)))
280 msg (SE, _("It's not possible to sort on `%s' without "
282 gettext (dsc_info[i].s), gettext (dsc_info[i].s));
286 assert (sortby_stat != -1);
291 procedure (precalc, calc, postcalc, NULL);
294 msg (SW, _("At least one case in the data file had a weight value "
295 "that was system-missing, zero, or negative. These case(s) "
312 /* Parses the VARIABLES subcommand. */
314 dsc_custom_variables (struct cmd_descriptives *cmd UNUSED)
316 if (!lex_match_id ("VARIABLES")
317 && (token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
322 while (token == T_ID || token == T_ALL)
327 if (!parse_variables (default_dict, &v_variables, &n_variables,
328 PV_DUPLICATE | PV_SINGLE | PV_APPEND | PV_NUMERIC
333 if (n_variables - n > 1)
335 msg (SE, _("Names for z-score variables must be given for "
336 "individual variables, not for groups of "
340 assert (n_variables - n <= 0);
343 msg (SE, _("Name for z-score variable expected."));
346 if (dict_lookup_var (default_dict, tokid))
348 msg (SE, _("Z-score variable name `%s' is a "
349 "duplicate variable name with a current variable."),
353 for (i = 0; i < n_variables; i++)
354 if (v_variables[i]->p.dsc.zname[0]
355 && !strcmp (v_variables[i]->p.dsc.zname, tokid))
357 msg (SE, _("Z-score variable name `%s' is "
358 "used multiple times."), tokid);
361 strcpy (v_variables[n_variables - 1]->p.dsc.zname, tokid);
365 msg (SE, _("`)' expected after z-score variable name."));
378 /* Returns 0 if NAME is a duplicate of any existing variable name or
379 of any previously-declared z-var name; otherwise returns 1. */
381 try_name (char *name)
385 if (dict_lookup_var (default_dict, name) != NULL)
387 for (i = 0; i < n_variables; i++)
389 struct variable *v = v_variables[i];
390 if (!strcmp (v->p.dsc.zname, name))
397 generate_z_varname (struct variable * v)
401 strcpy (&zname[1], v->name);
404 if (try_name (zname))
406 strcpy (v->p.dsc.zname, zname);
412 /* Generate variable name. */
416 sprintf (zname, "ZSC%03d", z_count);
417 else if (z_count <= 108)
418 sprintf (zname, "STDZ%02d", z_count - 99);
419 else if (z_count <= 117)
420 sprintf (zname, "ZZZZ%02d", z_count - 108);
421 else if (z_count <= 126)
422 sprintf (zname, "ZQZQ%02d", z_count - 117);
425 msg (SE, _("Ran out of generic names for Z-score variables. "
426 "There are only 126 generic names: ZSC001-ZSC0999, "
427 "STDZ01-STDZ09, ZZZZ01-ZZZZ09, ZQZQ01-ZQZQ09."));
431 if (try_name (zname))
433 strcpy (v->p.dsc.zname, zname);
448 for (count = i = 0; i < n_variables; i++)
449 if (v_variables[i]->p.dsc.zname)
453 t = tab_create (2, count + 1, 0);
454 tab_title (t, 0, _("Mapping of variables to corresponding Z-scores."));
455 tab_columns (t, SOM_COL_DOWN, 1);
456 tab_headers (t, 0, 0, 1, 0);
457 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, count);
458 tab_hline (t, TAL_2, 0, 1, 1);
459 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Source"));
460 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Target"));
461 tab_dim (t, tab_natural_dimensions);
466 for (i = 0, y = 1; i < n_variables; i++)
467 if (v_variables[i]->p.dsc.zname)
469 tab_text (t, 0, y, TAB_LEFT, v_variables[i]->name);
470 tab_text (t, 1, y++, TAB_LEFT, v_variables[i]->p.dsc.zname);
477 /* Transformation function to calculate Z-scores. */
479 descriptives_trns_proc (struct trns_header * trns, struct ccase * c)
481 struct descriptives_trns *t = (struct descriptives_trns *) trns;
482 struct dsc_z_score *z;
485 for (i = t->n, z = t->z; i--; z++)
487 double score = c->data[z->s->fv].f;
489 if (z->mean == SYSMIS || score == SYSMIS)
490 c->data[z->d->fv].f = SYSMIS;
492 c->data[z->d->fv].f = (score - z->mean) / z->std_dev;
497 /* Frees a descriptives_trns struct. */
499 descriptives_trns_free (struct trns_header * trns)
501 struct descriptives_trns *t = (struct descriptives_trns *) trns;
506 /* The name is a misnomer: actually this function sets up a
507 transformation by which scores can be converted into Z-scores. */
511 struct descriptives_trns *t;
514 for (i = 0; i < n_variables; i++)
515 v_variables[i]->p.dsc.dup = 0;
516 for (count = i = 0; i < n_variables; i++)
518 if (v_variables[i]->p.dsc.dup++)
520 if (v_variables[i]->p.dsc.zname)
524 t = xmalloc (sizeof *t);
525 t->h.proc = descriptives_trns_proc;
526 t->h.free = descriptives_trns_free;
528 t->z = xmalloc (count * sizeof *t->z);
530 for (i = 0; i < n_variables; i++)
531 v_variables[i]->p.dsc.dup = 0;
532 for (count = i = 0; i < n_variables; i++)
534 struct variable *v = v_variables[i];
535 if (v->p.dsc.dup++ == 0 && v->p.dsc.zname[0])
541 t->z[count].d = d = dict_create_var_assert (default_dict,
546 d->label = xmalloc (strlen (v->label) + 12);
547 cp = stpcpy (d->label, _("Z-score of "));
548 strcpy (cp, v->label);
552 d->label = xmalloc (strlen (v->name) + 12);
553 cp = stpcpy (d->label, _("Z-score of "));
554 strcpy (cp, v->name);
556 t->z[count].mean = v->p.dsc.stats[dsc_mean];
557 t->z[count].std_dev = v->p.dsc.stats[dsc_stddev];
558 if (t->z[count].std_dev == SYSMIS || t->z[count].std_dev == 0.0)
559 t->z[count].mean = SYSMIS;
564 add_transformation ((struct trns_header *) t);
567 /* Statistical calculation. */
570 precalc (void *aux UNUSED)
574 for (i = 0; i < n_variables; i++)
575 v_variables[i]->p.dsc.dup = -2;
576 for (i = 0; i < n_variables; i++)
578 struct descriptives_proc *dsc = &v_variables[i]->p.dsc;
580 /* Don't need to initialize more than once. */
585 dsc->valid = dsc->miss = 0.0;
586 dsc->X_bar = dsc->M2 = dsc->M3 = dsc->M4 = 0.0;
590 d_glob_valid = d_glob_missing = 0.0;
594 calc (struct ccase * c, void *aux UNUSED)
598 /* Unique case identifier. */
601 /* Get the weight for this case. */
602 double weight = dict_get_case_weight (default_dict, c);
611 /* Handle missing values. */
612 for (i = 0; i < n_variables; i++)
614 struct variable *v = v_variables[i];
615 double X = c->data[v->fv].f;
617 if (X == SYSMIS || (!opt[op_incl_miss] && is_num_user_missing (X, v)))
619 if (opt[op_excl_miss])
621 d_glob_missing += weight;
626 d_glob_miss_list += weight;
631 d_glob_valid += weight;
634 for (i = 0; i < n_variables; i++)
636 struct descriptives_proc *inf = &v_variables[i]->p.dsc;
643 if (inf->dup == case_id)
647 X = c->data[v_variables[i]->fv].f;
649 || (!opt[op_incl_miss] && is_num_user_missing (X, v_variables[i])))
655 /* These formulas taken from _SPSS Statistical Algorithms_. The
656 names W_old, and W_new are used for W_j-1 and W_j,
657 respectively, and other variables simply have the subscripts
658 trimmed off, except for X_bar.
660 I am happy that mathematical formulas may not be
663 W_new = inf->valid += weight;
664 v = (weight / W_new) * (X - inf->X_bar);
668 w2 = weight * weight;
671 inf->M4 += (-4.0 * v * inf->M3 + 6.0 * v2 * inf->M2
672 + (W_new * W_new - 3 * weight * W_old / w3) * v4 * W_old * W_new);
673 inf->M3 += (-3.0 * v * inf->M2 + W_new * W_old / w2
674 * (W_new - 2 * weight) * v3);
675 inf->M2 += W_new * W_old / weight * v2;
686 postcalc (void *aux UNUSED)
690 if (opt[op_excl_miss])
691 d_glob_miss_list = d_glob_missing;
693 for (i = 0; i < n_variables; i++)
695 struct descriptives_proc *dsc = &v_variables[i]->p.dsc;
698 /* Don't duplicate our efforts. */
705 dsc->stats[dsc_mean] = dsc->X_bar;
706 dsc->stats[dsc_variance] = dsc->M2 / (W - 1);
707 dsc->stats[dsc_stddev] = sqrt (dsc->stats[dsc_variance]);
708 dsc->stats[dsc_semean] = dsc->stats[dsc_stddev] / sqrt (W);
709 dsc->stats[dsc_min] = dsc->min == DBL_MAX ? SYSMIS : dsc->min;
710 dsc->stats[dsc_max] = dsc->max == -DBL_MAX ? SYSMIS : dsc->max;
711 dsc->stats[dsc_range] = ((dsc->min == DBL_MAX || dsc->max == -DBL_MAX)
712 ? SYSMIS : dsc->max - dsc->min);
713 dsc->stats[dsc_sum] = W * dsc->X_bar;
714 if (W > 2.0 && dsc->stats[dsc_variance] >= 1e-20)
716 double S = dsc->stats[dsc_stddev];
717 dsc->stats[dsc_skew] = (W * dsc->M3 / ((W - 1.0) * (W - 2.0) * S * S * S));
718 dsc->stats[dsc_seskew] =
719 sqrt (6.0 * W * (W - 1.0) / ((W - 2.0) * (W + 1.0) * (W + 3.0)));
723 dsc->stats[dsc_skew] = dsc->stats[dsc_seskew] = SYSMIS;
725 if (W > 3.0 && dsc->stats[dsc_variance] >= 1e-20)
727 double S2 = dsc->stats[dsc_variance];
728 double SE_g1 = dsc->stats[dsc_seskew];
730 dsc->stats[dsc_kurt] =
731 (W * (W + 1.0) * dsc->M4 - 3.0 * dsc->M2 * dsc->M2 * (W - 1.0))
732 / ((W - 1.0) * (W - 2.0) * (W - 3.0) * S2 * S2);
734 /* Note that in _SPSS Statistical Algorithms_, the square
735 root symbol is missing from this formula. */
736 dsc->stats[dsc_sekurt] =
737 sqrt ((4.0 * (W * W - 1.0) * SE_g1 * SE_g1) / ((W - 3.0) * (W + 5.0)));
741 dsc->stats[dsc_kurt] = dsc->stats[dsc_sekurt] = SYSMIS;
748 /* Statistical display. */
750 static algo_compare_func descriptives_compare_variables;
760 /* If op_excl_miss is on, d_glob_valid and (potentially)
761 d_glob_missing are nonzero, and d_glob_missing equals
764 If op_excl_miss is off, d_glob_valid is nonzero. d_glob_missing
765 is zero. d_glob_miss_list is (potentially) nonzero. */
767 if (sortby_stat != -2)
768 sort (v_variables, n_variables, sizeof *v_variables,
769 descriptives_compare_variables, &cmd);
771 for (nc = i = 0; i < dsc_n_stats; i++)
772 if (stats & BIT_INDEX (i))
775 if (!opt[op_no_varnames])
777 nc += opt[op_serial] ? 2 : 1;
779 t = tab_create (nc, n_variables + 1, 0);
780 tab_headers (t, 1, 0, 1, 0);
781 tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, nc - 1, n_variables);
782 tab_box (t, -1, -1, -1, TAL_1, 1, 0, nc - 1, n_variables);
783 tab_hline (t, TAL_2, 0, nc - 1, 1);
784 tab_vline (t, TAL_2, 1, 0, n_variables);
785 tab_dim (t, tab_natural_dimensions);
788 if (!opt[op_no_varnames])
790 tab_text (t, nc++, 0, TAB_LEFT | TAT_TITLE, _("Variable"));
794 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Valid N"));
795 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Missing N"));
797 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, "N");
800 for (i = 0; i < dsc_n_stats; i++)
801 if (stats & BIT_INDEX (i))
803 const char *title = gettext (dsc_info[i].s8);
804 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, title);
807 for (i = 0; i < n_variables; i++)
809 struct variable *v = v_variables[i];
812 if (!opt[op_no_varnames])
813 tab_text (t, nc++, i + 1, TAB_LEFT, v->name);
814 tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.valid);
816 tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.miss);
817 for (j = 0; j < dsc_n_stats; j++)
818 if (stats & BIT_INDEX (j))
819 tab_float (t, nc++, i + 1, TAB_NONE, v->p.dsc.stats[j], 10, 3);
822 tab_title (t, 1, _("Valid cases = %g; cases with missing value(s) = %g."),
823 d_glob_valid, d_glob_miss_list);
828 /* Compares variables A and B according to the ordering specified
831 descriptives_compare_variables (const void *a_, const void *b_, void *cmd_)
833 struct variable *const *ap = a_;
834 struct variable *const *bp = b_;
835 struct variable *a = *ap;
836 struct variable *b = *bp;
837 struct cmd_descriptives *cmd = cmd_;
841 if (cmd->sortby == DSC_NAME)
842 result = strcmp (a->name, b->name);
845 double as = a->p.dsc.stats[sortby_stat];
846 double bs = b->p.dsc.stats[sortby_stat];
848 result = as < bs ? -1 : as > bs;
851 if (cmd->order == DSC_D)