1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 /* FIXME: Many possible optimizations. */
27 #include "algorithm.h"
29 #include "bitvector.h"
43 +missing=miss:!variable/listwise,incl:!noinclude/include;
44 +format=labeled:!labels/nolabels,indexed:!noindex/index,lined:!line/serial;
46 +options[op_]=1,2,3,4,5,6,7,8;
47 +statistics[st_]=all,1|mean,2|semean,5|stddev,6|variance,7|kurtosis,
48 8|skewness,9|range,10|minimum,11|maximum,12|sum,
49 13|default,14|seskewness,15|sekurtosis;
50 +sort=sortby:mean/semean/stddev/variance/kurtosis/skewness/range/
51 range/minimum/maximum/sum/name/seskewness/sekurtosis/!none,
57 /* DESCRIPTIVES private data. */
59 /* Describes properties of a distribution for the purpose of
60 calculating a Z-score. */
63 struct variable *s, *d; /* Source, destination variable. */
64 double mean; /* Distribution mean. */
65 double std_dev; /* Distribution standard deviation. */
68 /* DESCRIPTIVES transformation (for calculating Z-scores). */
69 struct descriptives_trns
72 int n; /* Number of Z-scores. */
73 struct dsc_z_score *z; /* Array of Z-scores. */
76 /* These next three vars, see comment at top of display(). */
77 /* Number of cases missing listwise, even if option 5 not selected. */
78 static double d_glob_miss_list;
80 /* Number of total *cases* valid or missing, as a double. Unless
81 option 5 is selected, d_glob_missing is 0. */
82 static double d_glob_valid, d_glob_missing;
84 /* Set when a weighting variable is missing or <=0. */
85 static int bad_weight;
87 /* Number of generic zvarnames we've generated in this execution. */
90 /* Variables specified on command. */
91 static struct variable **v_variables;
92 static int n_variables;
94 /* Command specifications. */
95 static struct cmd_descriptives cmd;
97 /* Whether z-scores are computed. */
100 /* Statistic to sort by. */
101 static int sortby_stat;
103 /* Statistics to display. */
104 static unsigned long stats;
106 /* Easier access to long-named arrays. */
107 #define stat cmd.a_statistics
108 #define opt cmd.a_options
110 /* Groups of statistics. */
113 #define dsc_default \
114 (BI (dsc_mean) | BI (dsc_stddev) | BI (dsc_min) | BI (dsc_max))
117 (BI (dsc_sum) | BI (dsc_min) | BI (dsc_max) \
118 | BI (dsc_mean) | BI (dsc_semean) | BI (dsc_stddev) \
119 | BI (dsc_variance) | BI (dsc_kurt) | BI (dsc_sekurt) \
120 | BI (dsc_skew) | BI (dsc_seskew) | BI (dsc_range) \
123 /* Table of options. */
124 #define op_incl_miss DSC_OP_1 /* Honored. */
125 #define op_no_varlabs DSC_OP_2 /* Ignored. */
126 #define op_zscores DSC_OP_3 /* Honored. */
127 #define op_index DSC_OP_4 /* FIXME. */
128 #define op_excl_miss DSC_OP_5 /* Honored. */
129 #define op_serial DSC_OP_6 /* Honored. */
130 #define op_narrow DSC_OP_7 /* Ignored. */
131 #define op_no_varnames DSC_OP_8 /* Honored. */
133 /* Describes one statistic that can be calculated. */
134 /* FIXME: Currently sm,col_width are not used. */
137 int st_indx; /* Index into st_a_statistics[]. */
138 int sb_indx; /* Sort-by index. */
139 const char *s10; /* Name, stuffed into 10 columns. */
140 const char *s8; /* Name, stuffed into 8 columns. */
141 const char *sm; /* Name, stuffed minimally. */
142 const char *s; /* Full name. */
143 int max_degree; /* Highest degree necessary to calculate this
145 int col_width; /* Column width (not incl. spacing between columns) */
148 /* Table of statistics, indexed by dsc_*. */
149 static struct dsc_info dsc_info[dsc_n_stats] =
151 {DSC_ST_MEAN, DSC_MEAN, N_("Mean"), N_("Mean"), N_("Mean"), N_("mean"), 1, 10},
152 {DSC_ST_SEMEAN, DSC_SEMEAN, N_("S.E. Mean"), N_("S E Mean"), N_("SE"),
153 N_("standard error of mean"), 2, 10},
154 {DSC_ST_STDDEV, DSC_STDDEV, N_("Std Dev"), N_("Std Dev"), N_("SD"),
155 N_("standard deviation"), 2, 11},
156 {DSC_ST_VARIANCE, DSC_VARIANCE, N_("Variance"), N_("Variance"),
157 N_("Var"), N_("variance"), 2, 12},
158 {DSC_ST_KURTOSIS, DSC_KURTOSIS, N_("Kurtosis"), N_("Kurtosis"),
159 N_("Kurt"), N_("kurtosis"), 4, 9},
160 {DSC_ST_SEKURTOSIS, DSC_SEKURTOSIS, N_("S.E. Kurt"), N_("S E Kurt"), N_("SEKurt"),
161 N_("standard error of kurtosis"), 0, 9},
162 {DSC_ST_SKEWNESS, DSC_SKEWNESS, N_("Skewness"), N_("Skewness"), N_("Skew"),
163 N_("skewness"), 3, 9},
164 {DSC_ST_SESKEWNESS, DSC_SESKEWNESS, N_("S.E. Skew"), N_("S E Skew"), N_("SESkew"),
165 N_("standard error of skewness"), 0, 9},
166 {DSC_ST_RANGE, DSC_RANGE, N_("Range"), N_("Range"), N_("Rng"), N_("range"), 0, 10},
167 {DSC_ST_MINIMUM, DSC_MINIMUM, N_("Minimum"), N_("Minimum"), N_("Min"),
168 N_("minimum"), 0, 10},
169 {DSC_ST_MAXIMUM, DSC_MAXIMUM, N_("Maximum"), N_("Maximum"), N_("Max"),
170 N_("maximum"), 0, 10},
171 {DSC_ST_SUM, DSC_SUM, N_("Sum"), N_("Sum"), N_("Sum"), N_("sum"), 1, 13},
174 /* Z-score functions. */
175 static int generate_z_varname (struct variable * v);
176 static void dump_z_table (void);
177 static void run_z_pass (void);
179 /* Procedure execution functions. */
180 static int calc (struct ccase *, void *);
181 static void precalc (void *);
182 static void postcalc (void *);
183 static void display (void);
185 /* Parser and outline. */
188 cmd_descriptives (void)
196 if (!parse_descriptives (&cmd))
199 if (n_variables == 0)
201 for (i = 0; i < n_variables; i++)
205 v->p.dsc.zname[0] = 0;
210 msg (SE, _("No variables specified."));
214 if (cmd.sbc_options && (cmd.sbc_save || cmd.sbc_format || cmd.sbc_missing))
216 msg (SE, _("OPTIONS may not be used with SAVE, FORMAT, or MISSING."));
220 if (!cmd.sbc_options)
222 if (cmd.incl == DSC_INCLUDE)
223 opt[op_incl_miss] = 1;
224 if (cmd.labeled == DSC_NOLABELS)
225 opt[op_no_varlabs] = 1;
228 if (cmd.miss == DSC_LISTWISE)
229 opt[op_excl_miss] = 1;
230 if (cmd.lined == DSC_SERIAL)
234 /* Construct z-score varnames, show translation table. */
238 for (i = 0; i < n_variables; i++)
244 if (v->p.dsc.zname[0] == 0)
245 if (!generate_z_varname (v))
252 /* Figure out statistics to calculate. */
254 if (stat[DSC_ST_DEFAULT] || !cmd.sbc_statistics)
255 stats |= dsc_default;
256 if (stat[DSC_ST_ALL])
258 for (i = 0; i < dsc_n_stats; i++)
259 if (stat[dsc_info[i].st_indx])
260 stats |= BIT_INDEX (i);
261 if (stats & dsc_kurt)
263 if (stats & dsc_skew)
266 /* Check the sort order. */
268 if (cmd.sortby == DSC_NONE)
270 else if (cmd.sortby != DSC_NAME)
272 for (i = 0; i < n_variables; i++)
273 if (dsc_info[i].sb_indx == cmd.sortby)
276 if (!(stats & BIT_INDEX (i)))
278 msg (SE, _("It's not possible to sort on `%s' without "
280 gettext (dsc_info[i].s), gettext (dsc_info[i].s));
284 assert (sortby_stat != -1);
289 procedure_with_splits (precalc, calc, postcalc, NULL);
292 msg (SW, _("At least one case in the data file had a weight value "
293 "that was system-missing, zero, or negative. These case(s) "
310 /* Parses the VARIABLES subcommand. */
312 dsc_custom_variables (struct cmd_descriptives *cmd UNUSED)
314 if (!lex_match_id ("VARIABLES")
315 && (token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
320 while (token == T_ID || token == T_ALL)
325 if (!parse_variables (default_dict, &v_variables, &n_variables,
326 PV_DUPLICATE | PV_SINGLE | PV_APPEND | PV_NUMERIC
331 if (n_variables - n > 1)
333 msg (SE, _("Names for z-score variables must be given for "
334 "individual variables, not for groups of "
338 assert (n_variables - n <= 0);
341 msg (SE, _("Name for z-score variable expected."));
344 if (dict_lookup_var (default_dict, tokid))
346 msg (SE, _("Z-score variable name `%s' is a "
347 "duplicate variable name with a current variable."),
351 for (i = 0; i < n_variables; i++)
352 if (v_variables[i]->p.dsc.zname[0]
353 && !strcmp (v_variables[i]->p.dsc.zname, tokid))
355 msg (SE, _("Z-score variable name `%s' is "
356 "used multiple times."), tokid);
359 strcpy (v_variables[n_variables - 1]->p.dsc.zname, tokid);
363 msg (SE, _("`)' expected after z-score variable name."));
376 /* Returns 0 if NAME is a duplicate of any existing variable name or
377 of any previously-declared z-var name; otherwise returns 1. */
379 try_name (char *name)
383 if (dict_lookup_var (default_dict, name) != NULL)
385 for (i = 0; i < n_variables; i++)
387 struct variable *v = v_variables[i];
388 if (!strcmp (v->p.dsc.zname, name))
395 generate_z_varname (struct variable * v)
399 strcpy (&zname[1], v->name);
402 if (try_name (zname))
404 strcpy (v->p.dsc.zname, zname);
410 /* Generate variable name. */
414 sprintf (zname, "ZSC%03d", z_count);
415 else if (z_count <= 108)
416 sprintf (zname, "STDZ%02d", z_count - 99);
417 else if (z_count <= 117)
418 sprintf (zname, "ZZZZ%02d", z_count - 108);
419 else if (z_count <= 126)
420 sprintf (zname, "ZQZQ%02d", z_count - 117);
423 msg (SE, _("Ran out of generic names for Z-score variables. "
424 "There are only 126 generic names: ZSC001-ZSC0999, "
425 "STDZ01-STDZ09, ZZZZ01-ZZZZ09, ZQZQ01-ZQZQ09."));
429 if (try_name (zname))
431 strcpy (v->p.dsc.zname, zname);
446 for (count = i = 0; i < n_variables; i++)
447 if (v_variables[i]->p.dsc.zname)
451 t = tab_create (2, count + 1, 0);
452 tab_title (t, 0, _("Mapping of variables to corresponding Z-scores."));
453 tab_columns (t, SOM_COL_DOWN, 1);
454 tab_headers (t, 0, 0, 1, 0);
455 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, count);
456 tab_hline (t, TAL_2, 0, 1, 1);
457 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Source"));
458 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Target"));
459 tab_dim (t, tab_natural_dimensions);
464 for (i = 0, y = 1; i < n_variables; i++)
465 if (v_variables[i]->p.dsc.zname)
467 tab_text (t, 0, y, TAB_LEFT, v_variables[i]->name);
468 tab_text (t, 1, y++, TAB_LEFT, v_variables[i]->p.dsc.zname);
475 /* Transformation function to calculate Z-scores. */
477 descriptives_trns_proc (struct trns_header * trns, struct ccase * c,
480 struct descriptives_trns *t = (struct descriptives_trns *) trns;
481 struct dsc_z_score *z;
484 for (i = t->n, z = t->z; i--; z++)
486 double score = c->data[z->s->fv].f;
488 if (z->mean == SYSMIS || score == SYSMIS)
489 c->data[z->d->fv].f = SYSMIS;
491 c->data[z->d->fv].f = (score - z->mean) / z->std_dev;
496 /* Frees a descriptives_trns struct. */
498 descriptives_trns_free (struct trns_header * trns)
500 struct descriptives_trns *t = (struct descriptives_trns *) trns;
505 /* The name is a misnomer: actually this function sets up a
506 transformation by which scores can be converted into Z-scores. */
510 struct descriptives_trns *t;
513 for (i = 0; i < n_variables; i++)
514 v_variables[i]->p.dsc.dup = 0;
515 for (count = i = 0; i < n_variables; i++)
517 if (v_variables[i]->p.dsc.dup++)
519 if (v_variables[i]->p.dsc.zname)
523 t = xmalloc (sizeof *t);
524 t->h.proc = descriptives_trns_proc;
525 t->h.free = descriptives_trns_free;
527 t->z = xmalloc (count * sizeof *t->z);
529 for (i = 0; i < n_variables; i++)
530 v_variables[i]->p.dsc.dup = 0;
531 for (count = i = 0; i < n_variables; i++)
533 struct variable *v = v_variables[i];
534 if (v->p.dsc.dup++ == 0 && v->p.dsc.zname[0])
540 t->z[count].d = d = dict_create_var_assert (default_dict,
545 d->label = xmalloc (strlen (v->label) + 12);
546 cp = stpcpy (d->label, _("Z-score of "));
547 strcpy (cp, v->label);
551 d->label = xmalloc (strlen (v->name) + 12);
552 cp = stpcpy (d->label, _("Z-score of "));
553 strcpy (cp, v->name);
555 t->z[count].mean = v->p.dsc.stats[dsc_mean];
556 t->z[count].std_dev = v->p.dsc.stats[dsc_stddev];
557 if (t->z[count].std_dev == SYSMIS || t->z[count].std_dev == 0.0)
558 t->z[count].mean = SYSMIS;
563 add_transformation ((struct trns_header *) t);
566 /* Statistical calculation. */
569 precalc (void *aux UNUSED)
573 for (i = 0; i < n_variables; i++)
574 v_variables[i]->p.dsc.dup = -2;
575 for (i = 0; i < n_variables; i++)
577 struct descriptives_proc *dsc = &v_variables[i]->p.dsc;
579 /* Don't need to initialize more than once. */
584 dsc->valid = dsc->miss = 0.0;
585 dsc->X_bar = dsc->M2 = dsc->M3 = dsc->M4 = 0.0;
589 d_glob_valid = d_glob_missing = 0.0;
593 calc (struct ccase * c, void *aux UNUSED)
597 /* Unique case identifier. */
600 /* Get the weight for this case. */
601 double weight = dict_get_case_weight (default_dict, c);
610 /* Handle missing values. */
611 for (i = 0; i < n_variables; i++)
613 struct variable *v = v_variables[i];
614 double X = c->data[v->fv].f;
616 if (X == SYSMIS || (!opt[op_incl_miss] && is_num_user_missing (X, v)))
618 if (opt[op_excl_miss])
620 d_glob_missing += weight;
625 d_glob_miss_list += weight;
630 d_glob_valid += weight;
633 for (i = 0; i < n_variables; i++)
635 struct descriptives_proc *inf = &v_variables[i]->p.dsc;
642 if (inf->dup == case_id)
646 X = c->data[v_variables[i]->fv].f;
648 || (!opt[op_incl_miss] && is_num_user_missing (X, v_variables[i])))
654 /* These formulas taken from _SPSS Statistical Algorithms_. The
655 names W_old, and W_new are used for W_j-1 and W_j,
656 respectively, and other variables simply have the subscripts
657 trimmed off, except for X_bar.
659 I am happy that mathematical formulas may not be
662 W_new = inf->valid += weight;
663 v = (weight / W_new) * (X - inf->X_bar);
667 w2 = weight * weight;
670 inf->M4 += (-4.0 * v * inf->M3 + 6.0 * v2 * inf->M2
671 + (W_new * W_new - 3 * weight * W_old / w3) * v4 * W_old * W_new);
672 inf->M3 += (-3.0 * v * inf->M2 + W_new * W_old / w2
673 * (W_new - 2 * weight) * v3);
674 inf->M2 += W_new * W_old / weight * v2;
685 postcalc (void *aux UNUSED)
689 if (opt[op_excl_miss])
690 d_glob_miss_list = d_glob_missing;
692 for (i = 0; i < n_variables; i++)
694 struct descriptives_proc *dsc = &v_variables[i]->p.dsc;
697 /* Don't duplicate our efforts. */
704 dsc->stats[dsc_mean] = dsc->X_bar;
705 dsc->stats[dsc_variance] = dsc->M2 / (W - 1);
706 dsc->stats[dsc_stddev] = sqrt (dsc->stats[dsc_variance]);
707 dsc->stats[dsc_semean] = dsc->stats[dsc_stddev] / sqrt (W);
708 dsc->stats[dsc_min] = dsc->min == DBL_MAX ? SYSMIS : dsc->min;
709 dsc->stats[dsc_max] = dsc->max == -DBL_MAX ? SYSMIS : dsc->max;
710 dsc->stats[dsc_range] = ((dsc->min == DBL_MAX || dsc->max == -DBL_MAX)
711 ? SYSMIS : dsc->max - dsc->min);
712 dsc->stats[dsc_sum] = W * dsc->X_bar;
713 if (W > 2.0 && dsc->stats[dsc_variance] >= 1e-20)
715 double S = dsc->stats[dsc_stddev];
716 dsc->stats[dsc_skew] = (W * dsc->M3 / ((W - 1.0) * (W - 2.0) * S * S * S));
717 dsc->stats[dsc_seskew] =
718 sqrt (6.0 * W * (W - 1.0) / ((W - 2.0) * (W + 1.0) * (W + 3.0)));
722 dsc->stats[dsc_skew] = dsc->stats[dsc_seskew] = SYSMIS;
724 if (W > 3.0 && dsc->stats[dsc_variance] >= 1e-20)
726 double S2 = dsc->stats[dsc_variance];
727 double SE_g1 = dsc->stats[dsc_seskew];
729 dsc->stats[dsc_kurt] =
730 (W * (W + 1.0) * dsc->M4 - 3.0 * dsc->M2 * dsc->M2 * (W - 1.0))
731 / ((W - 1.0) * (W - 2.0) * (W - 3.0) * S2 * S2);
733 /* Note that in _SPSS Statistical Algorithms_, the square
734 root symbol is missing from this formula. */
735 dsc->stats[dsc_sekurt] =
736 sqrt ((4.0 * (W * W - 1.0) * SE_g1 * SE_g1) / ((W - 3.0) * (W + 5.0)));
740 dsc->stats[dsc_kurt] = dsc->stats[dsc_sekurt] = SYSMIS;
747 /* Statistical display. */
749 static algo_compare_func descriptives_compare_variables;
759 /* If op_excl_miss is on, d_glob_valid and (potentially)
760 d_glob_missing are nonzero, and d_glob_missing equals
763 If op_excl_miss is off, d_glob_valid is nonzero. d_glob_missing
764 is zero. d_glob_miss_list is (potentially) nonzero. */
766 if (sortby_stat != -2)
767 sort (v_variables, n_variables, sizeof *v_variables,
768 descriptives_compare_variables, &cmd);
770 for (nc = i = 0; i < dsc_n_stats; i++)
771 if (stats & BIT_INDEX (i))
774 if (!opt[op_no_varnames])
776 nc += opt[op_serial] ? 2 : 1;
778 t = tab_create (nc, n_variables + 1, 0);
779 tab_headers (t, 1, 0, 1, 0);
780 tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, nc - 1, n_variables);
781 tab_box (t, -1, -1, -1, TAL_1, 1, 0, nc - 1, n_variables);
782 tab_hline (t, TAL_2, 0, nc - 1, 1);
783 tab_vline (t, TAL_2, 1, 0, n_variables);
784 tab_dim (t, tab_natural_dimensions);
787 if (!opt[op_no_varnames])
789 tab_text (t, nc++, 0, TAB_LEFT | TAT_TITLE, _("Variable"));
793 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Valid N"));
794 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Missing N"));
796 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, "N");
799 for (i = 0; i < dsc_n_stats; i++)
800 if (stats & BIT_INDEX (i))
802 const char *title = gettext (dsc_info[i].s8);
803 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, title);
806 for (i = 0; i < n_variables; i++)
808 struct variable *v = v_variables[i];
811 if (!opt[op_no_varnames])
812 tab_text (t, nc++, i + 1, TAB_LEFT, v->name);
813 tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.valid);
815 tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.miss);
816 for (j = 0; j < dsc_n_stats; j++)
817 if (stats & BIT_INDEX (j))
818 tab_float (t, nc++, i + 1, TAB_NONE, v->p.dsc.stats[j], 10, 3);
821 tab_title (t, 1, _("Valid cases = %g; cases with missing value(s) = %g."),
822 d_glob_valid, d_glob_miss_list);
827 /* Compares variables A and B according to the ordering specified
830 descriptives_compare_variables (const void *a_, const void *b_, void *cmd_)
832 struct variable *const *ap = a_;
833 struct variable *const *bp = b_;
834 struct variable *a = *ap;
835 struct variable *b = *bp;
836 struct cmd_descriptives *cmd = cmd_;
840 if (cmd->sortby == DSC_NAME)
841 result = strcmp (a->name, b->name);
844 double as = a->p.dsc.stats[sortby_stat];
845 double bs = b->p.dsc.stats[sortby_stat];
847 result = as < bs ? -1 : as > bs;
850 if (cmd->order == DSC_D)