1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 /* FIXME: Many possible optimizations. */
27 #include "algorithm.h"
29 #include "bitvector.h"
43 +missing=miss:!variable/listwise,incl:!noinclude/include;
44 +format=labeled:!labels/nolabels,indexed:!noindex/index,lined:!line/serial;
46 +options[op_]=1,2,3,4,5,6,7,8;
47 +statistics[st_]=all,1|mean,2|semean,5|stddev,6|variance,7|kurtosis,
48 8|skewness,9|range,10|minimum,11|maximum,12|sum,
49 13|default,14|seskewness,15|sekurtosis;
50 +sort=sortby:mean/semean/stddev/variance/kurtosis/skewness/range/
51 range/minimum/maximum/sum/name/seskewness/sekurtosis/!none,
57 /* DESCRIPTIVES private data. */
59 /* Describes properties of a distribution for the purpose of
60 calculating a Z-score. */
63 struct variable *s, *d; /* Source, destination variable. */
64 double mean; /* Distribution mean. */
65 double std_dev; /* Distribution standard deviation. */
68 /* DESCRIPTIVES transformation (for calculating Z-scores). */
69 struct descriptives_trns
72 int n; /* Number of Z-scores. */
73 struct dsc_z_score *z; /* Array of Z-scores. */
76 /* These next three vars, see comment at top of display(). */
77 /* Number of cases missing listwise, even if option 5 not selected. */
78 static double d_glob_miss_list;
80 /* Number of total *cases* valid or missing, as a double. Unless
81 option 5 is selected, d_glob_missing is 0. */
82 static double d_glob_valid, d_glob_missing;
84 /* Set when a weighting variable is missing or <=0. */
85 static int bad_weight;
87 /* Number of generic zvarnames we've generated in this execution. */
90 /* Variables specified on command. */
91 static struct variable **v_variables;
92 static int n_variables;
94 /* Command specifications. */
95 static struct cmd_descriptives cmd;
97 /* Whether z-scores are computed. */
100 /* Statistic to sort by. */
101 static int sortby_stat;
103 /* Statistics to display. */
104 static unsigned long stats;
106 /* Easier access to long-named arrays. */
107 #define stat cmd.a_statistics
108 #define opt cmd.a_options
110 /* Groups of statistics. */
113 #define dsc_default \
114 (BI (dsc_mean) | BI (dsc_stddev) | BI (dsc_min) | BI (dsc_max))
117 (BI (dsc_sum) | BI (dsc_min) | BI (dsc_max) \
118 | BI (dsc_mean) | BI (dsc_semean) | BI (dsc_stddev) \
119 | BI (dsc_variance) | BI (dsc_kurt) | BI (dsc_sekurt) \
120 | BI (dsc_skew) | BI (dsc_seskew) | BI (dsc_range) \
123 /* Table of options. */
124 #define op_incl_miss DSC_OP_1 /* Honored. */
125 #define op_no_varlabs DSC_OP_2 /* Ignored. */
126 #define op_zscores DSC_OP_3 /* Honored. */
127 #define op_index DSC_OP_4 /* FIXME. */
128 #define op_excl_miss DSC_OP_5 /* Honored. */
129 #define op_serial DSC_OP_6 /* Honored. */
130 #define op_narrow DSC_OP_7 /* Ignored. */
131 #define op_no_varnames DSC_OP_8 /* Honored. */
133 /* Describes one statistic that can be calculated. */
134 /* FIXME: Currently sm,col_width are not used. */
137 int st_indx; /* Index into st_a_statistics[]. */
138 int sb_indx; /* Sort-by index. */
139 const char *s10; /* Name, stuffed into 10 columns. */
140 const char *s8; /* Name, stuffed into 8 columns. */
141 const char *sm; /* Name, stuffed minimally. */
142 const char *s; /* Full name. */
143 int max_degree; /* Highest degree necessary to calculate this
145 int col_width; /* Column width (not incl. spacing between columns) */
148 /* Table of statistics, indexed by dsc_*. */
149 static struct dsc_info dsc_info[dsc_n_stats] =
151 {DSC_ST_MEAN, DSC_MEAN, N_("Mean"), N_("Mean"), N_("Mean"), N_("mean"), 1, 10},
152 {DSC_ST_SEMEAN, DSC_SEMEAN, N_("S.E. Mean"), N_("S E Mean"), N_("SE"),
153 N_("standard error of mean"), 2, 10},
154 {DSC_ST_STDDEV, DSC_STDDEV, N_("Std Dev"), N_("Std Dev"), N_("SD"),
155 N_("standard deviation"), 2, 11},
156 {DSC_ST_VARIANCE, DSC_VARIANCE, N_("Variance"), N_("Variance"),
157 N_("Var"), N_("variance"), 2, 12},
158 {DSC_ST_KURTOSIS, DSC_KURTOSIS, N_("Kurtosis"), N_("Kurtosis"),
159 N_("Kurt"), N_("kurtosis"), 4, 9},
160 {DSC_ST_SEKURTOSIS, DSC_SEKURTOSIS, N_("S.E. Kurt"), N_("S E Kurt"), N_("SEKurt"),
161 N_("standard error of kurtosis"), 0, 9},
162 {DSC_ST_SKEWNESS, DSC_SKEWNESS, N_("Skewness"), N_("Skewness"), N_("Skew"),
163 N_("skewness"), 3, 9},
164 {DSC_ST_SESKEWNESS, DSC_SESKEWNESS, N_("S.E. Skew"), N_("S E Skew"), N_("SESkew"),
165 N_("standard error of skewness"), 0, 9},
166 {DSC_ST_RANGE, DSC_RANGE, N_("Range"), N_("Range"), N_("Rng"), N_("range"), 0, 10},
167 {DSC_ST_MINIMUM, DSC_MINIMUM, N_("Minimum"), N_("Minimum"), N_("Min"),
168 N_("minimum"), 0, 10},
169 {DSC_ST_MAXIMUM, DSC_MAXIMUM, N_("Maximum"), N_("Maximum"), N_("Max"),
170 N_("maximum"), 0, 10},
171 {DSC_ST_SUM, DSC_SUM, N_("Sum"), N_("Sum"), N_("Sum"), N_("sum"), 1, 13},
174 /* Z-score functions. */
175 static int generate_z_varname (struct variable * v);
176 static void dump_z_table (void);
177 static void run_z_pass (void);
179 /* Procedure execution functions. */
180 static int calc (struct ccase *, void *);
181 static void precalc (void *);
182 static void postcalc (void *);
183 static void display (void);
185 /* Parser and outline. */
188 cmd_descriptives (void)
196 lex_match_id ("DESCRIPTIVES");
197 lex_match_id ("CONDESCRIPTIVES");
198 if (!parse_descriptives (&cmd))
201 if (n_variables == 0)
203 for (i = 0; i < n_variables; i++)
207 v->p.dsc.zname[0] = 0;
212 msg (SE, _("No variables specified."));
216 if (cmd.sbc_options && (cmd.sbc_save || cmd.sbc_format || cmd.sbc_missing))
218 msg (SE, _("OPTIONS may not be used with SAVE, FORMAT, or MISSING."));
222 if (!cmd.sbc_options)
224 if (cmd.incl == DSC_INCLUDE)
225 opt[op_incl_miss] = 1;
226 if (cmd.labeled == DSC_NOLABELS)
227 opt[op_no_varlabs] = 1;
230 if (cmd.miss == DSC_LISTWISE)
231 opt[op_excl_miss] = 1;
232 if (cmd.lined == DSC_SERIAL)
236 /* Construct z-score varnames, show translation table. */
240 for (i = 0; i < n_variables; i++)
246 if (v->p.dsc.zname[0] == 0)
247 if (!generate_z_varname (v))
254 /* Figure out statistics to calculate. */
256 if (stat[DSC_ST_DEFAULT] || !cmd.sbc_statistics)
257 stats |= dsc_default;
258 if (stat[DSC_ST_ALL])
260 for (i = 0; i < dsc_n_stats; i++)
261 if (stat[dsc_info[i].st_indx])
262 stats |= BIT_INDEX (i);
263 if (stats & dsc_kurt)
265 if (stats & dsc_skew)
268 /* Check the sort order. */
270 if (cmd.sortby == DSC_NONE)
272 else if (cmd.sortby != DSC_NAME)
274 for (i = 0; i < n_variables; i++)
275 if (dsc_info[i].sb_indx == cmd.sortby)
278 if (!(stats & BIT_INDEX (i)))
280 msg (SE, _("It's not possible to sort on `%s' without "
282 gettext (dsc_info[i].s), gettext (dsc_info[i].s));
286 assert (sortby_stat != -1);
291 procedure (precalc, calc, postcalc, NULL);
294 msg (SW, _("At least one case in the data file had a weight value "
295 "that was system-missing, zero, or negative. These case(s) "
312 /* Parses the VARIABLES subcommand. */
314 dsc_custom_variables (struct cmd_descriptives *cmd UNUSED)
316 if (!lex_match_id ("VARIABLES")
317 && (token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
322 while (token == T_ID || token == T_ALL)
327 if (!parse_variables (default_dict, &v_variables, &n_variables,
328 PV_DUPLICATE | PV_SINGLE | PV_APPEND | PV_NUMERIC
333 if (n_variables - n > 1)
335 msg (SE, _("Names for z-score variables must be given for "
336 "individual variables, not for groups of "
340 assert (n_variables - n <= 0);
343 msg (SE, _("Name for z-score variable expected."));
346 if (dict_lookup_var (default_dict, tokid))
348 msg (SE, _("Z-score variable name `%s' is a "
349 "duplicate variable name with a current variable."),
353 for (i = 0; i < n_variables; i++)
354 if (v_variables[i]->p.dsc.zname[0]
355 && !strcmp (v_variables[i]->p.dsc.zname, tokid))
357 msg (SE, _("Z-score variable name `%s' is "
358 "used multiple times."), tokid);
361 strcpy (v_variables[n_variables - 1]->p.dsc.zname, tokid);
365 msg (SE, _("`)' expected after z-score variable name."));
378 /* Returns 0 if NAME is a duplicate of any existing variable name or
379 of any previously-declared z-var name; otherwise returns 1. */
381 try_name (char *name)
385 if (dict_lookup_var (default_dict, name) != NULL)
387 for (i = 0; i < n_variables; i++)
389 struct variable *v = v_variables[i];
390 if (!strcmp (v->p.dsc.zname, name))
397 generate_z_varname (struct variable * v)
401 strcpy (&zname[1], v->name);
404 if (try_name (zname))
406 strcpy (v->p.dsc.zname, zname);
412 /* Generate variable name. */
416 sprintf (zname, "ZSC%03d", z_count);
417 else if (z_count <= 108)
418 sprintf (zname, "STDZ%02d", z_count - 99);
419 else if (z_count <= 117)
420 sprintf (zname, "ZZZZ%02d", z_count - 108);
421 else if (z_count <= 126)
422 sprintf (zname, "ZQZQ%02d", z_count - 117);
425 msg (SE, _("Ran out of generic names for Z-score variables. "
426 "There are only 126 generic names: ZSC001-ZSC0999, "
427 "STDZ01-STDZ09, ZZZZ01-ZZZZ09, ZQZQ01-ZQZQ09."));
431 if (try_name (zname))
433 strcpy (v->p.dsc.zname, zname);
448 for (count = i = 0; i < n_variables; i++)
449 if (v_variables[i]->p.dsc.zname)
453 t = tab_create (2, count + 1, 0);
454 tab_title (t, 0, _("Mapping of variables to corresponding Z-scores."));
455 tab_columns (t, SOM_COL_DOWN, 1);
456 tab_headers (t, 0, 0, 1, 0);
457 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, count);
458 tab_hline (t, TAL_2, 0, 1, 1);
459 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Source"));
460 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Target"));
461 tab_dim (t, tab_natural_dimensions);
466 for (i = 0, y = 1; i < n_variables; i++)
467 if (v_variables[i]->p.dsc.zname)
469 tab_text (t, 0, y, TAB_LEFT, v_variables[i]->name);
470 tab_text (t, 1, y++, TAB_LEFT, v_variables[i]->p.dsc.zname);
477 /* Transformation function to calculate Z-scores. */
479 descriptives_trns_proc (struct trns_header * trns, struct ccase * c,
482 struct descriptives_trns *t = (struct descriptives_trns *) trns;
483 struct dsc_z_score *z;
486 for (i = t->n, z = t->z; i--; z++)
488 double score = c->data[z->s->fv].f;
490 if (z->mean == SYSMIS || score == SYSMIS)
491 c->data[z->d->fv].f = SYSMIS;
493 c->data[z->d->fv].f = (score - z->mean) / z->std_dev;
498 /* Frees a descriptives_trns struct. */
500 descriptives_trns_free (struct trns_header * trns)
502 struct descriptives_trns *t = (struct descriptives_trns *) trns;
507 /* The name is a misnomer: actually this function sets up a
508 transformation by which scores can be converted into Z-scores. */
512 struct descriptives_trns *t;
515 for (i = 0; i < n_variables; i++)
516 v_variables[i]->p.dsc.dup = 0;
517 for (count = i = 0; i < n_variables; i++)
519 if (v_variables[i]->p.dsc.dup++)
521 if (v_variables[i]->p.dsc.zname)
525 t = xmalloc (sizeof *t);
526 t->h.proc = descriptives_trns_proc;
527 t->h.free = descriptives_trns_free;
529 t->z = xmalloc (count * sizeof *t->z);
531 for (i = 0; i < n_variables; i++)
532 v_variables[i]->p.dsc.dup = 0;
533 for (count = i = 0; i < n_variables; i++)
535 struct variable *v = v_variables[i];
536 if (v->p.dsc.dup++ == 0 && v->p.dsc.zname[0])
542 t->z[count].d = d = dict_create_var_assert (default_dict,
547 d->label = xmalloc (strlen (v->label) + 12);
548 cp = stpcpy (d->label, _("Z-score of "));
549 strcpy (cp, v->label);
553 d->label = xmalloc (strlen (v->name) + 12);
554 cp = stpcpy (d->label, _("Z-score of "));
555 strcpy (cp, v->name);
557 t->z[count].mean = v->p.dsc.stats[dsc_mean];
558 t->z[count].std_dev = v->p.dsc.stats[dsc_stddev];
559 if (t->z[count].std_dev == SYSMIS || t->z[count].std_dev == 0.0)
560 t->z[count].mean = SYSMIS;
565 add_transformation ((struct trns_header *) t);
568 /* Statistical calculation. */
571 precalc (void *aux UNUSED)
575 for (i = 0; i < n_variables; i++)
576 v_variables[i]->p.dsc.dup = -2;
577 for (i = 0; i < n_variables; i++)
579 struct descriptives_proc *dsc = &v_variables[i]->p.dsc;
581 /* Don't need to initialize more than once. */
586 dsc->valid = dsc->miss = 0.0;
587 dsc->X_bar = dsc->M2 = dsc->M3 = dsc->M4 = 0.0;
591 d_glob_valid = d_glob_missing = 0.0;
595 calc (struct ccase * c, void *aux UNUSED)
599 /* Unique case identifier. */
602 /* Get the weight for this case. */
603 double weight = dict_get_case_weight (default_dict, c);
612 /* Handle missing values. */
613 for (i = 0; i < n_variables; i++)
615 struct variable *v = v_variables[i];
616 double X = c->data[v->fv].f;
618 if (X == SYSMIS || (!opt[op_incl_miss] && is_num_user_missing (X, v)))
620 if (opt[op_excl_miss])
622 d_glob_missing += weight;
627 d_glob_miss_list += weight;
632 d_glob_valid += weight;
635 for (i = 0; i < n_variables; i++)
637 struct descriptives_proc *inf = &v_variables[i]->p.dsc;
644 if (inf->dup == case_id)
648 X = c->data[v_variables[i]->fv].f;
650 || (!opt[op_incl_miss] && is_num_user_missing (X, v_variables[i])))
656 /* These formulas taken from _SPSS Statistical Algorithms_. The
657 names W_old, and W_new are used for W_j-1 and W_j,
658 respectively, and other variables simply have the subscripts
659 trimmed off, except for X_bar.
661 I am happy that mathematical formulas may not be
664 W_new = inf->valid += weight;
665 v = (weight / W_new) * (X - inf->X_bar);
669 w2 = weight * weight;
672 inf->M4 += (-4.0 * v * inf->M3 + 6.0 * v2 * inf->M2
673 + (W_new * W_new - 3 * weight * W_old / w3) * v4 * W_old * W_new);
674 inf->M3 += (-3.0 * v * inf->M2 + W_new * W_old / w2
675 * (W_new - 2 * weight) * v3);
676 inf->M2 += W_new * W_old / weight * v2;
687 postcalc (void *aux UNUSED)
691 if (opt[op_excl_miss])
692 d_glob_miss_list = d_glob_missing;
694 for (i = 0; i < n_variables; i++)
696 struct descriptives_proc *dsc = &v_variables[i]->p.dsc;
699 /* Don't duplicate our efforts. */
706 dsc->stats[dsc_mean] = dsc->X_bar;
707 dsc->stats[dsc_variance] = dsc->M2 / (W - 1);
708 dsc->stats[dsc_stddev] = sqrt (dsc->stats[dsc_variance]);
709 dsc->stats[dsc_semean] = dsc->stats[dsc_stddev] / sqrt (W);
710 dsc->stats[dsc_min] = dsc->min == DBL_MAX ? SYSMIS : dsc->min;
711 dsc->stats[dsc_max] = dsc->max == -DBL_MAX ? SYSMIS : dsc->max;
712 dsc->stats[dsc_range] = ((dsc->min == DBL_MAX || dsc->max == -DBL_MAX)
713 ? SYSMIS : dsc->max - dsc->min);
714 dsc->stats[dsc_sum] = W * dsc->X_bar;
715 if (W > 2.0 && dsc->stats[dsc_variance] >= 1e-20)
717 double S = dsc->stats[dsc_stddev];
718 dsc->stats[dsc_skew] = (W * dsc->M3 / ((W - 1.0) * (W - 2.0) * S * S * S));
719 dsc->stats[dsc_seskew] =
720 sqrt (6.0 * W * (W - 1.0) / ((W - 2.0) * (W + 1.0) * (W + 3.0)));
724 dsc->stats[dsc_skew] = dsc->stats[dsc_seskew] = SYSMIS;
726 if (W > 3.0 && dsc->stats[dsc_variance] >= 1e-20)
728 double S2 = dsc->stats[dsc_variance];
729 double SE_g1 = dsc->stats[dsc_seskew];
731 dsc->stats[dsc_kurt] =
732 (W * (W + 1.0) * dsc->M4 - 3.0 * dsc->M2 * dsc->M2 * (W - 1.0))
733 / ((W - 1.0) * (W - 2.0) * (W - 3.0) * S2 * S2);
735 /* Note that in _SPSS Statistical Algorithms_, the square
736 root symbol is missing from this formula. */
737 dsc->stats[dsc_sekurt] =
738 sqrt ((4.0 * (W * W - 1.0) * SE_g1 * SE_g1) / ((W - 3.0) * (W + 5.0)));
742 dsc->stats[dsc_kurt] = dsc->stats[dsc_sekurt] = SYSMIS;
749 /* Statistical display. */
751 static algo_compare_func descriptives_compare_variables;
761 /* If op_excl_miss is on, d_glob_valid and (potentially)
762 d_glob_missing are nonzero, and d_glob_missing equals
765 If op_excl_miss is off, d_glob_valid is nonzero. d_glob_missing
766 is zero. d_glob_miss_list is (potentially) nonzero. */
768 if (sortby_stat != -2)
769 sort (v_variables, n_variables, sizeof *v_variables,
770 descriptives_compare_variables, &cmd);
772 for (nc = i = 0; i < dsc_n_stats; i++)
773 if (stats & BIT_INDEX (i))
776 if (!opt[op_no_varnames])
778 nc += opt[op_serial] ? 2 : 1;
780 t = tab_create (nc, n_variables + 1, 0);
781 tab_headers (t, 1, 0, 1, 0);
782 tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, nc - 1, n_variables);
783 tab_box (t, -1, -1, -1, TAL_1, 1, 0, nc - 1, n_variables);
784 tab_hline (t, TAL_2, 0, nc - 1, 1);
785 tab_vline (t, TAL_2, 1, 0, n_variables);
786 tab_dim (t, tab_natural_dimensions);
789 if (!opt[op_no_varnames])
791 tab_text (t, nc++, 0, TAB_LEFT | TAT_TITLE, _("Variable"));
795 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Valid N"));
796 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Missing N"));
798 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, "N");
801 for (i = 0; i < dsc_n_stats; i++)
802 if (stats & BIT_INDEX (i))
804 const char *title = gettext (dsc_info[i].s8);
805 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, title);
808 for (i = 0; i < n_variables; i++)
810 struct variable *v = v_variables[i];
813 if (!opt[op_no_varnames])
814 tab_text (t, nc++, i + 1, TAB_LEFT, v->name);
815 tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.valid);
817 tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.miss);
818 for (j = 0; j < dsc_n_stats; j++)
819 if (stats & BIT_INDEX (j))
820 tab_float (t, nc++, i + 1, TAB_NONE, v->p.dsc.stats[j], 10, 3);
823 tab_title (t, 1, _("Valid cases = %g; cases with missing value(s) = %g."),
824 d_glob_valid, d_glob_miss_list);
829 /* Compares variables A and B according to the ordering specified
832 descriptives_compare_variables (const void *a_, const void *b_, void *cmd_)
834 struct variable *const *ap = a_;
835 struct variable *const *bp = b_;
836 struct variable *a = *ap;
837 struct variable *b = *bp;
838 struct cmd_descriptives *cmd = cmd_;
842 if (cmd->sortby == DSC_NAME)
843 result = strcmp (a->name, b->name);
846 double as = a->p.dsc.stats[sortby_stat];
847 double bs = b->p.dsc.stats[sortby_stat];
849 result = as < bs ? -1 : as > bs;
852 if (cmd->order == DSC_D)