1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2009-2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
24 #include "data/casegrouper.h"
25 #include "data/casereader.h"
26 #include "data/casewriter.h"
27 #include "data/dataset.h"
28 #include "data/dictionary.h"
29 #include "data/transformations.h"
30 #include "data/variable.h"
31 #include "language/command.h"
32 #include "language/dictionary/split-file.h"
33 #include "language/lexer/lexer.h"
34 #include "language/lexer/variable-parser.h"
35 #include "libpspp/array.h"
36 #include "libpspp/assertion.h"
37 #include "libpspp/compiler.h"
38 #include "libpspp/i18n.h"
39 #include "libpspp/message.h"
40 #include "math/moments.h"
41 #include "output/tab.h"
43 #include "gl/xalloc.h"
46 #define _(msgid) gettext (msgid)
47 #define N_(msgid) msgid
49 /* DESCRIPTIVES private data. */
53 /* Handling of missing values. */
56 DSC_VARIABLE, /* Handle missing values on a per-variable basis. */
57 DSC_LISTWISE /* Discard entire case if any variable is missing. */
60 /* Describes properties of a distribution for the purpose of
61 calculating a Z-score. */
64 const struct variable *src_var; /* Variable on which z-score is based. */
65 struct variable *z_var; /* New z-score variable. */
66 double mean; /* Distribution mean. */
67 double std_dev; /* Distribution standard deviation. */
70 /* DESCRIPTIVES transformation (for calculating Z-scores). */
73 struct dsc_z_score *z_scores; /* Array of Z-scores. */
74 int z_score_cnt; /* Number of Z-scores. */
75 const struct variable **vars; /* Variables for listwise missing checks. */
76 size_t var_cnt; /* Number of variables. */
77 enum dsc_missing_type missing_type; /* Treatment of missing values. */
78 enum mv_class exclude; /* Classes of missing values to exclude. */
79 struct variable *filter; /* Dictionary FILTER BY variable. */
80 struct casereader *z_reader; /* Reader for count, mean, stddev. */
81 casenumber count; /* Number left in this SPLIT FILE group.*/
85 /* Statistics. Used as bit indexes, so must be 32 or fewer. */
88 DSC_MEAN = 0, DSC_SEMEAN, DSC_STDDEV, DSC_VARIANCE, DSC_KURTOSIS,
89 DSC_SEKURT, DSC_SKEWNESS, DSC_SESKEW, DSC_RANGE, DSC_MIN,
90 DSC_MAX, DSC_SUM, DSC_N_STATS,
92 /* Only valid as sort criteria. */
93 DSC_NAME = -2, /* Sort by name. */
94 DSC_NONE = -1 /* Unsorted. */
97 /* Describes one statistic. */
98 struct dsc_statistic_info
100 const char *identifier; /* Identifier. */
101 const char *name; /* Full name. */
102 enum moment moment; /* Highest moment needed to calculate. */
105 /* Table of statistics, indexed by DSC_*. */
106 static const struct dsc_statistic_info dsc_info[DSC_N_STATS] =
108 {"MEAN", N_("Mean"), MOMENT_MEAN},
109 {"SEMEAN", N_("S.E. Mean"), MOMENT_VARIANCE},
110 {"STDDEV", N_("Std Dev"), MOMENT_VARIANCE},
111 {"VARIANCE", N_("Variance"), MOMENT_VARIANCE},
112 {"KURTOSIS", N_("Kurtosis"), MOMENT_KURTOSIS},
113 {"SEKURTOSIS", N_("S.E. Kurt"), MOMENT_NONE},
114 {"SKEWNESS", N_("Skewness"), MOMENT_SKEWNESS},
115 {"SESKEWNESS", N_("S.E. Skew"), MOMENT_NONE},
116 {"RANGE", N_("Range"), MOMENT_NONE},
117 {"MINIMUM", N_("Minimum"), MOMENT_NONE},
118 {"MAXIMUM", N_("Maximum"), MOMENT_NONE},
119 {"SUM", N_("Sum"), MOMENT_MEAN},
122 /* Statistics calculated by default if none are explicitly
124 #define DEFAULT_STATS \
125 ((1ul << DSC_MEAN) | (1ul << DSC_STDDEV) | (1ul << DSC_MIN) \
128 /* A variable specified on DESCRIPTIVES. */
131 const struct variable *v; /* Variable to calculate on. */
132 char *z_name; /* Name for z-score variable. */
133 double valid, missing; /* Valid, missing counts. */
134 struct moments *moments; /* Moments. */
135 double min, max; /* Maximum and mimimum values. */
136 double stats[DSC_N_STATS]; /* All the stats' values. */
142 DSC_LINE, /* Abbreviated format. */
143 DSC_SERIAL /* Long format. */
146 /* A DESCRIPTIVES procedure. */
149 /* Per-variable info. */
150 struct dsc_var *vars; /* Variables. */
151 size_t var_cnt; /* Number of variables. */
154 enum dsc_missing_type missing_type; /* Treatment of missing values. */
155 enum mv_class exclude; /* Classes of missing values to exclude. */
156 int show_var_labels; /* Nonzero to show variable labels. */
157 int show_index; /* Nonzero to show variable index. */
158 enum dsc_format format; /* Output format. */
160 /* Accumulated results. */
161 double missing_listwise; /* Sum of weights of cases missing listwise. */
162 double valid; /* Sum of weights of valid cases. */
163 bool bad_warn; /* Warn if bad weight found. */
164 enum dsc_statistic sort_by_stat; /* Statistic to sort by; -1: name. */
165 int sort_ascending; /* !0: ascending order; 0: descending. */
166 unsigned long show_stats; /* Statistics to display. */
167 unsigned long calc_stats; /* Statistics to calculate. */
168 enum moment max_moment; /* Highest moment needed for stats. */
171 struct casewriter *z_writer; /* Mean and stddev per SPLIT FILE group. */
175 static enum dsc_statistic match_statistic (struct lexer *);
176 static void free_dsc_proc (struct dsc_proc *);
178 /* Z-score functions. */
179 static bool try_name (const struct dictionary *dict,
180 struct dsc_proc *dsc, const char *name);
181 static char *generate_z_varname (const struct dictionary *dict,
182 struct dsc_proc *dsc,
183 const char *name, int *z_cnt);
184 static void dump_z_table (struct dsc_proc *);
185 static void setup_z_trns (struct dsc_proc *, struct dataset *);
187 /* Procedure execution functions. */
188 static void calc_descriptives (struct dsc_proc *, struct casereader *,
190 static void display (struct dsc_proc *dsc);
192 /* Parser and outline. */
194 /* Handles DESCRIPTIVES. */
196 cmd_descriptives (struct lexer *lexer, struct dataset *ds)
198 struct dictionary *dict = dataset_dict (ds);
199 struct dsc_proc *dsc;
200 const struct variable **vars = NULL;
202 int save_z_scores = 0;
207 struct casegrouper *grouper;
208 struct casereader *group;
210 /* Create and initialize dsc. */
211 dsc = xmalloc (sizeof *dsc);
214 dsc->missing_type = DSC_VARIABLE;
215 dsc->exclude = MV_ANY;
216 dsc->show_var_labels = 1;
218 dsc->format = DSC_LINE;
219 dsc->missing_listwise = 0.;
222 dsc->sort_by_stat = DSC_NONE;
223 dsc->sort_ascending = 1;
224 dsc->show_stats = dsc->calc_stats = DEFAULT_STATS;
225 dsc->z_writer = NULL;
227 /* Parse DESCRIPTIVES. */
228 while (lex_token (lexer) != T_ENDCMD)
230 if (lex_match_id (lexer, "MISSING"))
232 lex_match (lexer, T_EQUALS);
233 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
235 if (lex_match_id (lexer, "VARIABLE"))
236 dsc->missing_type = DSC_VARIABLE;
237 else if (lex_match_id (lexer, "LISTWISE"))
238 dsc->missing_type = DSC_LISTWISE;
239 else if (lex_match_id (lexer, "INCLUDE"))
240 dsc->exclude = MV_SYSTEM;
243 lex_error (lexer, NULL);
246 lex_match (lexer, T_COMMA);
249 else if (lex_match_id (lexer, "SAVE"))
251 else if (lex_match_id (lexer, "FORMAT"))
253 lex_match (lexer, T_EQUALS);
254 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
256 if (lex_match_id (lexer, "LABELS"))
257 dsc->show_var_labels = 1;
258 else if (lex_match_id (lexer, "NOLABELS"))
259 dsc->show_var_labels = 0;
260 else if (lex_match_id (lexer, "INDEX"))
262 else if (lex_match_id (lexer, "NOINDEX"))
264 else if (lex_match_id (lexer, "LINE"))
265 dsc->format = DSC_LINE;
266 else if (lex_match_id (lexer, "SERIAL"))
267 dsc->format = DSC_SERIAL;
270 lex_error (lexer, NULL);
273 lex_match (lexer, T_COMMA);
276 else if (lex_match_id (lexer, "STATISTICS"))
278 lex_match (lexer, T_EQUALS);
280 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
282 if (lex_match (lexer, T_ALL))
283 dsc->show_stats |= (1ul << DSC_N_STATS) - 1;
284 else if (lex_match_id (lexer, "DEFAULT"))
285 dsc->show_stats |= DEFAULT_STATS;
288 enum dsc_statistic s = match_statistic (lexer);
291 lex_error (lexer, NULL);
294 dsc->show_stats |= 1ul << s;
296 lex_match (lexer, T_COMMA);
298 if (dsc->show_stats == 0)
299 dsc->show_stats = DEFAULT_STATS;
301 else if (lex_match_id (lexer, "SORT"))
303 lex_match (lexer, T_EQUALS);
304 if (lex_match_id (lexer, "NAME"))
305 dsc->sort_by_stat = DSC_NAME;
308 dsc->sort_by_stat = match_statistic (lexer);
309 if (dsc->sort_by_stat == DSC_NONE )
310 dsc->sort_by_stat = DSC_MEAN;
312 if (lex_match (lexer, T_LPAREN))
314 if (lex_match_id (lexer, "A"))
315 dsc->sort_ascending = 1;
316 else if (lex_match_id (lexer, "D"))
317 dsc->sort_ascending = 0;
319 lex_error (lexer, NULL);
320 if (! lex_force_match (lexer, T_RPAREN))
324 else if (var_cnt == 0)
326 if (lex_next_token (lexer, 1) == T_EQUALS)
328 lex_match_id (lexer, "VARIABLES");
329 lex_match (lexer, T_EQUALS);
332 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
336 if (!parse_variables_const (lexer, dict, &vars, &var_cnt,
337 PV_APPEND | PV_NO_DUPLICATE | PV_NUMERIC))
340 dsc->vars = xnrealloc ((void *)dsc->vars, var_cnt, sizeof *dsc->vars);
341 for (i = dsc->var_cnt; i < var_cnt; i++)
343 struct dsc_var *dv = &dsc->vars[i];
348 dsc->var_cnt = var_cnt;
350 if (lex_match (lexer, T_LPAREN))
352 if (lex_token (lexer) != T_ID)
354 lex_error (lexer, NULL);
357 if (try_name (dict, dsc, lex_tokcstr (lexer)))
359 struct dsc_var *dsc_var = &dsc->vars[dsc->var_cnt - 1];
360 dsc_var->z_name = xstrdup (lex_tokcstr (lexer));
364 msg (SE, _("Z-score variable name %s would be"
365 " a duplicate variable name."), lex_tokcstr (lexer));
367 if (!lex_force_match (lexer, T_RPAREN))
374 lex_error (lexer, NULL);
378 lex_match (lexer, T_SLASH);
382 msg (SE, _("No variables specified."));
386 /* Construct z-score varnames, show translation table. */
387 if (z_cnt || save_z_scores)
389 struct caseproto *proto;
395 for (i = 0; i < dsc->var_cnt; i++)
397 struct dsc_var *dsc_var = &dsc->vars[i];
398 if (dsc_var->z_name == NULL)
400 const char *name = var_get_name (dsc_var->v);
401 dsc_var->z_name = generate_z_varname (dict, dsc, name,
403 if (dsc_var->z_name == NULL)
411 /* It would be better to handle Z scores correctly (however we define
412 that) when TEMPORARY is in effect, but in the meantime this at least
413 prevents a use-after-free error. See bug #38786. */
414 if (proc_make_temporary_transformations_permanent (ds))
415 msg (SW, _("DESCRIPTIVES with Z scores ignores TEMPORARY. "
416 "Temporary transformations will be made permanent."));
418 proto = caseproto_create ();
419 for (i = 0; i < 1 + 2 * z_cnt; i++)
420 proto = caseproto_add_width (proto, 0);
421 dsc->z_writer = autopaging_writer_create (proto);
422 caseproto_unref (proto);
427 /* Figure out statistics to display. */
428 if (dsc->show_stats & (1ul << DSC_SKEWNESS))
429 dsc->show_stats |= 1ul << DSC_SESKEW;
430 if (dsc->show_stats & (1ul << DSC_KURTOSIS))
431 dsc->show_stats |= 1ul << DSC_SEKURT;
433 /* Figure out which statistics to calculate. */
434 dsc->calc_stats = dsc->show_stats;
436 dsc->calc_stats |= (1ul << DSC_MEAN) | (1ul << DSC_STDDEV);
437 if (dsc->sort_by_stat >= 0)
438 dsc->calc_stats |= 1ul << dsc->sort_by_stat;
439 if (dsc->show_stats & (1ul << DSC_SESKEW))
440 dsc->calc_stats |= 1ul << DSC_SKEWNESS;
441 if (dsc->show_stats & (1ul << DSC_SEKURT))
442 dsc->calc_stats |= 1ul << DSC_KURTOSIS;
444 /* Figure out maximum moment needed and allocate moments for
446 dsc->max_moment = MOMENT_NONE;
447 for (i = 0; i < DSC_N_STATS; i++)
448 if (dsc->calc_stats & (1ul << i) && dsc_info[i].moment > dsc->max_moment)
449 dsc->max_moment = dsc_info[i].moment;
450 if (dsc->max_moment != MOMENT_NONE)
451 for (i = 0; i < dsc->var_cnt; i++)
452 dsc->vars[i].moments = moments_create (dsc->max_moment);
455 grouper = casegrouper_create_splits (proc_open_filtering (ds, false), dict);
456 while (casegrouper_get_next_group (grouper, &group))
457 calc_descriptives (dsc, group, ds);
458 ok = casegrouper_destroy (grouper);
459 ok = proc_commit (ds) && ok;
463 setup_z_trns (dsc, ds);
468 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
476 /* Returns the statistic named by the current token and skips past the token.
477 Returns DSC_NONE if no statistic is given (e.g., subcommand with no
478 specifiers). Emits an error if the current token ID does not name a
480 static enum dsc_statistic
481 match_statistic (struct lexer *lexer)
483 if (lex_token (lexer) == T_ID)
485 enum dsc_statistic stat;
487 for (stat = 0; stat < DSC_N_STATS; stat++)
488 if (lex_match_id (lexer, dsc_info[stat].identifier))
492 lex_error (lexer, _("expecting statistic name: reverting to default"));
500 free_dsc_proc (struct dsc_proc *dsc)
507 for (i = 0; i < dsc->var_cnt; i++)
509 struct dsc_var *dsc_var = &dsc->vars[i];
510 free (dsc_var->z_name);
511 moments_destroy (dsc_var->moments);
513 casewriter_destroy (dsc->z_writer);
520 /* Returns false if NAME is a duplicate of any existing variable name or
521 of any previously-declared z-var name; otherwise returns true. */
523 try_name (const struct dictionary *dict, struct dsc_proc *dsc,
528 if (dict_lookup_var (dict, name) != NULL)
530 for (i = 0; i < dsc->var_cnt; i++)
532 struct dsc_var *dsc_var = &dsc->vars[i];
533 if (dsc_var->z_name != NULL && !utf8_strcasecmp (dsc_var->z_name, name))
539 /* Generates a name for a Z-score variable based on a variable
540 named VAR_NAME, given that *Z_CNT generated variable names are
541 known to already exist. If successful, returns the new name
542 as a dynamically allocated string. On failure, returns NULL. */
544 generate_z_varname (const struct dictionary *dict, struct dsc_proc *dsc,
545 const char *var_name, int *z_cnt)
547 char *z_name, *trunc_name;
549 /* Try a name based on the original variable name. */
550 z_name = xasprintf ("Z%s", var_name);
551 trunc_name = utf8_encoding_trunc (z_name, dict_get_encoding (dict),
554 if (try_name (dict, dsc, trunc_name))
558 /* Generate a synthetic name. */
566 sprintf (name, "ZSC%03d", *z_cnt);
567 else if (*z_cnt <= 108)
568 sprintf (name, "STDZ%02d", *z_cnt - 99);
569 else if (*z_cnt <= 117)
570 sprintf (name, "ZZZZ%02d", *z_cnt - 108);
571 else if (*z_cnt <= 126)
572 sprintf (name, "ZQZQ%02d", *z_cnt - 117);
575 msg (SE, _("Ran out of generic names for Z-score variables. "
576 "There are only 126 generic names: ZSC001-ZSC0999, "
577 "STDZ01-STDZ09, ZZZZ01-ZZZZ09, ZQZQ01-ZQZQ09."));
581 if (try_name (dict, dsc, name))
582 return xstrdup (name);
587 /* Outputs a table describing the mapping between source
588 variables and Z-score variables. */
590 dump_z_table (struct dsc_proc *dsc)
598 for (i = 0; i < dsc->var_cnt; i++)
599 if (dsc->vars[i].z_name != NULL)
603 t = tab_create (2, cnt + 1);
604 tab_title (t, _("Mapping of variables to corresponding Z-scores."));
605 tab_headers (t, 0, 0, 1, 0);
606 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, cnt);
607 tab_hline (t, TAL_2, 0, 1, 1);
608 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Source"));
609 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Target"));
614 for (i = 0, y = 1; i < dsc->var_cnt; i++)
615 if (dsc->vars[i].z_name != NULL)
617 tab_text (t, 0, y, TAB_LEFT, var_to_string (dsc->vars[i].v));
618 tab_text (t, 1, y++, TAB_LEFT, dsc->vars[i].z_name);
626 descriptives_set_all_sysmis_zscores (const struct dsc_trns *t, struct ccase *c)
628 const struct dsc_z_score *z;
630 for (z = t->z_scores; z < t->z_scores + t->z_score_cnt; z++)
631 case_data_rw (c, z->z_var)->f = SYSMIS;
634 /* Transformation function to calculate Z-scores. Will return SYSMIS if any of
635 the following are true: 1) mean or standard deviation is SYSMIS 2) score is
636 SYSMIS 3) score is user missing and they were not included in the original
637 analyis. 4) any of the variables in the original analysis were missing
638 (either system or user-missing values that weren't included).
641 descriptives_trns_proc (void *trns_, struct ccase **c,
642 casenumber case_idx UNUSED)
644 struct dsc_trns *t = trns_;
645 struct dsc_z_score *z;
646 const struct variable **vars;
648 *c = case_unshare (*c);
652 double f = case_num (*c, t->filter);
653 if (f == 0.0 || var_is_num_missing (t->filter, f, MV_ANY))
655 descriptives_set_all_sysmis_zscores (t, *c);
656 return TRNS_CONTINUE;
662 struct ccase *z_case;
664 z_case = casereader_read (t->z_reader);
669 t->count = case_num_idx (z_case, z_idx++);
670 for (z = t->z_scores; z < t->z_scores + t->z_score_cnt; z++)
672 z->mean = case_num_idx (z_case, z_idx++);
673 z->std_dev = case_num_idx (z_case, z_idx++);
681 msg (SE, _("Internal error processing Z scores"));
684 descriptives_set_all_sysmis_zscores (t, *c);
685 return TRNS_CONTINUE;
690 if (t->missing_type == DSC_LISTWISE)
693 for (vars = t->vars; vars < t->vars + t->var_cnt; vars++)
695 double score = case_num (*c, *vars);
696 if (var_is_num_missing (*vars, score, t->exclude))
698 descriptives_set_all_sysmis_zscores (t, *c);
699 return TRNS_CONTINUE;
704 for (z = t->z_scores; z < t->z_scores + t->z_score_cnt; z++)
706 double input = case_num (*c, z->src_var);
707 double *output = &case_data_rw (*c, z->z_var)->f;
709 if (z->mean == SYSMIS || z->std_dev == SYSMIS
710 || var_is_num_missing (z->src_var, input, t->exclude))
713 *output = (input - z->mean) / z->std_dev;
715 return TRNS_CONTINUE;
718 /* Frees a descriptives_trns struct. */
720 descriptives_trns_free (void *trns_)
722 struct dsc_trns *t = trns_;
723 bool ok = t->ok && !casereader_error (t->z_reader);
726 casereader_destroy (t->z_reader);
727 assert((t->missing_type != DSC_LISTWISE) ^ (t->vars != NULL));
734 /* Sets up a transformation to calculate Z scores. */
736 setup_z_trns (struct dsc_proc *dsc, struct dataset *ds)
741 for (cnt = i = 0; i < dsc->var_cnt; i++)
742 if (dsc->vars[i].z_name != NULL)
745 t = xmalloc (sizeof *t);
746 t->z_scores = xnmalloc (cnt, sizeof *t->z_scores);
747 t->z_score_cnt = cnt;
748 t->missing_type = dsc->missing_type;
749 t->exclude = dsc->exclude;
750 if ( t->missing_type == DSC_LISTWISE )
752 t->var_cnt = dsc->var_cnt;
753 t->vars = xnmalloc (t->var_cnt, sizeof *t->vars);
754 for (i = 0; i < t->var_cnt; i++)
755 t->vars[i] = dsc->vars[i].v;
762 t->filter = dict_get_filter (dataset_dict (ds));
763 t->z_reader = casewriter_make_reader (dsc->z_writer);
766 dsc->z_writer = NULL;
768 for (cnt = i = 0; i < dsc->var_cnt; i++)
770 struct dsc_var *dv = &dsc->vars[i];
771 if (dv->z_name != NULL)
773 struct dsc_z_score *z;
774 struct variable *dst_var;
777 dst_var = dict_create_var_assert (dataset_dict (ds), dv->z_name, 0);
779 label = xasprintf (_("Z-score of %s"),var_to_string (dv->v));
780 var_set_label (dst_var, label);
783 z = &t->z_scores[cnt++];
789 add_transformation (ds,
790 descriptives_trns_proc, descriptives_trns_free, t);
793 /* Statistical calculation. */
795 static bool listwise_missing (struct dsc_proc *dsc, const struct ccase *c);
797 /* Calculates and displays descriptive statistics for the cases
800 calc_descriptives (struct dsc_proc *dsc, struct casereader *group,
803 struct variable *filter = dict_get_filter (dataset_dict (ds));
804 struct casereader *pass1, *pass2;
810 c = casereader_peek (group, 0);
813 casereader_destroy (group);
816 output_split_file_values (ds, c);
819 group = casereader_create_filter_weight (group, dataset_dict (ds),
823 pass2 = dsc->max_moment <= MOMENT_MEAN ? NULL : casereader_clone (pass1);
825 for (i = 0; i < dsc->var_cnt; i++)
827 struct dsc_var *dv = &dsc->vars[i];
829 dv->valid = dv->missing = 0.0;
830 if (dv->moments != NULL)
831 moments_clear (dv->moments);
835 dsc->missing_listwise = 0.;
838 /* First pass to handle most of the work. */
840 for (; (c = casereader_read (pass1)) != NULL; case_unref (c))
842 double weight = dict_get_case_weight (dataset_dict (ds), c, NULL);
846 double f = case_num (c, filter);
847 if (f == 0.0 || var_is_num_missing (filter, f, MV_ANY))
851 /* Check for missing values. */
852 if (listwise_missing (dsc, c))
854 dsc->missing_listwise += weight;
855 if (dsc->missing_type == DSC_LISTWISE)
858 dsc->valid += weight;
860 for (i = 0; i < dsc->var_cnt; i++)
862 struct dsc_var *dv = &dsc->vars[i];
863 double x = case_num (c, dv->v);
865 if (var_is_num_missing (dv->v, x, dsc->exclude))
867 dv->missing += weight;
871 if (dv->moments != NULL)
872 moments_pass_one (dv->moments, x, weight);
882 if (!casereader_destroy (pass1))
884 casereader_destroy (pass2);
888 /* Second pass for higher-order moments. */
889 if (dsc->max_moment > MOMENT_MEAN)
891 for (; (c = casereader_read (pass2)) != NULL; case_unref (c))
893 double weight = dict_get_case_weight (dataset_dict (ds), c, NULL);
897 double f = case_num (c, filter);
898 if (f == 0.0 || var_is_num_missing (filter, f, MV_ANY))
902 /* Check for missing values. */
903 if (dsc->missing_type == DSC_LISTWISE && listwise_missing (dsc, c))
906 for (i = 0; i < dsc->var_cnt; i++)
908 struct dsc_var *dv = &dsc->vars[i];
909 double x = case_num (c, dv->v);
911 if (var_is_num_missing (dv->v, x, dsc->exclude))
914 if (dv->moments != NULL)
915 moments_pass_two (dv->moments, x, weight);
918 if (!casereader_destroy (pass2))
922 /* Calculate results. */
923 if (dsc->z_writer && count > 0)
925 c = case_create (casewriter_get_proto (dsc->z_writer));
927 case_data_rw_idx (c, z_idx++)->f = count;
932 for (i = 0; i < dsc->var_cnt; i++)
934 struct dsc_var *dv = &dsc->vars[i];
938 for (j = 0; j < DSC_N_STATS; j++)
939 dv->stats[j] = SYSMIS;
941 dv->valid = W = dsc->valid - dv->missing;
943 if (dv->moments != NULL)
944 moments_calculate (dv->moments, NULL,
945 &dv->stats[DSC_MEAN], &dv->stats[DSC_VARIANCE],
946 &dv->stats[DSC_SKEWNESS], &dv->stats[DSC_KURTOSIS]);
947 if (dsc->calc_stats & (1ul << DSC_SEMEAN)
948 && dv->stats[DSC_VARIANCE] != SYSMIS && W > 0.)
949 dv->stats[DSC_SEMEAN] = sqrt (dv->stats[DSC_VARIANCE]) / sqrt (W);
950 if (dsc->calc_stats & (1ul << DSC_STDDEV)
951 && dv->stats[DSC_VARIANCE] != SYSMIS)
952 dv->stats[DSC_STDDEV] = sqrt (dv->stats[DSC_VARIANCE]);
953 if (dsc->calc_stats & (1ul << DSC_SEKURT))
954 if (dv->stats[DSC_KURTOSIS] != SYSMIS)
955 dv->stats[DSC_SEKURT] = calc_sekurt (W);
956 if (dsc->calc_stats & (1ul << DSC_SESKEW)
957 && dv->stats[DSC_SKEWNESS] != SYSMIS)
958 dv->stats[DSC_SESKEW] = calc_seskew (W);
959 dv->stats[DSC_RANGE] = ((dv->min == DBL_MAX || dv->max == -DBL_MAX)
960 ? SYSMIS : dv->max - dv->min);
961 dv->stats[DSC_MIN] = dv->min == DBL_MAX ? SYSMIS : dv->min;
962 dv->stats[DSC_MAX] = dv->max == -DBL_MAX ? SYSMIS : dv->max;
963 if (dsc->calc_stats & (1ul << DSC_SUM))
964 dv->stats[DSC_SUM] = W * dv->stats[DSC_MEAN];
966 if (dv->z_name && c != NULL)
968 case_data_rw_idx (c, z_idx++)->f = dv->stats[DSC_MEAN];
969 case_data_rw_idx (c, z_idx++)->f = dv->stats[DSC_STDDEV];
974 casewriter_write (dsc->z_writer, c);
976 /* Output results. */
980 /* Returns true if any of the descriptives variables in DSC's
981 variable list have missing values in case C, false otherwise. */
983 listwise_missing (struct dsc_proc *dsc, const struct ccase *c)
987 for (i = 0; i < dsc->var_cnt; i++)
989 struct dsc_var *dv = &dsc->vars[i];
990 double x = case_num (c, dv->v);
992 if (var_is_num_missing (dv->v, x, dsc->exclude))
998 /* Statistical display. */
1000 static algo_compare_func descriptives_compare_dsc_vars;
1002 /* Displays a table of descriptive statistics for DSC. */
1004 display (struct dsc_proc *dsc)
1008 struct tab_table *t;
1010 nc = 1 + (dsc->format == DSC_SERIAL ? 2 : 1);
1011 for (i = 0; i < DSC_N_STATS; i++)
1012 if (dsc->show_stats & (1ul << i))
1015 if (dsc->sort_by_stat != DSC_NONE)
1016 sort (dsc->vars, dsc->var_cnt, sizeof *dsc->vars,
1017 descriptives_compare_dsc_vars, dsc);
1019 t = tab_create (nc, dsc->var_cnt + 1);
1020 tab_headers (t, 1, 0, 1, 0);
1021 tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, nc - 1, dsc->var_cnt);
1022 tab_box (t, -1, -1, -1, TAL_1, 1, 0, nc - 1, dsc->var_cnt);
1023 tab_hline (t, TAL_2, 0, nc - 1, 1);
1024 tab_vline (t, TAL_2, 1, 0, dsc->var_cnt);
1027 tab_text (t, nc++, 0, TAB_LEFT | TAT_TITLE, _("Variable"));
1028 if (dsc->format == DSC_SERIAL)
1030 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Valid N"));
1031 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Missing N"));
1034 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, "N");
1036 for (i = 0; i < DSC_N_STATS; i++)
1037 if (dsc->show_stats & (1ul << i))
1039 const char *title = gettext (dsc_info[i].name);
1040 tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, title);
1043 for (i = 0; i < dsc->var_cnt; i++)
1045 struct dsc_var *dv = &dsc->vars[i];
1049 tab_text (t, nc++, i + 1, TAB_LEFT, var_to_string (dv->v));
1050 tab_text_format (t, nc++, i + 1, 0, "%.*g", DBL_DIG + 1, dv->valid);
1051 if (dsc->format == DSC_SERIAL)
1052 tab_text_format (t, nc++, i + 1, 0, "%.*g", DBL_DIG + 1, dv->missing);
1054 for (j = 0; j < DSC_N_STATS; j++)
1055 if (dsc->show_stats & (1ul << j))
1056 tab_double (t, nc++, i + 1, TAB_NONE, dv->stats[j], NULL, RC_OTHER);
1059 tab_title (t, _("Valid cases = %.*g; cases with missing value(s) = %.*g."),
1060 DBL_DIG + 1, dsc->valid,
1061 DBL_DIG + 1, dsc->missing_listwise);
1066 /* Compares `struct dsc_var's A and B according to the ordering
1067 specified by CMD. */
1069 descriptives_compare_dsc_vars (const void *a_, const void *b_, const void *dsc_)
1071 const struct dsc_var *a = a_;
1072 const struct dsc_var *b = b_;
1073 const struct dsc_proc *dsc = dsc_;
1077 if (dsc->sort_by_stat == DSC_NAME)
1078 result = utf8_strcasecmp (var_get_name (a->v), var_get_name (b->v));
1081 double as = a->stats[dsc->sort_by_stat];
1082 double bs = b->stats[dsc->sort_by_stat];
1084 result = as < bs ? -1 : as > bs;
1087 if (!dsc->sort_ascending)