1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include <data/any-writer.h>
24 #include <data/case-ordering.h>
25 #include <data/case.h>
26 #include <data/casegrouper.h>
27 #include <data/casereader.h>
28 #include <data/casewriter.h>
29 #include <data/dictionary.h>
30 #include <data/file-handle-def.h>
31 #include <data/format.h>
32 #include <data/procedure.h>
33 #include <data/settings.h>
34 #include <data/sys-file-writer.h>
35 #include <data/variable.h>
36 #include <language/command.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <language/lexer/variable-parser.h>
40 #include <language/stats/sort-criteria.h>
41 #include <libpspp/alloc.h>
42 #include <libpspp/assertion.h>
43 #include <libpspp/message.h>
44 #include <libpspp/misc.h>
45 #include <libpspp/pool.h>
46 #include <libpspp/str.h>
47 #include <math/moments.h>
48 #include <math/sort.h>
53 #define _(msgid) gettext (msgid)
55 /* Argument for AGGREGATE function. */
58 double f; /* Numeric. */
59 char *c; /* Short or long string. */
62 /* Specifies how to make an aggregate variable. */
65 struct agr_var *next; /* Next in list. */
67 /* Collected during parsing. */
68 const struct variable *src; /* Source variable. */
69 struct variable *dest; /* Target variable. */
70 int function; /* Function. */
71 enum mv_class exclude; /* Classes of missing values to exclude. */
72 union agr_argument arg[2]; /* Arguments. */
74 /* Accumulated during AGGREGATE execution. */
79 struct moments1 *moments;
82 /* Aggregation functions. */
85 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
86 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
87 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
88 FUNC = 0x1f, /* Function mask. */
89 FSTRING = 1<<5, /* String function bit. */
92 /* Attributes of an aggregation function. */
95 const char *name; /* Aggregation function name. */
96 size_t n_args; /* Number of arguments. */
97 enum var_type alpha_type; /* When given ALPHA arguments, output type. */
98 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
101 /* Attributes of aggregation functions. */
102 static const struct agr_func agr_func_tab[] =
104 {"<NONE>", 0, -1, {0, 0, 0}},
105 {"SUM", 0, -1, {FMT_F, 8, 2}},
106 {"MEAN", 0, -1, {FMT_F, 8, 2}},
107 {"SD", 0, -1, {FMT_F, 8, 2}},
108 {"MAX", 0, VAR_STRING, {-1, -1, -1}},
109 {"MIN", 0, VAR_STRING, {-1, -1, -1}},
110 {"PGT", 1, VAR_NUMERIC, {FMT_F, 5, 1}},
111 {"PLT", 1, VAR_NUMERIC, {FMT_F, 5, 1}},
112 {"PIN", 2, VAR_NUMERIC, {FMT_F, 5, 1}},
113 {"POUT", 2, VAR_NUMERIC, {FMT_F, 5, 1}},
114 {"FGT", 1, VAR_NUMERIC, {FMT_F, 5, 3}},
115 {"FLT", 1, VAR_NUMERIC, {FMT_F, 5, 3}},
116 {"FIN", 2, VAR_NUMERIC, {FMT_F, 5, 3}},
117 {"FOUT", 2, VAR_NUMERIC, {FMT_F, 5, 3}},
118 {"N", 0, VAR_NUMERIC, {FMT_F, 7, 0}},
119 {"NU", 0, VAR_NUMERIC, {FMT_F, 7, 0}},
120 {"NMISS", 0, VAR_NUMERIC, {FMT_F, 7, 0}},
121 {"NUMISS", 0, VAR_NUMERIC, {FMT_F, 7, 0}},
122 {"FIRST", 0, VAR_STRING, {-1, -1, -1}},
123 {"LAST", 0, VAR_STRING, {-1, -1, -1}},
124 {NULL, 0, -1, {-1, -1, -1}},
125 {"N", 0, VAR_NUMERIC, {FMT_F, 7, 0}},
126 {"NU", 0, VAR_NUMERIC, {FMT_F, 7, 0}},
129 /* Missing value types. */
130 enum missing_treatment
132 ITEMWISE, /* Missing values item by item. */
133 COLUMNWISE /* Missing values column by column. */
136 /* An entire AGGREGATE procedure. */
139 /* Break variables. */
140 struct case_ordering *sort; /* Sort criteria. */
141 const struct variable **break_vars; /* Break variables. */
142 size_t break_var_cnt; /* Number of break variables. */
143 struct ccase break_case; /* Last values of break variables. */
145 enum missing_treatment missing; /* How to treat missing values. */
146 struct agr_var *agr_vars; /* First aggregate variable. */
147 struct dictionary *dict; /* Aggregate dictionary. */
148 const struct dictionary *src_dict; /* Dict of the source */
149 int case_cnt; /* Counts aggregated cases. */
152 static void initialize_aggregate_info (struct agr_proc *,
153 const struct ccase *);
154 static void accumulate_aggregate_info (struct agr_proc *,
155 const struct ccase *);
157 static bool parse_aggregate_functions (struct lexer *, const struct dictionary *,
159 static void agr_destroy (struct agr_proc *);
160 static void dump_aggregate_info (struct agr_proc *agr,
161 struct casewriter *output);
165 /* Parses and executes the AGGREGATE procedure. */
167 cmd_aggregate (struct lexer *lexer, struct dataset *ds)
169 struct dictionary *dict = dataset_dict (ds);
171 struct file_handle *out_file = NULL;
172 struct casereader *input = NULL, *group;
173 struct casegrouper *grouper;
174 struct casewriter *output = NULL;
176 bool copy_documents = false;
177 bool presorted = false;
181 memset(&agr, 0 , sizeof (agr));
182 agr.missing = ITEMWISE;
183 case_nullify (&agr.break_case);
185 agr.dict = dict_create ();
187 dict_set_label (agr.dict, dict_get_label (dict));
188 dict_set_documents (agr.dict, dict_get_documents (dict));
190 /* OUTFILE subcommand must be first. */
191 if (!lex_force_match_id (lexer, "OUTFILE"))
193 lex_match (lexer, '=');
194 if (!lex_match (lexer, '*'))
196 out_file = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
197 if (out_file == NULL)
201 /* Read most of the subcommands. */
204 lex_match (lexer, '/');
206 if (lex_match_id (lexer, "MISSING"))
208 lex_match (lexer, '=');
209 if (!lex_match_id (lexer, "COLUMNWISE"))
211 lex_error (lexer, _("while expecting COLUMNWISE"));
214 agr.missing = COLUMNWISE;
216 else if (lex_match_id (lexer, "DOCUMENT"))
217 copy_documents = true;
218 else if (lex_match_id (lexer, "PRESORTED"))
220 else if (lex_match_id (lexer, "BREAK"))
224 lex_match (lexer, '=');
225 agr.sort = parse_case_ordering (lexer, dict,
228 if (agr.sort == NULL)
230 case_ordering_get_vars (agr.sort,
231 &agr.break_vars, &agr.break_var_cnt);
233 for (i = 0; i < agr.break_var_cnt; i++)
234 dict_clone_var_assert (agr.dict, agr.break_vars[i],
235 var_get_name (agr.break_vars[i]));
237 /* BREAK must follow the options. */
242 lex_error (lexer, _("expecting BREAK"));
246 if (presorted && saw_direction)
247 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
248 "with (A) or (D) has no effect. Output data will be sorted "
249 "the same way as the input data."));
251 /* Read in the aggregate functions. */
252 lex_match (lexer, '/');
253 if (!parse_aggregate_functions (lexer, dict, &agr))
256 /* Delete documents. */
258 dict_clear_documents (agr.dict);
260 /* Cancel SPLIT FILE. */
261 dict_set_split_vars (agr.dict, NULL, 0);
266 if (out_file == NULL)
268 /* The active file will be replaced by the aggregated data,
269 so TEMPORARY is moot. */
270 proc_cancel_temporary_transformations (ds);
271 proc_discard_output (ds);
272 output = autopaging_writer_create (dict_get_next_value_idx (agr.dict));
276 output = any_writer_open (out_file, agr.dict);
281 input = proc_open (ds);
282 if (agr.sort != NULL && !presorted)
284 input = sort_execute (input, agr.sort);
288 for (grouper = casegrouper_create_vars (input, agr.break_vars,
290 casegrouper_get_next_group (grouper, &group);
291 casereader_destroy (group))
295 if (!casereader_peek (group, 0, &c))
297 initialize_aggregate_info (&agr, &c);
300 for (; casereader_read (group, &c); case_destroy (&c))
301 accumulate_aggregate_info (&agr, &c);
302 dump_aggregate_info (&agr, output);
304 if (!casegrouper_destroy (grouper))
307 if (!proc_commit (ds))
314 if (out_file == NULL)
316 struct casereader *next_input = casewriter_make_reader (output);
317 if (next_input == NULL)
320 proc_set_active_file (ds, next_input, agr.dict);
325 ok = casewriter_destroy (output);
337 casewriter_destroy (output);
339 return CMD_CASCADING_FAILURE;
342 /* Parse all the aggregate functions. */
344 parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, struct agr_proc *agr)
346 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
348 /* Parse everything. */
355 struct string function_name;
357 enum mv_class exclude;
358 const struct agr_func *function;
361 union agr_argument arg[2];
363 const struct variable **src;
376 ds_init_empty (&function_name);
378 /* Parse the list of target variables. */
379 while (!lex_match (lexer, '='))
381 size_t n_dest_prev = n_dest;
383 if (!parse_DATA_LIST_vars (lexer, &dest, &n_dest,
384 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
387 /* Assign empty labels. */
391 dest_label = xnrealloc (dest_label, n_dest, sizeof *dest_label);
392 for (j = n_dest_prev; j < n_dest; j++)
393 dest_label[j] = NULL;
398 if (lex_token (lexer) == T_STRING)
401 ds_init_string (&label, lex_tokstr (lexer));
403 ds_truncate (&label, 255);
404 dest_label[n_dest - 1] = ds_xstrdup (&label);
410 /* Get the name of the aggregation function. */
411 if (lex_token (lexer) != T_ID)
413 lex_error (lexer, _("expecting aggregation function"));
419 ds_assign_string (&function_name, lex_tokstr (lexer));
421 ds_chomp (&function_name, '.');
423 if (lex_tokid(lexer)[strlen (lex_tokid (lexer)) - 1] == '.')
426 for (function = agr_func_tab; function->name; function++)
427 if (!strcasecmp (function->name, ds_cstr (&function_name)))
429 if (NULL == function->name)
431 msg (SE, _("Unknown aggregation function %s."),
432 ds_cstr (&function_name));
435 ds_destroy (&function_name);
436 func_index = function - agr_func_tab;
439 /* Check for leading lparen. */
440 if (!lex_match (lexer, '('))
443 func_index = N_NO_VARS;
444 else if (func_index == NU)
445 func_index = NU_NO_VARS;
448 lex_error (lexer, _("expecting `('"));
454 /* Parse list of source variables. */
456 int pv_opts = PV_NO_SCRATCH;
458 if (func_index == SUM || func_index == MEAN || func_index == SD)
459 pv_opts |= PV_NUMERIC;
460 else if (function->n_args)
461 pv_opts |= PV_SAME_TYPE;
463 if (!parse_variables_const (lexer, dict, &src, &n_src, pv_opts))
467 /* Parse function arguments, for those functions that
468 require arguments. */
469 if (function->n_args != 0)
470 for (i = 0; i < function->n_args; i++)
474 lex_match (lexer, ',');
475 if (lex_token (lexer) == T_STRING)
477 arg[i].c = ds_xstrdup (lex_tokstr (lexer));
480 else if (lex_is_number (lexer))
482 arg[i].f = lex_tokval (lexer);
487 msg (SE, _("Missing argument %d to %s."),
488 (int) i + 1, function->name);
494 if (type != var_get_type (src[0]))
496 msg (SE, _("Arguments to %s must be of same type as "
497 "source variables."),
503 /* Trailing rparen. */
504 if (!lex_match (lexer, ')'))
506 lex_error (lexer, _("expecting `)'"));
510 /* Now check that the number of source variables match
511 the number of target variables. If we check earlier
512 than this, the user can get very misleading error
513 message, i.e. `AGGREGATE x=SUM(y t).' will get this
514 error message when a proper message would be more
515 like `unknown variable t'. */
518 msg (SE, _("Number of source variables (%u) does not match "
519 "number of target variables (%u)."),
520 (unsigned) n_src, (unsigned) n_dest);
524 if ((func_index == PIN || func_index == POUT
525 || func_index == FIN || func_index == FOUT)
526 && (var_is_numeric (src[0])
527 ? arg[0].f > arg[1].f
528 : str_compare_rpad (arg[0].c, arg[1].c) > 0))
530 union agr_argument t = arg[0];
534 msg (SW, _("The value arguments passed to the %s function "
535 "are out-of-order. They will be treated as if "
536 "they had been specified in the correct order."),
541 /* Finally add these to the linked list of aggregation
543 for (i = 0; i < n_dest; i++)
545 struct agr_var *v = xmalloc (sizeof *v);
547 /* Add variable to chain. */
548 if (agr->agr_vars != NULL)
556 /* Create the target variable in the aggregate
559 struct variable *destvar;
561 v->function = func_index;
567 if (var_is_alpha (src[i]))
569 v->function |= FSTRING;
570 v->string = xmalloc (var_get_width (src[i]));
573 if (function->alpha_type == VAR_STRING)
574 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
577 assert (var_is_numeric (v->src)
578 || function->alpha_type == VAR_NUMERIC);
579 destvar = dict_create_var (agr->dict, dest[i], 0);
583 if ((func_index == N || func_index == NMISS)
584 && dict_get_weight (dict) != NULL)
585 f = fmt_for_output (FMT_F, 8, 2);
587 f = function->format;
588 var_set_both_formats (destvar, &f);
594 destvar = dict_create_var (agr->dict, dest[i], 0);
595 if (func_index == N_NO_VARS && dict_get_weight (dict) != NULL)
596 f = fmt_for_output (FMT_F, 8, 2);
598 f = function->format;
599 var_set_both_formats (destvar, &f);
604 msg (SE, _("Variable name %s is not unique within the "
605 "aggregate file dictionary, which contains "
606 "the aggregate variables and the break "
614 var_set_label (destvar, dest_label[i]);
619 v->exclude = exclude;
625 if (var_is_numeric (v->src))
626 for (j = 0; j < function->n_args; j++)
627 v->arg[j].f = arg[j].f;
629 for (j = 0; j < function->n_args; j++)
630 v->arg[j].c = xstrdup (arg[j].c);
634 if (src != NULL && var_is_alpha (src[0]))
635 for (i = 0; i < function->n_args; i++)
645 if (!lex_match (lexer, '/'))
647 if (lex_token (lexer) == '.')
650 lex_error (lexer, "expecting end of command");
656 ds_destroy (&function_name);
657 for (i = 0; i < n_dest; i++)
660 free (dest_label[i]);
666 if (src && n_src && var_is_alpha (src[0]))
667 for (i = 0; i < function->n_args; i++)
680 agr_destroy (struct agr_proc *agr)
682 struct agr_var *iter, *next;
684 case_ordering_destroy (agr->sort);
685 free (agr->break_vars);
686 case_destroy (&agr->break_case);
687 for (iter = agr->agr_vars; iter; iter = next)
691 if (iter->function & FSTRING)
696 n_args = agr_func_tab[iter->function & FUNC].n_args;
697 for (i = 0; i < n_args; i++)
698 free (iter->arg[i].c);
701 else if (iter->function == SD)
702 moments1_destroy (iter->moments);
705 if (agr->dict != NULL)
706 dict_destroy (agr->dict);
711 /* Accumulates aggregation data from the case INPUT. */
713 accumulate_aggregate_info (struct agr_proc *agr, const struct ccase *input)
715 struct agr_var *iter;
717 bool bad_warn = true;
719 weight = dict_get_case_weight (agr->src_dict, input, &bad_warn);
721 for (iter = agr->agr_vars; iter; iter = iter->next)
724 const union value *v = case_data (input, iter->src);
725 int src_width = var_get_width (iter->src);
727 if (var_is_value_missing (iter->src, v, iter->exclude))
729 switch (iter->function)
732 case NMISS | FSTRING:
733 iter->dbl[0] += weight;
736 case NUMISS | FSTRING:
740 iter->saw_missing = true;
744 /* This is horrible. There are too many possibilities. */
745 switch (iter->function)
748 iter->dbl[0] += v->f * weight;
752 iter->dbl[0] += v->f * weight;
753 iter->dbl[1] += weight;
756 moments1_add (iter->moments, v->f, weight);
759 iter->dbl[0] = MAX (iter->dbl[0], v->f);
763 if (memcmp (iter->string, v->s, src_width) < 0)
764 memcpy (iter->string, v->s, src_width);
768 iter->dbl[0] = MIN (iter->dbl[0], v->f);
772 if (memcmp (iter->string, v->s, src_width) > 0)
773 memcpy (iter->string, v->s, src_width);
778 if (v->f > iter->arg[0].f)
779 iter->dbl[0] += weight;
780 iter->dbl[1] += weight;
784 if (memcmp (iter->arg[0].c, v->s, src_width) < 0)
785 iter->dbl[0] += weight;
786 iter->dbl[1] += weight;
790 if (v->f < iter->arg[0].f)
791 iter->dbl[0] += weight;
792 iter->dbl[1] += weight;
796 if (memcmp (iter->arg[0].c, v->s, src_width) > 0)
797 iter->dbl[0] += weight;
798 iter->dbl[1] += weight;
802 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
803 iter->dbl[0] += weight;
804 iter->dbl[1] += weight;
808 if (memcmp (iter->arg[0].c, v->s, src_width) <= 0
809 && memcmp (iter->arg[1].c, v->s, src_width) >= 0)
810 iter->dbl[0] += weight;
811 iter->dbl[1] += weight;
815 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
816 iter->dbl[0] += weight;
817 iter->dbl[1] += weight;
821 if (memcmp (iter->arg[0].c, v->s, src_width) > 0
822 || memcmp (iter->arg[1].c, v->s, src_width) < 0)
823 iter->dbl[0] += weight;
824 iter->dbl[1] += weight;
828 iter->dbl[0] += weight;
841 case FIRST | FSTRING:
844 memcpy (iter->string, v->s, src_width);
853 memcpy (iter->string, v->s, src_width);
857 case NMISS | FSTRING:
859 case NUMISS | FSTRING:
860 /* Our value is not missing or it would have been
861 caught earlier. Nothing to do. */
867 switch (iter->function)
870 iter->dbl[0] += weight;
881 /* Writes an aggregated record to OUTPUT. */
883 dump_aggregate_info (struct agr_proc *agr, struct casewriter *output)
887 case_create (&c, dict_get_next_value_idx (agr->dict));
893 for (i = 0; i < agr->break_var_cnt; i++)
895 const struct variable *v = agr->break_vars[i];
896 size_t value_cnt = var_get_value_cnt (v);
897 memcpy (case_data_rw_idx (&c, value_idx),
898 case_data (&agr->break_case, v),
899 sizeof (union value) * value_cnt);
900 value_idx += value_cnt;
907 for (i = agr->agr_vars; i; i = i->next)
909 union value *v = case_data_rw (&c, i->dest);
911 if (agr->missing == COLUMNWISE && i->saw_missing
912 && (i->function & FUNC) != N && (i->function & FUNC) != NU
913 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
915 if (var_is_alpha (i->dest))
916 memset (v->s, ' ', var_get_width (i->dest));
925 v->f = i->int1 ? i->dbl[0] : SYSMIS;
928 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
934 /* FIXME: we should use two passes. */
935 moments1_calculate (i->moments, NULL, NULL, &variance,
937 if (variance != SYSMIS)
938 v->f = sqrt (variance);
945 v->f = i->int1 ? i->dbl[0] : SYSMIS;
950 memcpy (v->s, i->string, var_get_width (i->dest));
952 memset (v->s, ' ', var_get_width (i->dest));
962 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
972 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
984 v->f = i->int1 ? i->dbl[0] : SYSMIS;
986 case FIRST | FSTRING:
989 memcpy (v->s, i->string, var_get_width (i->dest));
991 memset (v->s, ' ', var_get_width (i->dest));
1000 case NMISS | FSTRING:
1004 case NUMISS | FSTRING:
1013 casewriter_write (output, &c);
1016 /* Resets the state for all the aggregate functions. */
1018 initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input)
1020 struct agr_var *iter;
1022 case_destroy (&agr->break_case);
1023 case_clone (&agr->break_case, input);
1025 for (iter = agr->agr_vars; iter; iter = iter->next)
1027 iter->saw_missing = false;
1028 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1029 iter->int1 = iter->int2 = 0;
1030 switch (iter->function)
1033 iter->dbl[0] = DBL_MAX;
1036 memset (iter->string, 255, var_get_width (iter->src));
1039 iter->dbl[0] = -DBL_MAX;
1042 memset (iter->string, 0, var_get_width (iter->src));
1045 if (iter->moments == NULL)
1046 iter->moments = moments1_create (MOMENT_VARIANCE);
1048 moments1_clear (iter->moments);