1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
42 /* Specifies how to make an aggregate variable. */
45 struct agr_var *next; /* Next in list. */
47 /* Collected during parsing. */
48 struct variable *src; /* Source variable. */
49 struct variable *dest; /* Target variable. */
50 int function; /* Function. */
51 int include_missing; /* 1=Include user-missing values. */
52 union value arg[2]; /* Arguments. */
54 /* Accumulated during AGGREGATE execution. */
59 struct moments1 *moments;
62 /* Aggregation functions. */
65 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
66 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
67 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
68 FUNC = 0x1f, /* Function mask. */
69 FSTRING = 1<<5, /* String function bit. */
72 /* Attributes of an aggregation function. */
75 const char *name; /* Aggregation function name. */
76 int n_args; /* Number of arguments. */
77 int alpha_type; /* When given ALPHA arguments, output type. */
78 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
81 /* Attributes of aggregation functions. */
82 static const struct agr_func agr_func_tab[] =
84 {"<NONE>", 0, -1, {0, 0, 0}},
85 {"SUM", 0, -1, {FMT_F, 8, 2}},
86 {"MEAN", 0, -1, {FMT_F, 8, 2}},
87 {"SD", 0, -1, {FMT_F, 8, 2}},
88 {"MAX", 0, ALPHA, {-1, -1, -1}},
89 {"MIN", 0, ALPHA, {-1, -1, -1}},
90 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
91 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
92 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
93 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
94 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
95 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
96 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
97 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
98 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
99 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
100 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
101 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"FIRST", 0, ALPHA, {-1, -1, -1}},
103 {"LAST", 0, ALPHA, {-1, -1, -1}},
104 {NULL, 0, -1, {-1, -1, -1}},
105 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
109 /* Missing value types. */
110 enum missing_treatment
112 ITEMWISE, /* Missing values item by item. */
113 COLUMNWISE /* Missing values column by column. */
116 /* An entire AGGREGATE procedure. */
119 /* We have either an output file or a sink. */
120 struct sfm_writer *writer; /* Output file, or null if none. */
121 struct case_sink *sink; /* Sink, or null if none. */
123 /* Break variables. */
124 struct sort_criteria *sort; /* Sort criteria. */
125 struct variable **break_vars; /* Break variables. */
126 size_t break_var_cnt; /* Number of break variables. */
127 struct ccase break_case; /* Last values of break variables. */
129 enum missing_treatment missing; /* How to treat missing values. */
130 struct agr_var *agr_vars; /* First aggregate variable. */
131 struct dictionary *dict; /* Aggregate dictionary. */
132 int case_cnt; /* Counts aggregated cases. */
133 struct ccase agr_case; /* Aggregate case for output. */
136 static void initialize_aggregate_info (struct agr_proc *,
137 const struct ccase *);
140 static int parse_aggregate_functions (struct agr_proc *);
141 static void agr_destroy (struct agr_proc *);
142 static int aggregate_single_case (struct agr_proc *agr,
143 const struct ccase *input,
144 struct ccase *output);
145 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
147 /* Aggregating to the active file. */
148 static int agr_to_active_file (struct ccase *, void *aux);
150 /* Aggregating to a system file. */
151 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
155 /* Parses and executes the AGGREGATE procedure. */
160 struct file_handle *out_file = NULL;
162 bool copy_documents = false;
163 bool presorted = false;
166 memset(&agr, 0 , sizeof (agr));
167 agr.missing = ITEMWISE;
168 case_nullify (&agr.break_case);
170 agr.dict = dict_create ();
171 dict_set_label (agr.dict, dict_get_label (default_dict));
172 dict_set_documents (agr.dict, dict_get_documents (default_dict));
174 /* OUTFILE subcommand must be first. */
175 if (!lex_force_match_id ("OUTFILE"))
178 if (!lex_match ('*'))
180 out_file = fh_parse ();
181 if (out_file == NULL)
185 /* Read most of the subcommands. */
190 if (lex_match_id ("MISSING"))
193 if (!lex_match_id ("COLUMNWISE"))
195 lex_error (_("while expecting COLUMNWISE"));
198 agr.missing = COLUMNWISE;
200 else if (lex_match_id ("DOCUMENT"))
201 copy_documents = true;
202 else if (lex_match_id ("PRESORTED"))
204 else if (lex_match_id ("BREAK"))
209 agr.sort = sort_parse_criteria (default_dict,
210 &agr.break_vars, &agr.break_var_cnt,
212 if (agr.sort == NULL)
215 for (i = 0; i < agr.break_var_cnt; i++)
217 struct variable *v = dict_clone_var (agr.dict, agr.break_vars[i],
218 agr.break_vars[i]->name,
219 agr.break_vars[i]->longname
224 /* BREAK must follow the options. */
229 lex_error (_("expecting BREAK"));
233 if (presorted && saw_direction)
234 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
235 "with (A) or (D) has no effect. Output data will be sorted "
236 "the same way as the input data."));
238 /* Read in the aggregate functions. */
240 if (!parse_aggregate_functions (&agr))
243 /* Delete documents. */
245 dict_set_documents (agr.dict, NULL);
247 /* Cancel SPLIT FILE. */
248 dict_set_split_vars (agr.dict, NULL, 0);
252 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
254 /* Output to active file or external file? */
255 if (out_file == NULL)
257 /* The active file will be replaced by the aggregated data,
258 so TEMPORARY is moot. */
261 if (agr.sort != NULL && !presorted)
262 sort_active_file_in_place (agr.sort);
264 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
265 if (agr.sink->class->open != NULL)
266 agr.sink->class->open (agr.sink);
267 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
268 procedure (agr_to_active_file, &agr);
269 if (agr.case_cnt > 0)
271 dump_aggregate_info (&agr, &agr.agr_case);
272 agr.sink->class->write (agr.sink, &agr.agr_case);
274 dict_destroy (default_dict);
275 default_dict = agr.dict;
277 vfm_source = agr.sink->class->make_source (agr.sink);
278 free_case_sink (agr.sink);
282 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression (), 0);
283 if (agr.writer == NULL)
286 if (agr.sort != NULL && !presorted)
288 /* Sorting is needed. */
289 struct casefile *dst;
290 struct casereader *reader;
293 dst = sort_active_file_to_casefile (agr.sort);
296 reader = casefile_get_destructive_reader (dst);
297 while (casereader_read_xfer (reader, &c))
299 if (aggregate_single_case (&agr, &c, &agr.agr_case))
300 sfm_write_case (agr.writer, &agr.agr_case);
303 casereader_destroy (reader);
304 casefile_destroy (dst);
308 /* Active file is already sorted. */
309 procedure (presorted_agr_to_sysfile, &agr);
312 if (agr.case_cnt > 0)
314 dump_aggregate_info (&agr, &agr.agr_case);
315 sfm_write_case (agr.writer, &agr.agr_case);
327 /* Parse all the aggregate functions. */
329 parse_aggregate_functions (struct agr_proc *agr)
331 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
333 /* Parse everything. */
342 const struct agr_func *function;
347 struct variable **src;
361 /* Parse the list of target variables. */
362 while (!lex_match ('='))
364 int n_dest_prev = n_dest;
366 if (!parse_DATA_LIST_vars (&dest, &n_dest,
367 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
370 /* Assign empty labels. */
374 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
375 for (j = n_dest_prev; j < n_dest; j++)
376 dest_label[j] = NULL;
379 if (token == T_STRING)
381 ds_truncate (&tokstr, 255);
382 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
387 /* Get the name of the aggregation function. */
390 lex_error (_("expecting aggregation function"));
395 if (tokid[strlen (tokid) - 1] == '.')
398 tokid[strlen (tokid) - 1] = 0;
401 for (function = agr_func_tab; function->name; function++)
402 if (!strcasecmp (function->name, tokid))
404 if (NULL == function->name)
406 msg (SE, _("Unknown aggregation function %s."), tokid);
409 func_index = function - agr_func_tab;
412 /* Check for leading lparen. */
413 if (!lex_match ('('))
416 func_index = N_NO_VARS;
417 else if (func_index == NU)
418 func_index = NU_NO_VARS;
421 lex_error (_("expecting `('"));
427 /* Parse list of source variables. */
429 int pv_opts = PV_NO_SCRATCH;
431 if (func_index == SUM || func_index == MEAN || func_index == SD)
432 pv_opts |= PV_NUMERIC;
433 else if (function->n_args)
434 pv_opts |= PV_SAME_TYPE;
436 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
440 /* Parse function arguments, for those functions that
441 require arguments. */
442 if (function->n_args != 0)
443 for (i = 0; i < function->n_args; i++)
448 if (token == T_STRING)
450 arg[i].c = xstrdup (ds_c_str (&tokstr));
453 else if (lex_is_number ())
458 msg (SE, _("Missing argument %d to %s."), i + 1, function->name);
464 if (type != src[0]->type)
466 msg (SE, _("Arguments to %s must be of same type as "
467 "source variables."),
473 /* Trailing rparen. */
476 lex_error (_("expecting `)'"));
480 /* Now check that the number of source variables match
481 the number of target variables. If we check earlier
482 than this, the user can get very misleading error
483 message, i.e. `AGGREGATE x=SUM(y t).' will get this
484 error message when a proper message would be more
485 like `unknown variable t'. */
488 msg (SE, _("Number of source variables (%d) does not match "
489 "number of target variables (%d)."),
494 if ((func_index == PIN || func_index == POUT
495 || func_index == FIN || func_index == FOUT)
496 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
497 || (src[0]->type == ALPHA
498 && st_compare_pad (arg[0].c, strlen (arg[0].c),
499 arg[1].c, strlen (arg[1].c)) > 0)))
501 union value t = arg[0];
505 msg (SW, _("The value arguments passed to the %s function "
506 "are out-of-order. They will be treated as if "
507 "they had been specified in the correct order."),
512 /* Finally add these to the linked list of aggregation
514 for (i = 0; i < n_dest; i++)
516 struct agr_var *v = xmalloc (sizeof *v);
518 /* Add variable to chain. */
519 if (agr->agr_vars != NULL)
527 /* Create the target variable in the aggregate
530 static const struct fmt_spec f8_2 = {FMT_F, 8, 2};
531 struct variable *destvar;
533 v->function = func_index;
539 if (src[i]->type == ALPHA)
541 v->function |= FSTRING;
542 v->string = xmalloc (src[i]->width);
545 if (function->alpha_type == ALPHA)
546 destvar = dict_clone_var (agr->dict, v->src, 0, dest[i] );
547 else if (v->src->type == NUMERIC
548 || function->alpha_type == NUMERIC)
550 destvar = dict_create_var (agr->dict, dest[i], 0);
553 if ((func_index == N || func_index == NMISS)
554 && dict_get_weight (default_dict) != NULL)
555 destvar->print = destvar->write = f8_2;
557 destvar->print = destvar->write = function->format;
562 destvar = dict_create_var (agr->dict, dest[i], 0);
563 if (func_index == N_NO_VARS
564 && dict_get_weight (default_dict) != NULL)
565 destvar->print = destvar->write = f8_2;
567 destvar->print = destvar->write = function->format;
572 msg (SE, _("Variable name %s is not unique within the "
573 "aggregate file dictionary, which contains "
574 "the aggregate variables and the break "
584 destvar->label = dest_label[i];
585 dest_label[i] = NULL;
591 v->include_missing = include_missing;
597 if (v->src->type == NUMERIC)
598 for (j = 0; j < function->n_args; j++)
599 v->arg[j].f = arg[j].f;
601 for (j = 0; j < function->n_args; j++)
602 v->arg[j].c = xstrdup (arg[j].c);
606 if (src != NULL && src[0]->type == ALPHA)
607 for (i = 0; i < function->n_args; i++)
617 if (!lex_match ('/'))
622 lex_error ("expecting end of command");
628 for (i = 0; i < n_dest; i++)
631 free (dest_label[i]);
637 if (src && n_src && src[0]->type == ALPHA)
638 for (i = 0; i < function->n_args; i++)
651 agr_destroy (struct agr_proc *agr)
653 struct agr_var *iter, *next;
655 sfm_close_writer (agr->writer);
656 if (agr->sort != NULL)
657 sort_destroy_criteria (agr->sort);
658 free (agr->break_vars);
659 case_destroy (&agr->break_case);
660 for (iter = agr->agr_vars; iter; iter = next)
664 if (iter->function & FSTRING)
669 n_args = agr_func_tab[iter->function & FUNC].n_args;
670 for (i = 0; i < n_args; i++)
671 free (iter->arg[i].c);
674 else if (iter->function == SD)
675 moments1_destroy (iter->moments);
678 if (agr->dict != NULL)
679 dict_destroy (agr->dict);
681 case_destroy (&agr->agr_case);
686 static void accumulate_aggregate_info (struct agr_proc *,
687 const struct ccase *);
688 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
690 /* Processes a single case INPUT for aggregation. If output is
691 warranted, writes it to OUTPUT and returns nonzero.
692 Otherwise, returns zero and OUTPUT is unmodified. */
694 aggregate_single_case (struct agr_proc *agr,
695 const struct ccase *input, struct ccase *output)
697 bool finished_group = false;
699 if (agr->case_cnt++ == 0)
700 initialize_aggregate_info (agr, input);
701 else if (case_compare (&agr->break_case, input,
702 agr->break_vars, agr->break_var_cnt))
704 dump_aggregate_info (agr, output);
705 finished_group = true;
707 initialize_aggregate_info (agr, input);
710 accumulate_aggregate_info (agr, input);
711 return finished_group;
714 /* Accumulates aggregation data from the case INPUT. */
716 accumulate_aggregate_info (struct agr_proc *agr,
717 const struct ccase *input)
719 struct agr_var *iter;
723 weight = dict_get_case_weight (default_dict, input, &bad_warn);
725 for (iter = agr->agr_vars; iter; iter = iter->next)
728 const union value *v = case_data (input, iter->src->fv);
730 if ((!iter->include_missing && is_missing (v, iter->src))
731 || (iter->include_missing && iter->src->type == NUMERIC
734 switch (iter->function)
737 case NMISS | FSTRING:
738 iter->dbl[0] += weight;
741 case NUMISS | FSTRING:
749 /* This is horrible. There are too many possibilities. */
750 switch (iter->function)
753 iter->dbl[0] += v->f * weight;
757 iter->dbl[0] += v->f * weight;
758 iter->dbl[1] += weight;
761 moments1_add (iter->moments, v->f, weight);
764 iter->dbl[0] = max (iter->dbl[0], v->f);
768 if (memcmp (iter->string, v->s, iter->src->width) < 0)
769 memcpy (iter->string, v->s, iter->src->width);
773 iter->dbl[0] = min (iter->dbl[0], v->f);
777 if (memcmp (iter->string, v->s, iter->src->width) > 0)
778 memcpy (iter->string, v->s, iter->src->width);
783 if (v->f > iter->arg[0].f)
784 iter->dbl[0] += weight;
785 iter->dbl[1] += weight;
789 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
790 iter->dbl[0] += weight;
791 iter->dbl[1] += weight;
795 if (v->f < iter->arg[0].f)
796 iter->dbl[0] += weight;
797 iter->dbl[1] += weight;
801 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
802 iter->dbl[0] += weight;
803 iter->dbl[1] += weight;
807 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
808 iter->dbl[0] += weight;
809 iter->dbl[1] += weight;
813 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
814 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
815 iter->dbl[0] += weight;
816 iter->dbl[1] += weight;
820 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
821 iter->dbl[0] += weight;
822 iter->dbl[1] += weight;
826 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
827 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
828 iter->dbl[0] += weight;
829 iter->dbl[1] += weight;
833 iter->dbl[0] += weight;
846 case FIRST | FSTRING:
849 memcpy (iter->string, v->s, iter->src->width);
858 memcpy (iter->string, v->s, iter->src->width);
862 case NMISS | FSTRING:
864 case NUMISS | FSTRING:
865 /* Our value is not missing or it would have been
866 caught earlier. Nothing to do. */
872 switch (iter->function)
875 iter->dbl[0] += weight;
886 /* We've come to a record that differs from the previous in one or
887 more of the break variables. Make an output record from the
888 accumulated statistics in the OUTPUT case. */
890 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
896 for (i = 0; i < agr->break_var_cnt; i++)
898 struct variable *v = agr->break_vars[i];
899 memcpy (case_data_rw (output, value_idx),
900 case_data (&agr->break_case, v->fv),
901 sizeof (union value) * v->nv);
909 for (i = agr->agr_vars; i; i = i->next)
911 union value *v = case_data_rw (output, i->dest->fv);
913 if (agr->missing == COLUMNWISE && i->missing != 0
914 && (i->function & FUNC) != N && (i->function & FUNC) != NU
915 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
917 if (i->dest->type == ALPHA)
918 memset (v->s, ' ', i->dest->width);
927 v->f = i->int1 ? i->dbl[0] : SYSMIS;
930 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
936 /* FIXME: we should use two passes. */
937 moments1_calculate (i->moments, NULL, NULL, &variance,
939 if (variance != SYSMIS)
940 v->f = sqrt (variance);
947 v->f = i->int1 ? i->dbl[0] : SYSMIS;
952 memcpy (v->s, i->string, i->dest->width);
954 memset (v->s, ' ', i->dest->width);
964 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
974 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
986 v->f = i->int1 ? i->dbl[0] : SYSMIS;
988 case FIRST | FSTRING:
991 memcpy (v->s, i->string, i->dest->width);
993 memset (v->s, ' ', i->dest->width);
1002 case NMISS | FSTRING:
1006 case NUMISS | FSTRING:
1016 /* Resets the state for all the aggregate functions. */
1018 initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input)
1020 struct agr_var *iter;
1022 case_destroy (&agr->break_case);
1023 case_clone (&agr->break_case, input);
1025 for (iter = agr->agr_vars; iter; iter = iter->next)
1028 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1029 iter->int1 = iter->int2 = 0;
1030 switch (iter->function)
1033 iter->dbl[0] = DBL_MAX;
1036 memset (iter->string, 255, iter->src->width);
1039 iter->dbl[0] = -DBL_MAX;
1042 memset (iter->string, 0, iter->src->width);
1045 if (iter->moments == NULL)
1046 iter->moments = moments1_create (MOMENT_VARIANCE);
1048 moments1_clear (iter->moments);
1056 /* Aggregate each case as it comes through. Cases which aren't needed
1059 agr_to_active_file (struct ccase *c, void *agr_)
1061 struct agr_proc *agr = agr_;
1063 if (aggregate_single_case (agr, c, &agr->agr_case))
1064 agr->sink->class->write (agr->sink, &agr->agr_case);
1069 /* Aggregate the current case and output it if we passed a
1072 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1074 struct agr_proc *agr = agr_;
1076 if (aggregate_single_case (agr, c, &agr->agr_case))
1077 sfm_write_case (agr->writer, &agr->agr_case);