1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
42 /* Specifies how to make an aggregate variable. */
45 struct agr_var *next; /* Next in list. */
47 /* Collected during parsing. */
48 struct variable *src; /* Source variable. */
49 struct variable *dest; /* Target variable. */
50 int function; /* Function. */
51 int include_missing; /* 1=Include user-missing values. */
52 union value arg[2]; /* Arguments. */
54 /* Accumulated during AGGREGATE execution. */
59 struct moments1 *moments;
62 /* Aggregation functions. */
65 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
66 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
67 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
68 FUNC = 0x1f, /* Function mask. */
69 FSTRING = 1<<5, /* String function bit. */
72 /* Attributes of an aggregation function. */
75 const char *name; /* Aggregation function name. */
76 int n_args; /* Number of arguments. */
77 int alpha_type; /* When given ALPHA arguments, output type. */
78 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
81 /* Attributes of aggregation functions. */
82 static const struct agr_func agr_func_tab[] =
84 {"<NONE>", 0, -1, {0, 0, 0}},
85 {"SUM", 0, -1, {FMT_F, 8, 2}},
86 {"MEAN", 0, -1, {FMT_F, 8, 2}},
87 {"SD", 0, -1, {FMT_F, 8, 2}},
88 {"MAX", 0, ALPHA, {-1, -1, -1}},
89 {"MIN", 0, ALPHA, {-1, -1, -1}},
90 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
91 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
92 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
93 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
94 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
95 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
96 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
97 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
98 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
99 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
100 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
101 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"FIRST", 0, ALPHA, {-1, -1, -1}},
103 {"LAST", 0, ALPHA, {-1, -1, -1}},
104 {NULL, 0, -1, {-1, -1, -1}},
105 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
109 /* Missing value types. */
110 enum missing_treatment
112 ITEMWISE, /* Missing values item by item. */
113 COLUMNWISE /* Missing values column by column. */
116 /* An entire AGGREGATE procedure. */
119 /* We have either an output file or a sink. */
120 struct sfm_writer *writer; /* Output file, or null if none. */
121 struct case_sink *sink; /* Sink, or null if none. */
123 /* Break variables. */
124 struct sort_criteria *sort; /* Sort criteria. */
125 struct variable **break_vars; /* Break variables. */
126 size_t break_var_cnt; /* Number of break variables. */
127 struct ccase break_case; /* Last values of break variables. */
129 enum missing_treatment missing; /* How to treat missing values. */
130 struct agr_var *agr_vars; /* First aggregate variable. */
131 struct dictionary *dict; /* Aggregate dictionary. */
132 int case_cnt; /* Counts aggregated cases. */
133 struct ccase agr_case; /* Aggregate case for output. */
136 static void initialize_aggregate_info (struct agr_proc *,
137 const struct ccase *);
140 static int parse_aggregate_functions (struct agr_proc *);
141 static void agr_destroy (struct agr_proc *);
142 static int aggregate_single_case (struct agr_proc *agr,
143 const struct ccase *input,
144 struct ccase *output);
145 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
147 /* Aggregating to the active file. */
148 static int agr_to_active_file (struct ccase *, void *aux);
150 /* Aggregating to a system file. */
151 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
155 /* Parses and executes the AGGREGATE procedure. */
160 struct file_handle *out_file = NULL;
162 bool copy_documents = false;
163 bool presorted = false;
166 memset(&agr, 0 , sizeof (agr));
167 agr.missing = ITEMWISE;
168 case_nullify (&agr.break_case);
170 agr.dict = dict_create ();
171 dict_set_label (agr.dict, dict_get_label (default_dict));
172 dict_set_documents (agr.dict, dict_get_documents (default_dict));
174 /* OUTFILE subcommand must be first. */
175 if (!lex_force_match_id ("OUTFILE"))
178 if (!lex_match ('*'))
180 out_file = fh_parse ();
181 if (out_file == NULL)
185 /* Read most of the subcommands. */
190 if (lex_match_id ("MISSING"))
193 if (!lex_match_id ("COLUMNWISE"))
195 lex_error (_("while expecting COLUMNWISE"));
198 agr.missing = COLUMNWISE;
200 else if (lex_match_id ("DOCUMENT"))
201 copy_documents = true;
202 else if (lex_match_id ("PRESORTED"))
204 else if (lex_match_id ("BREAK"))
209 agr.sort = sort_parse_criteria (default_dict,
210 &agr.break_vars, &agr.break_var_cnt,
212 if (agr.sort == NULL)
215 for (i = 0; i < agr.break_var_cnt; i++)
216 dict_clone_var_assert (agr.dict, agr.break_vars[i],
217 agr.break_vars[i]->name);
219 /* BREAK must follow the options. */
224 lex_error (_("expecting BREAK"));
228 if (presorted && saw_direction)
229 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
230 "with (A) or (D) has no effect. Output data will be sorted "
231 "the same way as the input data."));
233 /* Read in the aggregate functions. */
235 if (!parse_aggregate_functions (&agr))
238 /* Delete documents. */
240 dict_set_documents (agr.dict, NULL);
242 /* Cancel SPLIT FILE. */
243 dict_set_split_vars (agr.dict, NULL, 0);
247 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
249 /* Output to active file or external file? */
250 if (out_file == NULL)
252 /* The active file will be replaced by the aggregated data,
253 so TEMPORARY is moot. */
256 if (agr.sort != NULL && !presorted)
257 sort_active_file_in_place (agr.sort);
259 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
260 if (agr.sink->class->open != NULL)
261 agr.sink->class->open (agr.sink);
262 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
263 procedure (agr_to_active_file, &agr);
264 if (agr.case_cnt > 0)
266 dump_aggregate_info (&agr, &agr.agr_case);
267 agr.sink->class->write (agr.sink, &agr.agr_case);
269 dict_destroy (default_dict);
270 default_dict = agr.dict;
272 vfm_source = agr.sink->class->make_source (agr.sink);
273 free_case_sink (agr.sink);
277 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression (), 0);
278 if (agr.writer == NULL)
281 if (agr.sort != NULL && !presorted)
283 /* Sorting is needed. */
284 struct casefile *dst;
285 struct casereader *reader;
288 dst = sort_active_file_to_casefile (agr.sort);
291 reader = casefile_get_destructive_reader (dst);
292 while (casereader_read_xfer (reader, &c))
294 if (aggregate_single_case (&agr, &c, &agr.agr_case))
295 sfm_write_case (agr.writer, &agr.agr_case);
298 casereader_destroy (reader);
299 casefile_destroy (dst);
303 /* Active file is already sorted. */
304 procedure (presorted_agr_to_sysfile, &agr);
307 if (agr.case_cnt > 0)
309 dump_aggregate_info (&agr, &agr.agr_case);
310 sfm_write_case (agr.writer, &agr.agr_case);
322 /* Parse all the aggregate functions. */
324 parse_aggregate_functions (struct agr_proc *agr)
326 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
328 /* Parse everything. */
337 const struct agr_func *function;
342 struct variable **src;
356 /* Parse the list of target variables. */
357 while (!lex_match ('='))
359 int n_dest_prev = n_dest;
361 if (!parse_DATA_LIST_vars (&dest, &n_dest,
362 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
365 /* Assign empty labels. */
369 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
370 for (j = n_dest_prev; j < n_dest; j++)
371 dest_label[j] = NULL;
374 if (token == T_STRING)
376 ds_truncate (&tokstr, 255);
377 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
382 /* Get the name of the aggregation function. */
385 lex_error (_("expecting aggregation function"));
390 if (tokid[strlen (tokid) - 1] == '.')
393 tokid[strlen (tokid) - 1] = 0;
396 for (function = agr_func_tab; function->name; function++)
397 if (!strcasecmp (function->name, tokid))
399 if (NULL == function->name)
401 msg (SE, _("Unknown aggregation function %s."), tokid);
404 func_index = function - agr_func_tab;
407 /* Check for leading lparen. */
408 if (!lex_match ('('))
411 func_index = N_NO_VARS;
412 else if (func_index == NU)
413 func_index = NU_NO_VARS;
416 lex_error (_("expecting `('"));
422 /* Parse list of source variables. */
424 int pv_opts = PV_NO_SCRATCH;
426 if (func_index == SUM || func_index == MEAN || func_index == SD)
427 pv_opts |= PV_NUMERIC;
428 else if (function->n_args)
429 pv_opts |= PV_SAME_TYPE;
431 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
435 /* Parse function arguments, for those functions that
436 require arguments. */
437 if (function->n_args != 0)
438 for (i = 0; i < function->n_args; i++)
443 if (token == T_STRING)
445 arg[i].c = xstrdup (ds_c_str (&tokstr));
448 else if (lex_is_number ())
453 msg (SE, _("Missing argument %d to %s."), i + 1,
460 if (type != src[0]->type)
462 msg (SE, _("Arguments to %s must be of same type as "
463 "source variables."),
469 /* Trailing rparen. */
472 lex_error (_("expecting `)'"));
476 /* Now check that the number of source variables match
477 the number of target variables. If we check earlier
478 than this, the user can get very misleading error
479 message, i.e. `AGGREGATE x=SUM(y t).' will get this
480 error message when a proper message would be more
481 like `unknown variable t'. */
484 msg (SE, _("Number of source variables (%d) does not match "
485 "number of target variables (%d)."),
490 if ((func_index == PIN || func_index == POUT
491 || func_index == FIN || func_index == FOUT)
492 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
493 || (src[0]->type == ALPHA
494 && st_compare_pad (arg[0].c, strlen (arg[0].c),
495 arg[1].c, strlen (arg[1].c)) > 0)))
497 union value t = arg[0];
501 msg (SW, _("The value arguments passed to the %s function "
502 "are out-of-order. They will be treated as if "
503 "they had been specified in the correct order."),
508 /* Finally add these to the linked list of aggregation
510 for (i = 0; i < n_dest; i++)
512 struct agr_var *v = xmalloc (sizeof *v);
514 /* Add variable to chain. */
515 if (agr->agr_vars != NULL)
523 /* Create the target variable in the aggregate
526 static const struct fmt_spec f8_2 = {FMT_F, 8, 2};
527 struct variable *destvar;
529 v->function = func_index;
535 if (src[i]->type == ALPHA)
537 v->function |= FSTRING;
538 v->string = xmalloc (src[i]->width);
541 if (function->alpha_type == ALPHA)
542 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
545 assert (v->src->type == NUMERIC
546 || function->alpha_type == NUMERIC);
547 destvar = dict_create_var (agr->dict, dest[i], 0);
550 if ((func_index == N || func_index == NMISS)
551 && dict_get_weight (default_dict) != NULL)
552 destvar->print = destvar->write = f8_2;
554 destvar->print = destvar->write = function->format;
559 destvar = dict_create_var (agr->dict, dest[i], 0);
560 if (func_index == N_NO_VARS
561 && dict_get_weight (default_dict) != NULL)
562 destvar->print = destvar->write = f8_2;
564 destvar->print = destvar->write = function->format;
569 msg (SE, _("Variable name %s is not unique within the "
570 "aggregate file dictionary, which contains "
571 "the aggregate variables and the break "
581 destvar->label = dest_label[i];
582 dest_label[i] = NULL;
588 v->include_missing = include_missing;
594 if (v->src->type == NUMERIC)
595 for (j = 0; j < function->n_args; j++)
596 v->arg[j].f = arg[j].f;
598 for (j = 0; j < function->n_args; j++)
599 v->arg[j].c = xstrdup (arg[j].c);
603 if (src != NULL && src[0]->type == ALPHA)
604 for (i = 0; i < function->n_args; i++)
614 if (!lex_match ('/'))
619 lex_error ("expecting end of command");
625 for (i = 0; i < n_dest; i++)
628 free (dest_label[i]);
634 if (src && n_src && src[0]->type == ALPHA)
635 for (i = 0; i < function->n_args; i++)
648 agr_destroy (struct agr_proc *agr)
650 struct agr_var *iter, *next;
652 sfm_close_writer (agr->writer);
653 if (agr->sort != NULL)
654 sort_destroy_criteria (agr->sort);
655 free (agr->break_vars);
656 case_destroy (&agr->break_case);
657 for (iter = agr->agr_vars; iter; iter = next)
661 if (iter->function & FSTRING)
666 n_args = agr_func_tab[iter->function & FUNC].n_args;
667 for (i = 0; i < n_args; i++)
668 free (iter->arg[i].c);
671 else if (iter->function == SD)
672 moments1_destroy (iter->moments);
675 if (agr->dict != NULL)
676 dict_destroy (agr->dict);
678 case_destroy (&agr->agr_case);
683 static void accumulate_aggregate_info (struct agr_proc *,
684 const struct ccase *);
685 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
687 /* Processes a single case INPUT for aggregation. If output is
688 warranted, writes it to OUTPUT and returns nonzero.
689 Otherwise, returns zero and OUTPUT is unmodified. */
691 aggregate_single_case (struct agr_proc *agr,
692 const struct ccase *input, struct ccase *output)
694 bool finished_group = false;
696 if (agr->case_cnt++ == 0)
697 initialize_aggregate_info (agr, input);
698 else if (case_compare (&agr->break_case, input,
699 agr->break_vars, agr->break_var_cnt))
701 dump_aggregate_info (agr, output);
702 finished_group = true;
704 initialize_aggregate_info (agr, input);
707 accumulate_aggregate_info (agr, input);
708 return finished_group;
711 /* Accumulates aggregation data from the case INPUT. */
713 accumulate_aggregate_info (struct agr_proc *agr,
714 const struct ccase *input)
716 struct agr_var *iter;
720 weight = dict_get_case_weight (default_dict, input, &bad_warn);
722 for (iter = agr->agr_vars; iter; iter = iter->next)
725 const union value *v = case_data (input, iter->src->fv);
727 if ((!iter->include_missing && is_missing (v, iter->src))
728 || (iter->include_missing && iter->src->type == NUMERIC
731 switch (iter->function)
734 case NMISS | FSTRING:
735 iter->dbl[0] += weight;
738 case NUMISS | FSTRING:
746 /* This is horrible. There are too many possibilities. */
747 switch (iter->function)
750 iter->dbl[0] += v->f * weight;
754 iter->dbl[0] += v->f * weight;
755 iter->dbl[1] += weight;
758 moments1_add (iter->moments, v->f, weight);
761 iter->dbl[0] = max (iter->dbl[0], v->f);
765 if (memcmp (iter->string, v->s, iter->src->width) < 0)
766 memcpy (iter->string, v->s, iter->src->width);
770 iter->dbl[0] = min (iter->dbl[0], v->f);
774 if (memcmp (iter->string, v->s, iter->src->width) > 0)
775 memcpy (iter->string, v->s, iter->src->width);
780 if (v->f > iter->arg[0].f)
781 iter->dbl[0] += weight;
782 iter->dbl[1] += weight;
786 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
787 iter->dbl[0] += weight;
788 iter->dbl[1] += weight;
792 if (v->f < iter->arg[0].f)
793 iter->dbl[0] += weight;
794 iter->dbl[1] += weight;
798 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
799 iter->dbl[0] += weight;
800 iter->dbl[1] += weight;
804 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
805 iter->dbl[0] += weight;
806 iter->dbl[1] += weight;
810 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
811 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
812 iter->dbl[0] += weight;
813 iter->dbl[1] += weight;
817 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
818 iter->dbl[0] += weight;
819 iter->dbl[1] += weight;
823 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
824 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
825 iter->dbl[0] += weight;
826 iter->dbl[1] += weight;
830 iter->dbl[0] += weight;
843 case FIRST | FSTRING:
846 memcpy (iter->string, v->s, iter->src->width);
855 memcpy (iter->string, v->s, iter->src->width);
859 case NMISS | FSTRING:
861 case NUMISS | FSTRING:
862 /* Our value is not missing or it would have been
863 caught earlier. Nothing to do. */
869 switch (iter->function)
872 iter->dbl[0] += weight;
883 /* We've come to a record that differs from the previous in one or
884 more of the break variables. Make an output record from the
885 accumulated statistics in the OUTPUT case. */
887 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
893 for (i = 0; i < agr->break_var_cnt; i++)
895 struct variable *v = agr->break_vars[i];
896 memcpy (case_data_rw (output, value_idx),
897 case_data (&agr->break_case, v->fv),
898 sizeof (union value) * v->nv);
906 for (i = agr->agr_vars; i; i = i->next)
908 union value *v = case_data_rw (output, i->dest->fv);
910 if (agr->missing == COLUMNWISE && i->missing != 0
911 && (i->function & FUNC) != N && (i->function & FUNC) != NU
912 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
914 if (i->dest->type == ALPHA)
915 memset (v->s, ' ', i->dest->width);
924 v->f = i->int1 ? i->dbl[0] : SYSMIS;
927 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
933 /* FIXME: we should use two passes. */
934 moments1_calculate (i->moments, NULL, NULL, &variance,
936 if (variance != SYSMIS)
937 v->f = sqrt (variance);
944 v->f = i->int1 ? i->dbl[0] : SYSMIS;
949 memcpy (v->s, i->string, i->dest->width);
951 memset (v->s, ' ', i->dest->width);
961 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
971 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
983 v->f = i->int1 ? i->dbl[0] : SYSMIS;
985 case FIRST | FSTRING:
988 memcpy (v->s, i->string, i->dest->width);
990 memset (v->s, ' ', i->dest->width);
999 case NMISS | FSTRING:
1003 case NUMISS | FSTRING:
1013 /* Resets the state for all the aggregate functions. */
1015 initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input)
1017 struct agr_var *iter;
1019 case_destroy (&agr->break_case);
1020 case_clone (&agr->break_case, input);
1022 for (iter = agr->agr_vars; iter; iter = iter->next)
1025 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1026 iter->int1 = iter->int2 = 0;
1027 switch (iter->function)
1030 iter->dbl[0] = DBL_MAX;
1033 memset (iter->string, 255, iter->src->width);
1036 iter->dbl[0] = -DBL_MAX;
1039 memset (iter->string, 0, iter->src->width);
1042 if (iter->moments == NULL)
1043 iter->moments = moments1_create (MOMENT_VARIANCE);
1045 moments1_clear (iter->moments);
1053 /* Aggregate each case as it comes through. Cases which aren't needed
1056 agr_to_active_file (struct ccase *c, void *agr_)
1058 struct agr_proc *agr = agr_;
1060 if (aggregate_single_case (agr, c, &agr->agr_case))
1061 agr->sink->class->write (agr->sink, &agr->agr_case);
1066 /* Aggregate the current case and output it if we passed a
1069 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1071 struct agr_proc *agr = agr_;
1073 if (aggregate_single_case (agr, c, &agr->agr_case))
1074 sfm_write_case (agr->writer, &agr->agr_case);