1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
42 /* Specifies how to make an aggregate variable. */
45 struct agr_var *next; /* Next in list. */
47 /* Collected during parsing. */
48 struct variable *src; /* Source variable. */
49 struct variable *dest; /* Target variable. */
50 int function; /* Function. */
51 int include_missing; /* 1=Include user-missing values. */
52 union value arg[2]; /* Arguments. */
54 /* Accumulated during AGGREGATE execution. */
59 struct moments1 *moments;
62 /* Aggregation functions. */
65 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
66 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
67 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
68 FUNC = 0x1f, /* Function mask. */
69 FSTRING = 1<<5, /* String function bit. */
72 /* Attributes of an aggregation function. */
75 const char *name; /* Aggregation function name. */
76 int n_args; /* Number of arguments. */
77 int alpha_type; /* When given ALPHA arguments, output type. */
78 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
81 /* Attributes of aggregation functions. */
82 static const struct agr_func agr_func_tab[] =
84 {"<NONE>", 0, -1, {0, 0, 0}},
85 {"SUM", 0, -1, {FMT_F, 8, 2}},
86 {"MEAN", 0, -1, {FMT_F, 8, 2}},
87 {"SD", 0, -1, {FMT_F, 8, 2}},
88 {"MAX", 0, ALPHA, {-1, -1, -1}},
89 {"MIN", 0, ALPHA, {-1, -1, -1}},
90 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
91 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
92 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
93 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
94 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
95 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
96 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
97 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
98 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
99 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
100 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
101 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"FIRST", 0, ALPHA, {-1, -1, -1}},
103 {"LAST", 0, ALPHA, {-1, -1, -1}},
104 {NULL, 0, -1, {-1, -1, -1}},
105 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
109 /* Missing value types. */
110 enum missing_treatment
112 ITEMWISE, /* Missing values item by item. */
113 COLUMNWISE /* Missing values column by column. */
116 /* An entire AGGREGATE procedure. */
119 /* We have either an output file or a sink. */
120 struct sfm_writer *writer; /* Output file, or null if none. */
121 struct case_sink *sink; /* Sink, or null if none. */
123 /* Break variables. */
124 struct sort_criteria *sort; /* Sort criteria. */
125 struct variable **break_vars; /* Break variables. */
126 size_t break_var_cnt; /* Number of break variables. */
127 union value *prev_break; /* Last values of break variables. */
129 enum missing_treatment missing; /* How to treat missing values. */
130 struct agr_var *agr_vars; /* First aggregate variable. */
131 struct dictionary *dict; /* Aggregate dictionary. */
132 int case_cnt; /* Counts aggregated cases. */
133 struct ccase agr_case; /* Aggregate case for output. */
136 static void initialize_aggregate_info (struct agr_proc *);
139 static int parse_aggregate_functions (struct agr_proc *);
140 static void agr_destroy (struct agr_proc *);
141 static int aggregate_single_case (struct agr_proc *agr,
142 const struct ccase *input,
143 struct ccase *output);
144 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
146 /* Aggregating to the active file. */
147 static int agr_to_active_file (struct ccase *, void *aux);
149 /* Aggregating to a system file. */
150 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
154 /* Parses and executes the AGGREGATE procedure. */
159 struct file_handle *out_file = NULL;
161 bool copy_documents = false;
162 bool presorted = false;
165 memset(&agr, 0 , sizeof (agr));
166 agr.missing = ITEMWISE;
168 agr.dict = dict_create ();
169 dict_set_label (agr.dict, dict_get_label (default_dict));
170 dict_set_documents (agr.dict, dict_get_documents (default_dict));
172 /* OUTFILE subcommand must be first. */
173 if (!lex_force_match_id ("OUTFILE"))
176 if (!lex_match ('*'))
178 out_file = fh_parse ();
179 if (out_file == NULL)
183 /* Read most of the subcommands. */
188 if (lex_match_id ("MISSING"))
191 if (!lex_match_id ("COLUMNWISE"))
193 lex_error (_("while expecting COLUMNWISE"));
196 agr.missing = COLUMNWISE;
198 else if (lex_match_id ("DOCUMENT"))
199 copy_documents = true;
200 else if (lex_match_id ("PRESORTED"))
202 else if (lex_match_id ("BREAK"))
207 agr.sort = sort_parse_criteria (default_dict,
208 &agr.break_vars, &agr.break_var_cnt,
210 if (agr.sort == NULL)
213 for (i = 0; i < agr.break_var_cnt; i++)
215 struct variable *v = dict_clone_var (agr.dict, agr.break_vars[i],
216 agr.break_vars[i]->name);
220 /* BREAK must follow the options. */
225 lex_error (_("expecting BREAK"));
229 if (presorted && saw_direction)
230 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
231 "with (A) or (D) has no effect. Output data will be sorted "
232 "the same way as the input data."));
234 /* Read in the aggregate functions. */
236 if (!parse_aggregate_functions (&agr))
239 /* Delete documents. */
241 dict_set_documents (agr.dict, NULL);
243 /* Cancel SPLIT FILE. */
244 dict_set_split_vars (agr.dict, NULL, 0);
248 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
249 initialize_aggregate_info (&agr);
251 /* Output to active file or external file? */
252 if (out_file == NULL)
254 /* The active file will be replaced by the aggregated data,
255 so TEMPORARY is moot. */
258 if (agr.sort != NULL && !presorted)
259 sort_active_file_in_place (agr.sort);
261 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
262 if (agr.sink->class->open != NULL)
263 agr.sink->class->open (agr.sink);
264 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
265 procedure (agr_to_active_file, &agr);
266 if (agr.case_cnt > 0)
268 dump_aggregate_info (&agr, &agr.agr_case);
269 agr.sink->class->write (agr.sink, &agr.agr_case);
271 dict_destroy (default_dict);
272 default_dict = agr.dict;
274 vfm_source = agr.sink->class->make_source (agr.sink);
275 free_case_sink (agr.sink);
279 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression ());
280 if (agr.writer == NULL)
283 if (agr.sort != NULL && !presorted)
285 /* Sorting is needed. */
286 struct casefile *dst;
287 struct casereader *reader;
290 dst = sort_active_file_to_casefile (agr.sort);
293 reader = casefile_get_destructive_reader (dst);
294 while (casereader_read_xfer (reader, &c))
296 if (aggregate_single_case (&agr, &c, &agr.agr_case))
297 sfm_write_case (agr.writer, &agr.agr_case);
300 casereader_destroy (reader);
301 casefile_destroy (dst);
305 /* Active file is already sorted. */
306 procedure (presorted_agr_to_sysfile, &agr);
309 if (agr.case_cnt > 0)
311 dump_aggregate_info (&agr, &agr.agr_case);
312 sfm_write_case (agr.writer, &agr.agr_case);
324 /* Parse all the aggregate functions. */
326 parse_aggregate_functions (struct agr_proc *agr)
328 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
330 /* Parse everything. */
339 const struct agr_func *function;
344 struct variable **src;
358 /* Parse the list of target variables. */
359 while (!lex_match ('='))
361 int n_dest_prev = n_dest;
363 if (!parse_DATA_LIST_vars (&dest, &n_dest,
364 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
367 /* Assign empty labels. */
371 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
372 for (j = n_dest_prev; j < n_dest; j++)
373 dest_label[j] = NULL;
376 if (token == T_STRING)
378 ds_truncate (&tokstr, 255);
379 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
384 /* Get the name of the aggregation function. */
387 lex_error (_("expecting aggregation function"));
392 if (tokid[strlen (tokid) - 1] == '.')
395 tokid[strlen (tokid) - 1] = 0;
398 for (function = agr_func_tab; function->name; function++)
399 if (!strcmp (function->name, tokid))
401 if (NULL == function->name)
403 msg (SE, _("Unknown aggregation function %s."), tokid);
406 func_index = function - agr_func_tab;
409 /* Check for leading lparen. */
410 if (!lex_match ('('))
413 func_index = N_NO_VARS;
414 else if (func_index == NU)
415 func_index = NU_NO_VARS;
418 lex_error (_("expecting `('"));
424 /* Parse list of source variables. */
426 int pv_opts = PV_NO_SCRATCH;
428 if (func_index == SUM || func_index == MEAN || func_index == SD)
429 pv_opts |= PV_NUMERIC;
430 else if (function->n_args)
431 pv_opts |= PV_SAME_TYPE;
433 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
437 /* Parse function arguments, for those functions that
438 require arguments. */
439 if (function->n_args != 0)
440 for (i = 0; i < function->n_args; i++)
445 if (token == T_STRING)
447 arg[i].c = xstrdup (ds_c_str (&tokstr));
450 else if (lex_is_number ())
455 msg (SE, _("Missing argument %d to %s."), i + 1, function->name);
461 if (type != src[0]->type)
463 msg (SE, _("Arguments to %s must be of same type as "
464 "source variables."),
470 /* Trailing rparen. */
473 lex_error (_("expecting `)'"));
477 /* Now check that the number of source variables match
478 the number of target variables. If we check earlier
479 than this, the user can get very misleading error
480 message, i.e. `AGGREGATE x=SUM(y t).' will get this
481 error message when a proper message would be more
482 like `unknown variable t'. */
485 msg (SE, _("Number of source variables (%d) does not match "
486 "number of target variables (%d)."),
491 if ((func_index == PIN || func_index == POUT
492 || func_index == FIN || func_index == FOUT)
493 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
494 || (src[0]->type == ALPHA
495 && st_compare_pad (arg[0].c, strlen (arg[0].c),
496 arg[1].c, strlen (arg[1].c)) > 0)))
498 union value t = arg[0];
502 msg (SW, _("The value arguments passed to the %s function "
503 "are out-of-order. They will be treated as if "
504 "they had been specified in the correct order."),
509 /* Finally add these to the linked list of aggregation
511 for (i = 0; i < n_dest; i++)
513 struct agr_var *v = xmalloc (sizeof *v);
515 /* Add variable to chain. */
516 if (agr->agr_vars != NULL)
524 /* Create the target variable in the aggregate
527 static const struct fmt_spec f8_2 = {FMT_F, 8, 2};
528 struct variable *destvar;
530 v->function = func_index;
536 if (src[i]->type == ALPHA)
538 v->function |= FSTRING;
539 v->string = xmalloc (src[i]->width);
542 if (function->alpha_type == ALPHA)
543 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
544 else if (v->src->type == NUMERIC
545 || function->alpha_type == NUMERIC)
547 destvar = dict_create_var (agr->dict, dest[i], 0);
550 if ((func_index == N || func_index == NMISS)
551 && dict_get_weight (default_dict) != NULL)
552 destvar->print = destvar->write = f8_2;
554 destvar->print = destvar->write = function->format;
559 destvar = dict_create_var (agr->dict, dest[i], 0);
560 if (func_index == N_NO_VARS
561 && dict_get_weight (default_dict) != NULL)
562 destvar->print = destvar->write = f8_2;
564 destvar->print = destvar->write = function->format;
569 msg (SE, _("Variable name %s is not unique within the "
570 "aggregate file dictionary, which contains "
571 "the aggregate variables and the break "
581 destvar->label = dest_label[i];
582 dest_label[i] = NULL;
588 v->include_missing = include_missing;
594 if (v->src->type == NUMERIC)
595 for (j = 0; j < function->n_args; j++)
596 v->arg[j].f = arg[j].f;
598 for (j = 0; j < function->n_args; j++)
599 v->arg[j].c = xstrdup (arg[j].c);
603 if (src != NULL && src[0]->type == ALPHA)
604 for (i = 0; i < function->n_args; i++)
614 if (!lex_match ('/'))
619 lex_error ("expecting end of command");
625 for (i = 0; i < n_dest; i++)
628 free (dest_label[i]);
634 if (src && n_src && src[0]->type == ALPHA)
635 for (i = 0; i < function->n_args; i++)
648 agr_destroy (struct agr_proc *agr)
650 struct agr_var *iter, *next;
652 sfm_close_writer (agr->writer);
653 if (agr->sort != NULL)
654 sort_destroy_criteria (agr->sort);
655 free (agr->break_vars);
656 free (agr->prev_break);
657 for (iter = agr->agr_vars; iter; iter = next)
661 if (iter->function & FSTRING)
666 n_args = agr_func_tab[iter->function & FUNC].n_args;
667 for (i = 0; i < n_args; i++)
668 free (iter->arg[i].c);
671 else if (iter->function == SD)
672 moments1_destroy (iter->moments);
675 if (agr->dict != NULL)
676 dict_destroy (agr->dict);
678 case_destroy (&agr->agr_case);
683 static void accumulate_aggregate_info (struct agr_proc *,
684 const struct ccase *);
685 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
687 /* Processes a single case INPUT for aggregation. If output is
688 warranted, writes it to OUTPUT and returns nonzero.
689 Otherwise, returns zero and OUTPUT is unmodified. */
691 aggregate_single_case (struct agr_proc *agr,
692 const struct ccase *input, struct ccase *output)
694 /* The first case always begins a new break group. We also need to
695 preserve the values of the case for later comparison. */
696 if (agr->case_cnt++ == 0)
703 for (i = 0; i < agr->break_var_cnt; i++)
704 n_elem += agr->break_vars[i]->nv;
707 agr->prev_break = xmalloc (sizeof *agr->prev_break * n_elem);
709 /* Copy INPUT into prev_break. */
711 union value *iter = agr->prev_break;
714 for (i = 0; i < agr->break_var_cnt; i++)
716 struct variable *v = agr->break_vars[i];
718 if (v->type == NUMERIC)
719 (iter++)->f = case_num (input, v->fv);
722 memcpy (iter->s, case_str (input, v->fv), v->width);
728 accumulate_aggregate_info (agr, input);
733 /* Compare the value of each break variable to the values on the
736 union value *iter = agr->prev_break;
739 for (i = 0; i < agr->break_var_cnt; i++)
741 struct variable *v = agr->break_vars[i];
746 if (case_num (input, v->fv) != iter->f)
751 if (memcmp (case_str (input, v->fv), iter->s, v->width))
761 accumulate_aggregate_info (agr, input);
766 /* The values of the break variable are different from the values on
767 the previous case. That means that it's time to dump aggregate
769 dump_aggregate_info (agr, output);
770 initialize_aggregate_info (agr);
771 accumulate_aggregate_info (agr, input);
773 /* Copy INPUT into prev_break. */
775 union value *iter = agr->prev_break;
778 for (i = 0; i < agr->break_var_cnt; i++)
780 struct variable *v = agr->break_vars[i];
782 if (v->type == NUMERIC)
783 (iter++)->f = case_num (input, v->fv);
786 memcpy (iter->s, case_str (input, v->fv), v->width);
795 /* Accumulates aggregation data from the case INPUT. */
797 accumulate_aggregate_info (struct agr_proc *agr,
798 const struct ccase *input)
800 struct agr_var *iter;
804 weight = dict_get_case_weight (default_dict, input, &bad_warn);
806 for (iter = agr->agr_vars; iter; iter = iter->next)
809 const union value *v = case_data (input, iter->src->fv);
811 if ((!iter->include_missing && is_missing (v, iter->src))
812 || (iter->include_missing && iter->src->type == NUMERIC
815 switch (iter->function)
818 case NMISS | FSTRING:
819 iter->dbl[0] += weight;
822 case NUMISS | FSTRING:
830 /* This is horrible. There are too many possibilities. */
831 switch (iter->function)
834 iter->dbl[0] += v->f * weight;
838 iter->dbl[0] += v->f * weight;
839 iter->dbl[1] += weight;
842 moments1_add (iter->moments, v->f, weight);
845 iter->dbl[0] = max (iter->dbl[0], v->f);
849 if (memcmp (iter->string, v->s, iter->src->width) < 0)
850 memcpy (iter->string, v->s, iter->src->width);
854 iter->dbl[0] = min (iter->dbl[0], v->f);
858 if (memcmp (iter->string, v->s, iter->src->width) > 0)
859 memcpy (iter->string, v->s, iter->src->width);
864 if (v->f > iter->arg[0].f)
865 iter->dbl[0] += weight;
866 iter->dbl[1] += weight;
870 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
871 iter->dbl[0] += weight;
872 iter->dbl[1] += weight;
876 if (v->f < iter->arg[0].f)
877 iter->dbl[0] += weight;
878 iter->dbl[1] += weight;
882 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
883 iter->dbl[0] += weight;
884 iter->dbl[1] += weight;
888 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
889 iter->dbl[0] += weight;
890 iter->dbl[1] += weight;
894 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
895 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
896 iter->dbl[0] += weight;
897 iter->dbl[1] += weight;
901 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
902 iter->dbl[0] += weight;
903 iter->dbl[1] += weight;
907 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
908 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
909 iter->dbl[0] += weight;
910 iter->dbl[1] += weight;
914 iter->dbl[0] += weight;
927 case FIRST | FSTRING:
930 memcpy (iter->string, v->s, iter->src->width);
939 memcpy (iter->string, v->s, iter->src->width);
943 case NMISS | FSTRING:
945 case NUMISS | FSTRING:
946 /* Our value is not missing or it would have been
947 caught earlier. Nothing to do. */
953 switch (iter->function)
956 iter->dbl[0] += weight;
967 /* We've come to a record that differs from the previous in one or
968 more of the break variables. Make an output record from the
969 accumulated statistics in the OUTPUT case. */
971 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
977 for (i = 0; i < agr->break_var_cnt; i++)
979 int nv = agr->break_vars[i]->nv;
980 memcpy (case_data_rw (output, value_idx),
981 &agr->prev_break[value_idx],
982 sizeof (union value) * nv);
990 for (i = agr->agr_vars; i; i = i->next)
992 union value *v = case_data_rw (output, i->dest->fv);
994 if (agr->missing == COLUMNWISE && i->missing != 0
995 && (i->function & FUNC) != N && (i->function & FUNC) != NU
996 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
998 if (i->dest->type == ALPHA)
999 memset (v->s, ' ', i->dest->width);
1005 switch (i->function)
1008 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1011 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
1017 /* FIXME: we should use two passes. */
1018 moments1_calculate (i->moments, NULL, NULL, &variance,
1020 if (variance != SYSMIS)
1021 v->f = sqrt (variance);
1028 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1033 memcpy (v->s, i->string, i->dest->width);
1035 memset (v->s, ' ', i->dest->width);
1044 case FOUT | FSTRING:
1045 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
1054 case POUT | FSTRING:
1055 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
1067 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1069 case FIRST | FSTRING:
1070 case LAST | FSTRING:
1072 memcpy (v->s, i->string, i->dest->width);
1074 memset (v->s, ' ', i->dest->width);
1083 case NMISS | FSTRING:
1087 case NUMISS | FSTRING:
1097 /* Resets the state for all the aggregate functions. */
1099 initialize_aggregate_info (struct agr_proc *agr)
1101 struct agr_var *iter;
1103 for (iter = agr->agr_vars; iter; iter = iter->next)
1106 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1107 iter->int1 = iter->int2 = 0;
1108 switch (iter->function)
1111 iter->dbl[0] = DBL_MAX;
1114 memset (iter->string, 255, iter->src->width);
1117 iter->dbl[0] = -DBL_MAX;
1120 memset (iter->string, 0, iter->src->width);
1123 if (iter->moments == NULL)
1124 iter->moments = moments1_create (MOMENT_VARIANCE);
1126 moments1_clear (iter->moments);
1134 /* Aggregate each case as it comes through. Cases which aren't needed
1137 agr_to_active_file (struct ccase *c, void *agr_)
1139 struct agr_proc *agr = agr_;
1141 if (aggregate_single_case (agr, c, &agr->agr_case))
1142 agr->sink->class->write (agr->sink, &agr->agr_case);
1147 /* Aggregate the current case and output it if we passed a
1150 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1152 struct agr_proc *agr = agr_;
1154 if (aggregate_single_case (agr, c, &agr->agr_case))
1155 sfm_write_case (agr->writer, &agr->agr_case);