1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
42 /* Specifies how to make an aggregate variable. */
45 struct agr_var *next; /* Next in list. */
47 /* Collected during parsing. */
48 struct variable *src; /* Source variable. */
49 struct variable *dest; /* Target variable. */
50 int function; /* Function. */
51 int include_missing; /* 1=Include user-missing values. */
52 union value arg[2]; /* Arguments. */
54 /* Accumulated during AGGREGATE execution. */
59 struct moments1 *moments;
62 /* Aggregation functions. */
65 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
66 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
67 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
68 FUNC = 0x1f, /* Function mask. */
69 FSTRING = 1<<5, /* String function bit. */
72 /* Attributes of an aggregation function. */
75 const char *name; /* Aggregation function name. */
76 int n_args; /* Number of arguments. */
77 int alpha_type; /* When given ALPHA arguments, output type. */
78 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
81 /* Attributes of aggregation functions. */
82 static const struct agr_func agr_func_tab[] =
84 {"<NONE>", 0, -1, {0, 0, 0}},
85 {"SUM", 0, -1, {FMT_F, 8, 2}},
86 {"MEAN", 0, -1, {FMT_F, 8, 2}},
87 {"SD", 0, -1, {FMT_F, 8, 2}},
88 {"MAX", 0, ALPHA, {-1, -1, -1}},
89 {"MIN", 0, ALPHA, {-1, -1, -1}},
90 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
91 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
92 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
93 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
94 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
95 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
96 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
97 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
98 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
99 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
100 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
101 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"FIRST", 0, ALPHA, {-1, -1, -1}},
103 {"LAST", 0, ALPHA, {-1, -1, -1}},
104 {NULL, 0, -1, {-1, -1, -1}},
105 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
109 /* Missing value types. */
110 enum missing_treatment
112 ITEMWISE, /* Missing values item by item. */
113 COLUMNWISE /* Missing values column by column. */
116 /* An entire AGGREGATE procedure. */
119 /* We have either an output file or a sink. */
120 struct sfm_writer *writer; /* Output file, or null if none. */
121 struct case_sink *sink; /* Sink, or null if none. */
123 /* Break variables. */
124 struct sort_criteria *sort; /* Sort criteria. */
125 struct variable **break_vars; /* Break variables. */
126 size_t break_var_cnt; /* Number of break variables. */
127 union value *prev_break; /* Last values of break variables. */
129 enum missing_treatment missing; /* How to treat missing values. */
130 struct agr_var *agr_vars; /* First aggregate variable. */
131 struct dictionary *dict; /* Aggregate dictionary. */
132 int case_cnt; /* Counts aggregated cases. */
133 struct ccase agr_case; /* Aggregate case for output. */
136 static void initialize_aggregate_info (struct agr_proc *);
139 static int parse_aggregate_functions (struct agr_proc *);
140 static void agr_destroy (struct agr_proc *);
141 static int aggregate_single_case (struct agr_proc *agr,
142 const struct ccase *input,
143 struct ccase *output);
144 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
146 /* Aggregating to the active file. */
147 static int agr_to_active_file (struct ccase *, void *aux);
149 /* Aggregating to a system file. */
150 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
154 /* Parses and executes the AGGREGATE procedure. */
159 struct file_handle *out_file = NULL;
161 bool copy_documents = false;
162 bool presorted = false;
165 memset(&agr, 0 , sizeof (agr));
166 agr.missing = ITEMWISE;
168 agr.dict = dict_create ();
169 dict_set_label (agr.dict, dict_get_label (default_dict));
170 dict_set_documents (agr.dict, dict_get_documents (default_dict));
172 /* OUTFILE subcommand must be first. */
173 if (!lex_force_match_id ("OUTFILE"))
176 if (!lex_match ('*'))
178 out_file = fh_parse ();
179 if (out_file == NULL)
183 /* Read most of the subcommands. */
188 if (lex_match_id ("MISSING"))
191 if (!lex_match_id ("COLUMNWISE"))
193 lex_error (_("while expecting COLUMNWISE"));
196 agr.missing = COLUMNWISE;
198 else if (lex_match_id ("DOCUMENT"))
199 copy_documents = true;
200 else if (lex_match_id ("PRESORTED"))
202 else if (lex_match_id ("BREAK"))
207 agr.sort = sort_parse_criteria (default_dict,
208 &agr.break_vars, &agr.break_var_cnt,
210 if (agr.sort == NULL)
213 for (i = 0; i < agr.break_var_cnt; i++)
215 struct variable *v = dict_clone_var (agr.dict, agr.break_vars[i],
216 agr.break_vars[i]->name);
220 /* BREAK must follow the options. */
225 lex_error (_("expecting BREAK"));
229 if (presorted && saw_direction)
230 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
231 "with (A) or (D) has no effect. Output data will be sorted "
232 "the same way as the input data."));
234 /* Read in the aggregate functions. */
236 if (!parse_aggregate_functions (&agr))
239 /* Delete documents. */
241 dict_set_documents (agr.dict, NULL);
243 /* Cancel SPLIT FILE. */
244 dict_set_split_vars (agr.dict, NULL, 0);
248 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
249 initialize_aggregate_info (&agr);
251 /* Output to active file or external file? */
252 if (out_file == NULL)
254 /* The active file will be replaced by the aggregated data,
255 so TEMPORARY is moot. */
258 if (agr.sort != NULL && !presorted)
259 sort_active_file_in_place (agr.sort);
261 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
262 if (agr.sink->class->open != NULL)
263 agr.sink->class->open (agr.sink);
264 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
265 procedure (agr_to_active_file, &agr);
266 if (agr.case_cnt > 0)
268 dump_aggregate_info (&agr, &agr.agr_case);
269 agr.sink->class->write (agr.sink, &agr.agr_case);
271 dict_destroy (default_dict);
272 default_dict = agr.dict;
274 vfm_source = agr.sink->class->make_source (agr.sink);
275 free_case_sink (agr.sink);
279 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression ());
280 if (agr.writer == NULL)
283 if (agr.sort != NULL && !presorted)
285 /* Sorting is needed. */
286 struct casefile *dst;
287 struct casereader *reader;
290 dst = sort_active_file_to_casefile (agr.sort);
293 reader = casefile_get_destructive_reader (dst);
294 while (casereader_read_xfer (reader, &c))
296 if (aggregate_single_case (&agr, &c, &agr.agr_case))
297 sfm_write_case (agr.writer, &agr.agr_case);
300 casereader_destroy (reader);
301 casefile_destroy (dst);
305 /* Active file is already sorted. */
306 procedure (presorted_agr_to_sysfile, &agr);
309 if (agr.case_cnt > 0)
311 dump_aggregate_info (&agr, &agr.agr_case);
312 sfm_write_case (agr.writer, &agr.agr_case);
324 /* Parse all the aggregate functions. */
326 parse_aggregate_functions (struct agr_proc *agr)
328 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
330 /* Parse everything. */
339 const struct agr_func *function;
344 struct variable **src;
358 /* Parse the list of target variables. */
359 while (!lex_match ('='))
361 int n_dest_prev = n_dest;
363 if (!parse_DATA_LIST_vars (&dest, &n_dest,
364 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
367 /* Assign empty labels. */
371 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
372 for (j = n_dest_prev; j < n_dest; j++)
373 dest_label[j] = NULL;
376 if (token == T_STRING)
378 ds_truncate (&tokstr, 255);
379 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
384 /* Get the name of the aggregation function. */
387 lex_error (_("expecting aggregation function"));
392 if (tokid[strlen (tokid) - 1] == '.')
395 tokid[strlen (tokid) - 1] = 0;
398 for (function = agr_func_tab; function->name; function++)
399 if (!strcmp (function->name, tokid))
401 if (NULL == function->name)
403 msg (SE, _("Unknown aggregation function %s."), tokid);
406 func_index = function - agr_func_tab;
409 /* Check for leading lparen. */
410 if (!lex_match ('('))
413 func_index = N_NO_VARS;
414 else if (func_index == NU)
415 func_index = NU_NO_VARS;
418 lex_error (_("expecting `('"));
424 /* Parse list of source variables. */
426 int pv_opts = PV_NO_SCRATCH;
428 if (func_index == SUM || func_index == MEAN || func_index == SD)
429 pv_opts |= PV_NUMERIC;
430 else if (function->n_args)
431 pv_opts |= PV_SAME_TYPE;
433 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
437 /* Parse function arguments, for those functions that
438 require arguments. */
439 if (function->n_args != 0)
440 for (i = 0; i < function->n_args; i++)
445 if (token == T_STRING)
447 arg[i].c = xstrdup (ds_c_str (&tokstr));
450 else if (lex_is_number ())
455 msg (SE, _("Missing argument %d to %s."), i + 1, function->name);
461 if (type != src[0]->type)
463 msg (SE, _("Arguments to %s must be of same type as "
464 "source variables."),
470 /* Trailing rparen. */
473 lex_error (_("expecting `)'"));
477 /* Now check that the number of source variables match
478 the number of target variables. If we check earlier
479 than this, the user can get very misleading error
480 message, i.e. `AGGREGATE x=SUM(y t).' will get this
481 error message when a proper message would be more
482 like `unknown variable t'. */
485 msg (SE, _("Number of source variables (%d) does not match "
486 "number of target variables (%d)."),
491 if ((func_index == PIN || func_index == POUT
492 || func_index == FIN || func_index == FOUT)
493 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
494 || (src[0]->type == ALPHA
495 && st_compare_pad (arg[0].c, strlen (arg[0].c),
496 arg[1].c, strlen (arg[1].c)) > 0)))
498 union value t = arg[0];
502 msg (SW, _("The value arguments passed to the %s function "
503 "are out-of-order. They will be treated as if "
504 "they had been specified in the correct order."),
509 /* Finally add these to the linked list of aggregation
511 for (i = 0; i < n_dest; i++)
513 struct agr_var *v = xmalloc (sizeof *v);
515 /* Add variable to chain. */
516 if (agr->agr_vars != NULL)
524 /* Create the target variable in the aggregate
527 struct variable *destvar;
529 v->function = func_index;
535 if (src[i]->type == ALPHA)
537 v->function |= FSTRING;
538 v->string = xmalloc (src[i]->width);
541 if (function->alpha_type == ALPHA)
542 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
543 else if (v->src->type == NUMERIC
544 || function->alpha_type == NUMERIC)
546 destvar = dict_create_var (agr->dict, dest[i], 0);
549 || func_index == N_NO_VARS
550 || func_index == NMISS)
551 && dict_get_weight (default_dict) != NULL)
553 static const struct fmt_spec f8_2 = {FMT_F, 8, 2};
555 destvar->print = destvar->write = f8_2;
558 destvar->print = destvar->write = function->format;
561 destvar = dict_create_var (agr->dict, dest[i],
565 destvar = dict_create_var (agr->dict, dest[i], 0);
570 msg (SE, _("Variable name %s is not unique within the "
571 "aggregate file dictionary, which contains "
572 "the aggregate variables and the break "
583 destvar->label = dest_label[i];
584 dest_label[i] = NULL;
590 v->include_missing = include_missing;
596 if (v->src->type == NUMERIC)
597 for (j = 0; j < function->n_args; j++)
598 v->arg[j].f = arg[j].f;
600 for (j = 0; j < function->n_args; j++)
601 v->arg[j].c = xstrdup (arg[j].c);
605 if (src != NULL && src[0]->type == ALPHA)
606 for (i = 0; i < function->n_args; i++)
616 if (!lex_match ('/'))
621 lex_error ("expecting end of command");
627 for (i = 0; i < n_dest; i++)
630 free (dest_label[i]);
636 if (src && n_src && src[0]->type == ALPHA)
637 for (i = 0; i < function->n_args; i++)
650 agr_destroy (struct agr_proc *agr)
652 struct agr_var *iter, *next;
654 sfm_close_writer (agr->writer);
655 if (agr->sort != NULL)
656 sort_destroy_criteria (agr->sort);
657 free (agr->break_vars);
658 free (agr->prev_break);
659 for (iter = agr->agr_vars; iter; iter = next)
663 if (iter->function & FSTRING)
668 n_args = agr_func_tab[iter->function & FUNC].n_args;
669 for (i = 0; i < n_args; i++)
670 free (iter->arg[i].c);
673 else if (iter->function == SD)
674 moments1_destroy (iter->moments);
677 if (agr->dict != NULL)
678 dict_destroy (agr->dict);
680 case_destroy (&agr->agr_case);
685 static void accumulate_aggregate_info (struct agr_proc *,
686 const struct ccase *);
687 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
689 /* Processes a single case INPUT for aggregation. If output is
690 warranted, writes it to OUTPUT and returns nonzero.
691 Otherwise, returns zero and OUTPUT is unmodified. */
693 aggregate_single_case (struct agr_proc *agr,
694 const struct ccase *input, struct ccase *output)
696 /* The first case always begins a new break group. We also need to
697 preserve the values of the case for later comparison. */
698 if (agr->case_cnt++ == 0)
705 for (i = 0; i < agr->break_var_cnt; i++)
706 n_elem += agr->break_vars[i]->nv;
709 agr->prev_break = xmalloc (sizeof *agr->prev_break * n_elem);
711 /* Copy INPUT into prev_break. */
713 union value *iter = agr->prev_break;
716 for (i = 0; i < agr->break_var_cnt; i++)
718 struct variable *v = agr->break_vars[i];
720 if (v->type == NUMERIC)
721 (iter++)->f = case_num (input, v->fv);
724 memcpy (iter->s, case_str (input, v->fv), v->width);
730 accumulate_aggregate_info (agr, input);
735 /* Compare the value of each break variable to the values on the
738 union value *iter = agr->prev_break;
741 for (i = 0; i < agr->break_var_cnt; i++)
743 struct variable *v = agr->break_vars[i];
748 if (case_num (input, v->fv) != iter->f)
753 if (memcmp (case_str (input, v->fv), iter->s, v->width))
763 accumulate_aggregate_info (agr, input);
768 /* The values of the break variable are different from the values on
769 the previous case. That means that it's time to dump aggregate
771 dump_aggregate_info (agr, output);
772 initialize_aggregate_info (agr);
773 accumulate_aggregate_info (agr, input);
775 /* Copy INPUT into prev_break. */
777 union value *iter = agr->prev_break;
780 for (i = 0; i < agr->break_var_cnt; i++)
782 struct variable *v = agr->break_vars[i];
784 if (v->type == NUMERIC)
785 (iter++)->f = case_num (input, v->fv);
788 memcpy (iter->s, case_str (input, v->fv), v->width);
797 /* Accumulates aggregation data from the case INPUT. */
799 accumulate_aggregate_info (struct agr_proc *agr,
800 const struct ccase *input)
802 struct agr_var *iter;
806 weight = dict_get_case_weight (default_dict, input, &bad_warn);
808 for (iter = agr->agr_vars; iter; iter = iter->next)
811 const union value *v = case_data (input, iter->src->fv);
813 if ((!iter->include_missing && is_missing (v, iter->src))
814 || (iter->include_missing && iter->src->type == NUMERIC
817 switch (iter->function)
820 case NMISS | FSTRING:
821 iter->dbl[0] += weight;
824 case NUMISS | FSTRING:
832 /* This is horrible. There are too many possibilities. */
833 switch (iter->function)
836 iter->dbl[0] += v->f * weight;
839 iter->dbl[0] += v->f * weight;
840 iter->dbl[1] += weight;
843 moments1_add (iter->moments, v->f, weight);
846 iter->dbl[0] = max (iter->dbl[0], v->f);
850 if (memcmp (iter->string, v->s, iter->src->width) < 0)
851 memcpy (iter->string, v->s, iter->src->width);
855 iter->dbl[0] = min (iter->dbl[0], v->f);
859 if (memcmp (iter->string, v->s, iter->src->width) > 0)
860 memcpy (iter->string, v->s, iter->src->width);
865 if (v->f > iter->arg[0].f)
866 iter->dbl[0] += weight;
867 iter->dbl[1] += weight;
871 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
872 iter->dbl[0] += weight;
873 iter->dbl[1] += weight;
877 if (v->f < iter->arg[0].f)
878 iter->dbl[0] += weight;
879 iter->dbl[1] += weight;
883 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
884 iter->dbl[0] += weight;
885 iter->dbl[1] += weight;
889 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
890 iter->dbl[0] += weight;
891 iter->dbl[1] += weight;
895 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
896 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
897 iter->dbl[0] += weight;
898 iter->dbl[1] += weight;
902 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
903 iter->dbl[0] += weight;
904 iter->dbl[1] += weight;
908 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
909 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
910 iter->dbl[0] += weight;
911 iter->dbl[1] += weight;
915 iter->dbl[0] += weight;
928 case FIRST | FSTRING:
931 memcpy (iter->string, v->s, iter->src->width);
940 memcpy (iter->string, v->s, iter->src->width);
944 case NMISS | FSTRING:
946 case NUMISS | FSTRING:
947 /* Our value is not missing or it would have been
948 caught earlier. Nothing to do. */
954 switch (iter->function)
957 iter->dbl[0] += weight;
968 /* We've come to a record that differs from the previous in one or
969 more of the break variables. Make an output record from the
970 accumulated statistics in the OUTPUT case. */
972 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
978 for (i = 0; i < agr->break_var_cnt; i++)
980 int nv = agr->break_vars[i]->nv;
981 memcpy (case_data_rw (output, value_idx),
982 &agr->prev_break[value_idx],
983 sizeof (union value) * nv);
991 for (i = agr->agr_vars; i; i = i->next)
993 union value *v = case_data_rw (output, i->dest->fv);
995 if (agr->missing == COLUMNWISE && i->missing != 0
996 && (i->function & FUNC) != N && (i->function & FUNC) != NU
997 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
999 if (i->dest->type == ALPHA)
1000 memset (v->s, ' ', i->dest->width);
1006 switch (i->function)
1012 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
1018 /* FIXME: we should use two passes. */
1019 moments1_calculate (i->moments, NULL, NULL, &variance,
1021 if (variance != SYSMIS)
1022 v->f = sqrt (variance);
1029 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1034 memcpy (v->s, i->string, i->dest->width);
1036 memset (v->s, ' ', i->dest->width);
1045 case FOUT | FSTRING:
1046 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
1055 case POUT | FSTRING:
1056 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
1068 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1070 case FIRST | FSTRING:
1071 case LAST | FSTRING:
1073 memcpy (v->s, i->string, i->dest->width);
1075 memset (v->s, ' ', i->dest->width);
1084 case NMISS | FSTRING:
1088 case NUMISS | FSTRING:
1098 /* Resets the state for all the aggregate functions. */
1100 initialize_aggregate_info (struct agr_proc *agr)
1102 struct agr_var *iter;
1104 for (iter = agr->agr_vars; iter; iter = iter->next)
1107 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1108 iter->int1 = iter->int2 = 0;
1109 switch (iter->function)
1112 iter->dbl[0] = DBL_MAX;
1115 memset (iter->string, 255, iter->src->width);
1118 iter->dbl[0] = -DBL_MAX;
1121 memset (iter->string, 0, iter->src->width);
1124 if (iter->moments == NULL)
1125 iter->moments = moments1_create (MOMENT_VARIANCE);
1127 moments1_clear (iter->moments);
1135 /* Aggregate each case as it comes through. Cases which aren't needed
1138 agr_to_active_file (struct ccase *c, void *agr_)
1140 struct agr_proc *agr = agr_;
1142 if (aggregate_single_case (agr, c, &agr->agr_case))
1143 agr->sink->class->write (agr->sink, &agr->agr_case);
1148 /* Aggregate the current case and output it if we passed a
1151 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1153 struct agr_proc *agr = agr_;
1155 if (aggregate_single_case (agr, c, &agr->agr_case))
1156 sfm_write_case (agr->writer, &agr->agr_case);