1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
44 #define _(msgid) gettext (msgid)
46 /* Specifies how to make an aggregate variable. */
49 struct agr_var *next; /* Next in list. */
51 /* Collected during parsing. */
52 struct variable *src; /* Source variable. */
53 struct variable *dest; /* Target variable. */
54 int function; /* Function. */
55 int include_missing; /* 1=Include user-missing values. */
56 union value arg[2]; /* Arguments. */
58 /* Accumulated during AGGREGATE execution. */
63 struct moments1 *moments;
66 /* Aggregation functions. */
69 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
70 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
71 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
72 FUNC = 0x1f, /* Function mask. */
73 FSTRING = 1<<5, /* String function bit. */
76 /* Attributes of an aggregation function. */
79 const char *name; /* Aggregation function name. */
80 int n_args; /* Number of arguments. */
81 int alpha_type; /* When given ALPHA arguments, output type. */
82 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
85 /* Attributes of aggregation functions. */
86 static const struct agr_func agr_func_tab[] =
88 {"<NONE>", 0, -1, {0, 0, 0}},
89 {"SUM", 0, -1, {FMT_F, 8, 2}},
90 {"MEAN", 0, -1, {FMT_F, 8, 2}},
91 {"SD", 0, -1, {FMT_F, 8, 2}},
92 {"MAX", 0, ALPHA, {-1, -1, -1}},
93 {"MIN", 0, ALPHA, {-1, -1, -1}},
94 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
95 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
96 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
97 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
98 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
99 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
100 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
101 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
102 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
103 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
104 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
105 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"FIRST", 0, ALPHA, {-1, -1, -1}},
107 {"LAST", 0, ALPHA, {-1, -1, -1}},
108 {NULL, 0, -1, {-1, -1, -1}},
109 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
110 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
113 /* Missing value types. */
114 enum missing_treatment
116 ITEMWISE, /* Missing values item by item. */
117 COLUMNWISE /* Missing values column by column. */
120 /* An entire AGGREGATE procedure. */
123 /* We have either an output file or a sink. */
124 struct sfm_writer *writer; /* Output file, or null if none. */
125 struct case_sink *sink; /* Sink, or null if none. */
127 /* Break variables. */
128 struct sort_criteria *sort; /* Sort criteria. */
129 struct variable **break_vars; /* Break variables. */
130 size_t break_var_cnt; /* Number of break variables. */
131 struct ccase break_case; /* Last values of break variables. */
133 enum missing_treatment missing; /* How to treat missing values. */
134 struct agr_var *agr_vars; /* First aggregate variable. */
135 struct dictionary *dict; /* Aggregate dictionary. */
136 int case_cnt; /* Counts aggregated cases. */
137 struct ccase agr_case; /* Aggregate case for output. */
140 static void initialize_aggregate_info (struct agr_proc *,
141 const struct ccase *);
144 static int parse_aggregate_functions (struct agr_proc *);
145 static void agr_destroy (struct agr_proc *);
146 static int aggregate_single_case (struct agr_proc *agr,
147 const struct ccase *input,
148 struct ccase *output);
149 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
151 /* Aggregating to the active file. */
152 static int agr_to_active_file (struct ccase *, void *aux);
154 /* Aggregating to a system file. */
155 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
159 /* Parses and executes the AGGREGATE procedure. */
164 struct file_handle *out_file = NULL;
166 bool copy_documents = false;
167 bool presorted = false;
170 memset(&agr, 0 , sizeof (agr));
171 agr.missing = ITEMWISE;
172 case_nullify (&agr.break_case);
174 agr.dict = dict_create ();
175 dict_set_label (agr.dict, dict_get_label (default_dict));
176 dict_set_documents (agr.dict, dict_get_documents (default_dict));
178 /* OUTFILE subcommand must be first. */
179 if (!lex_force_match_id ("OUTFILE"))
182 if (!lex_match ('*'))
184 out_file = fh_parse ();
185 if (out_file == NULL)
189 /* Read most of the subcommands. */
194 if (lex_match_id ("MISSING"))
197 if (!lex_match_id ("COLUMNWISE"))
199 lex_error (_("while expecting COLUMNWISE"));
202 agr.missing = COLUMNWISE;
204 else if (lex_match_id ("DOCUMENT"))
205 copy_documents = true;
206 else if (lex_match_id ("PRESORTED"))
208 else if (lex_match_id ("BREAK"))
213 agr.sort = sort_parse_criteria (default_dict,
214 &agr.break_vars, &agr.break_var_cnt,
215 &saw_direction, NULL);
216 if (agr.sort == NULL)
219 for (i = 0; i < agr.break_var_cnt; i++)
220 dict_clone_var_assert (agr.dict, agr.break_vars[i],
221 agr.break_vars[i]->name);
223 /* BREAK must follow the options. */
228 lex_error (_("expecting BREAK"));
232 if (presorted && saw_direction)
233 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
234 "with (A) or (D) has no effect. Output data will be sorted "
235 "the same way as the input data."));
237 /* Read in the aggregate functions. */
239 if (!parse_aggregate_functions (&agr))
242 /* Delete documents. */
244 dict_set_documents (agr.dict, NULL);
246 /* Cancel SPLIT FILE. */
247 dict_set_split_vars (agr.dict, NULL, 0);
251 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
253 /* Output to active file or external file? */
254 if (out_file == NULL)
256 /* The active file will be replaced by the aggregated data,
257 so TEMPORARY is moot. */
260 if (agr.sort != NULL && !presorted)
261 sort_active_file_in_place (agr.sort);
263 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
264 if (agr.sink->class->open != NULL)
265 agr.sink->class->open (agr.sink);
266 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
267 procedure (agr_to_active_file, &agr);
268 if (agr.case_cnt > 0)
270 dump_aggregate_info (&agr, &agr.agr_case);
271 agr.sink->class->write (agr.sink, &agr.agr_case);
273 dict_destroy (default_dict);
274 default_dict = agr.dict;
276 vfm_source = agr.sink->class->make_source (agr.sink);
277 free_case_sink (agr.sink);
281 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression (), 0);
282 if (agr.writer == NULL)
285 if (agr.sort != NULL && !presorted)
287 /* Sorting is needed. */
288 struct casefile *dst;
289 struct casereader *reader;
292 dst = sort_active_file_to_casefile (agr.sort);
295 reader = casefile_get_destructive_reader (dst);
296 while (casereader_read_xfer (reader, &c))
298 if (aggregate_single_case (&agr, &c, &agr.agr_case))
299 sfm_write_case (agr.writer, &agr.agr_case);
302 casereader_destroy (reader);
303 casefile_destroy (dst);
307 /* Active file is already sorted. */
308 procedure (presorted_agr_to_sysfile, &agr);
311 if (agr.case_cnt > 0)
313 dump_aggregate_info (&agr, &agr.agr_case);
314 sfm_write_case (agr.writer, &agr.agr_case);
326 /* Parse all the aggregate functions. */
328 parse_aggregate_functions (struct agr_proc *agr)
330 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
332 /* Parse everything. */
341 const struct agr_func *function;
346 struct variable **src;
360 /* Parse the list of target variables. */
361 while (!lex_match ('='))
363 int n_dest_prev = n_dest;
365 if (!parse_DATA_LIST_vars (&dest, &n_dest,
366 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
369 /* Assign empty labels. */
373 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
374 for (j = n_dest_prev; j < n_dest; j++)
375 dest_label[j] = NULL;
378 if (token == T_STRING)
380 ds_truncate (&tokstr, 255);
381 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
386 /* Get the name of the aggregation function. */
389 lex_error (_("expecting aggregation function"));
394 if (tokid[strlen (tokid) - 1] == '.')
397 tokid[strlen (tokid) - 1] = 0;
400 for (function = agr_func_tab; function->name; function++)
401 if (!strcasecmp (function->name, tokid))
403 if (NULL == function->name)
405 msg (SE, _("Unknown aggregation function %s."), tokid);
408 func_index = function - agr_func_tab;
411 /* Check for leading lparen. */
412 if (!lex_match ('('))
415 func_index = N_NO_VARS;
416 else if (func_index == NU)
417 func_index = NU_NO_VARS;
420 lex_error (_("expecting `('"));
426 /* Parse list of source variables. */
428 int pv_opts = PV_NO_SCRATCH;
430 if (func_index == SUM || func_index == MEAN || func_index == SD)
431 pv_opts |= PV_NUMERIC;
432 else if (function->n_args)
433 pv_opts |= PV_SAME_TYPE;
435 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
439 /* Parse function arguments, for those functions that
440 require arguments. */
441 if (function->n_args != 0)
442 for (i = 0; i < function->n_args; i++)
447 if (token == T_STRING)
449 arg[i].c = xstrdup (ds_c_str (&tokstr));
452 else if (lex_is_number ())
457 msg (SE, _("Missing argument %d to %s."), i + 1,
464 if (type != src[0]->type)
466 msg (SE, _("Arguments to %s must be of same type as "
467 "source variables."),
473 /* Trailing rparen. */
476 lex_error (_("expecting `)'"));
480 /* Now check that the number of source variables match
481 the number of target variables. If we check earlier
482 than this, the user can get very misleading error
483 message, i.e. `AGGREGATE x=SUM(y t).' will get this
484 error message when a proper message would be more
485 like `unknown variable t'. */
488 msg (SE, _("Number of source variables (%d) does not match "
489 "number of target variables (%d)."),
494 if ((func_index == PIN || func_index == POUT
495 || func_index == FIN || func_index == FOUT)
496 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
497 || (src[0]->type == ALPHA
498 && str_compare_rpad (arg[0].c, arg[1].c) > 0)))
500 union value t = arg[0];
504 msg (SW, _("The value arguments passed to the %s function "
505 "are out-of-order. They will be treated as if "
506 "they had been specified in the correct order."),
511 /* Finally add these to the linked list of aggregation
513 for (i = 0; i < n_dest; i++)
515 struct agr_var *v = xmalloc (sizeof *v);
517 /* Add variable to chain. */
518 if (agr->agr_vars != NULL)
526 /* Create the target variable in the aggregate
529 struct variable *destvar;
531 v->function = func_index;
537 if (src[i]->type == ALPHA)
539 v->function |= FSTRING;
540 v->string = xmalloc (src[i]->width);
543 if (function->alpha_type == ALPHA)
544 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
547 assert (v->src->type == NUMERIC
548 || function->alpha_type == NUMERIC);
549 destvar = dict_create_var (agr->dict, dest[i], 0);
552 if ((func_index == N || func_index == NMISS)
553 && dict_get_weight (default_dict) != NULL)
554 destvar->print = destvar->write = f8_2;
556 destvar->print = destvar->write = function->format;
561 destvar = dict_create_var (agr->dict, dest[i], 0);
562 if (func_index == N_NO_VARS
563 && dict_get_weight (default_dict) != NULL)
564 destvar->print = destvar->write = f8_2;
566 destvar->print = destvar->write = function->format;
571 msg (SE, _("Variable name %s is not unique within the "
572 "aggregate file dictionary, which contains "
573 "the aggregate variables and the break "
583 destvar->label = dest_label[i];
584 dest_label[i] = NULL;
590 v->include_missing = include_missing;
596 if (v->src->type == NUMERIC)
597 for (j = 0; j < function->n_args; j++)
598 v->arg[j].f = arg[j].f;
600 for (j = 0; j < function->n_args; j++)
601 v->arg[j].c = xstrdup (arg[j].c);
605 if (src != NULL && src[0]->type == ALPHA)
606 for (i = 0; i < function->n_args; i++)
616 if (!lex_match ('/'))
621 lex_error ("expecting end of command");
627 for (i = 0; i < n_dest; i++)
630 free (dest_label[i]);
636 if (src && n_src && src[0]->type == ALPHA)
637 for (i = 0; i < function->n_args; i++)
650 agr_destroy (struct agr_proc *agr)
652 struct agr_var *iter, *next;
654 sfm_close_writer (agr->writer);
655 if (agr->sort != NULL)
656 sort_destroy_criteria (agr->sort);
657 free (agr->break_vars);
658 case_destroy (&agr->break_case);
659 for (iter = agr->agr_vars; iter; iter = next)
663 if (iter->function & FSTRING)
668 n_args = agr_func_tab[iter->function & FUNC].n_args;
669 for (i = 0; i < n_args; i++)
670 free (iter->arg[i].c);
673 else if (iter->function == SD)
674 moments1_destroy (iter->moments);
677 if (agr->dict != NULL)
678 dict_destroy (agr->dict);
680 case_destroy (&agr->agr_case);
685 static void accumulate_aggregate_info (struct agr_proc *,
686 const struct ccase *);
687 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
689 /* Processes a single case INPUT for aggregation. If output is
690 warranted, writes it to OUTPUT and returns nonzero.
691 Otherwise, returns zero and OUTPUT is unmodified. */
693 aggregate_single_case (struct agr_proc *agr,
694 const struct ccase *input, struct ccase *output)
696 bool finished_group = false;
698 if (agr->case_cnt++ == 0)
699 initialize_aggregate_info (agr, input);
700 else if (case_compare (&agr->break_case, input,
701 agr->break_vars, agr->break_var_cnt))
703 dump_aggregate_info (agr, output);
704 finished_group = true;
706 initialize_aggregate_info (agr, input);
709 accumulate_aggregate_info (agr, input);
710 return finished_group;
713 /* Accumulates aggregation data from the case INPUT. */
715 accumulate_aggregate_info (struct agr_proc *agr,
716 const struct ccase *input)
718 struct agr_var *iter;
722 weight = dict_get_case_weight (default_dict, input, &bad_warn);
724 for (iter = agr->agr_vars; iter; iter = iter->next)
727 const union value *v = case_data (input, iter->src->fv);
729 if ((!iter->include_missing && is_missing (v, iter->src))
730 || (iter->include_missing && iter->src->type == NUMERIC
733 switch (iter->function)
736 case NMISS | FSTRING:
737 iter->dbl[0] += weight;
740 case NUMISS | FSTRING:
748 /* This is horrible. There are too many possibilities. */
749 switch (iter->function)
752 iter->dbl[0] += v->f * weight;
756 iter->dbl[0] += v->f * weight;
757 iter->dbl[1] += weight;
760 moments1_add (iter->moments, v->f, weight);
763 iter->dbl[0] = max (iter->dbl[0], v->f);
767 if (memcmp (iter->string, v->s, iter->src->width) < 0)
768 memcpy (iter->string, v->s, iter->src->width);
772 iter->dbl[0] = min (iter->dbl[0], v->f);
776 if (memcmp (iter->string, v->s, iter->src->width) > 0)
777 memcpy (iter->string, v->s, iter->src->width);
782 if (v->f > iter->arg[0].f)
783 iter->dbl[0] += weight;
784 iter->dbl[1] += weight;
788 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
789 iter->dbl[0] += weight;
790 iter->dbl[1] += weight;
794 if (v->f < iter->arg[0].f)
795 iter->dbl[0] += weight;
796 iter->dbl[1] += weight;
800 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
801 iter->dbl[0] += weight;
802 iter->dbl[1] += weight;
806 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
807 iter->dbl[0] += weight;
808 iter->dbl[1] += weight;
812 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
813 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
814 iter->dbl[0] += weight;
815 iter->dbl[1] += weight;
819 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
820 iter->dbl[0] += weight;
821 iter->dbl[1] += weight;
825 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
826 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
827 iter->dbl[0] += weight;
828 iter->dbl[1] += weight;
832 iter->dbl[0] += weight;
845 case FIRST | FSTRING:
848 memcpy (iter->string, v->s, iter->src->width);
857 memcpy (iter->string, v->s, iter->src->width);
861 case NMISS | FSTRING:
863 case NUMISS | FSTRING:
864 /* Our value is not missing or it would have been
865 caught earlier. Nothing to do. */
871 switch (iter->function)
874 iter->dbl[0] += weight;
885 /* We've come to a record that differs from the previous in one or
886 more of the break variables. Make an output record from the
887 accumulated statistics in the OUTPUT case. */
889 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
895 for (i = 0; i < agr->break_var_cnt; i++)
897 struct variable *v = agr->break_vars[i];
898 memcpy (case_data_rw (output, value_idx),
899 case_data (&agr->break_case, v->fv),
900 sizeof (union value) * v->nv);
908 for (i = agr->agr_vars; i; i = i->next)
910 union value *v = case_data_rw (output, i->dest->fv);
912 if (agr->missing == COLUMNWISE && i->missing != 0
913 && (i->function & FUNC) != N && (i->function & FUNC) != NU
914 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
916 if (i->dest->type == ALPHA)
917 memset (v->s, ' ', i->dest->width);
926 v->f = i->int1 ? i->dbl[0] : SYSMIS;
929 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
935 /* FIXME: we should use two passes. */
936 moments1_calculate (i->moments, NULL, NULL, &variance,
938 if (variance != SYSMIS)
939 v->f = sqrt (variance);
946 v->f = i->int1 ? i->dbl[0] : SYSMIS;
951 memcpy (v->s, i->string, i->dest->width);
953 memset (v->s, ' ', i->dest->width);
963 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
973 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
985 v->f = i->int1 ? i->dbl[0] : SYSMIS;
987 case FIRST | FSTRING:
990 memcpy (v->s, i->string, i->dest->width);
992 memset (v->s, ' ', i->dest->width);
1001 case NMISS | FSTRING:
1005 case NUMISS | FSTRING:
1015 /* Resets the state for all the aggregate functions. */
1017 initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input)
1019 struct agr_var *iter;
1021 case_destroy (&agr->break_case);
1022 case_clone (&agr->break_case, input);
1024 for (iter = agr->agr_vars; iter; iter = iter->next)
1027 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1028 iter->int1 = iter->int2 = 0;
1029 switch (iter->function)
1032 iter->dbl[0] = DBL_MAX;
1035 memset (iter->string, 255, iter->src->width);
1038 iter->dbl[0] = -DBL_MAX;
1041 memset (iter->string, 0, iter->src->width);
1044 if (iter->moments == NULL)
1045 iter->moments = moments1_create (MOMENT_VARIANCE);
1047 moments1_clear (iter->moments);
1055 /* Aggregate each case as it comes through. Cases which aren't needed
1058 agr_to_active_file (struct ccase *c, void *agr_)
1060 struct agr_proc *agr = agr_;
1062 if (aggregate_single_case (agr, c, &agr->agr_case))
1063 agr->sink->class->write (agr->sink, &agr->agr_case);
1068 /* Aggregate the current case and output it if we passed a
1071 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1073 struct agr_proc *agr = agr_;
1075 if (aggregate_single_case (agr, c, &agr->agr_case))
1076 sfm_write_case (agr->writer, &agr->agr_case);