1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
42 /* Specifies how to make an aggregate variable. */
45 struct agr_var *next; /* Next in list. */
47 /* Collected during parsing. */
48 struct variable *src; /* Source variable. */
49 struct variable *dest; /* Target variable. */
50 int function; /* Function. */
51 int include_missing; /* 1=Include user-missing values. */
52 union value arg[2]; /* Arguments. */
54 /* Accumulated during AGGREGATE execution. */
59 struct moments1 *moments;
62 /* Aggregation functions. */
65 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
66 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
67 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
68 FUNC = 0x1f, /* Function mask. */
69 FSTRING = 1<<5, /* String function bit. */
72 /* Attributes of an aggregation function. */
75 const char *name; /* Aggregation function name. */
76 int n_args; /* Number of arguments. */
77 int alpha_type; /* When given ALPHA arguments, output type. */
78 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
81 /* Attributes of aggregation functions. */
82 static const struct agr_func agr_func_tab[] =
84 {"<NONE>", 0, -1, {0, 0, 0}},
85 {"SUM", 0, -1, {FMT_F, 8, 2}},
86 {"MEAN", 0, -1, {FMT_F, 8, 2}},
87 {"SD", 0, -1, {FMT_F, 8, 2}},
88 {"MAX", 0, ALPHA, {-1, -1, -1}},
89 {"MIN", 0, ALPHA, {-1, -1, -1}},
90 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
91 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
92 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
93 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
94 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
95 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
96 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
97 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
98 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
99 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
100 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
101 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"FIRST", 0, ALPHA, {-1, -1, -1}},
103 {"LAST", 0, ALPHA, {-1, -1, -1}},
104 {NULL, 0, -1, {-1, -1, -1}},
105 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
109 /* Missing value types. */
110 enum missing_treatment
112 ITEMWISE, /* Missing values item by item. */
113 COLUMNWISE /* Missing values column by column. */
116 /* An entire AGGREGATE procedure. */
119 /* We have either an output file or a sink. */
120 struct sfm_writer *writer; /* Output file, or null if none. */
121 struct case_sink *sink; /* Sink, or null if none. */
123 /* Break variables. */
124 struct sort_criteria *sort; /* Sort criteria. */
125 struct variable **break_vars; /* Break variables. */
126 size_t break_var_cnt; /* Number of break variables. */
127 struct ccase break_case; /* Last values of break variables. */
129 enum missing_treatment missing; /* How to treat missing values. */
130 struct agr_var *agr_vars; /* First aggregate variable. */
131 struct dictionary *dict; /* Aggregate dictionary. */
132 int case_cnt; /* Counts aggregated cases. */
133 struct ccase agr_case; /* Aggregate case for output. */
136 static void initialize_aggregate_info (struct agr_proc *,
137 const struct ccase *);
140 static int parse_aggregate_functions (struct agr_proc *);
141 static void agr_destroy (struct agr_proc *);
142 static int aggregate_single_case (struct agr_proc *agr,
143 const struct ccase *input,
144 struct ccase *output);
145 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
147 /* Aggregating to the active file. */
148 static int agr_to_active_file (struct ccase *, void *aux);
150 /* Aggregating to a system file. */
151 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
155 /* Parses and executes the AGGREGATE procedure. */
160 struct file_handle *out_file = NULL;
162 bool copy_documents = false;
163 bool presorted = false;
166 memset(&agr, 0 , sizeof (agr));
167 agr.missing = ITEMWISE;
168 case_nullify (&agr.break_case);
170 agr.dict = dict_create ();
171 dict_set_label (agr.dict, dict_get_label (default_dict));
172 dict_set_documents (agr.dict, dict_get_documents (default_dict));
174 /* OUTFILE subcommand must be first. */
175 if (!lex_force_match_id ("OUTFILE"))
178 if (!lex_match ('*'))
180 out_file = fh_parse ();
181 if (out_file == NULL)
185 /* Read most of the subcommands. */
190 if (lex_match_id ("MISSING"))
193 if (!lex_match_id ("COLUMNWISE"))
195 lex_error (_("while expecting COLUMNWISE"));
198 agr.missing = COLUMNWISE;
200 else if (lex_match_id ("DOCUMENT"))
201 copy_documents = true;
202 else if (lex_match_id ("PRESORTED"))
204 else if (lex_match_id ("BREAK"))
209 agr.sort = sort_parse_criteria (default_dict,
210 &agr.break_vars, &agr.break_var_cnt,
212 if (agr.sort == NULL)
215 for (i = 0; i < agr.break_var_cnt; i++)
216 dict_clone_var_assert (agr.dict, agr.break_vars[i],
217 agr.break_vars[i]->name);
219 /* BREAK must follow the options. */
224 lex_error (_("expecting BREAK"));
228 if (presorted && saw_direction)
229 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
230 "with (A) or (D) has no effect. Output data will be sorted "
231 "the same way as the input data."));
233 /* Read in the aggregate functions. */
235 if (!parse_aggregate_functions (&agr))
238 /* Delete documents. */
240 dict_set_documents (agr.dict, NULL);
242 /* Cancel SPLIT FILE. */
243 dict_set_split_vars (agr.dict, NULL, 0);
247 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
249 /* Output to active file or external file? */
250 if (out_file == NULL)
252 /* The active file will be replaced by the aggregated data,
253 so TEMPORARY is moot. */
256 if (agr.sort != NULL && !presorted)
257 sort_active_file_in_place (agr.sort);
259 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
260 if (agr.sink->class->open != NULL)
261 agr.sink->class->open (agr.sink);
262 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
263 procedure (agr_to_active_file, &agr);
264 if (agr.case_cnt > 0)
266 dump_aggregate_info (&agr, &agr.agr_case);
267 agr.sink->class->write (agr.sink, &agr.agr_case);
269 dict_destroy (default_dict);
270 default_dict = agr.dict;
272 vfm_source = agr.sink->class->make_source (agr.sink);
273 free_case_sink (agr.sink);
277 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression (), 0);
278 if (agr.writer == NULL)
281 if (agr.sort != NULL && !presorted)
283 /* Sorting is needed. */
284 struct casefile *dst;
285 struct casereader *reader;
288 dst = sort_active_file_to_casefile (agr.sort);
291 reader = casefile_get_destructive_reader (dst);
292 while (casereader_read_xfer (reader, &c))
294 if (aggregate_single_case (&agr, &c, &agr.agr_case))
295 sfm_write_case (agr.writer, &agr.agr_case);
298 casereader_destroy (reader);
299 casefile_destroy (dst);
303 /* Active file is already sorted. */
304 procedure (presorted_agr_to_sysfile, &agr);
307 if (agr.case_cnt > 0)
309 dump_aggregate_info (&agr, &agr.agr_case);
310 sfm_write_case (agr.writer, &agr.agr_case);
322 /* Parse all the aggregate functions. */
324 parse_aggregate_functions (struct agr_proc *agr)
326 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
328 /* Parse everything. */
337 const struct agr_func *function;
342 struct variable **src;
356 /* Parse the list of target variables. */
357 while (!lex_match ('='))
359 int n_dest_prev = n_dest;
361 if (!parse_DATA_LIST_vars (&dest, &n_dest,
362 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
365 /* Assign empty labels. */
369 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
370 for (j = n_dest_prev; j < n_dest; j++)
371 dest_label[j] = NULL;
374 if (token == T_STRING)
376 ds_truncate (&tokstr, 255);
377 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
382 /* Get the name of the aggregation function. */
385 lex_error (_("expecting aggregation function"));
390 if (tokid[strlen (tokid) - 1] == '.')
393 tokid[strlen (tokid) - 1] = 0;
396 for (function = agr_func_tab; function->name; function++)
397 if (!strcasecmp (function->name, tokid))
399 if (NULL == function->name)
401 msg (SE, _("Unknown aggregation function %s."), tokid);
404 func_index = function - agr_func_tab;
407 /* Check for leading lparen. */
408 if (!lex_match ('('))
411 func_index = N_NO_VARS;
412 else if (func_index == NU)
413 func_index = NU_NO_VARS;
416 lex_error (_("expecting `('"));
422 /* Parse list of source variables. */
424 int pv_opts = PV_NO_SCRATCH;
426 if (func_index == SUM || func_index == MEAN || func_index == SD)
427 pv_opts |= PV_NUMERIC;
428 else if (function->n_args)
429 pv_opts |= PV_SAME_TYPE;
431 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
435 /* Parse function arguments, for those functions that
436 require arguments. */
437 if (function->n_args != 0)
438 for (i = 0; i < function->n_args; i++)
443 if (token == T_STRING)
445 arg[i].c = xstrdup (ds_c_str (&tokstr));
448 else if (lex_is_number ())
453 msg (SE, _("Missing argument %d to %s."), i + 1,
460 if (type != src[0]->type)
462 msg (SE, _("Arguments to %s must be of same type as "
463 "source variables."),
469 /* Trailing rparen. */
472 lex_error (_("expecting `)'"));
476 /* Now check that the number of source variables match
477 the number of target variables. If we check earlier
478 than this, the user can get very misleading error
479 message, i.e. `AGGREGATE x=SUM(y t).' will get this
480 error message when a proper message would be more
481 like `unknown variable t'. */
484 msg (SE, _("Number of source variables (%d) does not match "
485 "number of target variables (%d)."),
490 if ((func_index == PIN || func_index == POUT
491 || func_index == FIN || func_index == FOUT)
492 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
493 || (src[0]->type == ALPHA
494 && str_compare_rpad (arg[0].c, arg[1].c) > 0)))
496 union value t = arg[0];
500 msg (SW, _("The value arguments passed to the %s function "
501 "are out-of-order. They will be treated as if "
502 "they had been specified in the correct order."),
507 /* Finally add these to the linked list of aggregation
509 for (i = 0; i < n_dest; i++)
511 struct agr_var *v = xmalloc (sizeof *v);
513 /* Add variable to chain. */
514 if (agr->agr_vars != NULL)
522 /* Create the target variable in the aggregate
525 static const struct fmt_spec f8_2 = {FMT_F, 8, 2};
526 struct variable *destvar;
528 v->function = func_index;
534 if (src[i]->type == ALPHA)
536 v->function |= FSTRING;
537 v->string = xmalloc (src[i]->width);
540 if (function->alpha_type == ALPHA)
541 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
544 assert (v->src->type == NUMERIC
545 || function->alpha_type == NUMERIC);
546 destvar = dict_create_var (agr->dict, dest[i], 0);
549 if ((func_index == N || func_index == NMISS)
550 && dict_get_weight (default_dict) != NULL)
551 destvar->print = destvar->write = f8_2;
553 destvar->print = destvar->write = function->format;
558 destvar = dict_create_var (agr->dict, dest[i], 0);
559 if (func_index == N_NO_VARS
560 && dict_get_weight (default_dict) != NULL)
561 destvar->print = destvar->write = f8_2;
563 destvar->print = destvar->write = function->format;
568 msg (SE, _("Variable name %s is not unique within the "
569 "aggregate file dictionary, which contains "
570 "the aggregate variables and the break "
580 destvar->label = dest_label[i];
581 dest_label[i] = NULL;
587 v->include_missing = include_missing;
593 if (v->src->type == NUMERIC)
594 for (j = 0; j < function->n_args; j++)
595 v->arg[j].f = arg[j].f;
597 for (j = 0; j < function->n_args; j++)
598 v->arg[j].c = xstrdup (arg[j].c);
602 if (src != NULL && src[0]->type == ALPHA)
603 for (i = 0; i < function->n_args; i++)
613 if (!lex_match ('/'))
618 lex_error ("expecting end of command");
624 for (i = 0; i < n_dest; i++)
627 free (dest_label[i]);
633 if (src && n_src && src[0]->type == ALPHA)
634 for (i = 0; i < function->n_args; i++)
647 agr_destroy (struct agr_proc *agr)
649 struct agr_var *iter, *next;
651 sfm_close_writer (agr->writer);
652 if (agr->sort != NULL)
653 sort_destroy_criteria (agr->sort);
654 free (agr->break_vars);
655 case_destroy (&agr->break_case);
656 for (iter = agr->agr_vars; iter; iter = next)
660 if (iter->function & FSTRING)
665 n_args = agr_func_tab[iter->function & FUNC].n_args;
666 for (i = 0; i < n_args; i++)
667 free (iter->arg[i].c);
670 else if (iter->function == SD)
671 moments1_destroy (iter->moments);
674 if (agr->dict != NULL)
675 dict_destroy (agr->dict);
677 case_destroy (&agr->agr_case);
682 static void accumulate_aggregate_info (struct agr_proc *,
683 const struct ccase *);
684 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
686 /* Processes a single case INPUT for aggregation. If output is
687 warranted, writes it to OUTPUT and returns nonzero.
688 Otherwise, returns zero and OUTPUT is unmodified. */
690 aggregate_single_case (struct agr_proc *agr,
691 const struct ccase *input, struct ccase *output)
693 bool finished_group = false;
695 if (agr->case_cnt++ == 0)
696 initialize_aggregate_info (agr, input);
697 else if (case_compare (&agr->break_case, input,
698 agr->break_vars, agr->break_var_cnt))
700 dump_aggregate_info (agr, output);
701 finished_group = true;
703 initialize_aggregate_info (agr, input);
706 accumulate_aggregate_info (agr, input);
707 return finished_group;
710 /* Accumulates aggregation data from the case INPUT. */
712 accumulate_aggregate_info (struct agr_proc *agr,
713 const struct ccase *input)
715 struct agr_var *iter;
719 weight = dict_get_case_weight (default_dict, input, &bad_warn);
721 for (iter = agr->agr_vars; iter; iter = iter->next)
724 const union value *v = case_data (input, iter->src->fv);
726 if ((!iter->include_missing && is_missing (v, iter->src))
727 || (iter->include_missing && iter->src->type == NUMERIC
730 switch (iter->function)
733 case NMISS | FSTRING:
734 iter->dbl[0] += weight;
737 case NUMISS | FSTRING:
745 /* This is horrible. There are too many possibilities. */
746 switch (iter->function)
749 iter->dbl[0] += v->f * weight;
753 iter->dbl[0] += v->f * weight;
754 iter->dbl[1] += weight;
757 moments1_add (iter->moments, v->f, weight);
760 iter->dbl[0] = max (iter->dbl[0], v->f);
764 if (memcmp (iter->string, v->s, iter->src->width) < 0)
765 memcpy (iter->string, v->s, iter->src->width);
769 iter->dbl[0] = min (iter->dbl[0], v->f);
773 if (memcmp (iter->string, v->s, iter->src->width) > 0)
774 memcpy (iter->string, v->s, iter->src->width);
779 if (v->f > iter->arg[0].f)
780 iter->dbl[0] += weight;
781 iter->dbl[1] += weight;
785 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
786 iter->dbl[0] += weight;
787 iter->dbl[1] += weight;
791 if (v->f < iter->arg[0].f)
792 iter->dbl[0] += weight;
793 iter->dbl[1] += weight;
797 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
798 iter->dbl[0] += weight;
799 iter->dbl[1] += weight;
803 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
804 iter->dbl[0] += weight;
805 iter->dbl[1] += weight;
809 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
810 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
811 iter->dbl[0] += weight;
812 iter->dbl[1] += weight;
816 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
817 iter->dbl[0] += weight;
818 iter->dbl[1] += weight;
822 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
823 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
824 iter->dbl[0] += weight;
825 iter->dbl[1] += weight;
829 iter->dbl[0] += weight;
842 case FIRST | FSTRING:
845 memcpy (iter->string, v->s, iter->src->width);
854 memcpy (iter->string, v->s, iter->src->width);
858 case NMISS | FSTRING:
860 case NUMISS | FSTRING:
861 /* Our value is not missing or it would have been
862 caught earlier. Nothing to do. */
868 switch (iter->function)
871 iter->dbl[0] += weight;
882 /* We've come to a record that differs from the previous in one or
883 more of the break variables. Make an output record from the
884 accumulated statistics in the OUTPUT case. */
886 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
892 for (i = 0; i < agr->break_var_cnt; i++)
894 struct variable *v = agr->break_vars[i];
895 memcpy (case_data_rw (output, value_idx),
896 case_data (&agr->break_case, v->fv),
897 sizeof (union value) * v->nv);
905 for (i = agr->agr_vars; i; i = i->next)
907 union value *v = case_data_rw (output, i->dest->fv);
909 if (agr->missing == COLUMNWISE && i->missing != 0
910 && (i->function & FUNC) != N && (i->function & FUNC) != NU
911 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
913 if (i->dest->type == ALPHA)
914 memset (v->s, ' ', i->dest->width);
923 v->f = i->int1 ? i->dbl[0] : SYSMIS;
926 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
932 /* FIXME: we should use two passes. */
933 moments1_calculate (i->moments, NULL, NULL, &variance,
935 if (variance != SYSMIS)
936 v->f = sqrt (variance);
943 v->f = i->int1 ? i->dbl[0] : SYSMIS;
948 memcpy (v->s, i->string, i->dest->width);
950 memset (v->s, ' ', i->dest->width);
960 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
970 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
982 v->f = i->int1 ? i->dbl[0] : SYSMIS;
984 case FIRST | FSTRING:
987 memcpy (v->s, i->string, i->dest->width);
989 memset (v->s, ' ', i->dest->width);
998 case NMISS | FSTRING:
1002 case NUMISS | FSTRING:
1012 /* Resets the state for all the aggregate functions. */
1014 initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input)
1016 struct agr_var *iter;
1018 case_destroy (&agr->break_case);
1019 case_clone (&agr->break_case, input);
1021 for (iter = agr->agr_vars; iter; iter = iter->next)
1024 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1025 iter->int1 = iter->int2 = 0;
1026 switch (iter->function)
1029 iter->dbl[0] = DBL_MAX;
1032 memset (iter->string, 255, iter->src->width);
1035 iter->dbl[0] = -DBL_MAX;
1038 memset (iter->string, 0, iter->src->width);
1041 if (iter->moments == NULL)
1042 iter->moments = moments1_create (MOMENT_VARIANCE);
1044 moments1_clear (iter->moments);
1052 /* Aggregate each case as it comes through. Cases which aren't needed
1055 agr_to_active_file (struct ccase *c, void *agr_)
1057 struct agr_proc *agr = agr_;
1059 if (aggregate_single_case (agr, c, &agr->agr_case))
1060 agr->sink->class->write (agr->sink, &agr->agr_case);
1065 /* Aggregate the current case and output it if we passed a
1068 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1070 struct agr_proc *agr = agr_;
1072 if (aggregate_single_case (agr, c, &agr->agr_case))
1073 sfm_write_case (agr->writer, &agr->agr_case);