1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
43 /* Specifies how to make an aggregate variable. */
46 struct agr_var *next; /* Next in list. */
48 /* Collected during parsing. */
49 struct variable *src; /* Source variable. */
50 struct variable *dest; /* Target variable. */
51 int function; /* Function. */
52 int include_missing; /* 1=Include user-missing values. */
53 union value arg[2]; /* Arguments. */
55 /* Accumulated during AGGREGATE execution. */
60 struct moments1 *moments;
63 /* Aggregation functions. */
66 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
67 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
68 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
69 FUNC = 0x1f, /* Function mask. */
70 FSTRING = 1<<5, /* String function bit. */
73 /* Attributes of an aggregation function. */
76 const char *name; /* Aggregation function name. */
77 int n_args; /* Number of arguments. */
78 int alpha_type; /* When given ALPHA arguments, output type. */
79 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
82 /* Attributes of aggregation functions. */
83 static const struct agr_func agr_func_tab[] =
85 {"<NONE>", 0, -1, {0, 0, 0}},
86 {"SUM", 0, -1, {FMT_F, 8, 2}},
87 {"MEAN", 0, -1, {FMT_F, 8, 2}},
88 {"SD", 0, -1, {FMT_F, 8, 2}},
89 {"MAX", 0, ALPHA, {-1, -1, -1}},
90 {"MIN", 0, ALPHA, {-1, -1, -1}},
91 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
92 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
93 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
94 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
95 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
96 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
97 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
98 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
99 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
100 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
101 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
103 {"FIRST", 0, ALPHA, {-1, -1, -1}},
104 {"LAST", 0, ALPHA, {-1, -1, -1}},
105 {NULL, 0, -1, {-1, -1, -1}},
106 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
107 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
110 /* Missing value types. */
111 enum missing_treatment
113 ITEMWISE, /* Missing values item by item. */
114 COLUMNWISE /* Missing values column by column. */
117 /* An entire AGGREGATE procedure. */
120 /* We have either an output file or a sink. */
121 struct sfm_writer *writer; /* Output file, or null if none. */
122 struct case_sink *sink; /* Sink, or null if none. */
124 /* Break variables. */
125 struct sort_criteria *sort; /* Sort criteria. */
126 struct variable **break_vars; /* Break variables. */
127 size_t break_var_cnt; /* Number of break variables. */
128 struct ccase break_case; /* Last values of break variables. */
130 enum missing_treatment missing; /* How to treat missing values. */
131 struct agr_var *agr_vars; /* First aggregate variable. */
132 struct dictionary *dict; /* Aggregate dictionary. */
133 int case_cnt; /* Counts aggregated cases. */
134 struct ccase agr_case; /* Aggregate case for output. */
137 static void initialize_aggregate_info (struct agr_proc *,
138 const struct ccase *);
141 static int parse_aggregate_functions (struct agr_proc *);
142 static void agr_destroy (struct agr_proc *);
143 static int aggregate_single_case (struct agr_proc *agr,
144 const struct ccase *input,
145 struct ccase *output);
146 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
148 /* Aggregating to the active file. */
149 static int agr_to_active_file (struct ccase *, void *aux);
151 /* Aggregating to a system file. */
152 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
156 /* Parses and executes the AGGREGATE procedure. */
161 struct file_handle *out_file = NULL;
163 bool copy_documents = false;
164 bool presorted = false;
167 memset(&agr, 0 , sizeof (agr));
168 agr.missing = ITEMWISE;
169 case_nullify (&agr.break_case);
171 agr.dict = dict_create ();
172 dict_set_label (agr.dict, dict_get_label (default_dict));
173 dict_set_documents (agr.dict, dict_get_documents (default_dict));
175 /* OUTFILE subcommand must be first. */
176 if (!lex_force_match_id ("OUTFILE"))
179 if (!lex_match ('*'))
181 out_file = fh_parse ();
182 if (out_file == NULL)
186 /* Read most of the subcommands. */
191 if (lex_match_id ("MISSING"))
194 if (!lex_match_id ("COLUMNWISE"))
196 lex_error (_("while expecting COLUMNWISE"));
199 agr.missing = COLUMNWISE;
201 else if (lex_match_id ("DOCUMENT"))
202 copy_documents = true;
203 else if (lex_match_id ("PRESORTED"))
205 else if (lex_match_id ("BREAK"))
210 agr.sort = sort_parse_criteria (default_dict,
211 &agr.break_vars, &agr.break_var_cnt,
212 &saw_direction, NULL);
213 if (agr.sort == NULL)
216 for (i = 0; i < agr.break_var_cnt; i++)
217 dict_clone_var_assert (agr.dict, agr.break_vars[i],
218 agr.break_vars[i]->name);
220 /* BREAK must follow the options. */
225 lex_error (_("expecting BREAK"));
229 if (presorted && saw_direction)
230 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
231 "with (A) or (D) has no effect. Output data will be sorted "
232 "the same way as the input data."));
234 /* Read in the aggregate functions. */
236 if (!parse_aggregate_functions (&agr))
239 /* Delete documents. */
241 dict_set_documents (agr.dict, NULL);
243 /* Cancel SPLIT FILE. */
244 dict_set_split_vars (agr.dict, NULL, 0);
248 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
250 /* Output to active file or external file? */
251 if (out_file == NULL)
253 /* The active file will be replaced by the aggregated data,
254 so TEMPORARY is moot. */
257 if (agr.sort != NULL && !presorted)
258 sort_active_file_in_place (agr.sort);
260 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
261 if (agr.sink->class->open != NULL)
262 agr.sink->class->open (agr.sink);
263 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
264 procedure (agr_to_active_file, &agr);
265 if (agr.case_cnt > 0)
267 dump_aggregate_info (&agr, &agr.agr_case);
268 agr.sink->class->write (agr.sink, &agr.agr_case);
270 dict_destroy (default_dict);
271 default_dict = agr.dict;
273 vfm_source = agr.sink->class->make_source (agr.sink);
274 free_case_sink (agr.sink);
278 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression (), 0);
279 if (agr.writer == NULL)
282 if (agr.sort != NULL && !presorted)
284 /* Sorting is needed. */
285 struct casefile *dst;
286 struct casereader *reader;
289 dst = sort_active_file_to_casefile (agr.sort);
292 reader = casefile_get_destructive_reader (dst);
293 while (casereader_read_xfer (reader, &c))
295 if (aggregate_single_case (&agr, &c, &agr.agr_case))
296 sfm_write_case (agr.writer, &agr.agr_case);
299 casereader_destroy (reader);
300 casefile_destroy (dst);
304 /* Active file is already sorted. */
305 procedure (presorted_agr_to_sysfile, &agr);
308 if (agr.case_cnt > 0)
310 dump_aggregate_info (&agr, &agr.agr_case);
311 sfm_write_case (agr.writer, &agr.agr_case);
323 /* Parse all the aggregate functions. */
325 parse_aggregate_functions (struct agr_proc *agr)
327 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
329 /* Parse everything. */
338 const struct agr_func *function;
343 struct variable **src;
357 /* Parse the list of target variables. */
358 while (!lex_match ('='))
360 int n_dest_prev = n_dest;
362 if (!parse_DATA_LIST_vars (&dest, &n_dest,
363 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
366 /* Assign empty labels. */
370 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
371 for (j = n_dest_prev; j < n_dest; j++)
372 dest_label[j] = NULL;
375 if (token == T_STRING)
377 ds_truncate (&tokstr, 255);
378 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
383 /* Get the name of the aggregation function. */
386 lex_error (_("expecting aggregation function"));
391 if (tokid[strlen (tokid) - 1] == '.')
394 tokid[strlen (tokid) - 1] = 0;
397 for (function = agr_func_tab; function->name; function++)
398 if (!strcasecmp (function->name, tokid))
400 if (NULL == function->name)
402 msg (SE, _("Unknown aggregation function %s."), tokid);
405 func_index = function - agr_func_tab;
408 /* Check for leading lparen. */
409 if (!lex_match ('('))
412 func_index = N_NO_VARS;
413 else if (func_index == NU)
414 func_index = NU_NO_VARS;
417 lex_error (_("expecting `('"));
423 /* Parse list of source variables. */
425 int pv_opts = PV_NO_SCRATCH;
427 if (func_index == SUM || func_index == MEAN || func_index == SD)
428 pv_opts |= PV_NUMERIC;
429 else if (function->n_args)
430 pv_opts |= PV_SAME_TYPE;
432 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
436 /* Parse function arguments, for those functions that
437 require arguments. */
438 if (function->n_args != 0)
439 for (i = 0; i < function->n_args; i++)
444 if (token == T_STRING)
446 arg[i].c = xstrdup (ds_c_str (&tokstr));
449 else if (lex_is_number ())
454 msg (SE, _("Missing argument %d to %s."), i + 1,
461 if (type != src[0]->type)
463 msg (SE, _("Arguments to %s must be of same type as "
464 "source variables."),
470 /* Trailing rparen. */
473 lex_error (_("expecting `)'"));
477 /* Now check that the number of source variables match
478 the number of target variables. If we check earlier
479 than this, the user can get very misleading error
480 message, i.e. `AGGREGATE x=SUM(y t).' will get this
481 error message when a proper message would be more
482 like `unknown variable t'. */
485 msg (SE, _("Number of source variables (%d) does not match "
486 "number of target variables (%d)."),
491 if ((func_index == PIN || func_index == POUT
492 || func_index == FIN || func_index == FOUT)
493 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
494 || (src[0]->type == ALPHA
495 && str_compare_rpad (arg[0].c, arg[1].c) > 0)))
497 union value t = arg[0];
501 msg (SW, _("The value arguments passed to the %s function "
502 "are out-of-order. They will be treated as if "
503 "they had been specified in the correct order."),
508 /* Finally add these to the linked list of aggregation
510 for (i = 0; i < n_dest; i++)
512 struct agr_var *v = xmalloc (sizeof *v);
514 /* Add variable to chain. */
515 if (agr->agr_vars != NULL)
523 /* Create the target variable in the aggregate
526 struct variable *destvar;
528 v->function = func_index;
534 if (src[i]->type == ALPHA)
536 v->function |= FSTRING;
537 v->string = xmalloc (src[i]->width);
540 if (function->alpha_type == ALPHA)
541 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
544 assert (v->src->type == NUMERIC
545 || function->alpha_type == NUMERIC);
546 destvar = dict_create_var (agr->dict, dest[i], 0);
549 if ((func_index == N || func_index == NMISS)
550 && dict_get_weight (default_dict) != NULL)
551 destvar->print = destvar->write = f8_2;
553 destvar->print = destvar->write = function->format;
558 destvar = dict_create_var (agr->dict, dest[i], 0);
559 if (func_index == N_NO_VARS
560 && dict_get_weight (default_dict) != NULL)
561 destvar->print = destvar->write = f8_2;
563 destvar->print = destvar->write = function->format;
568 msg (SE, _("Variable name %s is not unique within the "
569 "aggregate file dictionary, which contains "
570 "the aggregate variables and the break "
580 destvar->label = dest_label[i];
581 dest_label[i] = NULL;
587 v->include_missing = include_missing;
593 if (v->src->type == NUMERIC)
594 for (j = 0; j < function->n_args; j++)
595 v->arg[j].f = arg[j].f;
597 for (j = 0; j < function->n_args; j++)
598 v->arg[j].c = xstrdup (arg[j].c);
602 if (src != NULL && src[0]->type == ALPHA)
603 for (i = 0; i < function->n_args; i++)
613 if (!lex_match ('/'))
618 lex_error ("expecting end of command");
624 for (i = 0; i < n_dest; i++)
627 free (dest_label[i]);
633 if (src && n_src && src[0]->type == ALPHA)
634 for (i = 0; i < function->n_args; i++)
647 agr_destroy (struct agr_proc *agr)
649 struct agr_var *iter, *next;
651 sfm_close_writer (agr->writer);
652 if (agr->sort != NULL)
653 sort_destroy_criteria (agr->sort);
654 free (agr->break_vars);
655 case_destroy (&agr->break_case);
656 for (iter = agr->agr_vars; iter; iter = next)
660 if (iter->function & FSTRING)
665 n_args = agr_func_tab[iter->function & FUNC].n_args;
666 for (i = 0; i < n_args; i++)
667 free (iter->arg[i].c);
670 else if (iter->function == SD)
671 moments1_destroy (iter->moments);
674 if (agr->dict != NULL)
675 dict_destroy (agr->dict);
677 case_destroy (&agr->agr_case);
682 static void accumulate_aggregate_info (struct agr_proc *,
683 const struct ccase *);
684 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
686 /* Processes a single case INPUT for aggregation. If output is
687 warranted, writes it to OUTPUT and returns nonzero.
688 Otherwise, returns zero and OUTPUT is unmodified. */
690 aggregate_single_case (struct agr_proc *agr,
691 const struct ccase *input, struct ccase *output)
693 bool finished_group = false;
695 if (agr->case_cnt++ == 0)
696 initialize_aggregate_info (agr, input);
697 else if (case_compare (&agr->break_case, input,
698 agr->break_vars, agr->break_var_cnt))
700 dump_aggregate_info (agr, output);
701 finished_group = true;
703 initialize_aggregate_info (agr, input);
706 accumulate_aggregate_info (agr, input);
707 return finished_group;
710 /* Accumulates aggregation data from the case INPUT. */
712 accumulate_aggregate_info (struct agr_proc *agr,
713 const struct ccase *input)
715 struct agr_var *iter;
719 weight = dict_get_case_weight (default_dict, input, &bad_warn);
721 for (iter = agr->agr_vars; iter; iter = iter->next)
724 const union value *v = case_data (input, iter->src->fv);
726 if ((!iter->include_missing && is_missing (v, iter->src))
727 || (iter->include_missing && iter->src->type == NUMERIC
730 switch (iter->function)
733 case NMISS | FSTRING:
734 iter->dbl[0] += weight;
737 case NUMISS | FSTRING:
745 /* This is horrible. There are too many possibilities. */
746 switch (iter->function)
749 iter->dbl[0] += v->f * weight;
753 iter->dbl[0] += v->f * weight;
754 iter->dbl[1] += weight;
757 moments1_add (iter->moments, v->f, weight);
760 iter->dbl[0] = max (iter->dbl[0], v->f);
764 if (memcmp (iter->string, v->s, iter->src->width) < 0)
765 memcpy (iter->string, v->s, iter->src->width);
769 iter->dbl[0] = min (iter->dbl[0], v->f);
773 if (memcmp (iter->string, v->s, iter->src->width) > 0)
774 memcpy (iter->string, v->s, iter->src->width);
779 if (v->f > iter->arg[0].f)
780 iter->dbl[0] += weight;
781 iter->dbl[1] += weight;
785 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
786 iter->dbl[0] += weight;
787 iter->dbl[1] += weight;
791 if (v->f < iter->arg[0].f)
792 iter->dbl[0] += weight;
793 iter->dbl[1] += weight;
797 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
798 iter->dbl[0] += weight;
799 iter->dbl[1] += weight;
803 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
804 iter->dbl[0] += weight;
805 iter->dbl[1] += weight;
809 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
810 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
811 iter->dbl[0] += weight;
812 iter->dbl[1] += weight;
816 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
817 iter->dbl[0] += weight;
818 iter->dbl[1] += weight;
822 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
823 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
824 iter->dbl[0] += weight;
825 iter->dbl[1] += weight;
829 iter->dbl[0] += weight;
842 case FIRST | FSTRING:
845 memcpy (iter->string, v->s, iter->src->width);
854 memcpy (iter->string, v->s, iter->src->width);
858 case NMISS | FSTRING:
860 case NUMISS | FSTRING:
861 /* Our value is not missing or it would have been
862 caught earlier. Nothing to do. */
868 switch (iter->function)
871 iter->dbl[0] += weight;
882 /* We've come to a record that differs from the previous in one or
883 more of the break variables. Make an output record from the
884 accumulated statistics in the OUTPUT case. */
886 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
892 for (i = 0; i < agr->break_var_cnt; i++)
894 struct variable *v = agr->break_vars[i];
895 memcpy (case_data_rw (output, value_idx),
896 case_data (&agr->break_case, v->fv),
897 sizeof (union value) * v->nv);
905 for (i = agr->agr_vars; i; i = i->next)
907 union value *v = case_data_rw (output, i->dest->fv);
909 if (agr->missing == COLUMNWISE && i->missing != 0
910 && (i->function & FUNC) != N && (i->function & FUNC) != NU
911 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
913 if (i->dest->type == ALPHA)
914 memset (v->s, ' ', i->dest->width);
923 v->f = i->int1 ? i->dbl[0] : SYSMIS;
926 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
932 /* FIXME: we should use two passes. */
933 moments1_calculate (i->moments, NULL, NULL, &variance,
935 if (variance != SYSMIS)
936 v->f = sqrt (variance);
943 v->f = i->int1 ? i->dbl[0] : SYSMIS;
948 memcpy (v->s, i->string, i->dest->width);
950 memset (v->s, ' ', i->dest->width);
960 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
970 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
982 v->f = i->int1 ? i->dbl[0] : SYSMIS;
984 case FIRST | FSTRING:
987 memcpy (v->s, i->string, i->dest->width);
989 memset (v->s, ' ', i->dest->width);
998 case NMISS | FSTRING:
1002 case NUMISS | FSTRING:
1012 /* Resets the state for all the aggregate functions. */
1014 initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input)
1016 struct agr_var *iter;
1018 case_destroy (&agr->break_case);
1019 case_clone (&agr->break_case, input);
1021 for (iter = agr->agr_vars; iter; iter = iter->next)
1024 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1025 iter->int1 = iter->int2 = 0;
1026 switch (iter->function)
1029 iter->dbl[0] = DBL_MAX;
1032 memset (iter->string, 255, iter->src->width);
1035 iter->dbl[0] = -DBL_MAX;
1038 memset (iter->string, 0, iter->src->width);
1041 if (iter->moments == NULL)
1042 iter->moments = moments1_create (MOMENT_VARIANCE);
1044 moments1_clear (iter->moments);
1052 /* Aggregate each case as it comes through. Cases which aren't needed
1055 agr_to_active_file (struct ccase *c, void *agr_)
1057 struct agr_proc *agr = agr_;
1059 if (aggregate_single_case (agr, c, &agr->agr_case))
1060 agr->sink->class->write (agr->sink, &agr->agr_case);
1065 /* Aggregate the current case and output it if we passed a
1068 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1070 struct agr_proc *agr = agr_;
1072 if (aggregate_single_case (agr, c, &agr->agr_case))
1073 sfm_write_case (agr->writer, &agr->agr_case);