1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
42 /* Specifies how to make an aggregate variable. */
45 struct agr_var *next; /* Next in list. */
47 /* Collected during parsing. */
48 struct variable *src; /* Source variable. */
49 struct variable *dest; /* Target variable. */
50 int function; /* Function. */
51 int include_missing; /* 1=Include user-missing values. */
52 union value arg[2]; /* Arguments. */
54 /* Accumulated during AGGREGATE execution. */
59 struct moments1 *moments;
62 /* Aggregation functions. */
65 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
66 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
67 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
68 FUNC = 0x1f, /* Function mask. */
69 FSTRING = 1<<5, /* String function bit. */
72 /* Attributes of an aggregation function. */
75 const char *name; /* Aggregation function name. */
76 int n_args; /* Number of arguments. */
77 int alpha_type; /* When given ALPHA arguments, output type. */
78 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
81 /* Attributes of aggregation functions. */
82 static const struct agr_func agr_func_tab[] =
84 {"<NONE>", 0, -1, {0, 0, 0}},
85 {"SUM", 0, -1, {FMT_F, 8, 2}},
86 {"MEAN", 0, -1, {FMT_F, 8, 2}},
87 {"SD", 0, -1, {FMT_F, 8, 2}},
88 {"MAX", 0, ALPHA, {-1, -1, -1}},
89 {"MIN", 0, ALPHA, {-1, -1, -1}},
90 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
91 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
92 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
93 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
94 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
95 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
96 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
97 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
98 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
99 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
100 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
101 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"FIRST", 0, ALPHA, {-1, -1, -1}},
103 {"LAST", 0, ALPHA, {-1, -1, -1}},
104 {NULL, 0, -1, {-1, -1, -1}},
105 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
109 /* Missing value types. */
110 enum missing_treatment
112 ITEMWISE, /* Missing values item by item. */
113 COLUMNWISE /* Missing values column by column. */
116 /* An entire AGGREGATE procedure. */
119 /* We have either an output file or a sink. */
120 struct sfm_writer *writer; /* Output file, or null if none. */
121 struct case_sink *sink; /* Sink, or null if none. */
123 /* Break variables. */
124 struct sort_criteria *sort; /* Sort criteria. */
125 struct variable **break_vars; /* Break variables. */
126 size_t break_var_cnt; /* Number of break variables. */
127 union value *prev_break; /* Last values of break variables. */
129 enum missing_treatment missing; /* How to treat missing values. */
130 struct agr_var *agr_vars; /* First aggregate variable. */
131 struct dictionary *dict; /* Aggregate dictionary. */
132 int case_cnt; /* Counts aggregated cases. */
133 struct ccase agr_case; /* Aggregate case for output. */
136 static void initialize_aggregate_info (struct agr_proc *);
139 static int parse_aggregate_functions (struct agr_proc *);
140 static void agr_destroy (struct agr_proc *);
141 static int aggregate_single_case (struct agr_proc *agr,
142 const struct ccase *input,
143 struct ccase *output);
144 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
146 /* Aggregating to the active file. */
147 static int agr_to_active_file (struct ccase *, void *aux);
149 /* Aggregating to a system file. */
150 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
154 /* Parses and executes the AGGREGATE procedure. */
159 struct file_handle *out_file = NULL;
161 /* Have we seen these subcommands? */
166 agr.missing = ITEMWISE;
168 agr.break_vars = NULL;
172 agr.prev_break = NULL;
174 agr.dict = dict_create ();
175 dict_set_label (agr.dict, dict_get_label (default_dict));
176 dict_set_documents (agr.dict, dict_get_documents (default_dict));
178 /* Read most of the subcommands. */
183 if (lex_match_id ("OUTFILE"))
187 msg (SE, _("%s subcommand given multiple times."),"OUTFILE");
193 if (!lex_match ('*'))
195 out_file = fh_parse ();
196 if (out_file == NULL)
200 else if (lex_match_id ("MISSING"))
203 if (!lex_match_id ("COLUMNWISE"))
205 lex_error (_("while expecting COLUMNWISE"));
208 agr.missing = COLUMNWISE;
210 else if (lex_match_id ("DOCUMENT"))
212 else if (lex_match_id ("PRESORTED"))
214 else if (lex_match_id ("BREAK"))
220 msg (SE, _("%s subcommand given multiple times."),"BREAK");
226 agr.sort = sort_parse_criteria (default_dict,
227 &agr.break_vars, &agr.break_var_cnt);
228 if (agr.sort == NULL)
231 for (i = 0; i < agr.break_var_cnt; i++)
233 struct variable *v = dict_clone_var (agr.dict, agr.break_vars[i],
234 agr.break_vars[i]->name);
241 /* Check for proper syntax. */
243 msg (SW, _("BREAK subcommand not specified."));
245 /* Read in the aggregate functions. */
246 if (!parse_aggregate_functions (&agr))
249 /* Delete documents. */
251 dict_set_documents (agr.dict, NULL);
253 /* Cancel SPLIT FILE. */
254 dict_set_split_vars (agr.dict, NULL, 0);
258 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
259 initialize_aggregate_info (&agr);
261 /* Output to active file or external file? */
262 if (out_file == NULL)
264 /* The active file will be replaced by the aggregated data,
265 so TEMPORARY is moot. */
268 if (agr.sort != NULL && (seen & 4) == 0)
269 sort_active_file_in_place (agr.sort);
271 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
272 if (agr.sink->class->open != NULL)
273 agr.sink->class->open (agr.sink);
274 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
275 procedure (agr_to_active_file, &agr);
276 if (agr.case_cnt > 0)
278 dump_aggregate_info (&agr, &agr.agr_case);
279 agr.sink->class->write (agr.sink, &agr.agr_case);
281 dict_destroy (default_dict);
282 default_dict = agr.dict;
284 vfm_source = agr.sink->class->make_source (agr.sink);
285 free_case_sink (agr.sink);
289 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression ());
290 if (agr.writer == NULL)
293 if (agr.sort != NULL && (seen & 4) == 0)
295 /* Sorting is needed. */
296 struct casefile *dst;
297 struct casereader *reader;
300 dst = sort_active_file_to_casefile (agr.sort);
303 reader = casefile_get_destructive_reader (dst);
304 while (casereader_read_xfer (reader, &c))
306 if (aggregate_single_case (&agr, &c, &agr.agr_case))
307 sfm_write_case (agr.writer, &agr.agr_case);
310 casereader_destroy (reader);
311 casefile_destroy (dst);
315 /* Active file is already sorted. */
316 procedure (presorted_agr_to_sysfile, &agr);
319 if (agr.case_cnt > 0)
321 dump_aggregate_info (&agr, &agr.agr_case);
322 sfm_write_case (agr.writer, &agr.agr_case);
334 /* Parse all the aggregate functions. */
336 parse_aggregate_functions (struct agr_proc *agr)
338 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
340 /* Parse everything. */
349 const struct agr_func *function;
354 struct variable **src;
368 /* Parse the list of target variables. */
369 while (!lex_match ('='))
371 int n_dest_prev = n_dest;
373 if (!parse_DATA_LIST_vars (&dest, &n_dest,
374 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
377 /* Assign empty labels. */
381 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
382 for (j = n_dest_prev; j < n_dest; j++)
383 dest_label[j] = NULL;
386 if (token == T_STRING)
388 ds_truncate (&tokstr, 255);
389 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
394 /* Get the name of the aggregation function. */
397 lex_error (_("expecting aggregation function"));
402 if (tokid[strlen (tokid) - 1] == '.')
405 tokid[strlen (tokid) - 1] = 0;
408 for (function = agr_func_tab; function->name; function++)
409 if (!strcmp (function->name, tokid))
411 if (NULL == function->name)
413 msg (SE, _("Unknown aggregation function %s."), tokid);
416 func_index = function - agr_func_tab;
419 /* Check for leading lparen. */
420 if (!lex_match ('('))
423 func_index = N_NO_VARS;
424 else if (func_index == NU)
425 func_index = NU_NO_VARS;
428 lex_error (_("expecting `('"));
432 /* Parse list of source variables. */
434 int pv_opts = PV_NO_SCRATCH;
436 if (func_index == SUM || func_index == MEAN || func_index == SD)
437 pv_opts |= PV_NUMERIC;
438 else if (function->n_args)
439 pv_opts |= PV_SAME_TYPE;
441 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
445 /* Parse function arguments, for those functions that
446 require arguments. */
447 if (function->n_args != 0)
448 for (i = 0; i < function->n_args; i++)
453 if (token == T_STRING)
455 arg[i].c = xstrdup (ds_c_str (&tokstr));
458 else if (token == T_NUM)
463 msg (SE, _("Missing argument %d to %s."), i + 1, function->name);
469 if (type != src[0]->type)
471 msg (SE, _("Arguments to %s must be of same type as "
472 "source variables."),
478 /* Trailing rparen. */
481 lex_error (_("expecting `)'"));
485 /* Now check that the number of source variables match the
486 number of target variables. Do this here because if we
487 do it earlier then the user can get very misleading error
488 messages; i.e., `AGGREGATE x=SUM(y t).' will get this
489 error message when a proper message would be more like
490 `unknown variable t'. */
493 msg (SE, _("Number of source variables (%d) does not match "
494 "number of target variables (%d)."),
500 /* Finally add these to the linked list of aggregation
502 for (i = 0; i < n_dest; i++)
504 struct agr_var *v = xmalloc (sizeof *v);
506 /* Add variable to chain. */
507 if (agr->agr_vars != NULL)
515 /* Create the target variable in the aggregate
518 struct variable *destvar;
520 v->function = func_index;
528 if (src[i]->type == ALPHA)
530 v->function |= FSTRING;
531 v->string = xmalloc (src[i]->width);
534 if (v->src->type == NUMERIC || function->alpha_type == NUMERIC)
537 output_width = v->src->width;
539 if (function->alpha_type == ALPHA)
540 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
543 destvar = dict_create_var (agr->dict, dest[i], output_width);
544 if (output_width == 0)
545 destvar->print = destvar->write = function->format;
546 if (output_width == 0 && dict_get_weight (default_dict) != NULL
547 && (func_index == N || func_index == N_NO_VARS
548 || func_index == NU || func_index == NU_NO_VARS))
550 struct fmt_spec f = {FMT_F, 8, 2};
552 destvar->print = destvar->write = f;
557 destvar = dict_create_var (agr->dict, dest[i], 0);
562 msg (SE, _("Variable name %s is not unique within the "
563 "aggregate file dictionary, which contains "
564 "the aggregate variables and the break "
575 destvar->label = dest_label[i];
576 dest_label[i] = NULL;
578 else if (function->alpha_type == ALPHA)
579 destvar->print = destvar->write = function->format;
584 v->include_missing = include_missing;
590 if (v->src->type == NUMERIC)
591 for (j = 0; j < function->n_args; j++)
592 v->arg[j].f = arg[j].f;
594 for (j = 0; j < function->n_args; j++)
595 v->arg[j].c = xstrdup (arg[j].c);
599 if (src != NULL && src[0]->type == ALPHA)
600 for (i = 0; i < function->n_args; i++)
610 if (!lex_match ('/'))
615 lex_error ("expecting end of command");
621 for (i = 0; i < n_dest; i++)
624 free (dest_label[i]);
630 if (src && n_src && src[0]->type == ALPHA)
631 for (i = 0; i < function->n_args; i++)
644 agr_destroy (struct agr_proc *agr)
646 struct agr_var *iter, *next;
648 sfm_close_writer (agr->writer);
649 if (agr->sort != NULL)
650 sort_destroy_criteria (agr->sort);
651 free (agr->break_vars);
652 free (agr->prev_break);
653 for (iter = agr->agr_vars; iter; iter = next)
657 if (iter->function & FSTRING)
662 n_args = agr_func_tab[iter->function & FUNC].n_args;
663 for (i = 0; i < n_args; i++)
664 free (iter->arg[i].c);
667 else if (iter->function == SD)
668 moments1_destroy (iter->moments);
671 if (agr->dict != NULL)
672 dict_destroy (agr->dict);
673 case_destroy (&agr->agr_case);
678 static void accumulate_aggregate_info (struct agr_proc *,
679 const struct ccase *);
680 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
682 /* Processes a single case INPUT for aggregation. If output is
683 warranted, writes it to OUTPUT and returns nonzero.
684 Otherwise, returns zero and OUTPUT is unmodified. */
686 aggregate_single_case (struct agr_proc *agr,
687 const struct ccase *input, struct ccase *output)
689 /* The first case always begins a new break group. We also need to
690 preserve the values of the case for later comparison. */
691 if (agr->case_cnt++ == 0)
698 for (i = 0; i < agr->break_var_cnt; i++)
699 n_elem += agr->break_vars[i]->nv;
702 agr->prev_break = xmalloc (sizeof *agr->prev_break * n_elem);
704 /* Copy INPUT into prev_break. */
706 union value *iter = agr->prev_break;
709 for (i = 0; i < agr->break_var_cnt; i++)
711 struct variable *v = agr->break_vars[i];
713 if (v->type == NUMERIC)
714 (iter++)->f = case_num (input, v->fv);
717 memcpy (iter->s, case_str (input, v->fv), v->width);
723 accumulate_aggregate_info (agr, input);
728 /* Compare the value of each break variable to the values on the
731 union value *iter = agr->prev_break;
734 for (i = 0; i < agr->break_var_cnt; i++)
736 struct variable *v = agr->break_vars[i];
741 if (case_num (input, v->fv) != iter->f)
746 if (memcmp (case_str (input, v->fv), iter->s, v->width))
756 accumulate_aggregate_info (agr, input);
761 /* The values of the break variable are different from the values on
762 the previous case. That means that it's time to dump aggregate
764 dump_aggregate_info (agr, output);
765 initialize_aggregate_info (agr);
766 accumulate_aggregate_info (agr, input);
768 /* Copy INPUT into prev_break. */
770 union value *iter = agr->prev_break;
773 for (i = 0; i < agr->break_var_cnt; i++)
775 struct variable *v = agr->break_vars[i];
777 if (v->type == NUMERIC)
778 (iter++)->f = case_num (input, v->fv);
781 memcpy (iter->s, case_str (input, v->fv), v->width);
790 /* Accumulates aggregation data from the case INPUT. */
792 accumulate_aggregate_info (struct agr_proc *agr,
793 const struct ccase *input)
795 struct agr_var *iter;
799 weight = dict_get_case_weight (default_dict, input, &bad_warn);
801 for (iter = agr->agr_vars; iter; iter = iter->next)
804 const union value *v = case_data (input, iter->src->fv);
806 if ((!iter->include_missing && is_missing (v, iter->src))
807 || (iter->include_missing && iter->src->type == NUMERIC
810 switch (iter->function)
813 iter->dbl[0] += weight;
823 /* This is horrible. There are too many possibilities. */
824 switch (iter->function)
827 iter->dbl[0] += v->f;
830 iter->dbl[0] += v->f * weight;
831 iter->dbl[1] += weight;
834 moments1_add (iter->moments, v->f, weight);
837 iter->dbl[0] = max (iter->dbl[0], v->f);
841 if (memcmp (iter->string, v->s, iter->src->width) < 0)
842 memcpy (iter->string, v->s, iter->src->width);
846 iter->dbl[0] = min (iter->dbl[0], v->f);
850 if (memcmp (iter->string, v->s, iter->src->width) > 0)
851 memcpy (iter->string, v->s, iter->src->width);
856 if (v->f > iter->arg[0].f)
857 iter->dbl[0] += weight;
858 iter->dbl[1] += weight;
862 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
863 iter->dbl[0] += weight;
864 iter->dbl[1] += weight;
868 if (v->f < iter->arg[0].f)
869 iter->dbl[0] += weight;
870 iter->dbl[1] += weight;
874 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
875 iter->dbl[0] += weight;
876 iter->dbl[1] += weight;
880 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
881 iter->dbl[0] += weight;
882 iter->dbl[1] += weight;
886 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
887 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
888 iter->dbl[0] += weight;
889 iter->dbl[1] += weight;
893 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
894 iter->dbl[0] += weight;
895 iter->dbl[1] += weight;
899 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
900 && memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
901 iter->dbl[0] += weight;
902 iter->dbl[1] += weight;
905 iter->dbl[0] += weight;
917 case FIRST | FSTRING:
920 memcpy (iter->string, v->s, iter->src->width);
929 memcpy (iter->string, v->s, iter->src->width);
936 switch (iter->function)
939 iter->dbl[0] += weight;
950 /* We've come to a record that differs from the previous in one or
951 more of the break variables. Make an output record from the
952 accumulated statistics in the OUTPUT case. */
954 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
960 for (i = 0; i < agr->break_var_cnt; i++)
962 int nv = agr->break_vars[i]->nv;
963 memcpy (case_data_rw (output, value_idx),
964 &agr->prev_break[value_idx],
965 sizeof (union value) * nv);
973 for (i = agr->agr_vars; i; i = i->next)
975 union value *v = case_data_rw (output, i->dest->fv);
977 if (agr->missing == COLUMNWISE && i->missing != 0
978 && (i->function & FUNC) != N && (i->function & FUNC) != NU
979 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
981 if (i->function & FSTRING)
982 memset (v->s, ' ', i->dest->width);
994 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
1000 /* FIXME: we should use two passes. */
1001 moments1_calculate (i->moments, NULL, NULL, &variance,
1003 if (variance != SYSMIS)
1004 v->f = sqrt (variance);
1011 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1016 memcpy (v->s, i->string, i->dest->width);
1018 memset (v->s, ' ', i->dest->width);
1023 case FOUT | FSTRING:
1024 v->f = i->int2 ? (double) i->int1 / (double) i->int2 : SYSMIS;
1030 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
1039 case POUT | FSTRING:
1040 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
1050 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1052 case FIRST | FSTRING:
1053 case LAST | FSTRING:
1055 memcpy (v->s, i->string, i->dest->width);
1057 memset (v->s, ' ', i->dest->width);
1078 /* Resets the state for all the aggregate functions. */
1080 initialize_aggregate_info (struct agr_proc *agr)
1082 struct agr_var *iter;
1084 for (iter = agr->agr_vars; iter; iter = iter->next)
1087 switch (iter->function)
1090 iter->dbl[0] = DBL_MAX;
1093 memset (iter->string, 255, iter->src->width);
1096 iter->dbl[0] = -DBL_MAX;
1099 memset (iter->string, 0, iter->src->width);
1102 if (iter->moments == NULL)
1103 iter->moments = moments1_create (MOMENT_VARIANCE);
1105 moments1_clear (iter->moments);
1108 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1109 iter->int1 = iter->int2 = 0;
1115 /* Aggregate each case as it comes through. Cases which aren't needed
1118 agr_to_active_file (struct ccase *c, void *agr_)
1120 struct agr_proc *agr = agr_;
1122 if (aggregate_single_case (agr, c, &agr->agr_case))
1123 agr->sink->class->write (agr->sink, &agr->agr_case);
1128 /* Aggregate the current case and output it if we passed a
1131 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1133 struct agr_proc *agr = agr_;
1135 if (aggregate_single_case (agr, c, &agr->agr_case))
1136 sfm_write_case (agr->writer, &agr->agr_case);