1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
42 /* Specifies how to make an aggregate variable. */
45 struct agr_var *next; /* Next in list. */
47 /* Collected during parsing. */
48 struct variable *src; /* Source variable. */
49 struct variable *dest; /* Target variable. */
50 int function; /* Function. */
51 int include_missing; /* 1=Include user-missing values. */
52 union value arg[2]; /* Arguments. */
54 /* Accumulated during AGGREGATE execution. */
59 struct moments1 *moments;
62 /* Aggregation functions. */
65 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
66 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
67 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
68 FUNC = 0x1f, /* Function mask. */
69 FSTRING = 1<<5, /* String function bit. */
72 /* Attributes of an aggregation function. */
75 const char *name; /* Aggregation function name. */
76 int n_args; /* Number of arguments. */
77 int alpha_type; /* When given ALPHA arguments, output type. */
78 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
81 /* Attributes of aggregation functions. */
82 static const struct agr_func agr_func_tab[] =
84 {"<NONE>", 0, -1, {0, 0, 0}},
85 {"SUM", 0, -1, {FMT_F, 8, 2}},
86 {"MEAN", 0, -1, {FMT_F, 8, 2}},
87 {"SD", 0, -1, {FMT_F, 8, 2}},
88 {"MAX", 0, ALPHA, {-1, -1, -1}},
89 {"MIN", 0, ALPHA, {-1, -1, -1}},
90 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
91 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
92 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
93 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
94 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
95 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
96 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
97 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
98 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
99 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
100 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
101 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"FIRST", 0, ALPHA, {-1, -1, -1}},
103 {"LAST", 0, ALPHA, {-1, -1, -1}},
104 {NULL, 0, -1, {-1, -1, -1}},
105 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
109 /* Missing value types. */
110 enum missing_treatment
112 ITEMWISE, /* Missing values item by item. */
113 COLUMNWISE /* Missing values column by column. */
116 /* An entire AGGREGATE procedure. */
119 /* We have either an output file or a sink. */
120 struct sfm_writer *writer; /* Output file, or null if none. */
121 struct case_sink *sink; /* Sink, or null if none. */
123 /* Break variables. */
124 struct sort_criteria *sort; /* Sort criteria. */
125 struct variable **break_vars; /* Break variables. */
126 size_t break_var_cnt; /* Number of break variables. */
127 union value *prev_break; /* Last values of break variables. */
129 enum missing_treatment missing; /* How to treat missing values. */
130 struct agr_var *agr_vars; /* First aggregate variable. */
131 struct dictionary *dict; /* Aggregate dictionary. */
132 int case_cnt; /* Counts aggregated cases. */
133 struct ccase agr_case; /* Aggregate case for output. */
136 static void initialize_aggregate_info (struct agr_proc *);
139 static int parse_aggregate_functions (struct agr_proc *);
140 static void agr_destroy (struct agr_proc *);
141 static int aggregate_single_case (struct agr_proc *agr,
142 const struct ccase *input,
143 struct ccase *output);
144 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
146 /* Aggregating to the active file. */
147 static int agr_to_active_file (struct ccase *, void *aux);
149 /* Aggregating to a system file. */
150 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
154 /* Parses and executes the AGGREGATE procedure. */
159 struct file_handle *out_file = NULL;
161 /* Have we seen these subcommands? */
164 memset(&agr, 0 , sizeof (agr));
165 agr.missing = ITEMWISE;
167 agr.dict = dict_create ();
168 dict_set_label (agr.dict, dict_get_label (default_dict));
169 dict_set_documents (agr.dict, dict_get_documents (default_dict));
171 /* Read most of the subcommands. */
176 if (lex_match_id ("OUTFILE"))
180 msg (SE, _("%s subcommand given multiple times."),"OUTFILE");
186 if (!lex_match ('*'))
188 out_file = fh_parse ();
189 if (out_file == NULL)
193 else if (lex_match_id ("MISSING"))
196 if (!lex_match_id ("COLUMNWISE"))
198 lex_error (_("while expecting COLUMNWISE"));
201 agr.missing = COLUMNWISE;
203 else if (lex_match_id ("DOCUMENT"))
205 else if (lex_match_id ("PRESORTED"))
207 else if (lex_match_id ("BREAK"))
213 msg (SE, _("%s subcommand given multiple times."),"BREAK");
219 agr.sort = sort_parse_criteria (default_dict,
220 &agr.break_vars, &agr.break_var_cnt);
221 if (agr.sort == NULL)
224 for (i = 0; i < agr.break_var_cnt; i++)
226 struct variable *v = dict_clone_var (agr.dict, agr.break_vars[i],
227 agr.break_vars[i]->name);
234 /* Check for proper syntax. */
236 msg (SW, _("BREAK subcommand not specified."));
238 /* Read in the aggregate functions. */
239 if (!parse_aggregate_functions (&agr))
242 /* Delete documents. */
244 dict_set_documents (agr.dict, NULL);
246 /* Cancel SPLIT FILE. */
247 dict_set_split_vars (agr.dict, NULL, 0);
251 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
252 initialize_aggregate_info (&agr);
254 /* Output to active file or external file? */
255 if (out_file == NULL)
257 /* The active file will be replaced by the aggregated data,
258 so TEMPORARY is moot. */
261 if (agr.sort != NULL && (seen & 4) == 0)
262 sort_active_file_in_place (agr.sort);
264 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
265 if (agr.sink->class->open != NULL)
266 agr.sink->class->open (agr.sink);
267 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
268 procedure (agr_to_active_file, &agr);
269 if (agr.case_cnt > 0)
271 dump_aggregate_info (&agr, &agr.agr_case);
272 agr.sink->class->write (agr.sink, &agr.agr_case);
274 dict_destroy (default_dict);
275 default_dict = agr.dict;
277 vfm_source = agr.sink->class->make_source (agr.sink);
278 free_case_sink (agr.sink);
282 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression ());
283 if (agr.writer == NULL)
286 if (agr.sort != NULL && (seen & 4) == 0)
288 /* Sorting is needed. */
289 struct casefile *dst;
290 struct casereader *reader;
293 dst = sort_active_file_to_casefile (agr.sort);
296 reader = casefile_get_destructive_reader (dst);
297 while (casereader_read_xfer (reader, &c))
299 if (aggregate_single_case (&agr, &c, &agr.agr_case))
300 sfm_write_case (agr.writer, &agr.agr_case);
303 casereader_destroy (reader);
304 casefile_destroy (dst);
308 /* Active file is already sorted. */
309 procedure (presorted_agr_to_sysfile, &agr);
312 if (agr.case_cnt > 0)
314 dump_aggregate_info (&agr, &agr.agr_case);
315 sfm_write_case (agr.writer, &agr.agr_case);
327 /* Parse all the aggregate functions. */
329 parse_aggregate_functions (struct agr_proc *agr)
331 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
333 /* Parse everything. */
342 const struct agr_func *function;
347 struct variable **src;
361 /* Parse the list of target variables. */
362 while (!lex_match ('='))
364 int n_dest_prev = n_dest;
366 if (!parse_DATA_LIST_vars (&dest, &n_dest,
367 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
370 /* Assign empty labels. */
374 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
375 for (j = n_dest_prev; j < n_dest; j++)
376 dest_label[j] = NULL;
379 if (token == T_STRING)
381 ds_truncate (&tokstr, 255);
382 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
387 /* Get the name of the aggregation function. */
390 lex_error (_("expecting aggregation function"));
395 if (tokid[strlen (tokid) - 1] == '.')
398 tokid[strlen (tokid) - 1] = 0;
401 for (function = agr_func_tab; function->name; function++)
402 if (!strcmp (function->name, tokid))
404 if (NULL == function->name)
406 msg (SE, _("Unknown aggregation function %s."), tokid);
409 func_index = function - agr_func_tab;
412 /* Check for leading lparen. */
413 if (!lex_match ('('))
416 func_index = N_NO_VARS;
417 else if (func_index == NU)
418 func_index = NU_NO_VARS;
421 lex_error (_("expecting `('"));
425 /* Parse list of source variables. */
427 int pv_opts = PV_NO_SCRATCH;
429 if (func_index == SUM || func_index == MEAN || func_index == SD)
430 pv_opts |= PV_NUMERIC;
431 else if (function->n_args)
432 pv_opts |= PV_SAME_TYPE;
434 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
438 /* Parse function arguments, for those functions that
439 require arguments. */
440 if (function->n_args != 0)
441 for (i = 0; i < function->n_args; i++)
446 if (token == T_STRING)
448 arg[i].c = xstrdup (ds_c_str (&tokstr));
451 else if (token == T_NUM)
456 msg (SE, _("Missing argument %d to %s."), i + 1, function->name);
462 if (type != src[0]->type)
464 msg (SE, _("Arguments to %s must be of same type as "
465 "source variables."),
471 /* Trailing rparen. */
474 lex_error (_("expecting `)'"));
478 /* Now check that the number of source variables match the
479 number of target variables. Do this here because if we
480 do it earlier then the user can get very misleading error
481 messages; i.e., `AGGREGATE x=SUM(y t).' will get this
482 error message when a proper message would be more like
483 `unknown variable t'. */
486 msg (SE, _("Number of source variables (%d) does not match "
487 "number of target variables (%d)."),
493 /* Finally add these to the linked list of aggregation
495 for (i = 0; i < n_dest; i++)
497 struct agr_var *v = xmalloc (sizeof *v);
499 /* Add variable to chain. */
500 if (agr->agr_vars != NULL)
508 /* Create the target variable in the aggregate
511 struct variable *destvar;
513 v->function = func_index;
521 if (src[i]->type == ALPHA)
523 v->function |= FSTRING;
524 v->string = xmalloc (src[i]->width);
527 if (v->src->type == NUMERIC || function->alpha_type == NUMERIC)
530 output_width = v->src->width;
532 if (function->alpha_type == ALPHA)
533 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
536 destvar = dict_create_var (agr->dict, dest[i], output_width);
537 if (output_width == 0)
538 destvar->print = destvar->write = function->format;
539 if (output_width == 0 && dict_get_weight (default_dict) != NULL
540 && (func_index == N || func_index == N_NO_VARS
541 || func_index == NU || func_index == NU_NO_VARS))
543 struct fmt_spec f = {FMT_F, 8, 2};
545 destvar->print = destvar->write = f;
550 destvar = dict_create_var (agr->dict, dest[i], 0);
555 msg (SE, _("Variable name %s is not unique within the "
556 "aggregate file dictionary, which contains "
557 "the aggregate variables and the break "
568 destvar->label = dest_label[i];
569 dest_label[i] = NULL;
571 else if (function->alpha_type == ALPHA)
572 destvar->print = destvar->write = function->format;
577 v->include_missing = include_missing;
583 if (v->src->type == NUMERIC)
584 for (j = 0; j < function->n_args; j++)
585 v->arg[j].f = arg[j].f;
587 for (j = 0; j < function->n_args; j++)
588 v->arg[j].c = xstrdup (arg[j].c);
592 if (src != NULL && src[0]->type == ALPHA)
593 for (i = 0; i < function->n_args; i++)
603 if (!lex_match ('/'))
608 lex_error ("expecting end of command");
614 for (i = 0; i < n_dest; i++)
617 free (dest_label[i]);
623 if (src && n_src && src[0]->type == ALPHA)
624 for (i = 0; i < function->n_args; i++)
637 agr_destroy (struct agr_proc *agr)
639 struct agr_var *iter, *next;
641 sfm_close_writer (agr->writer);
642 if (agr->sort != NULL)
643 sort_destroy_criteria (agr->sort);
644 free (agr->break_vars);
645 free (agr->prev_break);
646 for (iter = agr->agr_vars; iter; iter = next)
650 if (iter->function & FSTRING)
655 n_args = agr_func_tab[iter->function & FUNC].n_args;
656 for (i = 0; i < n_args; i++)
657 free (iter->arg[i].c);
660 else if (iter->function == SD)
661 moments1_destroy (iter->moments);
664 if (agr->dict != NULL)
665 dict_destroy (agr->dict);
667 case_destroy (&agr->agr_case);
672 static void accumulate_aggregate_info (struct agr_proc *,
673 const struct ccase *);
674 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
676 /* Processes a single case INPUT for aggregation. If output is
677 warranted, writes it to OUTPUT and returns nonzero.
678 Otherwise, returns zero and OUTPUT is unmodified. */
680 aggregate_single_case (struct agr_proc *agr,
681 const struct ccase *input, struct ccase *output)
683 /* The first case always begins a new break group. We also need to
684 preserve the values of the case for later comparison. */
685 if (agr->case_cnt++ == 0)
692 for (i = 0; i < agr->break_var_cnt; i++)
693 n_elem += agr->break_vars[i]->nv;
696 agr->prev_break = xmalloc (sizeof *agr->prev_break * n_elem);
698 /* Copy INPUT into prev_break. */
700 union value *iter = agr->prev_break;
703 for (i = 0; i < agr->break_var_cnt; i++)
705 struct variable *v = agr->break_vars[i];
707 if (v->type == NUMERIC)
708 (iter++)->f = case_num (input, v->fv);
711 memcpy (iter->s, case_str (input, v->fv), v->width);
717 accumulate_aggregate_info (agr, input);
722 /* Compare the value of each break variable to the values on the
725 union value *iter = agr->prev_break;
728 for (i = 0; i < agr->break_var_cnt; i++)
730 struct variable *v = agr->break_vars[i];
735 if (case_num (input, v->fv) != iter->f)
740 if (memcmp (case_str (input, v->fv), iter->s, v->width))
750 accumulate_aggregate_info (agr, input);
755 /* The values of the break variable are different from the values on
756 the previous case. That means that it's time to dump aggregate
758 dump_aggregate_info (agr, output);
759 initialize_aggregate_info (agr);
760 accumulate_aggregate_info (agr, input);
762 /* Copy INPUT into prev_break. */
764 union value *iter = agr->prev_break;
767 for (i = 0; i < agr->break_var_cnt; i++)
769 struct variable *v = agr->break_vars[i];
771 if (v->type == NUMERIC)
772 (iter++)->f = case_num (input, v->fv);
775 memcpy (iter->s, case_str (input, v->fv), v->width);
784 /* Accumulates aggregation data from the case INPUT. */
786 accumulate_aggregate_info (struct agr_proc *agr,
787 const struct ccase *input)
789 struct agr_var *iter;
793 weight = dict_get_case_weight (default_dict, input, &bad_warn);
795 for (iter = agr->agr_vars; iter; iter = iter->next)
798 const union value *v = case_data (input, iter->src->fv);
800 if ((!iter->include_missing && is_missing (v, iter->src))
801 || (iter->include_missing && iter->src->type == NUMERIC
804 switch (iter->function)
807 iter->dbl[0] += weight;
817 /* This is horrible. There are too many possibilities. */
818 switch (iter->function)
821 iter->dbl[0] += v->f;
824 iter->dbl[0] += v->f * weight;
825 iter->dbl[1] += weight;
828 moments1_add (iter->moments, v->f, weight);
831 iter->dbl[0] = max (iter->dbl[0], v->f);
835 if (memcmp (iter->string, v->s, iter->src->width) < 0)
836 memcpy (iter->string, v->s, iter->src->width);
840 iter->dbl[0] = min (iter->dbl[0], v->f);
844 if (memcmp (iter->string, v->s, iter->src->width) > 0)
845 memcpy (iter->string, v->s, iter->src->width);
850 if (v->f > iter->arg[0].f)
851 iter->dbl[0] += weight;
852 iter->dbl[1] += weight;
856 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
857 iter->dbl[0] += weight;
858 iter->dbl[1] += weight;
862 if (v->f < iter->arg[0].f)
863 iter->dbl[0] += weight;
864 iter->dbl[1] += weight;
868 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
869 iter->dbl[0] += weight;
870 iter->dbl[1] += weight;
874 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
875 iter->dbl[0] += weight;
876 iter->dbl[1] += weight;
880 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
881 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
882 iter->dbl[0] += weight;
883 iter->dbl[1] += weight;
887 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
888 iter->dbl[0] += weight;
889 iter->dbl[1] += weight;
893 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
894 && memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
895 iter->dbl[0] += weight;
896 iter->dbl[1] += weight;
899 iter->dbl[0] += weight;
911 case FIRST | FSTRING:
914 memcpy (iter->string, v->s, iter->src->width);
923 memcpy (iter->string, v->s, iter->src->width);
930 switch (iter->function)
933 iter->dbl[0] += weight;
944 /* We've come to a record that differs from the previous in one or
945 more of the break variables. Make an output record from the
946 accumulated statistics in the OUTPUT case. */
948 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
954 for (i = 0; i < agr->break_var_cnt; i++)
956 int nv = agr->break_vars[i]->nv;
957 memcpy (case_data_rw (output, value_idx),
958 &agr->prev_break[value_idx],
959 sizeof (union value) * nv);
967 for (i = agr->agr_vars; i; i = i->next)
969 union value *v = case_data_rw (output, i->dest->fv);
971 if (agr->missing == COLUMNWISE && i->missing != 0
972 && (i->function & FUNC) != N && (i->function & FUNC) != NU
973 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
975 if (i->function & FSTRING)
976 memset (v->s, ' ', i->dest->width);
988 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
994 /* FIXME: we should use two passes. */
995 moments1_calculate (i->moments, NULL, NULL, &variance,
997 if (variance != SYSMIS)
998 v->f = sqrt (variance);
1005 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1010 memcpy (v->s, i->string, i->dest->width);
1012 memset (v->s, ' ', i->dest->width);
1017 case FOUT | FSTRING:
1018 v->f = i->int2 ? (double) i->int1 / (double) i->int2 : SYSMIS;
1024 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
1033 case POUT | FSTRING:
1034 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
1044 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1046 case FIRST | FSTRING:
1047 case LAST | FSTRING:
1049 memcpy (v->s, i->string, i->dest->width);
1051 memset (v->s, ' ', i->dest->width);
1072 /* Resets the state for all the aggregate functions. */
1074 initialize_aggregate_info (struct agr_proc *agr)
1076 struct agr_var *iter;
1078 for (iter = agr->agr_vars; iter; iter = iter->next)
1081 switch (iter->function)
1084 iter->dbl[0] = DBL_MAX;
1087 memset (iter->string, 255, iter->src->width);
1090 iter->dbl[0] = -DBL_MAX;
1093 memset (iter->string, 0, iter->src->width);
1096 if (iter->moments == NULL)
1097 iter->moments = moments1_create (MOMENT_VARIANCE);
1099 moments1_clear (iter->moments);
1102 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1103 iter->int1 = iter->int2 = 0;
1109 /* Aggregate each case as it comes through. Cases which aren't needed
1112 agr_to_active_file (struct ccase *c, void *agr_)
1114 struct agr_proc *agr = agr_;
1116 if (aggregate_single_case (agr, c, &agr->agr_case))
1117 agr->sink->class->write (agr->sink, &agr->agr_case);
1122 /* Aggregate the current case and output it if we passed a
1125 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1127 struct agr_proc *agr = agr_;
1129 if (aggregate_single_case (agr, c, &agr->agr_case))
1130 sfm_write_case (agr->writer, &agr->agr_case);