1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
44 #define _(msgid) gettext (msgid)
46 /* Specifies how to make an aggregate variable. */
49 struct agr_var *next; /* Next in list. */
51 /* Collected during parsing. */
52 struct variable *src; /* Source variable. */
53 struct variable *dest; /* Target variable. */
54 int function; /* Function. */
55 int include_missing; /* 1=Include user-missing values. */
56 union value arg[2]; /* Arguments. */
58 /* Accumulated during AGGREGATE execution. */
63 struct moments1 *moments;
66 /* Aggregation functions. */
69 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
70 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
71 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
72 FUNC = 0x1f, /* Function mask. */
73 FSTRING = 1<<5, /* String function bit. */
76 /* Attributes of an aggregation function. */
79 const char *name; /* Aggregation function name. */
80 size_t n_args; /* Number of arguments. */
81 int alpha_type; /* When given ALPHA arguments, output type. */
82 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
85 /* Attributes of aggregation functions. */
86 static const struct agr_func agr_func_tab[] =
88 {"<NONE>", 0, -1, {0, 0, 0}},
89 {"SUM", 0, -1, {FMT_F, 8, 2}},
90 {"MEAN", 0, -1, {FMT_F, 8, 2}},
91 {"SD", 0, -1, {FMT_F, 8, 2}},
92 {"MAX", 0, ALPHA, {-1, -1, -1}},
93 {"MIN", 0, ALPHA, {-1, -1, -1}},
94 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
95 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
96 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
97 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
98 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
99 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
100 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
101 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
102 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
103 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
104 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
105 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"FIRST", 0, ALPHA, {-1, -1, -1}},
107 {"LAST", 0, ALPHA, {-1, -1, -1}},
108 {NULL, 0, -1, {-1, -1, -1}},
109 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
110 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
113 /* Missing value types. */
114 enum missing_treatment
116 ITEMWISE, /* Missing values item by item. */
117 COLUMNWISE /* Missing values column by column. */
120 /* An entire AGGREGATE procedure. */
123 /* We have either an output file or a sink. */
124 struct sfm_writer *writer; /* Output file, or null if none. */
125 struct case_sink *sink; /* Sink, or null if none. */
127 /* Break variables. */
128 struct sort_criteria *sort; /* Sort criteria. */
129 struct variable **break_vars; /* Break variables. */
130 size_t break_var_cnt; /* Number of break variables. */
131 struct ccase break_case; /* Last values of break variables. */
133 enum missing_treatment missing; /* How to treat missing values. */
134 struct agr_var *agr_vars; /* First aggregate variable. */
135 struct dictionary *dict; /* Aggregate dictionary. */
136 int case_cnt; /* Counts aggregated cases. */
137 struct ccase agr_case; /* Aggregate case for output. */
140 static void initialize_aggregate_info (struct agr_proc *,
141 const struct ccase *);
144 static int parse_aggregate_functions (struct agr_proc *);
145 static void agr_destroy (struct agr_proc *);
146 static int aggregate_single_case (struct agr_proc *agr,
147 const struct ccase *input,
148 struct ccase *output);
149 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
151 /* Aggregating to the active file. */
152 static int agr_to_active_file (struct ccase *, void *aux);
154 /* Aggregating to a system file. */
155 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
159 /* Parses and executes the AGGREGATE procedure. */
164 struct file_handle *out_file = NULL;
166 bool copy_documents = false;
167 bool presorted = false;
170 memset(&agr, 0 , sizeof (agr));
171 agr.missing = ITEMWISE;
172 case_nullify (&agr.break_case);
174 agr.dict = dict_create ();
175 dict_set_label (agr.dict, dict_get_label (default_dict));
176 dict_set_documents (agr.dict, dict_get_documents (default_dict));
178 /* OUTFILE subcommand must be first. */
179 if (!lex_force_match_id ("OUTFILE"))
182 if (!lex_match ('*'))
184 out_file = fh_parse ();
185 if (out_file == NULL)
189 /* Read most of the subcommands. */
194 if (lex_match_id ("MISSING"))
197 if (!lex_match_id ("COLUMNWISE"))
199 lex_error (_("while expecting COLUMNWISE"));
202 agr.missing = COLUMNWISE;
204 else if (lex_match_id ("DOCUMENT"))
205 copy_documents = true;
206 else if (lex_match_id ("PRESORTED"))
208 else if (lex_match_id ("BREAK"))
213 agr.sort = sort_parse_criteria (default_dict,
214 &agr.break_vars, &agr.break_var_cnt,
215 &saw_direction, NULL);
216 if (agr.sort == NULL)
219 for (i = 0; i < agr.break_var_cnt; i++)
220 dict_clone_var_assert (agr.dict, agr.break_vars[i],
221 agr.break_vars[i]->name);
223 /* BREAK must follow the options. */
228 lex_error (_("expecting BREAK"));
232 if (presorted && saw_direction)
233 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
234 "with (A) or (D) has no effect. Output data will be sorted "
235 "the same way as the input data."));
237 /* Read in the aggregate functions. */
239 if (!parse_aggregate_functions (&agr))
242 /* Delete documents. */
244 dict_set_documents (agr.dict, NULL);
246 /* Cancel SPLIT FILE. */
247 dict_set_split_vars (agr.dict, NULL, 0);
251 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
253 /* Output to active file or external file? */
254 if (out_file == NULL)
256 /* The active file will be replaced by the aggregated data,
257 so TEMPORARY is moot. */
260 if (agr.sort != NULL && !presorted)
261 sort_active_file_in_place (agr.sort);
263 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
264 if (agr.sink->class->open != NULL)
265 agr.sink->class->open (agr.sink);
266 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
267 procedure (agr_to_active_file, &agr);
268 if (agr.case_cnt > 0)
270 dump_aggregate_info (&agr, &agr.agr_case);
271 agr.sink->class->write (agr.sink, &agr.agr_case);
273 dict_destroy (default_dict);
274 default_dict = agr.dict;
276 vfm_source = agr.sink->class->make_source (agr.sink);
277 free_case_sink (agr.sink);
281 agr.writer = sfm_open_writer (out_file, agr.dict,
282 sfm_writer_default_options ());
283 if (agr.writer == NULL)
286 if (agr.sort != NULL && !presorted)
288 /* Sorting is needed. */
289 struct casefile *dst;
290 struct casereader *reader;
293 dst = sort_active_file_to_casefile (agr.sort);
296 reader = casefile_get_destructive_reader (dst);
297 while (casereader_read_xfer (reader, &c))
299 if (aggregate_single_case (&agr, &c, &agr.agr_case))
300 sfm_write_case (agr.writer, &agr.agr_case);
303 casereader_destroy (reader);
304 casefile_destroy (dst);
308 /* Active file is already sorted. */
309 procedure (presorted_agr_to_sysfile, &agr);
312 if (agr.case_cnt > 0)
314 dump_aggregate_info (&agr, &agr.agr_case);
315 sfm_write_case (agr.writer, &agr.agr_case);
327 /* Parse all the aggregate functions. */
329 parse_aggregate_functions (struct agr_proc *agr)
331 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
333 /* Parse everything. */
342 const struct agr_func *function;
347 struct variable **src;
361 /* Parse the list of target variables. */
362 while (!lex_match ('='))
364 size_t n_dest_prev = n_dest;
366 if (!parse_DATA_LIST_vars (&dest, &n_dest,
367 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
370 /* Assign empty labels. */
374 dest_label = xnrealloc (dest_label, n_dest, sizeof *dest_label);
375 for (j = n_dest_prev; j < n_dest; j++)
376 dest_label[j] = NULL;
379 if (token == T_STRING)
381 ds_truncate (&tokstr, 255);
382 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
387 /* Get the name of the aggregation function. */
390 lex_error (_("expecting aggregation function"));
395 if (tokid[strlen (tokid) - 1] == '.')
398 tokid[strlen (tokid) - 1] = 0;
401 for (function = agr_func_tab; function->name; function++)
402 if (!strcasecmp (function->name, tokid))
404 if (NULL == function->name)
406 msg (SE, _("Unknown aggregation function %s."), tokid);
409 func_index = function - agr_func_tab;
412 /* Check for leading lparen. */
413 if (!lex_match ('('))
416 func_index = N_NO_VARS;
417 else if (func_index == NU)
418 func_index = NU_NO_VARS;
421 lex_error (_("expecting `('"));
427 /* Parse list of source variables. */
429 int pv_opts = PV_NO_SCRATCH;
431 if (func_index == SUM || func_index == MEAN || func_index == SD)
432 pv_opts |= PV_NUMERIC;
433 else if (function->n_args)
434 pv_opts |= PV_SAME_TYPE;
436 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
440 /* Parse function arguments, for those functions that
441 require arguments. */
442 if (function->n_args != 0)
443 for (i = 0; i < function->n_args; i++)
448 if (token == T_STRING)
450 arg[i].c = xstrdup (ds_c_str (&tokstr));
453 else if (lex_is_number ())
458 msg (SE, _("Missing argument %d to %s."), i + 1,
465 if (type != src[0]->type)
467 msg (SE, _("Arguments to %s must be of same type as "
468 "source variables."),
474 /* Trailing rparen. */
477 lex_error (_("expecting `)'"));
481 /* Now check that the number of source variables match
482 the number of target variables. If we check earlier
483 than this, the user can get very misleading error
484 message, i.e. `AGGREGATE x=SUM(y t).' will get this
485 error message when a proper message would be more
486 like `unknown variable t'. */
489 msg (SE, _("Number of source variables (%u) does not match "
490 "number of target variables (%u)."),
491 (unsigned) n_src, (unsigned) n_dest);
495 if ((func_index == PIN || func_index == POUT
496 || func_index == FIN || func_index == FOUT)
497 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
498 || (src[0]->type == ALPHA
499 && str_compare_rpad (arg[0].c, arg[1].c) > 0)))
501 union value t = arg[0];
505 msg (SW, _("The value arguments passed to the %s function "
506 "are out-of-order. They will be treated as if "
507 "they had been specified in the correct order."),
512 /* Finally add these to the linked list of aggregation
514 for (i = 0; i < n_dest; i++)
516 struct agr_var *v = xmalloc (sizeof *v);
518 /* Add variable to chain. */
519 if (agr->agr_vars != NULL)
527 /* Create the target variable in the aggregate
530 struct variable *destvar;
532 v->function = func_index;
538 if (src[i]->type == ALPHA)
540 v->function |= FSTRING;
541 v->string = xmalloc (src[i]->width);
544 if (function->alpha_type == ALPHA)
545 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
548 assert (v->src->type == NUMERIC
549 || function->alpha_type == NUMERIC);
550 destvar = dict_create_var (agr->dict, dest[i], 0);
553 if ((func_index == N || func_index == NMISS)
554 && dict_get_weight (default_dict) != NULL)
555 destvar->print = destvar->write = f8_2;
557 destvar->print = destvar->write = function->format;
562 destvar = dict_create_var (agr->dict, dest[i], 0);
563 if (func_index == N_NO_VARS
564 && dict_get_weight (default_dict) != NULL)
565 destvar->print = destvar->write = f8_2;
567 destvar->print = destvar->write = function->format;
572 msg (SE, _("Variable name %s is not unique within the "
573 "aggregate file dictionary, which contains "
574 "the aggregate variables and the break "
584 destvar->label = dest_label[i];
585 dest_label[i] = NULL;
591 v->include_missing = include_missing;
597 if (v->src->type == NUMERIC)
598 for (j = 0; j < function->n_args; j++)
599 v->arg[j].f = arg[j].f;
601 for (j = 0; j < function->n_args; j++)
602 v->arg[j].c = xstrdup (arg[j].c);
606 if (src != NULL && src[0]->type == ALPHA)
607 for (i = 0; i < function->n_args; i++)
617 if (!lex_match ('/'))
622 lex_error ("expecting end of command");
628 for (i = 0; i < n_dest; i++)
631 free (dest_label[i]);
637 if (src && n_src && src[0]->type == ALPHA)
638 for (i = 0; i < function->n_args; i++)
651 agr_destroy (struct agr_proc *agr)
653 struct agr_var *iter, *next;
655 sfm_close_writer (agr->writer);
656 if (agr->sort != NULL)
657 sort_destroy_criteria (agr->sort);
658 free (agr->break_vars);
659 case_destroy (&agr->break_case);
660 for (iter = agr->agr_vars; iter; iter = next)
664 if (iter->function & FSTRING)
669 n_args = agr_func_tab[iter->function & FUNC].n_args;
670 for (i = 0; i < n_args; i++)
671 free (iter->arg[i].c);
674 else if (iter->function == SD)
675 moments1_destroy (iter->moments);
678 if (agr->dict != NULL)
679 dict_destroy (agr->dict);
681 case_destroy (&agr->agr_case);
686 static void accumulate_aggregate_info (struct agr_proc *,
687 const struct ccase *);
688 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
690 /* Processes a single case INPUT for aggregation. If output is
691 warranted, writes it to OUTPUT and returns nonzero.
692 Otherwise, returns zero and OUTPUT is unmodified. */
694 aggregate_single_case (struct agr_proc *agr,
695 const struct ccase *input, struct ccase *output)
697 bool finished_group = false;
699 if (agr->case_cnt++ == 0)
700 initialize_aggregate_info (agr, input);
701 else if (case_compare (&agr->break_case, input,
702 agr->break_vars, agr->break_var_cnt))
704 dump_aggregate_info (agr, output);
705 finished_group = true;
707 initialize_aggregate_info (agr, input);
710 accumulate_aggregate_info (agr, input);
711 return finished_group;
714 /* Accumulates aggregation data from the case INPUT. */
716 accumulate_aggregate_info (struct agr_proc *agr,
717 const struct ccase *input)
719 struct agr_var *iter;
723 weight = dict_get_case_weight (default_dict, input, &bad_warn);
725 for (iter = agr->agr_vars; iter; iter = iter->next)
728 const union value *v = case_data (input, iter->src->fv);
730 if ((!iter->include_missing
731 && mv_is_value_missing (&iter->src->miss, v))
732 || (iter->include_missing && iter->src->type == NUMERIC
735 switch (iter->function)
738 case NMISS | FSTRING:
739 iter->dbl[0] += weight;
742 case NUMISS | FSTRING:
750 /* This is horrible. There are too many possibilities. */
751 switch (iter->function)
754 iter->dbl[0] += v->f * weight;
758 iter->dbl[0] += v->f * weight;
759 iter->dbl[1] += weight;
762 moments1_add (iter->moments, v->f, weight);
765 iter->dbl[0] = max (iter->dbl[0], v->f);
769 if (memcmp (iter->string, v->s, iter->src->width) < 0)
770 memcpy (iter->string, v->s, iter->src->width);
774 iter->dbl[0] = min (iter->dbl[0], v->f);
778 if (memcmp (iter->string, v->s, iter->src->width) > 0)
779 memcpy (iter->string, v->s, iter->src->width);
784 if (v->f > iter->arg[0].f)
785 iter->dbl[0] += weight;
786 iter->dbl[1] += weight;
790 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
791 iter->dbl[0] += weight;
792 iter->dbl[1] += weight;
796 if (v->f < iter->arg[0].f)
797 iter->dbl[0] += weight;
798 iter->dbl[1] += weight;
802 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
803 iter->dbl[0] += weight;
804 iter->dbl[1] += weight;
808 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
809 iter->dbl[0] += weight;
810 iter->dbl[1] += weight;
814 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
815 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
816 iter->dbl[0] += weight;
817 iter->dbl[1] += weight;
821 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
822 iter->dbl[0] += weight;
823 iter->dbl[1] += weight;
827 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
828 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
829 iter->dbl[0] += weight;
830 iter->dbl[1] += weight;
834 iter->dbl[0] += weight;
847 case FIRST | FSTRING:
850 memcpy (iter->string, v->s, iter->src->width);
859 memcpy (iter->string, v->s, iter->src->width);
863 case NMISS | FSTRING:
865 case NUMISS | FSTRING:
866 /* Our value is not missing or it would have been
867 caught earlier. Nothing to do. */
873 switch (iter->function)
876 iter->dbl[0] += weight;
887 /* We've come to a record that differs from the previous in one or
888 more of the break variables. Make an output record from the
889 accumulated statistics in the OUTPUT case. */
891 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
897 for (i = 0; i < agr->break_var_cnt; i++)
899 struct variable *v = agr->break_vars[i];
900 memcpy (case_data_rw (output, value_idx),
901 case_data (&agr->break_case, v->fv),
902 sizeof (union value) * v->nv);
910 for (i = agr->agr_vars; i; i = i->next)
912 union value *v = case_data_rw (output, i->dest->fv);
914 if (agr->missing == COLUMNWISE && i->missing != 0
915 && (i->function & FUNC) != N && (i->function & FUNC) != NU
916 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
918 if (i->dest->type == ALPHA)
919 memset (v->s, ' ', i->dest->width);
928 v->f = i->int1 ? i->dbl[0] : SYSMIS;
931 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
937 /* FIXME: we should use two passes. */
938 moments1_calculate (i->moments, NULL, NULL, &variance,
940 if (variance != SYSMIS)
941 v->f = sqrt (variance);
948 v->f = i->int1 ? i->dbl[0] : SYSMIS;
953 memcpy (v->s, i->string, i->dest->width);
955 memset (v->s, ' ', i->dest->width);
965 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
975 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
987 v->f = i->int1 ? i->dbl[0] : SYSMIS;
989 case FIRST | FSTRING:
992 memcpy (v->s, i->string, i->dest->width);
994 memset (v->s, ' ', i->dest->width);
1003 case NMISS | FSTRING:
1007 case NUMISS | FSTRING:
1017 /* Resets the state for all the aggregate functions. */
1019 initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input)
1021 struct agr_var *iter;
1023 case_destroy (&agr->break_case);
1024 case_clone (&agr->break_case, input);
1026 for (iter = agr->agr_vars; iter; iter = iter->next)
1029 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1030 iter->int1 = iter->int2 = 0;
1031 switch (iter->function)
1034 iter->dbl[0] = DBL_MAX;
1037 memset (iter->string, 255, iter->src->width);
1040 iter->dbl[0] = -DBL_MAX;
1043 memset (iter->string, 0, iter->src->width);
1046 if (iter->moments == NULL)
1047 iter->moments = moments1_create (MOMENT_VARIANCE);
1049 moments1_clear (iter->moments);
1057 /* Aggregate each case as it comes through. Cases which aren't needed
1060 agr_to_active_file (struct ccase *c, void *agr_)
1062 struct agr_proc *agr = agr_;
1064 if (aggregate_single_case (agr, c, &agr->agr_case))
1065 agr->sink->class->write (agr->sink, &agr->agr_case);
1070 /* Aggregate the current case and output it if we passed a
1073 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1075 struct agr_proc *agr = agr_;
1077 if (aggregate_single_case (agr, c, &agr->agr_case))
1078 sfm_write_case (agr->writer, &agr->agr_case);