1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 #include "dictionary.h"
29 #include "file-handle.h"
35 #include "sfm-write.h"
42 /* Specifies how to make an aggregate variable. */
45 struct agr_var *next; /* Next in list. */
47 /* Collected during parsing. */
48 struct variable *src; /* Source variable. */
49 struct variable *dest; /* Target variable. */
50 int function; /* Function. */
51 int include_missing; /* 1=Include user-missing values. */
52 union value arg[2]; /* Arguments. */
54 /* Accumulated during AGGREGATE execution. */
59 struct moments1 *moments;
62 /* Aggregation functions. */
65 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
66 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
67 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
68 FUNC = 0x1f, /* Function mask. */
69 FSTRING = 1<<5, /* String function bit. */
72 /* Attributes of an aggregation function. */
75 const char *name; /* Aggregation function name. */
76 int n_args; /* Number of arguments. */
77 int alpha_type; /* When given ALPHA arguments, output type. */
78 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
81 /* Attributes of aggregation functions. */
82 static const struct agr_func agr_func_tab[] =
84 {"<NONE>", 0, -1, {0, 0, 0}},
85 {"SUM", 0, -1, {FMT_F, 8, 2}},
86 {"MEAN", 0, -1, {FMT_F, 8, 2}},
87 {"SD", 0, -1, {FMT_F, 8, 2}},
88 {"MAX", 0, ALPHA, {-1, -1, -1}},
89 {"MIN", 0, ALPHA, {-1, -1, -1}},
90 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
91 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
92 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
93 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
94 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
95 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
96 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
97 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
98 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
99 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
100 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
101 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"FIRST", 0, ALPHA, {-1, -1, -1}},
103 {"LAST", 0, ALPHA, {-1, -1, -1}},
104 {NULL, 0, -1, {-1, -1, -1}},
105 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
106 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
109 /* Missing value types. */
110 enum missing_treatment
112 ITEMWISE, /* Missing values item by item. */
113 COLUMNWISE /* Missing values column by column. */
116 /* An entire AGGREGATE procedure. */
119 /* We have either an output file or a sink. */
120 struct sfm_writer *writer; /* Output file, or null if none. */
121 struct case_sink *sink; /* Sink, or null if none. */
123 /* Break variables. */
124 struct sort_criteria *sort; /* Sort criteria. */
125 struct variable **break_vars; /* Break variables. */
126 size_t break_var_cnt; /* Number of break variables. */
127 struct ccase break_case; /* Last values of break variables. */
129 enum missing_treatment missing; /* How to treat missing values. */
130 struct agr_var *agr_vars; /* First aggregate variable. */
131 struct dictionary *dict; /* Aggregate dictionary. */
132 int case_cnt; /* Counts aggregated cases. */
133 struct ccase agr_case; /* Aggregate case for output. */
136 static void initialize_aggregate_info (struct agr_proc *,
137 const struct ccase *);
140 static int parse_aggregate_functions (struct agr_proc *);
141 static void agr_destroy (struct agr_proc *);
142 static int aggregate_single_case (struct agr_proc *agr,
143 const struct ccase *input,
144 struct ccase *output);
145 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
147 /* Aggregating to the active file. */
148 static int agr_to_active_file (struct ccase *, void *aux);
150 /* Aggregating to a system file. */
151 static int presorted_agr_to_sysfile (struct ccase *, void *aux);
155 /* Parses and executes the AGGREGATE procedure. */
160 struct file_handle *out_file = NULL;
162 bool copy_documents = false;
163 bool presorted = false;
166 memset(&agr, 0 , sizeof (agr));
167 agr.missing = ITEMWISE;
168 case_nullify (&agr.break_case);
170 agr.dict = dict_create ();
171 dict_set_label (agr.dict, dict_get_label (default_dict));
172 dict_set_documents (agr.dict, dict_get_documents (default_dict));
174 /* OUTFILE subcommand must be first. */
175 if (!lex_force_match_id ("OUTFILE"))
178 if (!lex_match ('*'))
180 out_file = fh_parse ();
181 if (out_file == NULL)
185 /* Read most of the subcommands. */
190 if (lex_match_id ("MISSING"))
193 if (!lex_match_id ("COLUMNWISE"))
195 lex_error (_("while expecting COLUMNWISE"));
198 agr.missing = COLUMNWISE;
200 else if (lex_match_id ("DOCUMENT"))
201 copy_documents = true;
202 else if (lex_match_id ("PRESORTED"))
204 else if (lex_match_id ("BREAK"))
209 agr.sort = sort_parse_criteria (default_dict,
210 &agr.break_vars, &agr.break_var_cnt,
212 if (agr.sort == NULL)
215 for (i = 0; i < agr.break_var_cnt; i++)
216 dict_clone_var_assert (agr.dict, agr.break_vars[i],
217 agr.break_vars[i]->name);
219 /* BREAK must follow the options. */
224 lex_error (_("expecting BREAK"));
228 if (presorted && saw_direction)
229 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
230 "with (A) or (D) has no effect. Output data will be sorted "
231 "the same way as the input data."));
233 /* Read in the aggregate functions. */
235 if (!parse_aggregate_functions (&agr))
238 /* Delete documents. */
240 dict_set_documents (agr.dict, NULL);
242 /* Cancel SPLIT FILE. */
243 dict_set_split_vars (agr.dict, NULL, 0);
247 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
249 /* Output to active file or external file? */
250 if (out_file == NULL)
252 /* The active file will be replaced by the aggregated data,
253 so TEMPORARY is moot. */
256 if (agr.sort != NULL && !presorted)
257 sort_active_file_in_place (agr.sort);
259 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
260 if (agr.sink->class->open != NULL)
261 agr.sink->class->open (agr.sink);
262 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
263 procedure (agr_to_active_file, &agr);
264 if (agr.case_cnt > 0)
266 dump_aggregate_info (&agr, &agr.agr_case);
267 agr.sink->class->write (agr.sink, &agr.agr_case);
269 dict_destroy (default_dict);
270 default_dict = agr.dict;
272 vfm_source = agr.sink->class->make_source (agr.sink);
273 free_case_sink (agr.sink);
277 agr.writer = sfm_open_writer (out_file, agr.dict, get_scompression (), 0);
278 if (agr.writer == NULL)
281 if (agr.sort != NULL && !presorted)
283 /* Sorting is needed. */
284 struct casefile *dst;
285 struct casereader *reader;
288 dst = sort_active_file_to_casefile (agr.sort);
291 reader = casefile_get_destructive_reader (dst);
292 while (casereader_read_xfer (reader, &c))
294 if (aggregate_single_case (&agr, &c, &agr.agr_case))
295 sfm_write_case (agr.writer, &agr.agr_case);
298 casereader_destroy (reader);
299 casefile_destroy (dst);
303 /* Active file is already sorted. */
304 procedure (presorted_agr_to_sysfile, &agr);
307 if (agr.case_cnt > 0)
309 dump_aggregate_info (&agr, &agr.agr_case);
310 sfm_write_case (agr.writer, &agr.agr_case);
322 /* Parse all the aggregate functions. */
324 parse_aggregate_functions (struct agr_proc *agr)
326 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
328 /* Parse everything. */
337 const struct agr_func *function;
342 struct variable **src;
356 /* Parse the list of target variables. */
357 while (!lex_match ('='))
359 int n_dest_prev = n_dest;
361 if (!parse_DATA_LIST_vars (&dest, &n_dest,
362 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
365 /* Assign empty labels. */
369 dest_label = xrealloc (dest_label, sizeof *dest_label * n_dest);
370 for (j = n_dest_prev; j < n_dest; j++)
371 dest_label[j] = NULL;
374 if (token == T_STRING)
376 ds_truncate (&tokstr, 255);
377 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
382 /* Get the name of the aggregation function. */
385 lex_error (_("expecting aggregation function"));
390 if (tokid[strlen (tokid) - 1] == '.')
393 tokid[strlen (tokid) - 1] = 0;
396 for (function = agr_func_tab; function->name; function++)
397 if (!strcasecmp (function->name, tokid))
399 if (NULL == function->name)
401 msg (SE, _("Unknown aggregation function %s."), tokid);
404 func_index = function - agr_func_tab;
407 /* Check for leading lparen. */
408 if (!lex_match ('('))
411 func_index = N_NO_VARS;
412 else if (func_index == NU)
413 func_index = NU_NO_VARS;
416 lex_error (_("expecting `('"));
422 /* Parse list of source variables. */
424 int pv_opts = PV_NO_SCRATCH;
426 if (func_index == SUM || func_index == MEAN || func_index == SD)
427 pv_opts |= PV_NUMERIC;
428 else if (function->n_args)
429 pv_opts |= PV_SAME_TYPE;
431 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
435 /* Parse function arguments, for those functions that
436 require arguments. */
437 if (function->n_args != 0)
438 for (i = 0; i < function->n_args; i++)
443 if (token == T_STRING)
445 arg[i].c = xstrdup (ds_c_str (&tokstr));
448 else if (lex_is_number ())
453 msg (SE, _("Missing argument %d to %s."), i + 1,
460 if (type != src[0]->type)
462 msg (SE, _("Arguments to %s must be of same type as "
463 "source variables."),
469 /* Trailing rparen. */
472 lex_error (_("expecting `)'"));
476 /* Now check that the number of source variables match
477 the number of target variables. If we check earlier
478 than this, the user can get very misleading error
479 message, i.e. `AGGREGATE x=SUM(y t).' will get this
480 error message when a proper message would be more
481 like `unknown variable t'. */
484 msg (SE, _("Number of source variables (%d) does not match "
485 "number of target variables (%d)."),
490 if ((func_index == PIN || func_index == POUT
491 || func_index == FIN || func_index == FOUT)
492 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
493 || (src[0]->type == ALPHA
494 && str_compare_rpad (arg[0].c, arg[1].c) > 0)))
496 union value t = arg[0];
500 msg (SW, _("The value arguments passed to the %s function "
501 "are out-of-order. They will be treated as if "
502 "they had been specified in the correct order."),
507 /* Finally add these to the linked list of aggregation
509 for (i = 0; i < n_dest; i++)
511 struct agr_var *v = xmalloc (sizeof *v);
513 /* Add variable to chain. */
514 if (agr->agr_vars != NULL)
522 /* Create the target variable in the aggregate
525 struct variable *destvar;
527 v->function = func_index;
533 if (src[i]->type == ALPHA)
535 v->function |= FSTRING;
536 v->string = xmalloc (src[i]->width);
539 if (function->alpha_type == ALPHA)
540 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
543 assert (v->src->type == NUMERIC
544 || function->alpha_type == NUMERIC);
545 destvar = dict_create_var (agr->dict, dest[i], 0);
548 if ((func_index == N || func_index == NMISS)
549 && dict_get_weight (default_dict) != NULL)
550 destvar->print = destvar->write = f8_2;
552 destvar->print = destvar->write = function->format;
557 destvar = dict_create_var (agr->dict, dest[i], 0);
558 if (func_index == N_NO_VARS
559 && dict_get_weight (default_dict) != NULL)
560 destvar->print = destvar->write = f8_2;
562 destvar->print = destvar->write = function->format;
567 msg (SE, _("Variable name %s is not unique within the "
568 "aggregate file dictionary, which contains "
569 "the aggregate variables and the break "
579 destvar->label = dest_label[i];
580 dest_label[i] = NULL;
586 v->include_missing = include_missing;
592 if (v->src->type == NUMERIC)
593 for (j = 0; j < function->n_args; j++)
594 v->arg[j].f = arg[j].f;
596 for (j = 0; j < function->n_args; j++)
597 v->arg[j].c = xstrdup (arg[j].c);
601 if (src != NULL && src[0]->type == ALPHA)
602 for (i = 0; i < function->n_args; i++)
612 if (!lex_match ('/'))
617 lex_error ("expecting end of command");
623 for (i = 0; i < n_dest; i++)
626 free (dest_label[i]);
632 if (src && n_src && src[0]->type == ALPHA)
633 for (i = 0; i < function->n_args; i++)
646 agr_destroy (struct agr_proc *agr)
648 struct agr_var *iter, *next;
650 sfm_close_writer (agr->writer);
651 if (agr->sort != NULL)
652 sort_destroy_criteria (agr->sort);
653 free (agr->break_vars);
654 case_destroy (&agr->break_case);
655 for (iter = agr->agr_vars; iter; iter = next)
659 if (iter->function & FSTRING)
664 n_args = agr_func_tab[iter->function & FUNC].n_args;
665 for (i = 0; i < n_args; i++)
666 free (iter->arg[i].c);
669 else if (iter->function == SD)
670 moments1_destroy (iter->moments);
673 if (agr->dict != NULL)
674 dict_destroy (agr->dict);
676 case_destroy (&agr->agr_case);
681 static void accumulate_aggregate_info (struct agr_proc *,
682 const struct ccase *);
683 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
685 /* Processes a single case INPUT for aggregation. If output is
686 warranted, writes it to OUTPUT and returns nonzero.
687 Otherwise, returns zero and OUTPUT is unmodified. */
689 aggregate_single_case (struct agr_proc *agr,
690 const struct ccase *input, struct ccase *output)
692 bool finished_group = false;
694 if (agr->case_cnt++ == 0)
695 initialize_aggregate_info (agr, input);
696 else if (case_compare (&agr->break_case, input,
697 agr->break_vars, agr->break_var_cnt))
699 dump_aggregate_info (agr, output);
700 finished_group = true;
702 initialize_aggregate_info (agr, input);
705 accumulate_aggregate_info (agr, input);
706 return finished_group;
709 /* Accumulates aggregation data from the case INPUT. */
711 accumulate_aggregate_info (struct agr_proc *agr,
712 const struct ccase *input)
714 struct agr_var *iter;
718 weight = dict_get_case_weight (default_dict, input, &bad_warn);
720 for (iter = agr->agr_vars; iter; iter = iter->next)
723 const union value *v = case_data (input, iter->src->fv);
725 if ((!iter->include_missing && is_missing (v, iter->src))
726 || (iter->include_missing && iter->src->type == NUMERIC
729 switch (iter->function)
732 case NMISS | FSTRING:
733 iter->dbl[0] += weight;
736 case NUMISS | FSTRING:
744 /* This is horrible. There are too many possibilities. */
745 switch (iter->function)
748 iter->dbl[0] += v->f * weight;
752 iter->dbl[0] += v->f * weight;
753 iter->dbl[1] += weight;
756 moments1_add (iter->moments, v->f, weight);
759 iter->dbl[0] = max (iter->dbl[0], v->f);
763 if (memcmp (iter->string, v->s, iter->src->width) < 0)
764 memcpy (iter->string, v->s, iter->src->width);
768 iter->dbl[0] = min (iter->dbl[0], v->f);
772 if (memcmp (iter->string, v->s, iter->src->width) > 0)
773 memcpy (iter->string, v->s, iter->src->width);
778 if (v->f > iter->arg[0].f)
779 iter->dbl[0] += weight;
780 iter->dbl[1] += weight;
784 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
785 iter->dbl[0] += weight;
786 iter->dbl[1] += weight;
790 if (v->f < iter->arg[0].f)
791 iter->dbl[0] += weight;
792 iter->dbl[1] += weight;
796 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
797 iter->dbl[0] += weight;
798 iter->dbl[1] += weight;
802 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
803 iter->dbl[0] += weight;
804 iter->dbl[1] += weight;
808 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
809 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
810 iter->dbl[0] += weight;
811 iter->dbl[1] += weight;
815 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
816 iter->dbl[0] += weight;
817 iter->dbl[1] += weight;
821 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
822 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
823 iter->dbl[0] += weight;
824 iter->dbl[1] += weight;
828 iter->dbl[0] += weight;
841 case FIRST | FSTRING:
844 memcpy (iter->string, v->s, iter->src->width);
853 memcpy (iter->string, v->s, iter->src->width);
857 case NMISS | FSTRING:
859 case NUMISS | FSTRING:
860 /* Our value is not missing or it would have been
861 caught earlier. Nothing to do. */
867 switch (iter->function)
870 iter->dbl[0] += weight;
881 /* We've come to a record that differs from the previous in one or
882 more of the break variables. Make an output record from the
883 accumulated statistics in the OUTPUT case. */
885 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
891 for (i = 0; i < agr->break_var_cnt; i++)
893 struct variable *v = agr->break_vars[i];
894 memcpy (case_data_rw (output, value_idx),
895 case_data (&agr->break_case, v->fv),
896 sizeof (union value) * v->nv);
904 for (i = agr->agr_vars; i; i = i->next)
906 union value *v = case_data_rw (output, i->dest->fv);
908 if (agr->missing == COLUMNWISE && i->missing != 0
909 && (i->function & FUNC) != N && (i->function & FUNC) != NU
910 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
912 if (i->dest->type == ALPHA)
913 memset (v->s, ' ', i->dest->width);
922 v->f = i->int1 ? i->dbl[0] : SYSMIS;
925 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
931 /* FIXME: we should use two passes. */
932 moments1_calculate (i->moments, NULL, NULL, &variance,
934 if (variance != SYSMIS)
935 v->f = sqrt (variance);
942 v->f = i->int1 ? i->dbl[0] : SYSMIS;
947 memcpy (v->s, i->string, i->dest->width);
949 memset (v->s, ' ', i->dest->width);
959 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
969 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
981 v->f = i->int1 ? i->dbl[0] : SYSMIS;
983 case FIRST | FSTRING:
986 memcpy (v->s, i->string, i->dest->width);
988 memset (v->s, ' ', i->dest->width);
997 case NMISS | FSTRING:
1001 case NUMISS | FSTRING:
1011 /* Resets the state for all the aggregate functions. */
1013 initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input)
1015 struct agr_var *iter;
1017 case_destroy (&agr->break_case);
1018 case_clone (&agr->break_case, input);
1020 for (iter = agr->agr_vars; iter; iter = iter->next)
1023 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1024 iter->int1 = iter->int2 = 0;
1025 switch (iter->function)
1028 iter->dbl[0] = DBL_MAX;
1031 memset (iter->string, 255, iter->src->width);
1034 iter->dbl[0] = -DBL_MAX;
1037 memset (iter->string, 0, iter->src->width);
1040 if (iter->moments == NULL)
1041 iter->moments = moments1_create (MOMENT_VARIANCE);
1043 moments1_clear (iter->moments);
1051 /* Aggregate each case as it comes through. Cases which aren't needed
1054 agr_to_active_file (struct ccase *c, void *agr_)
1056 struct agr_proc *agr = agr_;
1058 if (aggregate_single_case (agr, c, &agr->agr_case))
1059 agr->sink->class->write (agr->sink, &agr->agr_case);
1064 /* Aggregate the current case and output it if we passed a
1067 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1069 struct agr_proc *agr = agr_;
1071 if (aggregate_single_case (agr, c, &agr->agr_case))
1072 sfm_write_case (agr->writer, &agr->agr_case);