1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include <libpspp/message.h>
23 #include <libpspp/alloc.h>
24 #include <data/any-writer.h>
25 #include <data/case.h>
26 #include <data/casefile.h>
27 #include <language/command.h>
28 #include <data/dictionary.h>
29 #include <libpspp/message.h>
30 #include <data/file-handle-def.h>
31 #include <language/lexer/lexer.h>
32 #include <libpspp/misc.h>
33 #include <math/moments.h>
34 #include <libpspp/pool.h>
35 #include <data/settings.h>
36 #include <data/sys-file-writer.h>
37 #include <math/sort.h>
38 #include <libpspp/str.h>
39 #include <data/variable.h>
40 #include <procedure.h>
43 #define _(msgid) gettext (msgid)
45 /* Specifies how to make an aggregate variable. */
48 struct agr_var *next; /* Next in list. */
50 /* Collected during parsing. */
51 struct variable *src; /* Source variable. */
52 struct variable *dest; /* Target variable. */
53 int function; /* Function. */
54 int include_missing; /* 1=Include user-missing values. */
55 union value arg[2]; /* Arguments. */
57 /* Accumulated during AGGREGATE execution. */
62 struct moments1 *moments;
65 /* Aggregation functions. */
68 NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN,
69 FOUT, N, NU, NMISS, NUMISS, FIRST, LAST,
70 N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS,
71 FUNC = 0x1f, /* Function mask. */
72 FSTRING = 1<<5, /* String function bit. */
75 /* Attributes of an aggregation function. */
78 const char *name; /* Aggregation function name. */
79 size_t n_args; /* Number of arguments. */
80 int alpha_type; /* When given ALPHA arguments, output type. */
81 struct fmt_spec format; /* Format spec if alpha_type != ALPHA. */
84 /* Attributes of aggregation functions. */
85 static const struct agr_func agr_func_tab[] =
87 {"<NONE>", 0, -1, {0, 0, 0}},
88 {"SUM", 0, -1, {FMT_F, 8, 2}},
89 {"MEAN", 0, -1, {FMT_F, 8, 2}},
90 {"SD", 0, -1, {FMT_F, 8, 2}},
91 {"MAX", 0, ALPHA, {-1, -1, -1}},
92 {"MIN", 0, ALPHA, {-1, -1, -1}},
93 {"PGT", 1, NUMERIC, {FMT_F, 5, 1}},
94 {"PLT", 1, NUMERIC, {FMT_F, 5, 1}},
95 {"PIN", 2, NUMERIC, {FMT_F, 5, 1}},
96 {"POUT", 2, NUMERIC, {FMT_F, 5, 1}},
97 {"FGT", 1, NUMERIC, {FMT_F, 5, 3}},
98 {"FLT", 1, NUMERIC, {FMT_F, 5, 3}},
99 {"FIN", 2, NUMERIC, {FMT_F, 5, 3}},
100 {"FOUT", 2, NUMERIC, {FMT_F, 5, 3}},
101 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
102 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
103 {"NMISS", 0, NUMERIC, {FMT_F, 7, 0}},
104 {"NUMISS", 0, NUMERIC, {FMT_F, 7, 0}},
105 {"FIRST", 0, ALPHA, {-1, -1, -1}},
106 {"LAST", 0, ALPHA, {-1, -1, -1}},
107 {NULL, 0, -1, {-1, -1, -1}},
108 {"N", 0, NUMERIC, {FMT_F, 7, 0}},
109 {"NU", 0, NUMERIC, {FMT_F, 7, 0}},
112 /* Missing value types. */
113 enum missing_treatment
115 ITEMWISE, /* Missing values item by item. */
116 COLUMNWISE /* Missing values column by column. */
119 /* An entire AGGREGATE procedure. */
122 /* We have either an output file or a sink. */
123 struct any_writer *writer; /* Output file, or null if none. */
124 struct case_sink *sink; /* Sink, or null if none. */
126 /* Break variables. */
127 struct sort_criteria *sort; /* Sort criteria. */
128 struct variable **break_vars; /* Break variables. */
129 size_t break_var_cnt; /* Number of break variables. */
130 struct ccase break_case; /* Last values of break variables. */
132 enum missing_treatment missing; /* How to treat missing values. */
133 struct agr_var *agr_vars; /* First aggregate variable. */
134 struct dictionary *dict; /* Aggregate dictionary. */
135 int case_cnt; /* Counts aggregated cases. */
136 struct ccase agr_case; /* Aggregate case for output. */
139 static void initialize_aggregate_info (struct agr_proc *,
140 const struct ccase *);
143 static int parse_aggregate_functions (struct agr_proc *);
144 static void agr_destroy (struct agr_proc *);
145 static int aggregate_single_case (struct agr_proc *agr,
146 const struct ccase *input,
147 struct ccase *output);
148 static void dump_aggregate_info (struct agr_proc *agr, struct ccase *output);
150 /* Aggregating to the active file. */
151 static bool agr_to_active_file (struct ccase *, void *aux);
153 /* Aggregating to a system file. */
154 static bool presorted_agr_to_sysfile (struct ccase *, void *aux);
158 /* Parses and executes the AGGREGATE procedure. */
163 struct file_handle *out_file = NULL;
165 bool copy_documents = false;
166 bool presorted = false;
169 memset(&agr, 0 , sizeof (agr));
170 agr.missing = ITEMWISE;
171 case_nullify (&agr.break_case);
173 agr.dict = dict_create ();
174 dict_set_label (agr.dict, dict_get_label (default_dict));
175 dict_set_documents (agr.dict, dict_get_documents (default_dict));
177 /* OUTFILE subcommand must be first. */
178 if (!lex_force_match_id ("OUTFILE"))
181 if (!lex_match ('*'))
183 out_file = fh_parse (FH_REF_FILE | FH_REF_SCRATCH);
184 if (out_file == NULL)
188 /* Read most of the subcommands. */
193 if (lex_match_id ("MISSING"))
196 if (!lex_match_id ("COLUMNWISE"))
198 lex_error (_("while expecting COLUMNWISE"));
201 agr.missing = COLUMNWISE;
203 else if (lex_match_id ("DOCUMENT"))
204 copy_documents = true;
205 else if (lex_match_id ("PRESORTED"))
207 else if (lex_match_id ("BREAK"))
212 agr.sort = sort_parse_criteria (default_dict,
213 &agr.break_vars, &agr.break_var_cnt,
214 &saw_direction, NULL);
215 if (agr.sort == NULL)
218 for (i = 0; i < agr.break_var_cnt; i++)
219 dict_clone_var_assert (agr.dict, agr.break_vars[i],
220 agr.break_vars[i]->name);
222 /* BREAK must follow the options. */
227 lex_error (_("expecting BREAK"));
231 if (presorted && saw_direction)
232 msg (SW, _("When PRESORTED is specified, specifying sorting directions "
233 "with (A) or (D) has no effect. Output data will be sorted "
234 "the same way as the input data."));
236 /* Read in the aggregate functions. */
238 if (!parse_aggregate_functions (&agr))
241 /* Delete documents. */
243 dict_set_documents (agr.dict, NULL);
245 /* Cancel SPLIT FILE. */
246 dict_set_split_vars (agr.dict, NULL, 0);
250 case_create (&agr.agr_case, dict_get_next_value_idx (agr.dict));
252 /* Output to active file or external file? */
253 if (out_file == NULL)
255 /* The active file will be replaced by the aggregated data,
256 so TEMPORARY is moot. */
259 if (agr.sort != NULL && !presorted)
261 if (!sort_active_file_in_place (agr.sort))
265 agr.sink = create_case_sink (&storage_sink_class, agr.dict, NULL);
266 if (agr.sink->class->open != NULL)
267 agr.sink->class->open (agr.sink);
268 vfm_sink = create_case_sink (&null_sink_class, default_dict, NULL);
269 if (!procedure (agr_to_active_file, &agr))
271 if (agr.case_cnt > 0)
273 dump_aggregate_info (&agr, &agr.agr_case);
274 if (!agr.sink->class->write (agr.sink, &agr.agr_case))
277 dict_destroy (default_dict);
278 default_dict = agr.dict;
280 vfm_source = agr.sink->class->make_source (agr.sink);
281 free_case_sink (agr.sink);
285 agr.writer = any_writer_open (out_file, agr.dict);
286 if (agr.writer == NULL)
289 if (agr.sort != NULL && !presorted)
291 /* Sorting is needed. */
292 struct casefile *dst;
293 struct casereader *reader;
297 dst = sort_active_file_to_casefile (agr.sort);
300 reader = casefile_get_destructive_reader (dst);
301 while (ok && casereader_read_xfer (reader, &c))
303 if (aggregate_single_case (&agr, &c, &agr.agr_case))
304 ok = any_writer_write (agr.writer, &agr.agr_case);
307 casereader_destroy (reader);
309 ok = !casefile_error (dst);
310 casefile_destroy (dst);
316 /* Active file is already sorted. */
317 if (!procedure (presorted_agr_to_sysfile, &agr))
321 if (agr.case_cnt > 0)
323 dump_aggregate_info (&agr, &agr.agr_case);
324 any_writer_write (agr.writer, &agr.agr_case);
326 if (any_writer_error (agr.writer))
335 return CMD_CASCADING_FAILURE;
338 /* Parse all the aggregate functions. */
340 parse_aggregate_functions (struct agr_proc *agr)
342 struct agr_var *tail; /* Tail of linked list starting at agr->vars. */
344 /* Parse everything. */
353 const struct agr_func *function;
358 struct variable **src;
372 /* Parse the list of target variables. */
373 while (!lex_match ('='))
375 size_t n_dest_prev = n_dest;
377 if (!parse_DATA_LIST_vars (&dest, &n_dest,
378 PV_APPEND | PV_SINGLE | PV_NO_SCRATCH))
381 /* Assign empty labels. */
385 dest_label = xnrealloc (dest_label, n_dest, sizeof *dest_label);
386 for (j = n_dest_prev; j < n_dest; j++)
387 dest_label[j] = NULL;
390 if (token == T_STRING)
392 ds_truncate (&tokstr, 255);
393 dest_label[n_dest - 1] = xstrdup (ds_c_str (&tokstr));
398 /* Get the name of the aggregation function. */
401 lex_error (_("expecting aggregation function"));
406 if (tokid[strlen (tokid) - 1] == '.')
409 tokid[strlen (tokid) - 1] = 0;
412 for (function = agr_func_tab; function->name; function++)
413 if (!strcasecmp (function->name, tokid))
415 if (NULL == function->name)
417 msg (SE, _("Unknown aggregation function %s."), tokid);
420 func_index = function - agr_func_tab;
423 /* Check for leading lparen. */
424 if (!lex_match ('('))
427 func_index = N_NO_VARS;
428 else if (func_index == NU)
429 func_index = NU_NO_VARS;
432 lex_error (_("expecting `('"));
438 /* Parse list of source variables. */
440 int pv_opts = PV_NO_SCRATCH;
442 if (func_index == SUM || func_index == MEAN || func_index == SD)
443 pv_opts |= PV_NUMERIC;
444 else if (function->n_args)
445 pv_opts |= PV_SAME_TYPE;
447 if (!parse_variables (default_dict, &src, &n_src, pv_opts))
451 /* Parse function arguments, for those functions that
452 require arguments. */
453 if (function->n_args != 0)
454 for (i = 0; i < function->n_args; i++)
459 if (token == T_STRING)
461 arg[i].c = xstrdup (ds_c_str (&tokstr));
464 else if (lex_is_number ())
469 msg (SE, _("Missing argument %d to %s."), i + 1,
476 if (type != src[0]->type)
478 msg (SE, _("Arguments to %s must be of same type as "
479 "source variables."),
485 /* Trailing rparen. */
488 lex_error (_("expecting `)'"));
492 /* Now check that the number of source variables match
493 the number of target variables. If we check earlier
494 than this, the user can get very misleading error
495 message, i.e. `AGGREGATE x=SUM(y t).' will get this
496 error message when a proper message would be more
497 like `unknown variable t'. */
500 msg (SE, _("Number of source variables (%u) does not match "
501 "number of target variables (%u)."),
502 (unsigned) n_src, (unsigned) n_dest);
506 if ((func_index == PIN || func_index == POUT
507 || func_index == FIN || func_index == FOUT)
508 && ((src[0]->type == NUMERIC && arg[0].f > arg[1].f)
509 || (src[0]->type == ALPHA
510 && str_compare_rpad (arg[0].c, arg[1].c) > 0)))
512 union value t = arg[0];
516 msg (SW, _("The value arguments passed to the %s function "
517 "are out-of-order. They will be treated as if "
518 "they had been specified in the correct order."),
523 /* Finally add these to the linked list of aggregation
525 for (i = 0; i < n_dest; i++)
527 struct agr_var *v = xmalloc (sizeof *v);
529 /* Add variable to chain. */
530 if (agr->agr_vars != NULL)
538 /* Create the target variable in the aggregate
541 struct variable *destvar;
543 v->function = func_index;
549 if (src[i]->type == ALPHA)
551 v->function |= FSTRING;
552 v->string = xmalloc (src[i]->width);
555 if (function->alpha_type == ALPHA)
556 destvar = dict_clone_var (agr->dict, v->src, dest[i]);
559 assert (v->src->type == NUMERIC
560 || function->alpha_type == NUMERIC);
561 destvar = dict_create_var (agr->dict, dest[i], 0);
564 if ((func_index == N || func_index == NMISS)
565 && dict_get_weight (default_dict) != NULL)
566 destvar->print = destvar->write = f8_2;
568 destvar->print = destvar->write = function->format;
573 destvar = dict_create_var (agr->dict, dest[i], 0);
574 if (func_index == N_NO_VARS
575 && dict_get_weight (default_dict) != NULL)
576 destvar->print = destvar->write = f8_2;
578 destvar->print = destvar->write = function->format;
583 msg (SE, _("Variable name %s is not unique within the "
584 "aggregate file dictionary, which contains "
585 "the aggregate variables and the break "
595 destvar->label = dest_label[i];
596 dest_label[i] = NULL;
602 v->include_missing = include_missing;
608 if (v->src->type == NUMERIC)
609 for (j = 0; j < function->n_args; j++)
610 v->arg[j].f = arg[j].f;
612 for (j = 0; j < function->n_args; j++)
613 v->arg[j].c = xstrdup (arg[j].c);
617 if (src != NULL && src[0]->type == ALPHA)
618 for (i = 0; i < function->n_args; i++)
628 if (!lex_match ('/'))
633 lex_error ("expecting end of command");
639 for (i = 0; i < n_dest; i++)
642 free (dest_label[i]);
648 if (src && n_src && src[0]->type == ALPHA)
649 for (i = 0; i < function->n_args; i++)
662 agr_destroy (struct agr_proc *agr)
664 struct agr_var *iter, *next;
666 any_writer_close (agr->writer);
667 if (agr->sort != NULL)
668 sort_destroy_criteria (agr->sort);
669 free (agr->break_vars);
670 case_destroy (&agr->break_case);
671 for (iter = agr->agr_vars; iter; iter = next)
675 if (iter->function & FSTRING)
680 n_args = agr_func_tab[iter->function & FUNC].n_args;
681 for (i = 0; i < n_args; i++)
682 free (iter->arg[i].c);
685 else if (iter->function == SD)
686 moments1_destroy (iter->moments);
689 if (agr->dict != NULL)
690 dict_destroy (agr->dict);
692 case_destroy (&agr->agr_case);
697 static void accumulate_aggregate_info (struct agr_proc *,
698 const struct ccase *);
699 static void dump_aggregate_info (struct agr_proc *, struct ccase *);
701 /* Processes a single case INPUT for aggregation. If output is
702 warranted, writes it to OUTPUT and returns nonzero.
703 Otherwise, returns zero and OUTPUT is unmodified. */
705 aggregate_single_case (struct agr_proc *agr,
706 const struct ccase *input, struct ccase *output)
708 bool finished_group = false;
710 if (agr->case_cnt++ == 0)
711 initialize_aggregate_info (agr, input);
712 else if (case_compare (&agr->break_case, input,
713 agr->break_vars, agr->break_var_cnt))
715 dump_aggregate_info (agr, output);
716 finished_group = true;
718 initialize_aggregate_info (agr, input);
721 accumulate_aggregate_info (agr, input);
722 return finished_group;
725 /* Accumulates aggregation data from the case INPUT. */
727 accumulate_aggregate_info (struct agr_proc *agr,
728 const struct ccase *input)
730 struct agr_var *iter;
734 weight = dict_get_case_weight (default_dict, input, &bad_warn);
736 for (iter = agr->agr_vars; iter; iter = iter->next)
739 const union value *v = case_data (input, iter->src->fv);
741 if ((!iter->include_missing
742 && mv_is_value_missing (&iter->src->miss, v))
743 || (iter->include_missing && iter->src->type == NUMERIC
746 switch (iter->function)
749 case NMISS | FSTRING:
750 iter->dbl[0] += weight;
753 case NUMISS | FSTRING:
761 /* This is horrible. There are too many possibilities. */
762 switch (iter->function)
765 iter->dbl[0] += v->f * weight;
769 iter->dbl[0] += v->f * weight;
770 iter->dbl[1] += weight;
773 moments1_add (iter->moments, v->f, weight);
776 iter->dbl[0] = max (iter->dbl[0], v->f);
780 if (memcmp (iter->string, v->s, iter->src->width) < 0)
781 memcpy (iter->string, v->s, iter->src->width);
785 iter->dbl[0] = min (iter->dbl[0], v->f);
789 if (memcmp (iter->string, v->s, iter->src->width) > 0)
790 memcpy (iter->string, v->s, iter->src->width);
795 if (v->f > iter->arg[0].f)
796 iter->dbl[0] += weight;
797 iter->dbl[1] += weight;
801 if (memcmp (iter->arg[0].c, v->s, iter->src->width) < 0)
802 iter->dbl[0] += weight;
803 iter->dbl[1] += weight;
807 if (v->f < iter->arg[0].f)
808 iter->dbl[0] += weight;
809 iter->dbl[1] += weight;
813 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0)
814 iter->dbl[0] += weight;
815 iter->dbl[1] += weight;
819 if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f)
820 iter->dbl[0] += weight;
821 iter->dbl[1] += weight;
825 if (memcmp (iter->arg[0].c, v->s, iter->src->width) <= 0
826 && memcmp (iter->arg[1].c, v->s, iter->src->width) >= 0)
827 iter->dbl[0] += weight;
828 iter->dbl[1] += weight;
832 if (iter->arg[0].f > v->f || v->f > iter->arg[1].f)
833 iter->dbl[0] += weight;
834 iter->dbl[1] += weight;
838 if (memcmp (iter->arg[0].c, v->s, iter->src->width) > 0
839 || memcmp (iter->arg[1].c, v->s, iter->src->width) < 0)
840 iter->dbl[0] += weight;
841 iter->dbl[1] += weight;
845 iter->dbl[0] += weight;
858 case FIRST | FSTRING:
861 memcpy (iter->string, v->s, iter->src->width);
870 memcpy (iter->string, v->s, iter->src->width);
874 case NMISS | FSTRING:
876 case NUMISS | FSTRING:
877 /* Our value is not missing or it would have been
878 caught earlier. Nothing to do. */
884 switch (iter->function)
887 iter->dbl[0] += weight;
898 /* We've come to a record that differs from the previous in one or
899 more of the break variables. Make an output record from the
900 accumulated statistics in the OUTPUT case. */
902 dump_aggregate_info (struct agr_proc *agr, struct ccase *output)
908 for (i = 0; i < agr->break_var_cnt; i++)
910 struct variable *v = agr->break_vars[i];
911 memcpy (case_data_rw (output, value_idx),
912 case_data (&agr->break_case, v->fv),
913 sizeof (union value) * v->nv);
921 for (i = agr->agr_vars; i; i = i->next)
923 union value *v = case_data_rw (output, i->dest->fv);
925 if (agr->missing == COLUMNWISE && i->missing != 0
926 && (i->function & FUNC) != N && (i->function & FUNC) != NU
927 && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS)
929 if (i->dest->type == ALPHA)
930 memset (v->s, ' ', i->dest->width);
939 v->f = i->int1 ? i->dbl[0] : SYSMIS;
942 v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS;
948 /* FIXME: we should use two passes. */
949 moments1_calculate (i->moments, NULL, NULL, &variance,
951 if (variance != SYSMIS)
952 v->f = sqrt (variance);
959 v->f = i->int1 ? i->dbl[0] : SYSMIS;
964 memcpy (v->s, i->string, i->dest->width);
966 memset (v->s, ' ', i->dest->width);
976 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS;
986 v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS;
998 v->f = i->int1 ? i->dbl[0] : SYSMIS;
1000 case FIRST | FSTRING:
1001 case LAST | FSTRING:
1003 memcpy (v->s, i->string, i->dest->width);
1005 memset (v->s, ' ', i->dest->width);
1014 case NMISS | FSTRING:
1018 case NUMISS | FSTRING:
1028 /* Resets the state for all the aggregate functions. */
1030 initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input)
1032 struct agr_var *iter;
1034 case_destroy (&agr->break_case);
1035 case_clone (&agr->break_case, input);
1037 for (iter = agr->agr_vars; iter; iter = iter->next)
1040 iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0;
1041 iter->int1 = iter->int2 = 0;
1042 switch (iter->function)
1045 iter->dbl[0] = DBL_MAX;
1048 memset (iter->string, 255, iter->src->width);
1051 iter->dbl[0] = -DBL_MAX;
1054 memset (iter->string, 0, iter->src->width);
1057 if (iter->moments == NULL)
1058 iter->moments = moments1_create (MOMENT_VARIANCE);
1060 moments1_clear (iter->moments);
1068 /* Aggregate each case as it comes through. Cases which aren't needed
1070 Returns true if successful, false if an I/O error occurred. */
1072 agr_to_active_file (struct ccase *c, void *agr_)
1074 struct agr_proc *agr = agr_;
1076 if (aggregate_single_case (agr, c, &agr->agr_case))
1077 return agr->sink->class->write (agr->sink, &agr->agr_case);
1082 /* Aggregate the current case and output it if we passed a
1085 presorted_agr_to_sysfile (struct ccase *c, void *agr_)
1087 struct agr_proc *agr = agr_;
1089 if (aggregate_single_case (agr, c, &agr->agr_case))
1090 return any_writer_write (agr->writer, &agr->agr_case);