1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
35 /*#define DEBUGGING 1 */
36 #include "debug-print.h"
42 RCD_END, /* sentinel value */
43 RCD_USER, /* user-missing => one */
44 RCD_SINGLE, /* one => one */
45 RCD_HIGH, /* x > a => one */
46 RCD_LOW, /* x < b => one */
47 RCD_RANGE, /* b < x < a => one */
48 RCD_ELSE, /* any but SYSMIS => one */
49 RCD_CONVERT /* "123" => 123 */
52 /* Describes how to recode a single value or range of values into a
57 union value f1, f2; /* Describe value or range as src. Long
58 strings are stored in `c'. */
59 union value t; /* Describes value as dest. Long strings in `c'. */
62 /* Describes how to recode a single variable. */
67 unsigned flags; /* RCD_SRC_* | RCD_DEST_* | RCD_MISC_* */
69 struct variable *src; /* Source variable. */
70 struct variable *dest; /* Destination variable. */
71 char dest_name[9]; /* Name of dest variable if we're creating it. */
73 int has_sysmis; /* Do we recode for SYSMIS? */
74 union value sysmis; /* Coding for SYSMIS (if src is numeric). */
76 struct coding *map; /* Coding for other values. */
77 int nmap, mmap; /* Length of map, max capacity of map. */
80 /* RECODE transformation. */
84 struct rcd_var *codings;
87 /* What we're recoding from (`src'==`source'). */
88 #define RCD_SRC_ERROR 0000u /* Bad value for src. */
89 #define RCD_SRC_NUMERIC 0001u /* Src is numeric. */
90 #define RCD_SRC_STRING 0002u /* Src is short string. */
91 #define RCD_SRC_MASK 0003u /* AND mask to isolate src bits. */
93 /* What we're recoding to (`dest'==`destination'). */
94 #define RCD_DEST_ERROR 0000u /* Bad value for dest. */
95 #define RCD_DEST_NUMERIC 0004u /* Dest is numeric. */
96 #define RCD_DEST_STRING 0010u /* Dest is short string. */
97 #define RCD_DEST_MASK 0014u /* AND mask to isolate dest bits. */
99 /* Miscellaneous bits. */
100 #define RCD_MISC_CREATE 0020u /* We create dest var (numeric only) */
101 #define RCD_MISC_DUPLICATE 0040u /* This var_info has the same MAP
102 value as the previous var_info.
103 Prevents redundant free()ing. */
104 #define RCD_MISC_MISSING 0100u /* Encountered MISSING or SYSMIS in
107 static int parse_dest_spec (struct rcd_var * rcd, union value *v,
108 size_t *max_dst_width);
109 static int parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width);
110 static int recode_trns_proc (struct trns_header *, struct ccase *);
111 static void recode_trns_free (struct trns_header *);
112 static double convert_to_double (char *, int);
115 static void debug_print (rcd_var * head);
120 /* First transformation in the list. rcd is in this list. */
121 static struct rcd_var *head;
123 /* Variables in the current part of the recoding. */
127 /* Parses the RECODE transformation. */
133 /* Transformation that we're constructing. */
136 /* Type of the src variables. */
139 /* Length of longest src string. */
140 size_t max_src_width;
142 /* Length of longest dest string. */
143 size_t max_dst_width;
145 /* For stepping through, constructing the linked list of
147 struct rcd_var *iter;
149 /* The real transformation, just a wrapper for a list of
151 struct recode_trns *trns;
153 lex_match_id ("RECODE");
155 /* Parses each specification between slashes. */
156 head = rcd = xmalloc (sizeof *rcd);
159 /* Whether we've already encountered a specification for SYSMIS. */
162 /* Initialize this rcd_var to ensure proper cleanup. */
165 rcd->nmap = rcd->mmap = 0;
169 /* Parse variable names. */
170 if (!parse_variables (NULL, &v, &nv, PV_SAME_TYPE))
173 /* Ensure all variables are same type; find length of longest
176 max_src_width = v[0]->width;
179 for (i = 0; i < nv; i++)
180 if (v[i]->width > (int) max_src_width)
181 max_src_width = v[i]->width;
186 rcd->flags |= RCD_SRC_NUMERIC;
188 rcd->flags |= RCD_SRC_STRING;
190 /* Parse each coding in parentheses. */
192 if (!lex_force_match ('('))
196 /* Get the input value (before the `='). */
197 int mark = rcd->nmap;
198 int code = parse_src_spec (rcd, type, max_src_width);
202 /* ELSE is the same as any other input spec except that it
203 precludes later sysmis specifications. */
210 /* If keyword CONVERT was specified, there is no output
216 /* Get the output value (after the `='). */
217 lex_get (); /* Skip `='. */
218 if (!parse_dest_spec (rcd, &output, &max_dst_width))
221 /* Set the value for SYSMIS if requested and if we don't
223 if ((rcd->flags & RCD_MISC_MISSING) && !had_sysmis)
226 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
227 rcd->sysmis.f = output.f;
229 rcd->sysmis.c = xstrdup (output.c);
232 rcd->flags &= ~RCD_MISC_MISSING;
235 /* Since there may be multiple input values for a single
236 output, the output value need to propagated among all
238 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
239 for (i = mark; i < rcd->nmap; i++)
240 rcd->map[i].t.f = output.f;
243 for (i = mark; i < rcd->nmap; i++)
244 rcd->map[i].t.c = xstrdup (output.c);
248 lex_get (); /* Skip `)'. */
249 if (!lex_match ('('))
253 /* Append sentinel value. */
254 rcd->map[rcd->nmap++].type = RCD_END;
256 /* Since multiple variables may use the same recodings, it is
257 necessary to propogate the codings to all of them. */
260 rcd->dest_name[0] = 0;
262 for (i = 1; i < nv; i++)
264 iter = iter->next = xmalloc (sizeof *iter);
266 iter->flags = rcd->flags | RCD_MISC_DUPLICATE;
269 iter->dest_name[0] = 0;
270 iter->has_sysmis = rcd->has_sysmis;
271 iter->sysmis = rcd->sysmis;
272 iter->map = rcd->map;
275 if (lex_match_id ("INTO"))
282 if (!parse_mixed_vars (&names, &nnames, PV_NONE))
287 for (i = 0; i < nnames; i++)
290 msg (SE, _("%d variable(s) cannot be recoded into "
291 "%d variable(s). Specify the same number "
292 "of variables as input and output variables."),
297 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
298 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
300 struct variable *v = find_variable (names[i]);
304 msg (SE, _("There is no string variable named "
305 "%s. (All string variables specified "
306 "on INTO must already exist. Use the "
307 "STRING command to create a string "
308 "variable.)"), names[i]);
311 if (v->type != ALPHA)
313 msg (SE, _("Type mismatch between input and output "
314 "variables. Output variable %s is not "
315 "a string variable, but all the input "
316 "variables are string variables."), v->name);
319 if (v->width > (int) max_dst_width)
320 max_dst_width = v->width;
324 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
326 struct variable *v = find_variable (names[i]);
330 if (v->type != NUMERIC)
332 msg (SE, _("Type mismatch after INTO: %s "
333 "is not a numeric variable."), v->name);
340 strcpy (iter->dest_name, names[i]);
344 /* Note that regardless of whether we succeed or fail,
345 flow-of-control comes here. `success' is the important
346 factor. Ah, if C had garbage collection... */
348 for (i = 0; i < nnames; i++)
356 if (max_src_width > max_dst_width)
357 max_dst_width = max_src_width;
359 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC
360 && (rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
362 msg (SE, _("INTO must be used when the input values are "
363 "numeric and output values are string."));
367 if ((rcd->flags & RCD_SRC_MASK) != RCD_SRC_NUMERIC
368 && (rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
370 msg (SE, _("INTO must be used when the input values are "
371 "string and output values are numeric."));
376 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
380 for (cp = rcd->map; cp->type != RCD_END; cp++)
383 if (strlen (cp->t.c) < max_dst_width)
385 /* The NULL is only really necessary for the
387 char *repl = xmalloc (max_dst_width + 1);
388 st_pad_copy (repl, cp->t.c, max_dst_width + 1);
393 /* The strings are guaranteed to be in order of
394 nondecreasing length. */
400 if (!lex_match ('/'))
404 rcd = rcd->next = xmalloc (sizeof *rcd);
411 lex_error (_("expecting end of command"));
415 for (rcd = head; rcd; rcd = rcd->next)
416 if (rcd->dest_name[0])
418 rcd->dest = create_variable (&default_dict, rcd->dest_name,
422 /* This can occur if a destname is duplicated. We could
423 give an error at parse time but I don't care enough. */
424 rcd->dest = find_variable (rcd->dest_name);
425 assert (rcd->dest != NULL);
428 envector (rcd->dest);
431 trns = xmalloc (sizeof *trns);
432 trns->h.proc = recode_trns_proc;
433 trns->h.free = recode_trns_free;
434 trns->codings = head;
435 add_transformation ((struct trns_header *) trns);
445 struct recode_trns t;
448 recode_trns_free ((struct trns_header *) &t);
454 parse_dest_spec (struct rcd_var * rcd, union value * v, size_t *max_dst_width)
464 flags = RCD_DEST_NUMERIC;
466 else if (lex_match_id ("SYSMIS"))
469 flags = RCD_DEST_NUMERIC;
471 else if (token == T_STRING)
473 size_t max = *max_dst_width;
474 size_t toklen = ds_length (&tokstr);
477 v->c = xmalloc (max + 1);
478 st_pad_copy (v->c, ds_value (&tokstr), max + 1);
479 flags = RCD_DEST_STRING;
480 *max_dst_width = max;
483 else if (lex_match_id ("COPY"))
485 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
487 flags = RCD_DEST_NUMERIC;
492 flags = RCD_DEST_STRING;
497 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
500 else if (((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC
501 && flags != RCD_DEST_NUMERIC)
502 || ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING
503 && flags != RCD_DEST_STRING))
505 else if ((rcd->flags & RCD_DEST_MASK) ^ flags)
507 msg (SE, _("Inconsistent output types. The output values "
508 "must be all numeric or all string."));
515 /* Reads a set of source specifications and returns one of the
516 following values: 0 on failure; 1 for normal success; 2 for success
517 but with CONVERT as the keyword; 3 for success but with ELSE as the
520 parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width)
526 if (rcd->nmap >= rcd->mmap - 1)
529 rcd->map = xrealloc (rcd->map, rcd->mmap * sizeof *rcd->map);
532 c = &rcd->map[rcd->nmap];
533 c->f1.c = c->f2.c = NULL;
534 if (lex_match_id ("ELSE"))
540 else if (type == NUMERIC)
544 if (lex_match_id ("LO") || lex_match_id ("LOWEST"))
546 if (!lex_force_match_id ("THRU"))
548 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
550 else if (token == T_NUM)
558 lex_error (_("following LO THRU"));
562 else if (lex_match_id ("MISSING"))
565 rcd->flags |= RCD_MISC_MISSING;
567 else if (lex_match_id ("SYSMIS"))
570 rcd->flags |= RCD_MISC_MISSING;
574 lex_error (_("in source value"));
578 else if (token == T_NUM)
582 if (lex_match_id ("THRU"))
584 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
586 else if (token == T_NUM)
599 c->type = RCD_SINGLE;
603 lex_error (_("in source value"));
609 assert (type == ALPHA);
610 if (lex_match_id ("CONVERT"))
612 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
613 rcd->flags |= RCD_DEST_NUMERIC;
614 else if ((rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
616 msg (SE, _("Keyword CONVERT may only be used with "
617 "string input values and numeric output "
622 c->type = RCD_CONVERT;
628 /* Only the debugging code needs the NULLs at the ends
629 of the strings. However, changing code behavior more
630 than necessary based on the DEBUGGING `#define' is just
632 c->type = RCD_SINGLE;
633 if (!lex_force_string ())
635 c->f1.c = xmalloc (max_src_width + 1);
636 st_pad_copy (c->f1.c, ds_value (&tokstr), max_src_width + 1);
641 if (c->type != RCD_END)
651 /* Data transformation. */
654 recode_trns_free (struct trns_header * t)
657 struct rcd_var *head, *next;
659 head = ((struct recode_trns *) t)->codings;
662 if (head->map && !(head->flags & RCD_MISC_DUPLICATE))
664 if (head->flags & RCD_SRC_STRING)
665 for (i = 0; i < head->nmap; i++)
666 switch (head->map[i].type)
669 free (head->map[i].f2.c);
675 free (head->map[i].f1.c);
684 if (head->flags & RCD_DEST_STRING)
685 for (i = 0; i < head->nmap; i++)
686 if (head->map[i].type != RCD_CONVERT && head->map[i].type != RCD_END)
687 free (head->map[i].t.c);
696 static inline struct coding *
697 find_src_numeric (struct rcd_var * v, struct ccase * c)
699 double cmp = c->data[v->src->fv].f;
704 if (v->sysmis.f != -SYSMIS)
706 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
707 c->data[v->dest->fv].f = v->sysmis.f;
709 memcpy (c->data[v->dest->fv].s, v->sysmis.c,
715 for (cp = v->map;; cp++)
721 if (is_num_user_missing (cmp, v->src))
725 if (approx_eq (cmp, cp->f1.f))
729 if (approx_ge (cmp, cp->f1.f))
733 if (approx_le (cmp, cp->f1.f))
737 if (approx_in_range (cmp, cp->f1.f, cp->f2.f))
747 static inline struct coding *
748 find_src_string (struct rcd_var * v, struct ccase * c)
750 char *cmp = c->data[v->src->fv].s;
751 int w = v->src->width;
754 for (cp = v->map;; cp++)
760 if (!memcmp (cp->f1.c, cmp, w))
767 double f = convert_to_double (cmp, w);
770 c->data[v->dest->fv].f = f;
781 recode_trns_proc (struct trns_header * t, struct ccase * c)
786 for (v = ((struct recode_trns *) t)->codings; v; v = v->next)
788 switch (v->flags & RCD_SRC_MASK)
790 case RCD_SRC_NUMERIC:
791 cp = find_src_numeric (v, c);
794 cp = find_src_string (v, c);
800 /* A matching input value was found. */
801 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
803 double val = cp->t.f;
805 c->data[v->dest->fv].f = c->data[v->src->fv].f;
807 c->data[v->dest->fv].f = val;
813 st_bare_pad_len_copy (c->data[v->dest->fv].s,
814 c->data[v->src->fv].c,
815 v->dest->width, v->src->width);
817 memcpy (c->data[v->dest->fv].s, cp->t.c, v->dest->width);
828 dump_dest (struct rcd_var * v, union value * c)
830 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
833 else if (c->f == -SYSMIS)
836 printf ("=%g", c->f);
838 printf ("=\"%s\"", c->c);
844 debug_print (struct rcd_var * head)
846 struct rcd_var *iter, *start;
850 for (iter = head; iter; iter = iter->next)
853 printf (" %s%s", iter == head ? "" : "/", iter->src->name);
854 while (iter->next && (iter->next->flags & RCD_MISC_DUPLICATE))
857 printf (" %s", iter->src->name);
859 if (iter->has_sysmis)
862 dump_dest (iter, &iter->sysmis);
865 for (c = iter->map; c->type != RCD_END; c++)
868 if ((iter->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
872 printf (_("!!END!!"));
878 printf ("%g", c->f1.f);
881 printf ("%g THRU HIGH", c->f1.f);
884 printf ("LOW THRU %g", c->f1.f);
887 printf ("%g THRU %g", c->f1.f, c->f2.f);
893 printf (_("!!ERROR!!"));
900 printf ("\"%s\"", c->f1.c);
909 printf (_("!!ERROR!!"));
912 if (c->type != RCD_CONVERT)
913 dump_dest (iter, &c->t);
920 start->dest_name[0] ? start->dest_name : start->dest->name);
930 /* Convert NPTR to a `long int' in base 10. Returns the long int on
931 success, NOT_LONG on failure. On success stores a pointer to the
932 first character after the number into *ENDPTR. From the GNU C
935 string_to_long (char *nptr, int width, char **endptr)
938 register unsigned long int cutoff;
939 register unsigned int cutlim;
940 register unsigned long int i;
942 register unsigned char c;
947 /* Check for a sign. */
960 if (s >= nptr + width)
963 /* Save the pointer so we can check later if anything happened. */
966 cutoff = ULONG_MAX / 10ul;
967 cutlim = ULONG_MAX % 10ul;
972 if (isdigit ((unsigned char) c))
976 /* Check for overflow. */
977 if (i > cutoff || (i == cutoff && c > cutlim))
983 if (s >= nptr + width)
988 /* Check if anything actually happened. */
992 /* Check for a value that is within the range of `unsigned long
993 int', but outside the range of `long int'. We limit LONG_MIN and
994 LONG_MAX by one point because we know that NOT_LONG is out there
997 ? -((unsigned long int) LONG_MIN) - 1
998 : ((unsigned long int) LONG_MAX) - 1))
1003 /* Return the result of the appropriate sign. */
1004 return (negative ? -i : i);
1007 /* Converts S to a double according to format Fx.0. Returns the value
1008 found, or -SYSMIS if there was no valid number in s. WIDTH is the
1009 length of string S. From the GNU C library. */
1011 convert_to_double (char *s, int width)
1013 register const char *end = &s[width];
1017 /* The number so far. */
1020 int got_dot; /* Found a decimal point. */
1021 int got_digit; /* Count of digits. */
1023 /* The exponent of the number. */
1026 /* Eat whitespace. */
1027 while (s < end && isspace ((unsigned char) *s))
1033 sign = *s == '-' ? -1 : 1;
1034 if (*s == '-' || *s == '+')
1045 for (; s < end; ++s)
1047 if (isdigit ((unsigned char) *s))
1051 /* Make sure that multiplication by 10 will not overflow. */
1052 if (num > DBL_MAX * 0.1)
1053 /* The value of the digit doesn't matter, since we have already
1054 gotten as many digits as can be represented in a `double'.
1055 This doesn't necessarily mean the result will overflow.
1056 The exponent may reduce it to within range.
1058 We just need to record that there was another
1059 digit so that we can multiply by 10 later. */
1062 num = (num * 10.0) + (*s - '0');
1064 /* Keep track of the number of digits after the decimal point.
1065 If we just divided by 10 here, we would lose precision. */
1069 else if (!got_dot && *s == '.')
1070 /* Record that we have found the decimal point. */
1079 if (s < end && (tolower ((unsigned char) (*s)) == 'e'
1080 || tolower ((unsigned char) (*s)) == 'd'))
1082 /* Get the exponent specified after the `e' or `E'. */
1089 exp = string_to_long (s, end - s, &s);
1090 if (exp == NOT_LONG || end == s)
1095 while (s < end && isspace ((unsigned char) *s))
1103 /* Multiply NUM by 10 to the EXPONENT power,
1104 checking for overflow and underflow. */
1108 if (-exponent + got_digit > -(DBL_MIN_10_EXP) + 5
1109 || num < DBL_MIN * pow (10.0, (double) -exponent))
1111 num *= pow (10.0, (double) exponent);
1113 else if (exponent > 0)
1115 if (num > DBL_MAX * pow (10.0, (double) -exponent))
1117 num *= pow (10.0, (double) exponent);
1120 return sign > 0 ? num : -num;