1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
34 #include "debug-print.h"
40 RCD_END, /* sentinel value */
41 RCD_USER, /* user-missing => one */
42 RCD_SINGLE, /* one => one */
43 RCD_HIGH, /* x > a => one */
44 RCD_LOW, /* x < b => one */
45 RCD_RANGE, /* b < x < a => one */
46 RCD_ELSE, /* any but SYSMIS => one */
47 RCD_CONVERT /* "123" => 123 */
50 /* Describes how to recode a single value or range of values into a
55 union value f1, f2; /* Describe value or range as src. Long
56 strings are stored in `c'. */
57 union value t; /* Describes value as dest. Long strings in `c'. */
60 /* Describes how to recode a single variable. */
65 unsigned flags; /* RCD_SRC_* | RCD_DEST_* | RCD_MISC_* */
67 struct variable *src; /* Source variable. */
68 struct variable *dest; /* Destination variable. */
69 char dest_name[9]; /* Name of dest variable if we're creating it. */
71 int has_sysmis; /* Do we recode for SYSMIS? */
72 union value sysmis; /* Coding for SYSMIS (if src is numeric). */
74 struct coding *map; /* Coding for other values. */
75 int nmap, mmap; /* Length of map, max capacity of map. */
78 /* RECODE transformation. */
82 struct rcd_var *codings;
85 /* What we're recoding from (`src'==`source'). */
86 #define RCD_SRC_ERROR 0000u /* Bad value for src. */
87 #define RCD_SRC_NUMERIC 0001u /* Src is numeric. */
88 #define RCD_SRC_STRING 0002u /* Src is short string. */
89 #define RCD_SRC_MASK 0003u /* AND mask to isolate src bits. */
91 /* What we're recoding to (`dest'==`destination'). */
92 #define RCD_DEST_ERROR 0000u /* Bad value for dest. */
93 #define RCD_DEST_NUMERIC 0004u /* Dest is numeric. */
94 #define RCD_DEST_STRING 0010u /* Dest is short string. */
95 #define RCD_DEST_MASK 0014u /* AND mask to isolate dest bits. */
97 /* Miscellaneous bits. */
98 #define RCD_MISC_CREATE 0020u /* We create dest var (numeric only) */
99 #define RCD_MISC_DUPLICATE 0040u /* This var_info has the same MAP
100 value as the previous var_info.
101 Prevents redundant free()ing. */
102 #define RCD_MISC_MISSING 0100u /* Encountered MISSING or SYSMIS in
105 static int parse_dest_spec (struct rcd_var * rcd, union value *v,
106 size_t *max_dst_width);
107 static int parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width);
108 static int recode_trns_proc (struct trns_header *, struct ccase *);
109 static void recode_trns_free (struct trns_header *);
110 static double convert_to_double (char *, int);
113 static void debug_print (struct rcd_var * head);
118 /* First transformation in the list. rcd is in this list. */
119 static struct rcd_var *head;
121 /* Variables in the current part of the recoding. */
125 /* Parses the RECODE transformation. */
131 /* Transformation that we're constructing. */
134 /* Type of the src variables. */
137 /* Length of longest src string. */
138 size_t max_src_width;
140 /* Length of longest dest string. */
141 size_t max_dst_width;
143 /* For stepping through, constructing the linked list of
145 struct rcd_var *iter;
147 /* The real transformation, just a wrapper for a list of
149 struct recode_trns *trns;
151 lex_match_id ("RECODE");
153 /* Parses each specification between slashes. */
154 head = rcd = xmalloc (sizeof *rcd);
157 /* Whether we've already encountered a specification for SYSMIS. */
160 /* Initialize this rcd_var to ensure proper cleanup. */
163 rcd->nmap = rcd->mmap = 0;
167 /* Parse variable names. */
168 if (!parse_variables (NULL, &v, &nv, PV_SAME_TYPE))
171 /* Ensure all variables are same type; find length of longest
174 max_src_width = v[0]->width;
177 for (i = 0; i < nv; i++)
178 if (v[i]->width > (int) max_src_width)
179 max_src_width = v[i]->width;
184 rcd->flags |= RCD_SRC_NUMERIC;
186 rcd->flags |= RCD_SRC_STRING;
188 /* Parse each coding in parentheses. */
190 if (!lex_force_match ('('))
194 /* Get the input value (before the `='). */
195 int mark = rcd->nmap;
196 int code = parse_src_spec (rcd, type, max_src_width);
200 /* ELSE is the same as any other input spec except that it
201 precludes later sysmis specifications. */
208 /* If keyword CONVERT was specified, there is no output
214 /* Get the output value (after the `='). */
215 lex_get (); /* Skip `='. */
216 if (!parse_dest_spec (rcd, &output, &max_dst_width))
219 /* Set the value for SYSMIS if requested and if we don't
221 if ((rcd->flags & RCD_MISC_MISSING) && !had_sysmis)
224 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
225 rcd->sysmis.f = output.f;
227 rcd->sysmis.c = xstrdup (output.c);
230 rcd->flags &= ~RCD_MISC_MISSING;
233 /* Since there may be multiple input values for a single
234 output, the output value need to propagated among all
236 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
237 for (i = mark; i < rcd->nmap; i++)
238 rcd->map[i].t.f = output.f;
241 for (i = mark; i < rcd->nmap; i++)
242 rcd->map[i].t.c = xstrdup (output.c);
246 lex_get (); /* Skip `)'. */
247 if (!lex_match ('('))
251 /* Append sentinel value. */
252 rcd->map[rcd->nmap++].type = RCD_END;
254 /* Since multiple variables may use the same recodings, it is
255 necessary to propogate the codings to all of them. */
258 rcd->dest_name[0] = 0;
260 for (i = 1; i < nv; i++)
262 iter = iter->next = xmalloc (sizeof *iter);
264 iter->flags = rcd->flags | RCD_MISC_DUPLICATE;
267 iter->dest_name[0] = 0;
268 iter->has_sysmis = rcd->has_sysmis;
269 iter->sysmis = rcd->sysmis;
270 iter->map = rcd->map;
273 if (lex_match_id ("INTO"))
280 if (!parse_mixed_vars (&names, &nnames, PV_NONE))
285 for (i = 0; i < nnames; i++)
288 msg (SE, _("%d variable(s) cannot be recoded into "
289 "%d variable(s). Specify the same number "
290 "of variables as input and output variables."),
295 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
296 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
298 struct variable *v = find_variable (names[i]);
302 msg (SE, _("There is no string variable named "
303 "%s. (All string variables specified "
304 "on INTO must already exist. Use the "
305 "STRING command to create a string "
306 "variable.)"), names[i]);
309 if (v->type != ALPHA)
311 msg (SE, _("Type mismatch between input and output "
312 "variables. Output variable %s is not "
313 "a string variable, but all the input "
314 "variables are string variables."), v->name);
317 if (v->width > (int) max_dst_width)
318 max_dst_width = v->width;
322 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
324 struct variable *v = find_variable (names[i]);
328 if (v->type != NUMERIC)
330 msg (SE, _("Type mismatch after INTO: %s "
331 "is not a numeric variable."), v->name);
338 strcpy (iter->dest_name, names[i]);
342 /* Note that regardless of whether we succeed or fail,
343 flow-of-control comes here. `success' is the important
344 factor. Ah, if C had garbage collection... */
346 for (i = 0; i < nnames; i++)
354 if (max_src_width > max_dst_width)
355 max_dst_width = max_src_width;
357 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC
358 && (rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
360 msg (SE, _("INTO must be used when the input values are "
361 "numeric and output values are string."));
365 if ((rcd->flags & RCD_SRC_MASK) != RCD_SRC_NUMERIC
366 && (rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
368 msg (SE, _("INTO must be used when the input values are "
369 "string and output values are numeric."));
374 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
378 for (cp = rcd->map; cp->type != RCD_END; cp++)
381 if (strlen (cp->t.c) < max_dst_width)
383 /* The NULL is only really necessary for the
385 char *repl = xmalloc (max_dst_width + 1);
386 st_pad_copy (repl, cp->t.c, max_dst_width + 1);
391 /* The strings are guaranteed to be in order of
392 nondecreasing length. */
398 if (!lex_match ('/'))
402 rcd = rcd->next = xmalloc (sizeof *rcd);
409 lex_error (_("expecting end of command"));
413 for (rcd = head; rcd; rcd = rcd->next)
414 if (rcd->dest_name[0])
416 rcd->dest = create_variable (&default_dict, rcd->dest_name,
420 /* This can occur if a destname is duplicated. We could
421 give an error at parse time but I don't care enough. */
422 rcd->dest = find_variable (rcd->dest_name);
423 assert (rcd->dest != NULL);
426 envector (rcd->dest);
429 trns = xmalloc (sizeof *trns);
430 trns->h.proc = recode_trns_proc;
431 trns->h.free = recode_trns_free;
432 trns->codings = head;
433 add_transformation ((struct trns_header *) trns);
443 struct recode_trns t;
446 recode_trns_free ((struct trns_header *) &t);
452 parse_dest_spec (struct rcd_var * rcd, union value * v, size_t *max_dst_width)
462 flags = RCD_DEST_NUMERIC;
464 else if (lex_match_id ("SYSMIS"))
467 flags = RCD_DEST_NUMERIC;
469 else if (token == T_STRING)
471 size_t max = *max_dst_width;
472 size_t toklen = ds_length (&tokstr);
475 v->c = xmalloc (max + 1);
476 st_pad_copy (v->c, ds_value (&tokstr), max + 1);
477 flags = RCD_DEST_STRING;
478 *max_dst_width = max;
481 else if (lex_match_id ("COPY"))
483 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
485 flags = RCD_DEST_NUMERIC;
490 flags = RCD_DEST_STRING;
495 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
498 else if (((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC
499 && flags != RCD_DEST_NUMERIC)
500 || ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING
501 && flags != RCD_DEST_STRING))
503 else if ((rcd->flags & RCD_DEST_MASK) ^ flags)
505 msg (SE, _("Inconsistent output types. The output values "
506 "must be all numeric or all string."));
513 /* Reads a set of source specifications and returns one of the
514 following values: 0 on failure; 1 for normal success; 2 for success
515 but with CONVERT as the keyword; 3 for success but with ELSE as the
518 parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width)
524 if (rcd->nmap >= rcd->mmap - 1)
527 rcd->map = xrealloc (rcd->map, rcd->mmap * sizeof *rcd->map);
530 c = &rcd->map[rcd->nmap];
531 c->f1.c = c->f2.c = NULL;
532 if (lex_match_id ("ELSE"))
538 else if (type == NUMERIC)
542 if (lex_match_id ("LO") || lex_match_id ("LOWEST"))
544 if (!lex_force_match_id ("THRU"))
546 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
548 else if (token == T_NUM)
556 lex_error (_("following LO THRU"));
560 else if (lex_match_id ("MISSING"))
563 rcd->flags |= RCD_MISC_MISSING;
565 else if (lex_match_id ("SYSMIS"))
568 rcd->flags |= RCD_MISC_MISSING;
572 lex_error (_("in source value"));
576 else if (token == T_NUM)
580 if (lex_match_id ("THRU"))
582 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
584 else if (token == T_NUM)
597 c->type = RCD_SINGLE;
601 lex_error (_("in source value"));
607 assert (type == ALPHA);
608 if (lex_match_id ("CONVERT"))
610 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
611 rcd->flags |= RCD_DEST_NUMERIC;
612 else if ((rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
614 msg (SE, _("Keyword CONVERT may only be used with "
615 "string input values and numeric output "
620 c->type = RCD_CONVERT;
626 /* Only the debugging code needs the NULLs at the ends
627 of the strings. However, changing code behavior more
628 than necessary based on the DEBUGGING `#define' is just
630 c->type = RCD_SINGLE;
631 if (!lex_force_string ())
633 c->f1.c = xmalloc (max_src_width + 1);
634 st_pad_copy (c->f1.c, ds_value (&tokstr), max_src_width + 1);
639 if (c->type != RCD_END)
649 /* Data transformation. */
652 recode_trns_free (struct trns_header * t)
655 struct rcd_var *head, *next;
657 head = ((struct recode_trns *) t)->codings;
660 if (head->map && !(head->flags & RCD_MISC_DUPLICATE))
662 if (head->flags & RCD_SRC_STRING)
663 for (i = 0; i < head->nmap; i++)
664 switch (head->map[i].type)
667 free (head->map[i].f2.c);
673 free (head->map[i].f1.c);
682 if (head->flags & RCD_DEST_STRING)
683 for (i = 0; i < head->nmap; i++)
684 if (head->map[i].type != RCD_CONVERT && head->map[i].type != RCD_END)
685 free (head->map[i].t.c);
694 static inline struct coding *
695 find_src_numeric (struct rcd_var * v, struct ccase * c)
697 double cmp = c->data[v->src->fv].f;
702 if (v->sysmis.f != -SYSMIS)
704 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
705 c->data[v->dest->fv].f = v->sysmis.f;
707 memcpy (c->data[v->dest->fv].s, v->sysmis.c,
713 for (cp = v->map;; cp++)
719 if (is_num_user_missing (cmp, v->src))
723 if (approx_eq (cmp, cp->f1.f))
727 if (approx_ge (cmp, cp->f1.f))
731 if (approx_le (cmp, cp->f1.f))
735 if (approx_in_range (cmp, cp->f1.f, cp->f2.f))
745 static inline struct coding *
746 find_src_string (struct rcd_var * v, struct ccase * c)
748 char *cmp = c->data[v->src->fv].s;
749 int w = v->src->width;
752 for (cp = v->map;; cp++)
758 if (!memcmp (cp->f1.c, cmp, w))
765 double f = convert_to_double (cmp, w);
768 c->data[v->dest->fv].f = f;
779 recode_trns_proc (struct trns_header * t, struct ccase * c)
784 for (v = ((struct recode_trns *) t)->codings; v; v = v->next)
786 switch (v->flags & RCD_SRC_MASK)
788 case RCD_SRC_NUMERIC:
789 cp = find_src_numeric (v, c);
792 cp = find_src_string (v, c);
798 /* A matching input value was found. */
799 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
801 double val = cp->t.f;
803 c->data[v->dest->fv].f = c->data[v->src->fv].f;
805 c->data[v->dest->fv].f = val;
811 st_bare_pad_len_copy (c->data[v->dest->fv].s,
812 c->data[v->src->fv].c,
813 v->dest->width, v->src->width);
815 memcpy (c->data[v->dest->fv].s, cp->t.c, v->dest->width);
826 dump_dest (struct rcd_var * v, union value * c)
828 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
831 else if (c->f == -SYSMIS)
834 printf ("=%g", c->f);
836 printf ("=\"%s\"", c->c);
842 debug_print (struct rcd_var * head)
844 struct rcd_var *iter, *start;
848 for (iter = head; iter; iter = iter->next)
851 printf (" %s%s", iter == head ? "" : "/", iter->src->name);
852 while (iter->next && (iter->next->flags & RCD_MISC_DUPLICATE))
855 printf (" %s", iter->src->name);
857 if (iter->has_sysmis)
860 dump_dest (iter, &iter->sysmis);
863 for (c = iter->map; c->type != RCD_END; c++)
866 if ((iter->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
870 printf (_("!!END!!"));
876 printf ("%g", c->f1.f);
879 printf ("%g THRU HIGH", c->f1.f);
882 printf ("LOW THRU %g", c->f1.f);
885 printf ("%g THRU %g", c->f1.f, c->f2.f);
891 printf (_("!!ERROR!!"));
898 printf ("\"%s\"", c->f1.c);
907 printf (_("!!ERROR!!"));
910 if (c->type != RCD_CONVERT)
911 dump_dest (iter, &c->t);
918 start->dest_name[0] ? start->dest_name : start->dest->name);
928 /* Convert NPTR to a `long int' in base 10. Returns the long int on
929 success, NOT_LONG on failure. On success stores a pointer to the
930 first character after the number into *ENDPTR. From the GNU C
933 string_to_long (char *nptr, int width, char **endptr)
936 register unsigned long int cutoff;
937 register unsigned int cutlim;
938 register unsigned long int i;
940 register unsigned char c;
945 /* Check for a sign. */
958 if (s >= nptr + width)
961 /* Save the pointer so we can check later if anything happened. */
964 cutoff = ULONG_MAX / 10ul;
965 cutlim = ULONG_MAX % 10ul;
970 if (isdigit ((unsigned char) c))
974 /* Check for overflow. */
975 if (i > cutoff || (i == cutoff && c > cutlim))
981 if (s >= nptr + width)
986 /* Check if anything actually happened. */
990 /* Check for a value that is within the range of `unsigned long
991 int', but outside the range of `long int'. We limit LONG_MIN and
992 LONG_MAX by one point because we know that NOT_LONG is out there
995 ? -((unsigned long int) LONG_MIN) - 1
996 : ((unsigned long int) LONG_MAX) - 1))
1001 /* Return the result of the appropriate sign. */
1002 return (negative ? -i : i);
1005 /* Converts S to a double according to format Fx.0. Returns the value
1006 found, or -SYSMIS if there was no valid number in s. WIDTH is the
1007 length of string S. From the GNU C library. */
1009 convert_to_double (char *s, int width)
1011 register const char *end = &s[width];
1015 /* The number so far. */
1018 int got_dot; /* Found a decimal point. */
1019 int got_digit; /* Count of digits. */
1021 /* The exponent of the number. */
1024 /* Eat whitespace. */
1025 while (s < end && isspace ((unsigned char) *s))
1031 sign = *s == '-' ? -1 : 1;
1032 if (*s == '-' || *s == '+')
1043 for (; s < end; ++s)
1045 if (isdigit ((unsigned char) *s))
1049 /* Make sure that multiplication by 10 will not overflow. */
1050 if (num > DBL_MAX * 0.1)
1051 /* The value of the digit doesn't matter, since we have already
1052 gotten as many digits as can be represented in a `double'.
1053 This doesn't necessarily mean the result will overflow.
1054 The exponent may reduce it to within range.
1056 We just need to record that there was another
1057 digit so that we can multiply by 10 later. */
1060 num = (num * 10.0) + (*s - '0');
1062 /* Keep track of the number of digits after the decimal point.
1063 If we just divided by 10 here, we would lose precision. */
1067 else if (!got_dot && *s == '.')
1068 /* Record that we have found the decimal point. */
1077 if (s < end && (tolower ((unsigned char) (*s)) == 'e'
1078 || tolower ((unsigned char) (*s)) == 'd'))
1080 /* Get the exponent specified after the `e' or `E'. */
1087 exp = string_to_long (s, end - s, &s);
1088 if (exp == NOT_LONG || end == s)
1093 while (s < end && isspace ((unsigned char) *s))
1101 /* Multiply NUM by 10 to the EXPONENT power,
1102 checking for overflow and underflow. */
1106 if (-exponent + got_digit > -(DBL_MIN_10_EXP) + 5
1107 || num < DBL_MIN * pow (10.0, (double) -exponent))
1109 num *= pow (10.0, (double) exponent);
1111 else if (exponent > 0)
1113 if (num > DBL_MAX * pow (10.0, (double) -exponent))
1115 num *= pow (10.0, (double) exponent);
1118 return sign > 0 ? num : -num;