1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
35 /* Type of source value for RECODE. */
38 RCD_END, /* sentinel value */
39 RCD_USER, /* user-missing => one */
40 RCD_SINGLE, /* one => one */
41 RCD_HIGH, /* x > a => one */
42 RCD_LOW, /* x < b => one */
43 RCD_RANGE, /* b < x < a => one */
44 RCD_ELSE, /* any but SYSMIS => one */
45 RCD_CONVERT /* "123" => 123 */
48 /* Describes how to recode a single value or range of values into a
53 union value f1, f2; /* Describe value or range as src. Long
54 strings are stored in `c'. */
55 union value t; /* Describes value as dest. Long strings in `c'. */
58 /* Describes how to recode a single variable. */
63 unsigned flags; /* RCD_SRC_* | RCD_DEST_* | RCD_MISC_* */
65 struct variable *src; /* Source variable. */
66 struct variable *dest; /* Destination variable. */
67 char dest_name[9]; /* Name of dest variable if we're creating it. */
69 int has_sysmis; /* Do we recode for SYSMIS? */
70 union value sysmis; /* Coding for SYSMIS (if src is numeric). */
72 struct coding *map; /* Coding for other values. */
73 int nmap, mmap; /* Length of map, max capacity of map. */
76 /* RECODE transformation. */
80 struct rcd_var *codings;
83 /* What we're recoding from (`src'==`source'). */
84 #define RCD_SRC_ERROR 0000u /* Bad value for src. */
85 #define RCD_SRC_NUMERIC 0001u /* Src is numeric. */
86 #define RCD_SRC_STRING 0002u /* Src is short string. */
87 #define RCD_SRC_MASK 0003u /* AND mask to isolate src bits. */
89 /* What we're recoding to (`dest'==`destination'). */
90 #define RCD_DEST_ERROR 0000u /* Bad value for dest. */
91 #define RCD_DEST_NUMERIC 0004u /* Dest is numeric. */
92 #define RCD_DEST_STRING 0010u /* Dest is short string. */
93 #define RCD_DEST_MASK 0014u /* AND mask to isolate dest bits. */
95 /* Miscellaneous bits. */
96 #define RCD_MISC_CREATE 0020u /* We create dest var (numeric only) */
97 #define RCD_MISC_DUPLICATE 0040u /* This var_info has the same MAP
98 value as the previous var_info.
99 Prevents redundant free()ing. */
100 #define RCD_MISC_MISSING 0100u /* Encountered MISSING or SYSMIS in
103 static int parse_dest_spec (struct rcd_var * rcd, union value *v,
104 size_t *max_dst_width);
105 static int parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width);
106 static trns_proc_func recode_trns_proc;
107 static trns_free_func recode_trns_free;
108 static double convert_to_double (char *, int);
112 /* Parses the RECODE transformation. */
118 /* Transformation that we're constructing. */
121 /* Type of the src variables. */
124 /* Length of longest src string. */
125 size_t max_src_width;
127 /* Length of longest dest string. */
128 size_t max_dst_width;
130 /* For stepping through, constructing the linked list of
132 struct rcd_var *iter;
134 /* The real transformation, just a wrapper for a list of
136 struct recode_trns *trns;
138 /* First transformation in the list. rcd is in this list. */
139 struct rcd_var *head;
141 /* Variables in the current part of the recoding. */
145 /* Parses each specification between slashes. */
146 head = rcd = xmalloc (sizeof *rcd);
150 /* Whether we've already encountered a specification for SYSMIS. */
153 /* Initialize this rcd_var to ensure proper cleanup. */
156 rcd->nmap = rcd->mmap = 0;
160 /* Parse variable names. */
161 if (!parse_variables (default_dict, &v, &nv, PV_SAME_TYPE))
164 /* Ensure all variables are same type; find length of longest
167 max_src_width = v[0]->width;
170 for (i = 0; i < nv; i++)
171 if (v[i]->width > (int) max_src_width)
172 max_src_width = v[i]->width;
177 rcd->flags |= RCD_SRC_NUMERIC;
179 rcd->flags |= RCD_SRC_STRING;
181 /* Parse each coding in parentheses. */
183 if (!lex_force_match ('('))
187 /* Get the input value (before the `='). */
188 int mark = rcd->nmap;
189 int code = parse_src_spec (rcd, type, max_src_width);
193 /* ELSE is the same as any other input spec except that it
194 precludes later sysmis specifications. */
201 /* If keyword CONVERT was specified, there is no output
207 /* Get the output value (after the `='). */
208 lex_get (); /* Skip `='. */
209 if (!parse_dest_spec (rcd, &output, &max_dst_width))
212 /* Set the value for SYSMIS if requested and if we don't
214 if ((rcd->flags & RCD_MISC_MISSING) && !had_sysmis)
217 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
218 rcd->sysmis.f = output.f;
220 rcd->sysmis.c = xstrdup (output.c);
223 rcd->flags &= ~RCD_MISC_MISSING;
226 /* Since there may be multiple input values for a single
227 output, the output value need to propagated among all
229 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
230 for (i = mark; i < rcd->nmap; i++)
231 rcd->map[i].t.f = output.f;
234 for (i = mark; i < rcd->nmap; i++)
235 rcd->map[i].t.c = xstrdup (output.c);
239 lex_get (); /* Skip `)'. */
240 if (!lex_match ('('))
244 /* Append sentinel value. */
245 rcd->map[rcd->nmap++].type = RCD_END;
247 /* Since multiple variables may use the same recodings, it is
248 necessary to propogate the codings to all of them. */
251 rcd->dest_name[0] = 0;
253 for (i = 1; i < nv; i++)
255 iter = iter->next = xmalloc (sizeof *iter);
257 iter->flags = rcd->flags | RCD_MISC_DUPLICATE;
260 iter->dest_name[0] = 0;
261 iter->has_sysmis = rcd->has_sysmis;
262 iter->sysmis = rcd->sysmis;
263 iter->map = rcd->map;
266 if (lex_match_id ("INTO"))
273 if (!parse_mixed_vars (&names, &nnames, PV_NONE))
278 for (i = 0; i < nnames; i++)
281 msg (SE, _("%d variable(s) cannot be recoded into "
282 "%d variable(s). Specify the same number "
283 "of variables as input and output variables."),
288 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
289 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
291 struct variable *v = dict_lookup_var (default_dict, names[i]);
295 msg (SE, _("There is no string variable named "
296 "%s. (All string variables specified "
297 "on INTO must already exist. Use the "
298 "STRING command to create a string "
299 "variable.)"), names[i]);
302 if (v->type != ALPHA)
304 msg (SE, _("Type mismatch between input and output "
305 "variables. Output variable %s is not "
306 "a string variable, but all the input "
307 "variables are string variables."), v->name);
310 if (v->width > (int) max_dst_width)
311 max_dst_width = v->width;
315 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
317 struct variable *v = dict_lookup_var (default_dict, names[i]);
321 if (v->type != NUMERIC)
323 msg (SE, _("Type mismatch after INTO: %s "
324 "is not a numeric variable."), v->name);
331 strcpy (iter->dest_name, names[i]);
335 /* Note that regardless of whether we succeed or fail,
336 flow-of-control comes here. `success' is the important
337 factor. Ah, if C had garbage collection... */
339 for (i = 0; i < nnames; i++)
347 if (max_src_width > max_dst_width)
348 max_dst_width = max_src_width;
350 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC
351 && (rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
353 msg (SE, _("INTO must be used when the input values are "
354 "numeric and output values are string."));
358 if ((rcd->flags & RCD_SRC_MASK) != RCD_SRC_NUMERIC
359 && (rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
361 msg (SE, _("INTO must be used when the input values are "
362 "string and output values are numeric."));
367 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
371 for (cp = rcd->map; cp->type != RCD_END; cp++)
374 if (strlen (cp->t.c) < max_dst_width)
376 /* The NULL is only really necessary for the
378 char *repl = xmalloc (max_dst_width + 1);
379 st_pad_copy (repl, cp->t.c, max_dst_width + 1);
384 /* The strings are guaranteed to be in order of
385 nondecreasing length. */
391 if (!lex_match ('/'))
395 rcd = rcd->next = xmalloc (sizeof *rcd);
403 lex_error (_("expecting end of command"));
407 for (rcd = head; rcd; rcd = rcd->next)
408 if (rcd->dest_name[0])
410 rcd->dest = dict_create_var (default_dict, rcd->dest_name, 0);
413 /* FIXME: This can occur if a destname is duplicated.
414 We could give an error at parse time but I don't
416 rcd->dest = dict_lookup_var_assert (default_dict, rcd->dest_name);
420 trns = xmalloc (sizeof *trns);
421 trns->h.proc = recode_trns_proc;
422 trns->h.free = recode_trns_free;
423 trns->codings = head;
424 add_transformation ((struct trns_header *) trns);
431 struct recode_trns t;
434 recode_trns_free ((struct trns_header *) &t);
440 parse_dest_spec (struct rcd_var * rcd, union value * v, size_t *max_dst_width)
450 flags = RCD_DEST_NUMERIC;
452 else if (lex_match_id ("SYSMIS"))
455 flags = RCD_DEST_NUMERIC;
457 else if (token == T_STRING)
459 size_t max = *max_dst_width;
460 size_t toklen = ds_length (&tokstr);
463 v->c = xmalloc (max + 1);
464 st_pad_copy (v->c, ds_value (&tokstr), max + 1);
465 flags = RCD_DEST_STRING;
466 *max_dst_width = max;
469 else if (lex_match_id ("COPY"))
471 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
473 flags = RCD_DEST_NUMERIC;
478 flags = RCD_DEST_STRING;
484 lex_error (_("expecting output value"));
488 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
491 else if (((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC
492 && flags != RCD_DEST_NUMERIC)
493 || ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING
494 && flags != RCD_DEST_STRING))
496 else if ((rcd->flags & RCD_DEST_MASK) ^ flags)
498 msg (SE, _("Inconsistent output types. The output values "
499 "must be all numeric or all string."));
506 /* Reads a set of source specifications and returns one of the
507 following values: 0 on failure; 1 for normal success; 2 for success
508 but with CONVERT as the keyword; 3 for success but with ELSE as the
511 parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width)
517 if (rcd->nmap >= rcd->mmap - 1)
520 rcd->map = xrealloc (rcd->map, rcd->mmap * sizeof *rcd->map);
523 c = &rcd->map[rcd->nmap];
524 c->f1.c = c->f2.c = NULL;
525 if (lex_match_id ("ELSE"))
531 else if (type == NUMERIC)
535 if (lex_match_id ("LO") || lex_match_id ("LOWEST"))
537 if (!lex_force_match_id ("THRU"))
539 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
541 else if (token == T_NUM)
549 lex_error (_("following LO THRU"));
553 else if (lex_match_id ("MISSING"))
556 rcd->flags |= RCD_MISC_MISSING;
558 else if (lex_match_id ("SYSMIS"))
561 rcd->flags |= RCD_MISC_MISSING;
565 lex_error (_("in source value"));
569 else if (token == T_NUM)
573 if (lex_match_id ("THRU"))
575 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
577 else if (token == T_NUM)
590 c->type = RCD_SINGLE;
594 lex_error (_("in source value"));
600 assert (type == ALPHA);
601 if (lex_match_id ("CONVERT"))
603 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
604 rcd->flags |= RCD_DEST_NUMERIC;
605 else if ((rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
607 msg (SE, _("Keyword CONVERT may only be used with "
608 "string input values and numeric output "
613 c->type = RCD_CONVERT;
619 /* Only the debugging code needs the NULLs at the ends
620 of the strings. However, changing code behavior more
621 than necessary based on the DEBUGGING `#define' is just
623 c->type = RCD_SINGLE;
624 if (!lex_force_string ())
626 c->f1.c = xmalloc (max_src_width + 1);
627 st_pad_copy (c->f1.c, ds_value (&tokstr), max_src_width + 1);
632 if (c->type != RCD_END)
642 /* Data transformation. */
645 recode_trns_free (struct trns_header * t)
648 struct rcd_var *head, *next;
650 head = ((struct recode_trns *) t)->codings;
653 if (head->map && !(head->flags & RCD_MISC_DUPLICATE))
655 if (head->flags & RCD_SRC_STRING)
656 for (i = 0; i < head->nmap; i++)
657 switch (head->map[i].type)
660 free (head->map[i].f2.c);
666 free (head->map[i].f1.c);
675 if (head->flags & RCD_DEST_STRING)
676 for (i = 0; i < head->nmap; i++)
677 if (head->map[i].type != RCD_CONVERT && head->map[i].type != RCD_END)
678 free (head->map[i].t.c);
687 static inline struct coding *
688 find_src_numeric (struct rcd_var * v, struct ccase * c)
690 double cmp = c->data[v->src->fv].f;
695 if (v->sysmis.f != -SYSMIS)
697 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
698 c->data[v->dest->fv].f = v->sysmis.f;
700 memcpy (c->data[v->dest->fv].s, v->sysmis.c,
706 for (cp = v->map;; cp++)
712 if (is_num_user_missing (cmp, v->src))
728 if (cmp >= cp->f1.f && cmp <= cp->f2.f)
738 static inline struct coding *
739 find_src_string (struct rcd_var * v, struct ccase * c)
741 char *cmp = c->data[v->src->fv].s;
742 int w = v->src->width;
745 for (cp = v->map;; cp++)
751 if (!memcmp (cp->f1.c, cmp, w))
758 double f = convert_to_double (cmp, w);
761 c->data[v->dest->fv].f = f;
772 recode_trns_proc (struct trns_header * t, struct ccase * c,
777 for (v = ((struct recode_trns *) t)->codings; v; v = v->next)
781 switch (v->flags & RCD_SRC_MASK)
783 case RCD_SRC_NUMERIC:
784 cp = find_src_numeric (v, c);
787 cp = find_src_string (v, c);
796 /* A matching input value was found. */
797 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
799 double val = cp->t.f;
801 c->data[v->dest->fv].f = c->data[v->src->fv].f;
803 c->data[v->dest->fv].f = val;
809 st_bare_pad_len_copy (c->data[v->dest->fv].s,
810 c->data[v->src->fv].c,
811 v->dest->width, v->src->width);
813 memcpy (c->data[v->dest->fv].s, cp->t.c, v->dest->width);
820 /* Convert NPTR to a `long int' in base 10. Returns the long int on
821 success, NOT_LONG on failure. On success stores a pointer to the
822 first character after the number into *ENDPTR. From the GNU C
825 string_to_long (char *nptr, int width, char **endptr)
828 register unsigned long int cutoff;
829 register unsigned int cutlim;
830 register unsigned long int i;
832 register unsigned char c;
837 /* Check for a sign. */
850 if (s >= nptr + width)
853 /* Save the pointer so we can check later if anything happened. */
856 cutoff = ULONG_MAX / 10ul;
857 cutlim = ULONG_MAX % 10ul;
862 if (isdigit ((unsigned char) c))
866 /* Check for overflow. */
867 if (i > cutoff || (i == cutoff && c > cutlim))
873 if (s >= nptr + width)
878 /* Check if anything actually happened. */
882 /* Check for a value that is within the range of `unsigned long
883 int', but outside the range of `long int'. We limit LONG_MIN and
884 LONG_MAX by one point because we know that NOT_LONG is out there
887 ? -((unsigned long int) LONG_MIN) - 1
888 : ((unsigned long int) LONG_MAX) - 1))
893 /* Return the result of the appropriate sign. */
894 return (negative ? -i : i);
897 /* Converts S to a double according to format Fx.0. Returns the value
898 found, or -SYSMIS if there was no valid number in s. WIDTH is the
899 length of string S. From the GNU C library. */
901 convert_to_double (char *s, int width)
903 register const char *end = &s[width];
907 /* The number so far. */
910 int got_dot; /* Found a decimal point. */
911 int got_digit; /* Count of digits. */
913 /* The exponent of the number. */
916 /* Eat whitespace. */
917 while (s < end && isspace ((unsigned char) *s))
923 sign = *s == '-' ? -1 : 1;
924 if (*s == '-' || *s == '+')
937 if (isdigit ((unsigned char) *s))
941 /* Make sure that multiplication by 10 will not overflow. */
942 if (num > DBL_MAX * 0.1)
943 /* The value of the digit doesn't matter, since we have already
944 gotten as many digits as can be represented in a `double'.
945 This doesn't necessarily mean the result will overflow.
946 The exponent may reduce it to within range.
948 We just need to record that there was another
949 digit so that we can multiply by 10 later. */
952 num = (num * 10.0) + (*s - '0');
954 /* Keep track of the number of digits after the decimal point.
955 If we just divided by 10 here, we would lose precision. */
959 else if (!got_dot && *s == '.')
960 /* Record that we have found the decimal point. */
969 if (s < end && (tolower ((unsigned char) (*s)) == 'e'
970 || tolower ((unsigned char) (*s)) == 'd'))
972 /* Get the exponent specified after the `e' or `E'. */
979 exp = string_to_long (s, end - s, &s);
980 if (exp == NOT_LONG || end == s)
985 while (s < end && isspace ((unsigned char) *s))
993 /* Multiply NUM by 10 to the EXPONENT power,
994 checking for overflow and underflow. */
998 if (-exponent + got_digit > -(DBL_MIN_10_EXP) + 5
999 || num < DBL_MIN * pow (10.0, (double) -exponent))
1001 num *= pow (10.0, (double) exponent);
1003 else if (exponent > 0)
1005 if (num > DBL_MAX * pow (10.0, (double) -exponent))
1007 num *= pow (10.0, (double) exponent);
1010 return sign > 0 ? num : -num;