1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
28 #include "dictionary.h"
36 #define _(msgid) gettext (msgid)
40 /* Type of source value for RECODE. */
43 RCD_END, /* sentinel value */
44 RCD_USER, /* user-missing => one */
45 RCD_SINGLE, /* one => one */
46 RCD_HIGH, /* x > a => one */
47 RCD_LOW, /* x < b => one */
48 RCD_RANGE, /* b < x < a => one */
49 RCD_ELSE, /* any but SYSMIS => one */
50 RCD_CONVERT /* "123" => 123 */
53 /* Describes how to recode a single value or range of values into a
58 union value f1, f2; /* Describe value or range as src. Long
59 strings are stored in `c'. */
60 union value t; /* Describes value as dest. Long strings in `c'. */
63 /* Describes how to recode a single variable. */
68 unsigned flags; /* RCD_SRC_* | RCD_DEST_* | RCD_MISC_* */
70 struct variable *src; /* Source variable. */
71 struct variable *dest; /* Destination variable. */
72 char dest_name[LONG_NAME_LEN + 1]; /* Name of dest variable if we're creating it. */
74 int has_sysmis; /* Do we recode for SYSMIS? */
75 union value sysmis; /* Coding for SYSMIS (if src is numeric). */
77 struct coding *map; /* Coding for other values. */
78 int nmap, mmap; /* Length of map, max capacity of map. */
81 /* RECODE transformation. */
85 struct rcd_var *codings;
88 /* What we're recoding from (`src'==`source'). */
89 #define RCD_SRC_ERROR 0000u /* Bad value for src. */
90 #define RCD_SRC_NUMERIC 0001u /* Src is numeric. */
91 #define RCD_SRC_STRING 0002u /* Src is short string. */
92 #define RCD_SRC_MASK 0003u /* AND mask to isolate src bits. */
94 /* What we're recoding to (`dest'==`destination'). */
95 #define RCD_DEST_ERROR 0000u /* Bad value for dest. */
96 #define RCD_DEST_NUMERIC 0004u /* Dest is numeric. */
97 #define RCD_DEST_STRING 0010u /* Dest is short string. */
98 #define RCD_DEST_MASK 0014u /* AND mask to isolate dest bits. */
100 /* Miscellaneous bits. */
101 #define RCD_MISC_CREATE 0020u /* We create dest var (numeric only) */
102 #define RCD_MISC_DUPLICATE 0040u /* This var_info has the same MAP
103 value as the previous var_info.
104 Prevents redundant free()ing. */
105 #define RCD_MISC_MISSING 0100u /* Encountered MISSING or SYSMIS in
108 static int parse_dest_spec (struct rcd_var * rcd, union value *v,
109 size_t *max_dst_width);
110 static int parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width);
111 static trns_proc_func recode_trns_proc;
112 static trns_free_func recode_trns_free;
113 static double convert_to_double (const char *, int);
117 /* Parses the RECODE transformation. */
123 /* Transformation that we're constructing. */
126 /* Type of the src variables. */
129 /* Length of longest src string. */
130 size_t max_src_width;
132 /* Length of longest dest string. */
133 size_t max_dst_width;
135 /* For stepping through, constructing the linked list of
137 struct rcd_var *iter;
139 /* The real transformation, just a wrapper for a list of
141 struct recode_trns *trns;
143 /* First transformation in the list. rcd is in this list. */
144 struct rcd_var *head;
146 /* Variables in the current part of the recoding. */
150 /* Parses each specification between slashes. */
151 head = rcd = xmalloc (sizeof *rcd);
155 /* Whether we've already encountered a specification for SYSMIS. */
158 /* Initialize this rcd_var to ensure proper cleanup. */
161 rcd->nmap = rcd->mmap = 0;
165 /* Parse variable names. */
166 if (!parse_variables (default_dict, &v, &nv, PV_SAME_TYPE))
169 /* Ensure all variables are same type; find length of longest
172 max_src_width = v[0]->width;
175 for (i = 0; i < nv; i++)
176 if (v[i]->width > (int) max_src_width)
177 max_src_width = v[i]->width;
182 rcd->flags |= RCD_SRC_NUMERIC;
184 rcd->flags |= RCD_SRC_STRING;
186 /* Parse each coding in parentheses. */
188 if (!lex_force_match ('('))
192 /* Get the input value (before the `='). */
193 int mark = rcd->nmap;
194 int code = parse_src_spec (rcd, type, max_src_width);
198 /* ELSE is the same as any other input spec except that it
199 precludes later sysmis specifications. */
206 /* If keyword CONVERT was specified, there is no output
212 /* Get the output value (after the `='). */
213 lex_get (); /* Skip `='. */
214 if (!parse_dest_spec (rcd, &output, &max_dst_width))
217 /* Set the value for SYSMIS if requested and if we don't
219 if ((rcd->flags & RCD_MISC_MISSING) && !had_sysmis)
222 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
223 rcd->sysmis.f = output.f;
225 rcd->sysmis.c = xstrdup (output.c);
228 rcd->flags &= ~RCD_MISC_MISSING;
231 /* Since there may be multiple input values for a single
232 output, the output value need to propagated among all
234 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
235 for (i = mark; i < rcd->nmap; i++)
236 rcd->map[i].t.f = output.f;
239 for (i = mark; i < rcd->nmap; i++)
240 rcd->map[i].t.c = (output.c?xstrdup (output.c):NULL);
244 lex_get (); /* Skip `)'. */
245 if (!lex_match ('('))
249 /* Append sentinel value. */
250 rcd->map[rcd->nmap++].type = RCD_END;
252 /* Since multiple variables may use the same recodings, it is
253 necessary to propogate the codings to all of them. */
256 rcd->dest_name[0] = 0;
258 for (i = 1; i < nv; i++)
260 iter = iter->next = xmalloc (sizeof *iter);
262 iter->flags = rcd->flags | RCD_MISC_DUPLICATE;
265 iter->dest_name[0] = 0;
266 iter->has_sysmis = rcd->has_sysmis;
267 iter->sysmis = rcd->sysmis;
268 iter->map = rcd->map;
271 if (lex_match_id ("INTO"))
278 if (!parse_mixed_vars (&names, &nnames, PV_NONE))
283 for (i = 0; i < nnames; i++)
286 msg (SE, _("%d variable(s) cannot be recoded into "
287 "%d variable(s). Specify the same number "
288 "of variables as input and output variables."),
293 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
294 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
296 struct variable *v = dict_lookup_var (default_dict, names[i]);
300 msg (SE, _("There is no string variable named "
301 "%s. (All string variables specified "
302 "on INTO must already exist. Use the "
303 "STRING command to create a string "
304 "variable.)"), names[i]);
307 if (v->type != ALPHA)
309 msg (SE, _("Type mismatch between input and output "
310 "variables. Output variable %s is not "
311 "a string variable, but all the input "
312 "variables are string variables."), v->name);
315 if (v->width > (int) max_dst_width)
316 max_dst_width = v->width;
320 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
322 struct variable *v = dict_lookup_var (default_dict, names[i]);
326 if (v->type != NUMERIC)
328 msg (SE, _("Type mismatch after INTO: %s "
329 "is not a numeric variable."), v->name);
336 strcpy (iter->dest_name, names[i]);
340 /* Note that regardless of whether we succeed or fail,
341 flow-of-control comes here. `success' is the important
342 factor. Ah, if C had garbage collection... */
344 for (i = 0; i < nnames; i++)
352 if (max_src_width > max_dst_width)
353 max_dst_width = max_src_width;
355 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC
356 && (rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
358 msg (SE, _("INTO must be used when the input values are "
359 "numeric and output values are string."));
363 if ((rcd->flags & RCD_SRC_MASK) != RCD_SRC_NUMERIC
364 && (rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
366 msg (SE, _("INTO must be used when the input values are "
367 "string and output values are numeric."));
372 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
376 for (cp = rcd->map; cp->type != RCD_END; cp++)
379 if (strlen (cp->t.c) < max_dst_width)
381 /* The NULL is only really necessary for the
383 char *repl = xmalloc (max_dst_width + 1);
384 str_copy_rpad (repl, max_dst_width + 1, cp->t.c);
389 /* The strings are guaranteed to be in order of
390 nondecreasing length. */
399 if (!lex_match ('/'))
403 rcd = rcd->next = xmalloc (sizeof *rcd);
408 lex_error (_("expecting end of command"));
412 for (rcd = head; rcd; rcd = rcd->next)
413 if (rcd->dest_name[0])
415 rcd->dest = dict_create_var (default_dict, rcd->dest_name, 0);
418 /* FIXME: This can fail if a destname is duplicated.
419 We could give an error at parse time but I don't
421 rcd->dest = dict_lookup_var_assert (default_dict, rcd->dest_name);
425 trns = xmalloc (sizeof *trns);
426 trns->h.proc = recode_trns_proc;
427 trns->h.free = recode_trns_free;
428 trns->codings = head;
429 add_transformation ((struct trns_header *) trns);
436 struct recode_trns t;
439 recode_trns_free ((struct trns_header *) &t);
445 parse_dest_spec (struct rcd_var * rcd, union value * v, size_t *max_dst_width)
451 if (lex_is_number ())
455 flags = RCD_DEST_NUMERIC;
457 else if (lex_match_id ("SYSMIS"))
460 flags = RCD_DEST_NUMERIC;
462 else if (token == T_STRING)
464 size_t max = *max_dst_width;
465 size_t toklen = ds_length (&tokstr);
468 v->c = xmalloc (max + 1);
469 str_copy_rpad (v->c, max + 1, ds_c_str (&tokstr));
470 flags = RCD_DEST_STRING;
471 *max_dst_width = max;
474 else if (lex_match_id ("COPY"))
476 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
478 flags = RCD_DEST_NUMERIC;
483 flags = RCD_DEST_STRING;
489 lex_error (_("expecting output value"));
493 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
496 else if (((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC
497 && flags != RCD_DEST_NUMERIC)
498 || ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING
499 && flags != RCD_DEST_STRING))
501 else if ((rcd->flags & RCD_DEST_MASK) ^ flags)
503 msg (SE, _("Inconsistent output types. The output values "
504 "must be all numeric or all string."));
511 /* Reads a set of source specifications and returns one of the
512 following values: 0 on failure; 1 for normal success; 2 for success
513 but with CONVERT as the keyword; 3 for success but with ELSE as the
516 parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width)
522 if (rcd->nmap >= rcd->mmap - 1)
525 rcd->map = xrealloc (rcd->map, rcd->mmap * sizeof *rcd->map);
528 c = &rcd->map[rcd->nmap];
529 c->f1.c = c->f2.c = NULL;
530 if (lex_match_id ("ELSE"))
536 else if (type == NUMERIC)
540 if (lex_match_id ("LO") || lex_match_id ("LOWEST"))
542 if (!lex_force_match_id ("THRU"))
544 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
546 else if (lex_is_number ())
554 lex_error (_("following LO THRU"));
558 else if (lex_match_id ("MISSING"))
561 rcd->flags |= RCD_MISC_MISSING;
563 else if (lex_match_id ("SYSMIS"))
566 rcd->flags |= RCD_MISC_MISSING;
570 lex_error (_("in source value"));
574 else if (lex_is_number ())
578 if (lex_match_id ("THRU"))
580 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
582 else if (lex_is_number ())
595 c->type = RCD_SINGLE;
599 lex_error (_("in source value"));
605 assert (type == ALPHA);
606 if (lex_match_id ("CONVERT"))
608 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
609 rcd->flags |= RCD_DEST_NUMERIC;
610 else if ((rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
612 msg (SE, _("Keyword CONVERT may only be used with "
613 "string input values and numeric output "
618 c->type = RCD_CONVERT;
624 /* Only the debugging code needs the NULLs at the ends
625 of the strings. However, changing code behavior more
626 than necessary based on the DEBUGGING `#define' is just
628 c->type = RCD_SINGLE;
629 if (!lex_force_string ())
631 c->f1.c = xmalloc (max_src_width + 1);
632 str_copy_rpad (c->f1.c, max_src_width + 1, ds_c_str (&tokstr));
637 if (c->type != RCD_END)
647 /* Data transformation. */
650 recode_trns_free (struct trns_header * t)
653 struct rcd_var *head, *next;
655 head = ((struct recode_trns *) t)->codings;
658 if (head->map && !(head->flags & RCD_MISC_DUPLICATE))
660 if (head->flags & RCD_SRC_STRING)
661 for (i = 0; i < head->nmap; i++)
662 switch (head->map[i].type)
665 free (head->map[i].f2.c);
671 free (head->map[i].f1.c);
680 if (head->flags & RCD_DEST_STRING)
681 for (i = 0; i < head->nmap; i++)
682 if (head->map[i].type != RCD_CONVERT && head->map[i].type != RCD_END)
683 free (head->map[i].t.c);
692 static inline struct coding *
693 find_src_numeric (struct rcd_var * v, struct ccase * c)
695 double cmp = case_num (c, v->src->fv);
700 if (v->sysmis.f != -SYSMIS)
702 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
703 case_data_rw (c, v->dest->fv)->f = v->sysmis.f;
705 memcpy (case_data_rw (c, v->dest->fv)->s, v->sysmis.s,
711 for (cp = v->map;; cp++)
717 if (is_num_user_missing (cmp, v->src))
733 if (cmp >= cp->f1.f && cmp <= cp->f2.f)
743 static inline struct coding *
744 find_src_string (struct rcd_var * v, struct ccase * c)
746 const char *cmp = case_str (c, v->src->fv);
747 int w = v->src->width;
750 for (cp = v->map;; cp++)
756 if (!memcmp (cp->f1.c, cmp, w))
763 double f = convert_to_double (cmp, w);
766 case_data_rw (c, v->dest->fv)->f = f;
777 recode_trns_proc (struct trns_header * t, struct ccase * c,
782 for (v = ((struct recode_trns *) t)->codings; v; v = v->next)
786 switch (v->flags & RCD_SRC_MASK)
788 case RCD_SRC_NUMERIC:
789 cp = find_src_numeric (v, c);
792 cp = find_src_string (v, c);
801 /* A matching input value was found. */
802 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
804 double val = cp->t.f;
805 double *out = &case_data_rw (c, v->dest->fv)->f;
807 *out = case_num (c, v->src->fv);
816 if (v->dest->fv != v->src->fv)
817 buf_copy_rpad (case_data_rw (c, v->dest->fv)->s,
819 case_str (c, v->src->fv), v->src->width);
822 memcpy (case_data_rw (c, v->dest->fv)->s, cp->t.c, v->dest->width);
829 /* Convert NPTR to a `long int' in base 10. Returns the long int on
830 success, NOT_LONG on failure. On success stores a pointer to the
831 first character after the number into *ENDPTR. From the GNU C
834 string_to_long (const char *nptr, int width, const char **endptr)
837 register unsigned long int cutoff;
838 register unsigned int cutlim;
839 register unsigned long int i;
840 register const char *s;
841 register unsigned char c;
846 /* Check for a sign. */
859 if (s >= nptr + width)
862 /* Save the pointer so we can check later if anything happened. */
865 cutoff = ULONG_MAX / 10ul;
866 cutlim = ULONG_MAX % 10ul;
871 if (isdigit ((unsigned char) c))
875 /* Check for overflow. */
876 if (i > cutoff || (i == cutoff && c > cutlim))
882 if (s >= nptr + width)
887 /* Check if anything actually happened. */
891 /* Check for a value that is within the range of `unsigned long
892 int', but outside the range of `long int'. We limit LONG_MIN and
893 LONG_MAX by one point because we know that NOT_LONG is out there
896 ? -((unsigned long int) LONG_MIN) - 1
897 : ((unsigned long int) LONG_MAX) - 1))
902 /* Return the result of the appropriate sign. */
903 return (negative ? -i : i);
906 /* Converts S to a double according to format Fx.0. Returns the value
907 found, or -SYSMIS if there was no valid number in s. WIDTH is the
908 length of string S. From the GNU C library. */
910 convert_to_double (const char *s, int width)
912 register const char *end = &s[width];
916 /* The number so far. */
919 int got_dot; /* Found a decimal point. */
920 int got_digit; /* Count of digits. */
922 /* The exponent of the number. */
925 /* Eat whitespace. */
926 while (s < end && isspace ((unsigned char) *s))
932 sign = *s == '-' ? -1 : 1;
933 if (*s == '-' || *s == '+')
946 if (isdigit ((unsigned char) *s))
950 /* Make sure that multiplication by 10 will not overflow. */
951 if (num > DBL_MAX * 0.1)
952 /* The value of the digit doesn't matter, since we have already
953 gotten as many digits as can be represented in a `double'.
954 This doesn't necessarily mean the result will overflow.
955 The exponent may reduce it to within range.
957 We just need to record that there was another
958 digit so that we can multiply by 10 later. */
961 num = (num * 10.0) + (*s - '0');
963 /* Keep track of the number of digits after the decimal point.
964 If we just divided by 10 here, we would lose precision. */
968 else if (!got_dot && *s == '.')
969 /* Record that we have found the decimal point. */
978 if (s < end && (tolower ((unsigned char) (*s)) == 'e'
979 || tolower ((unsigned char) (*s)) == 'd'))
981 /* Get the exponent specified after the `e' or `E'. */
988 exp = string_to_long (s, end - s, &s);
989 if (exp == NOT_LONG || end == s)
994 while (s < end && isspace ((unsigned char) *s))
1002 /* Multiply NUM by 10 to the EXPONENT power,
1003 checking for overflow and underflow. */
1007 if (-exponent + got_digit > -(DBL_MIN_10_EXP) + 5
1008 || num < DBL_MIN * pow (10.0, (double) -exponent))
1010 num *= pow (10.0, (double) exponent);
1012 else if (exponent > 0)
1014 if (num > DBL_MAX * pow (10.0, (double) -exponent))
1016 num *= pow (10.0, (double) exponent);
1019 return sign > 0 ? num : -num;