1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
28 #include "dictionary.h"
36 #define _(msgid) gettext (msgid)
40 /* Type of source value for RECODE. */
43 RCD_END, /* sentinel value */
44 RCD_USER, /* user-missing => one */
45 RCD_SINGLE, /* one => one */
46 RCD_HIGH, /* x > a => one */
47 RCD_LOW, /* x < b => one */
48 RCD_RANGE, /* b < x < a => one */
49 RCD_ELSE, /* any but SYSMIS => one */
50 RCD_CONVERT /* "123" => 123 */
53 /* Describes how to recode a single value or range of values into a
58 union value f1, f2; /* Describe value or range as src. Long
59 strings are stored in `c'. */
60 union value t; /* Describes value as dest. Long strings in `c'. */
63 /* Describes how to recode a single variable. */
68 unsigned flags; /* RCD_SRC_* | RCD_DEST_* | RCD_MISC_* */
70 struct variable *src; /* Source variable. */
71 struct variable *dest; /* Destination variable. */
72 char dest_name[LONG_NAME_LEN + 1]; /* Name of dest variable if we're creating it. */
74 int has_sysmis; /* Do we recode for SYSMIS? */
75 union value sysmis; /* Coding for SYSMIS (if src is numeric). */
77 struct coding *map; /* Coding for other values. */
78 size_t nmap, mmap; /* Length of map, max capacity of map. */
81 /* RECODE transformation. */
84 struct rcd_var *codings;
87 /* What we're recoding from (`src'==`source'). */
88 #define RCD_SRC_ERROR 0000u /* Bad value for src. */
89 #define RCD_SRC_NUMERIC 0001u /* Src is numeric. */
90 #define RCD_SRC_STRING 0002u /* Src is short string. */
91 #define RCD_SRC_MASK 0003u /* AND mask to isolate src bits. */
93 /* What we're recoding to (`dest'==`destination'). */
94 #define RCD_DEST_ERROR 0000u /* Bad value for dest. */
95 #define RCD_DEST_NUMERIC 0004u /* Dest is numeric. */
96 #define RCD_DEST_STRING 0010u /* Dest is short string. */
97 #define RCD_DEST_MASK 0014u /* AND mask to isolate dest bits. */
99 /* Miscellaneous bits. */
100 #define RCD_MISC_CREATE 0020u /* We create dest var (numeric only) */
101 #define RCD_MISC_DUPLICATE 0040u /* This var_info has the same MAP
102 value as the previous var_info.
103 Prevents redundant free()ing. */
104 #define RCD_MISC_MISSING 0100u /* Encountered MISSING or SYSMIS in
107 static int parse_dest_spec (struct rcd_var *rcd, union value *v,
108 size_t *max_dst_width);
109 static int parse_src_spec (struct rcd_var *rcd, int type, size_t max_src_width);
110 static trns_proc_func recode_trns_proc;
111 static trns_free_func recode_trns_free;
112 static double convert_to_double (const char *, int);
116 /* Parses the RECODE transformation. */
122 /* Transformation that we're constructing. */
125 /* Type of the src variables. */
128 /* Length of longest src string. */
129 size_t max_src_width;
131 /* Length of longest dest string. */
132 size_t max_dst_width;
134 /* For stepping through, constructing the linked list of
136 struct rcd_var *iter;
138 /* The real transformation, just a wrapper for a list of
140 struct recode_trns *trns;
142 /* First transformation in the list. rcd is in this list. */
143 struct rcd_var *head;
145 /* Variables in the current part of the recoding. */
149 /* Parses each specification between slashes. */
150 head = rcd = xmalloc (sizeof *rcd);
154 /* Whether we've already encountered a specification for SYSMIS. */
157 /* Initialize this rcd_var to ensure proper cleanup. */
160 rcd->nmap = rcd->mmap = 0;
164 /* Parse variable names. */
165 if (!parse_variables (default_dict, &v, &nv, PV_SAME_TYPE))
168 /* Ensure all variables are same type; find length of longest
171 max_src_width = v[0]->width;
174 for (i = 0; i < nv; i++)
175 if (v[i]->width > (int) max_src_width)
176 max_src_width = v[i]->width;
181 rcd->flags |= RCD_SRC_NUMERIC;
183 rcd->flags |= RCD_SRC_STRING;
185 /* Parse each coding in parentheses. */
187 if (!lex_force_match ('('))
191 /* Get the input value (before the `='). */
192 size_t mark = rcd->nmap;
193 int code = parse_src_spec (rcd, type, max_src_width);
197 /* ELSE is the same as any other input spec except that it
198 precludes later sysmis specifications. */
205 /* If keyword CONVERT was specified, there is no output
211 /* Get the output value (after the `='). */
212 lex_get (); /* Skip `='. */
213 if (!parse_dest_spec (rcd, &output, &max_dst_width))
216 /* Set the value for SYSMIS if requested and if we don't
218 if ((rcd->flags & RCD_MISC_MISSING) && !had_sysmis)
221 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
222 rcd->sysmis.f = output.f;
224 rcd->sysmis.c = xstrdup (output.c);
227 rcd->flags &= ~RCD_MISC_MISSING;
230 /* Since there may be multiple input values for a single
231 output, the output value need to propagated among all
233 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
234 for (i = mark; i < rcd->nmap; i++)
235 rcd->map[i].t.f = output.f;
238 for (i = mark; i < rcd->nmap; i++)
239 rcd->map[i].t.c = output.c ? xstrdup (output.c) : NULL;
243 lex_get (); /* Skip `)'. */
244 if (!lex_match ('('))
248 /* Append sentinel value. */
249 rcd->map[rcd->nmap++].type = RCD_END;
251 /* Since multiple variables may use the same recodings, it is
252 necessary to propogate the codings to all of them. */
255 rcd->dest_name[0] = 0;
257 for (i = 1; i < nv; i++)
259 iter = iter->next = xmalloc (sizeof *iter);
261 iter->flags = rcd->flags | RCD_MISC_DUPLICATE;
264 iter->dest_name[0] = 0;
265 iter->has_sysmis = rcd->has_sysmis;
266 iter->sysmis = rcd->sysmis;
267 iter->map = rcd->map;
270 if (lex_match_id ("INTO"))
277 if (!parse_mixed_vars (&names, &nnames, PV_NONE))
282 for (i = 0; i < nnames; i++)
285 msg (SE, _("%u variable(s) cannot be recoded into "
286 "%u variable(s). Specify the same number "
287 "of variables as input and output variables."),
288 (unsigned) nv, (unsigned) nnames);
292 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
293 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
295 struct variable *v = dict_lookup_var (default_dict, names[i]);
299 msg (SE, _("There is no string variable named "
300 "%s. (All string variables specified "
301 "on INTO must already exist. Use the "
302 "STRING command to create a string "
307 if (v->type != ALPHA)
309 msg (SE, _("Type mismatch between input and output "
310 "variables. Output variable %s is not "
311 "a string variable, but all the input "
312 "variables are string variables."),
316 if (v->width > (int) max_dst_width)
317 max_dst_width = v->width;
321 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
323 struct variable *v = dict_lookup_var (default_dict, names[i]);
327 if (v->type != NUMERIC)
329 msg (SE, _("Type mismatch after INTO: %s "
330 "is not a numeric variable."), v->name);
337 strcpy (iter->dest_name, names[i]);
341 /* Note that regardless of whether we succeed or fail,
342 flow-of-control comes here. `success' is the important
343 factor. Ah, if C had garbage collection... */
345 for (i = 0; i < nnames; i++)
353 if (max_src_width > max_dst_width)
354 max_dst_width = max_src_width;
356 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC
357 && (rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
359 msg (SE, _("INTO must be used when the input values are "
360 "numeric and output values are string."));
364 if ((rcd->flags & RCD_SRC_MASK) != RCD_SRC_NUMERIC
365 && (rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
367 msg (SE, _("INTO must be used when the input values are "
368 "string and output values are numeric."));
373 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
377 for (cp = rcd->map; cp->type != RCD_END; cp++)
380 if (strlen (cp->t.c) < max_dst_width)
382 /* The NULL is only really necessary for the
384 char *repl = xmalloc (max_dst_width + 1);
385 str_copy_rpad (repl, max_dst_width + 1, cp->t.c);
390 /* The strings are guaranteed to be in order of
391 nondecreasing length. */
400 if (!lex_match ('/'))
404 rcd = rcd->next = xmalloc (sizeof *rcd);
409 lex_error (_("expecting end of command"));
413 for (rcd = head; rcd; rcd = rcd->next)
414 if (rcd->dest_name[0])
416 rcd->dest = dict_create_var (default_dict, rcd->dest_name, 0);
419 /* FIXME: This can fail if a destname is duplicated.
420 We could give an error at parse time but I don't
422 rcd->dest = dict_lookup_var_assert (default_dict, rcd->dest_name);
426 trns = xmalloc (sizeof *trns);
427 trns->codings = head;
428 add_transformation (recode_trns_proc, recode_trns_free, trns);
435 struct recode_trns t;
438 recode_trns_free (&t);
444 parse_dest_spec (struct rcd_var *rcd, union value *v, size_t *max_dst_width)
450 if (lex_is_number ())
454 flags = RCD_DEST_NUMERIC;
456 else if (lex_match_id ("SYSMIS"))
459 flags = RCD_DEST_NUMERIC;
461 else if (token == T_STRING)
463 size_t max = *max_dst_width;
464 size_t toklen = ds_length (&tokstr);
467 v->c = xmalloc (max + 1);
468 str_copy_rpad (v->c, max + 1, ds_c_str (&tokstr));
469 flags = RCD_DEST_STRING;
470 *max_dst_width = max;
473 else if (lex_match_id ("COPY"))
475 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
477 flags = RCD_DEST_NUMERIC;
482 flags = RCD_DEST_STRING;
488 lex_error (_("expecting output value"));
492 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
495 else if (((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC
496 && flags != RCD_DEST_NUMERIC)
497 || ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING
498 && flags != RCD_DEST_STRING))
500 else if ((rcd->flags & RCD_DEST_MASK) ^ flags)
502 msg (SE, _("Inconsistent output types. The output values "
503 "must be all numeric or all string."));
510 /* Reads a set of source specifications and returns one of the
511 following values: 0 on failure; 1 for normal success; 2 for success
512 but with CONVERT as the keyword; 3 for success but with ELSE as the
515 parse_src_spec (struct rcd_var *rcd, int type, size_t max_src_width)
521 if (rcd->nmap + 1 >= rcd->mmap)
524 rcd->map = xnrealloc (rcd->map, rcd->mmap, sizeof *rcd->map);
527 c = &rcd->map[rcd->nmap];
528 c->f1.c = c->f2.c = NULL;
529 if (lex_match_id ("ELSE"))
535 else if (type == NUMERIC)
539 if (lex_match_id ("LO") || lex_match_id ("LOWEST"))
541 if (!lex_force_match_id ("THRU"))
543 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
545 else if (lex_is_number ())
553 lex_error (_("following LO THRU"));
557 else if (lex_match_id ("MISSING"))
560 rcd->flags |= RCD_MISC_MISSING;
562 else if (lex_match_id ("SYSMIS"))
565 rcd->flags |= RCD_MISC_MISSING;
569 lex_error (_("in source value"));
573 else if (lex_is_number ())
577 if (lex_match_id ("THRU"))
579 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
581 else if (lex_is_number ())
594 c->type = RCD_SINGLE;
598 lex_error (_("in source value"));
604 assert (type == ALPHA);
605 if (lex_match_id ("CONVERT"))
607 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
608 rcd->flags |= RCD_DEST_NUMERIC;
609 else if ((rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
611 msg (SE, _("Keyword CONVERT may only be used with "
612 "string input values and numeric output "
617 c->type = RCD_CONVERT;
623 /* Only the debugging code needs the NULLs at the ends
624 of the strings. However, changing code behavior more
625 than necessary based on the DEBUGGING `#define' is just
627 c->type = RCD_SINGLE;
628 if (!lex_force_string ())
630 c->f1.c = xmalloc (max_src_width + 1);
631 str_copy_rpad (c->f1.c, max_src_width + 1, ds_c_str (&tokstr));
636 if (c->type != RCD_END)
646 /* Data transformation. */
649 recode_trns_free (void *t_)
651 struct recode_trns *t = t_;
653 struct rcd_var *head, *next;
658 if (head->map && !(head->flags & RCD_MISC_DUPLICATE))
660 if (head->flags & RCD_SRC_STRING)
661 for (i = 0; i < head->nmap; i++)
662 switch (head->map[i].type)
665 free (head->map[i].f2.c);
671 free (head->map[i].f1.c);
680 if (head->flags & RCD_DEST_STRING)
681 for (i = 0; i < head->nmap; i++)
682 if (head->map[i].type != RCD_CONVERT && head->map[i].type != RCD_END)
683 free (head->map[i].t.c);
693 static inline struct coding *
694 find_src_numeric (struct rcd_var *v, struct ccase *c)
696 double cmp = case_num (c, v->src->fv);
701 if (v->sysmis.f != -SYSMIS)
703 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
704 case_data_rw (c, v->dest->fv)->f = v->sysmis.f;
706 memcpy (case_data_rw (c, v->dest->fv)->s, v->sysmis.s,
712 for (cp = v->map;; cp++)
718 if (mv_is_num_user_missing (&v->src->miss, cmp))
734 if (cmp >= cp->f1.f && cmp <= cp->f2.f)
744 static inline struct coding *
745 find_src_string (struct rcd_var *v, struct ccase *c)
747 const char *cmp = case_str (c, v->src->fv);
748 int w = v->src->width;
751 for (cp = v->map;; cp++)
757 if (!memcmp (cp->f1.c, cmp, w))
764 double f = convert_to_double (cmp, w);
767 case_data_rw (c, v->dest->fv)->f = f;
778 recode_trns_proc (void *t_, struct ccase *c,
781 struct recode_trns *t = t_;
784 for (v = t->codings; v; v = v->next)
788 switch (v->flags & RCD_SRC_MASK)
790 case RCD_SRC_NUMERIC:
791 cp = find_src_numeric (v, c);
794 cp = find_src_string (v, c);
803 /* A matching input value was found. */
804 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
806 double val = cp->t.f;
807 double *out = &case_data_rw (c, v->dest->fv)->f;
809 *out = case_num (c, v->src->fv);
818 if (v->dest->fv != v->src->fv)
819 buf_copy_rpad (case_data_rw (c, v->dest->fv)->s,
821 case_str (c, v->src->fv), v->src->width);
824 memcpy (case_data_rw (c, v->dest->fv)->s, cp->t.c, v->dest->width);
831 /* Convert NPTR to a `long int' in base 10. Returns the long int on
832 success, NOT_LONG on failure. On success stores a pointer to the
833 first character after the number into *ENDPTR. From the GNU C
836 string_to_long (const char *nptr, int width, const char **endptr)
839 unsigned long int cutoff;
848 /* Check for a sign. */
861 if (s >= nptr + width)
864 /* Save the pointer so we can check later if anything happened. */
867 cutoff = ULONG_MAX / 10ul;
868 cutlim = ULONG_MAX % 10ul;
873 if (isdigit ((unsigned char) c))
877 /* Check for overflow. */
878 if (i > cutoff || (i == cutoff && c > cutlim))
884 if (s >= nptr + width)
889 /* Check if anything actually happened. */
893 /* Check for a value that is within the range of `unsigned long
894 int', but outside the range of `long int'. We limit LONG_MIN and
895 LONG_MAX by one point because we know that NOT_LONG is out there
898 ? -((unsigned long int) LONG_MIN) - 1
899 : ((unsigned long int) LONG_MAX) - 1))
904 /* Return the result of the appropriate sign. */
905 return (negative ? -i : i);
908 /* Converts S to a double according to format Fx.0. Returns the value
909 found, or -SYSMIS if there was no valid number in s. WIDTH is the
910 length of string S. From the GNU C library. */
912 convert_to_double (const char *s, int width)
914 const char *end = &s[width];
918 /* The number so far. */
921 int got_dot; /* Found a decimal point. */
922 int got_digit; /* Count of digits. */
924 /* The exponent of the number. */
927 /* Eat whitespace. */
928 while (s < end && isspace ((unsigned char) *s))
934 sign = *s == '-' ? -1 : 1;
935 if (*s == '-' || *s == '+')
948 if (isdigit ((unsigned char) *s))
952 /* Make sure that multiplication by 10 will not overflow. */
953 if (num > DBL_MAX * 0.1)
954 /* The value of the digit doesn't matter, since we have already
955 gotten as many digits as can be represented in a `double'.
956 This doesn't necessarily mean the result will overflow.
957 The exponent may reduce it to within range.
959 We just need to record that there was another
960 digit so that we can multiply by 10 later. */
963 num = (num * 10.0) + (*s - '0');
965 /* Keep track of the number of digits after the decimal point.
966 If we just divided by 10 here, we would lose precision. */
970 else if (!got_dot && *s == '.')
971 /* Record that we have found the decimal point. */
980 if (s < end && (tolower ((unsigned char) (*s)) == 'e'
981 || tolower ((unsigned char) (*s)) == 'd'))
983 /* Get the exponent specified after the `e' or `E'. */
990 exp = string_to_long (s, end - s, &s);
991 if (exp == NOT_LONG || end == s)
996 while (s < end && isspace ((unsigned char) *s))
1004 /* Multiply NUM by 10 to the EXPONENT power,
1005 checking for overflow and underflow. */
1009 if (-exponent + got_digit > -(DBL_MIN_10_EXP) + 5
1010 || num < DBL_MIN * pow (10.0, (double) -exponent))
1012 num *= pow (10.0, (double) exponent);
1014 else if (exponent > 0)
1016 if (num > DBL_MAX * pow (10.0, (double) -exponent))
1018 num *= pow (10.0, (double) exponent);
1021 return sign > 0 ? num : -num;