1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
28 #include "dictionary.h"
37 /* Type of source value for RECODE. */
40 RCD_END, /* sentinel value */
41 RCD_USER, /* user-missing => one */
42 RCD_SINGLE, /* one => one */
43 RCD_HIGH, /* x > a => one */
44 RCD_LOW, /* x < b => one */
45 RCD_RANGE, /* b < x < a => one */
46 RCD_ELSE, /* any but SYSMIS => one */
47 RCD_CONVERT /* "123" => 123 */
50 /* Describes how to recode a single value or range of values into a
55 union value f1, f2; /* Describe value or range as src. Long
56 strings are stored in `c'. */
57 union value t; /* Describes value as dest. Long strings in `c'. */
60 /* Describes how to recode a single variable. */
65 unsigned flags; /* RCD_SRC_* | RCD_DEST_* | RCD_MISC_* */
67 struct variable *src; /* Source variable. */
68 struct variable *dest; /* Destination variable. */
69 char dest_name[9]; /* Name of dest variable if we're creating it. */
71 int has_sysmis; /* Do we recode for SYSMIS? */
72 union value sysmis; /* Coding for SYSMIS (if src is numeric). */
74 struct coding *map; /* Coding for other values. */
75 int nmap, mmap; /* Length of map, max capacity of map. */
78 /* RECODE transformation. */
82 struct rcd_var *codings;
85 /* What we're recoding from (`src'==`source'). */
86 #define RCD_SRC_ERROR 0000u /* Bad value for src. */
87 #define RCD_SRC_NUMERIC 0001u /* Src is numeric. */
88 #define RCD_SRC_STRING 0002u /* Src is short string. */
89 #define RCD_SRC_MASK 0003u /* AND mask to isolate src bits. */
91 /* What we're recoding to (`dest'==`destination'). */
92 #define RCD_DEST_ERROR 0000u /* Bad value for dest. */
93 #define RCD_DEST_NUMERIC 0004u /* Dest is numeric. */
94 #define RCD_DEST_STRING 0010u /* Dest is short string. */
95 #define RCD_DEST_MASK 0014u /* AND mask to isolate dest bits. */
97 /* Miscellaneous bits. */
98 #define RCD_MISC_CREATE 0020u /* We create dest var (numeric only) */
99 #define RCD_MISC_DUPLICATE 0040u /* This var_info has the same MAP
100 value as the previous var_info.
101 Prevents redundant free()ing. */
102 #define RCD_MISC_MISSING 0100u /* Encountered MISSING or SYSMIS in
105 static int parse_dest_spec (struct rcd_var * rcd, union value *v,
106 size_t *max_dst_width);
107 static int parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width);
108 static trns_proc_func recode_trns_proc;
109 static trns_free_func recode_trns_free;
110 static double convert_to_double (const char *, int);
114 /* Parses the RECODE transformation. */
120 /* Transformation that we're constructing. */
123 /* Type of the src variables. */
126 /* Length of longest src string. */
127 size_t max_src_width;
129 /* Length of longest dest string. */
130 size_t max_dst_width;
132 /* For stepping through, constructing the linked list of
134 struct rcd_var *iter;
136 /* The real transformation, just a wrapper for a list of
138 struct recode_trns *trns;
140 /* First transformation in the list. rcd is in this list. */
141 struct rcd_var *head;
143 /* Variables in the current part of the recoding. */
147 /* Parses each specification between slashes. */
148 head = rcd = xmalloc (sizeof *rcd);
152 /* Whether we've already encountered a specification for SYSMIS. */
155 /* Initialize this rcd_var to ensure proper cleanup. */
158 rcd->nmap = rcd->mmap = 0;
162 /* Parse variable names. */
163 if (!parse_variables (default_dict, &v, &nv, PV_SAME_TYPE))
166 /* Ensure all variables are same type; find length of longest
169 max_src_width = v[0]->width;
172 for (i = 0; i < nv; i++)
173 if (v[i]->width > (int) max_src_width)
174 max_src_width = v[i]->width;
179 rcd->flags |= RCD_SRC_NUMERIC;
181 rcd->flags |= RCD_SRC_STRING;
183 /* Parse each coding in parentheses. */
185 if (!lex_force_match ('('))
189 /* Get the input value (before the `='). */
190 int mark = rcd->nmap;
191 int code = parse_src_spec (rcd, type, max_src_width);
195 /* ELSE is the same as any other input spec except that it
196 precludes later sysmis specifications. */
203 /* If keyword CONVERT was specified, there is no output
209 /* Get the output value (after the `='). */
210 lex_get (); /* Skip `='. */
211 if (!parse_dest_spec (rcd, &output, &max_dst_width))
214 /* Set the value for SYSMIS if requested and if we don't
216 if ((rcd->flags & RCD_MISC_MISSING) && !had_sysmis)
219 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
220 rcd->sysmis.f = output.f;
222 rcd->sysmis.c = xstrdup (output.c);
225 rcd->flags &= ~RCD_MISC_MISSING;
228 /* Since there may be multiple input values for a single
229 output, the output value need to propagated among all
231 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
232 for (i = mark; i < rcd->nmap; i++)
233 rcd->map[i].t.f = output.f;
236 for (i = mark; i < rcd->nmap; i++)
237 rcd->map[i].t.c = (output.c?xstrdup (output.c):NULL);
241 lex_get (); /* Skip `)'. */
242 if (!lex_match ('('))
246 /* Append sentinel value. */
247 rcd->map[rcd->nmap++].type = RCD_END;
249 /* Since multiple variables may use the same recodings, it is
250 necessary to propogate the codings to all of them. */
253 rcd->dest_name[0] = 0;
255 for (i = 1; i < nv; i++)
257 iter = iter->next = xmalloc (sizeof *iter);
259 iter->flags = rcd->flags | RCD_MISC_DUPLICATE;
262 iter->dest_name[0] = 0;
263 iter->has_sysmis = rcd->has_sysmis;
264 iter->sysmis = rcd->sysmis;
265 iter->map = rcd->map;
268 if (lex_match_id ("INTO"))
275 if (!parse_mixed_vars (&names, &nnames, PV_NONE))
280 for (i = 0; i < nnames; i++)
283 msg (SE, _("%d variable(s) cannot be recoded into "
284 "%d variable(s). Specify the same number "
285 "of variables as input and output variables."),
290 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
291 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
293 struct variable *v = dict_lookup_var (default_dict, names[i]);
297 msg (SE, _("There is no string variable named "
298 "%s. (All string variables specified "
299 "on INTO must already exist. Use the "
300 "STRING command to create a string "
301 "variable.)"), names[i]);
304 if (v->type != ALPHA)
306 msg (SE, _("Type mismatch between input and output "
307 "variables. Output variable %s is not "
308 "a string variable, but all the input "
309 "variables are string variables."), v->name);
312 if (v->width > (int) max_dst_width)
313 max_dst_width = v->width;
317 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
319 struct variable *v = dict_lookup_var (default_dict, names[i]);
323 if (v->type != NUMERIC)
325 msg (SE, _("Type mismatch after INTO: %s "
326 "is not a numeric variable."), v->name);
333 strcpy (iter->dest_name, names[i]);
337 /* Note that regardless of whether we succeed or fail,
338 flow-of-control comes here. `success' is the important
339 factor. Ah, if C had garbage collection... */
341 for (i = 0; i < nnames; i++)
349 if (max_src_width > max_dst_width)
350 max_dst_width = max_src_width;
352 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC
353 && (rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
355 msg (SE, _("INTO must be used when the input values are "
356 "numeric and output values are string."));
360 if ((rcd->flags & RCD_SRC_MASK) != RCD_SRC_NUMERIC
361 && (rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
363 msg (SE, _("INTO must be used when the input values are "
364 "string and output values are numeric."));
369 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
373 for (cp = rcd->map; cp->type != RCD_END; cp++)
376 if (strlen (cp->t.c) < max_dst_width)
378 /* The NULL is only really necessary for the
380 char *repl = xmalloc (max_dst_width + 1);
381 st_pad_copy (repl, cp->t.c, max_dst_width + 1);
386 /* The strings are guaranteed to be in order of
387 nondecreasing length. */
396 if (!lex_match ('/'))
400 rcd = rcd->next = xmalloc (sizeof *rcd);
405 lex_error (_("expecting end of command"));
409 for (rcd = head; rcd; rcd = rcd->next)
410 if (rcd->dest_name[0])
412 rcd->dest = dict_create_var (default_dict, rcd->dest_name, 0);
415 /* FIXME: This can occur if a destname is duplicated.
416 We could give an error at parse time but I don't
418 rcd->dest = dict_lookup_var_assert (default_dict, rcd->dest_name);
422 trns = xmalloc (sizeof *trns);
423 trns->h.proc = recode_trns_proc;
424 trns->h.free = recode_trns_free;
425 trns->codings = head;
426 add_transformation ((struct trns_header *) trns);
433 struct recode_trns t;
436 recode_trns_free ((struct trns_header *) &t);
442 parse_dest_spec (struct rcd_var * rcd, union value * v, size_t *max_dst_width)
448 if (lex_is_number ())
452 flags = RCD_DEST_NUMERIC;
454 else if (lex_match_id ("SYSMIS"))
457 flags = RCD_DEST_NUMERIC;
459 else if (token == T_STRING)
461 size_t max = *max_dst_width;
462 size_t toklen = ds_length (&tokstr);
465 v->c = xmalloc (max + 1);
466 st_pad_copy (v->c, ds_c_str (&tokstr), max + 1);
467 flags = RCD_DEST_STRING;
468 *max_dst_width = max;
471 else if (lex_match_id ("COPY"))
473 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
475 flags = RCD_DEST_NUMERIC;
480 flags = RCD_DEST_STRING;
486 lex_error (_("expecting output value"));
490 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
493 else if (((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC
494 && flags != RCD_DEST_NUMERIC)
495 || ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING
496 && flags != RCD_DEST_STRING))
498 else if ((rcd->flags & RCD_DEST_MASK) ^ flags)
500 msg (SE, _("Inconsistent output types. The output values "
501 "must be all numeric or all string."));
508 /* Reads a set of source specifications and returns one of the
509 following values: 0 on failure; 1 for normal success; 2 for success
510 but with CONVERT as the keyword; 3 for success but with ELSE as the
513 parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width)
519 if (rcd->nmap >= rcd->mmap - 1)
522 rcd->map = xrealloc (rcd->map, rcd->mmap * sizeof *rcd->map);
525 c = &rcd->map[rcd->nmap];
526 c->f1.c = c->f2.c = NULL;
527 if (lex_match_id ("ELSE"))
533 else if (type == NUMERIC)
537 if (lex_match_id ("LO") || lex_match_id ("LOWEST"))
539 if (!lex_force_match_id ("THRU"))
541 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
543 else if (lex_is_number ())
551 lex_error (_("following LO THRU"));
555 else if (lex_match_id ("MISSING"))
558 rcd->flags |= RCD_MISC_MISSING;
560 else if (lex_match_id ("SYSMIS"))
563 rcd->flags |= RCD_MISC_MISSING;
567 lex_error (_("in source value"));
571 else if (lex_is_number ())
575 if (lex_match_id ("THRU"))
577 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
579 else if (lex_is_number ())
592 c->type = RCD_SINGLE;
596 lex_error (_("in source value"));
602 assert (type == ALPHA);
603 if (lex_match_id ("CONVERT"))
605 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
606 rcd->flags |= RCD_DEST_NUMERIC;
607 else if ((rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
609 msg (SE, _("Keyword CONVERT may only be used with "
610 "string input values and numeric output "
615 c->type = RCD_CONVERT;
621 /* Only the debugging code needs the NULLs at the ends
622 of the strings. However, changing code behavior more
623 than necessary based on the DEBUGGING `#define' is just
625 c->type = RCD_SINGLE;
626 if (!lex_force_string ())
628 c->f1.c = xmalloc (max_src_width + 1);
629 st_pad_copy (c->f1.c, ds_c_str (&tokstr), max_src_width + 1);
634 if (c->type != RCD_END)
644 /* Data transformation. */
647 recode_trns_free (struct trns_header * t)
650 struct rcd_var *head, *next;
652 head = ((struct recode_trns *) t)->codings;
655 if (head->map && !(head->flags & RCD_MISC_DUPLICATE))
657 if (head->flags & RCD_SRC_STRING)
658 for (i = 0; i < head->nmap; i++)
659 switch (head->map[i].type)
662 free (head->map[i].f2.c);
668 free (head->map[i].f1.c);
677 if (head->flags & RCD_DEST_STRING)
678 for (i = 0; i < head->nmap; i++)
679 if (head->map[i].type != RCD_CONVERT && head->map[i].type != RCD_END)
680 free (head->map[i].t.c);
689 static inline struct coding *
690 find_src_numeric (struct rcd_var * v, struct ccase * c)
692 double cmp = case_num (c, v->src->fv);
697 if (v->sysmis.f != -SYSMIS)
699 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
700 case_data_rw (c, v->dest->fv)->f = v->sysmis.f;
702 memcpy (case_data_rw (c, v->dest->fv)->s, v->sysmis.s,
708 for (cp = v->map;; cp++)
714 if (is_num_user_missing (cmp, v->src))
730 if (cmp >= cp->f1.f && cmp <= cp->f2.f)
740 static inline struct coding *
741 find_src_string (struct rcd_var * v, struct ccase * c)
743 const char *cmp = case_str (c, v->src->fv);
744 int w = v->src->width;
747 for (cp = v->map;; cp++)
753 if (!memcmp (cp->f1.c, cmp, w))
760 double f = convert_to_double (cmp, w);
763 case_data_rw (c, v->dest->fv)->f = f;
774 recode_trns_proc (struct trns_header * t, struct ccase * c,
779 for (v = ((struct recode_trns *) t)->codings; v; v = v->next)
783 switch (v->flags & RCD_SRC_MASK)
785 case RCD_SRC_NUMERIC:
786 cp = find_src_numeric (v, c);
789 cp = find_src_string (v, c);
798 /* A matching input value was found. */
799 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
801 double val = cp->t.f;
802 double *out = &case_data_rw (c, v->dest->fv)->f;
804 *out = case_num (c, v->src->fv);
813 if (v->dest->fv != v->src->fv)
814 st_bare_pad_len_copy (case_data_rw (c, v->dest->fv)->s,
815 case_str (c, v->src->fv),
816 v->dest->width, v->src->width);
819 memcpy (case_data_rw (c, v->dest->fv)->s, cp->t.c, v->dest->width);
826 /* Convert NPTR to a `long int' in base 10. Returns the long int on
827 success, NOT_LONG on failure. On success stores a pointer to the
828 first character after the number into *ENDPTR. From the GNU C
831 string_to_long (const char *nptr, int width, const char **endptr)
834 register unsigned long int cutoff;
835 register unsigned int cutlim;
836 register unsigned long int i;
837 register const char *s;
838 register unsigned char c;
843 /* Check for a sign. */
856 if (s >= nptr + width)
859 /* Save the pointer so we can check later if anything happened. */
862 cutoff = ULONG_MAX / 10ul;
863 cutlim = ULONG_MAX % 10ul;
868 if (isdigit ((unsigned char) c))
872 /* Check for overflow. */
873 if (i > cutoff || (i == cutoff && c > cutlim))
879 if (s >= nptr + width)
884 /* Check if anything actually happened. */
888 /* Check for a value that is within the range of `unsigned long
889 int', but outside the range of `long int'. We limit LONG_MIN and
890 LONG_MAX by one point because we know that NOT_LONG is out there
893 ? -((unsigned long int) LONG_MIN) - 1
894 : ((unsigned long int) LONG_MAX) - 1))
899 /* Return the result of the appropriate sign. */
900 return (negative ? -i : i);
903 /* Converts S to a double according to format Fx.0. Returns the value
904 found, or -SYSMIS if there was no valid number in s. WIDTH is the
905 length of string S. From the GNU C library. */
907 convert_to_double (const char *s, int width)
909 register const char *end = &s[width];
913 /* The number so far. */
916 int got_dot; /* Found a decimal point. */
917 int got_digit; /* Count of digits. */
919 /* The exponent of the number. */
922 /* Eat whitespace. */
923 while (s < end && isspace ((unsigned char) *s))
929 sign = *s == '-' ? -1 : 1;
930 if (*s == '-' || *s == '+')
943 if (isdigit ((unsigned char) *s))
947 /* Make sure that multiplication by 10 will not overflow. */
948 if (num > DBL_MAX * 0.1)
949 /* The value of the digit doesn't matter, since we have already
950 gotten as many digits as can be represented in a `double'.
951 This doesn't necessarily mean the result will overflow.
952 The exponent may reduce it to within range.
954 We just need to record that there was another
955 digit so that we can multiply by 10 later. */
958 num = (num * 10.0) + (*s - '0');
960 /* Keep track of the number of digits after the decimal point.
961 If we just divided by 10 here, we would lose precision. */
965 else if (!got_dot && *s == '.')
966 /* Record that we have found the decimal point. */
975 if (s < end && (tolower ((unsigned char) (*s)) == 'e'
976 || tolower ((unsigned char) (*s)) == 'd'))
978 /* Get the exponent specified after the `e' or `E'. */
985 exp = string_to_long (s, end - s, &s);
986 if (exp == NOT_LONG || end == s)
991 while (s < end && isspace ((unsigned char) *s))
999 /* Multiply NUM by 10 to the EXPONENT power,
1000 checking for overflow and underflow. */
1004 if (-exponent + got_digit > -(DBL_MIN_10_EXP) + 5
1005 || num < DBL_MIN * pow (10.0, (double) -exponent))
1007 num *= pow (10.0, (double) exponent);
1009 else if (exponent > 0)
1011 if (num > DBL_MAX * pow (10.0, (double) -exponent))
1013 num *= pow (10.0, (double) exponent);
1016 return sign > 0 ? num : -num;