1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
36 /* Type of source value for RECODE. */
39 RCD_END, /* sentinel value */
40 RCD_USER, /* user-missing => one */
41 RCD_SINGLE, /* one => one */
42 RCD_HIGH, /* x > a => one */
43 RCD_LOW, /* x < b => one */
44 RCD_RANGE, /* b < x < a => one */
45 RCD_ELSE, /* any but SYSMIS => one */
46 RCD_CONVERT /* "123" => 123 */
49 /* Describes how to recode a single value or range of values into a
54 union value f1, f2; /* Describe value or range as src. Long
55 strings are stored in `c'. */
56 union value t; /* Describes value as dest. Long strings in `c'. */
59 /* Describes how to recode a single variable. */
64 unsigned flags; /* RCD_SRC_* | RCD_DEST_* | RCD_MISC_* */
66 struct variable *src; /* Source variable. */
67 struct variable *dest; /* Destination variable. */
68 char dest_name[9]; /* Name of dest variable if we're creating it. */
70 int has_sysmis; /* Do we recode for SYSMIS? */
71 union value sysmis; /* Coding for SYSMIS (if src is numeric). */
73 struct coding *map; /* Coding for other values. */
74 int nmap, mmap; /* Length of map, max capacity of map. */
77 /* RECODE transformation. */
81 struct rcd_var *codings;
84 /* What we're recoding from (`src'==`source'). */
85 #define RCD_SRC_ERROR 0000u /* Bad value for src. */
86 #define RCD_SRC_NUMERIC 0001u /* Src is numeric. */
87 #define RCD_SRC_STRING 0002u /* Src is short string. */
88 #define RCD_SRC_MASK 0003u /* AND mask to isolate src bits. */
90 /* What we're recoding to (`dest'==`destination'). */
91 #define RCD_DEST_ERROR 0000u /* Bad value for dest. */
92 #define RCD_DEST_NUMERIC 0004u /* Dest is numeric. */
93 #define RCD_DEST_STRING 0010u /* Dest is short string. */
94 #define RCD_DEST_MASK 0014u /* AND mask to isolate dest bits. */
96 /* Miscellaneous bits. */
97 #define RCD_MISC_CREATE 0020u /* We create dest var (numeric only) */
98 #define RCD_MISC_DUPLICATE 0040u /* This var_info has the same MAP
99 value as the previous var_info.
100 Prevents redundant free()ing. */
101 #define RCD_MISC_MISSING 0100u /* Encountered MISSING or SYSMIS in
104 static int parse_dest_spec (struct rcd_var * rcd, union value *v,
105 size_t *max_dst_width);
106 static int parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width);
107 static trns_proc_func recode_trns_proc;
108 static trns_free_func recode_trns_free;
109 static double convert_to_double (const char *, int);
113 /* Parses the RECODE transformation. */
119 /* Transformation that we're constructing. */
122 /* Type of the src variables. */
125 /* Length of longest src string. */
126 size_t max_src_width;
128 /* Length of longest dest string. */
129 size_t max_dst_width;
131 /* For stepping through, constructing the linked list of
133 struct rcd_var *iter;
135 /* The real transformation, just a wrapper for a list of
137 struct recode_trns *trns;
139 /* First transformation in the list. rcd is in this list. */
140 struct rcd_var *head;
142 /* Variables in the current part of the recoding. */
146 /* Parses each specification between slashes. */
147 head = rcd = xmalloc (sizeof *rcd);
151 /* Whether we've already encountered a specification for SYSMIS. */
154 /* Initialize this rcd_var to ensure proper cleanup. */
157 rcd->nmap = rcd->mmap = 0;
161 /* Parse variable names. */
162 if (!parse_variables (default_dict, &v, &nv, PV_SAME_TYPE))
165 /* Ensure all variables are same type; find length of longest
168 max_src_width = v[0]->width;
171 for (i = 0; i < nv; i++)
172 if (v[i]->width > (int) max_src_width)
173 max_src_width = v[i]->width;
178 rcd->flags |= RCD_SRC_NUMERIC;
180 rcd->flags |= RCD_SRC_STRING;
182 /* Parse each coding in parentheses. */
184 if (!lex_force_match ('('))
188 /* Get the input value (before the `='). */
189 int mark = rcd->nmap;
190 int code = parse_src_spec (rcd, type, max_src_width);
194 /* ELSE is the same as any other input spec except that it
195 precludes later sysmis specifications. */
202 /* If keyword CONVERT was specified, there is no output
208 /* Get the output value (after the `='). */
209 lex_get (); /* Skip `='. */
210 if (!parse_dest_spec (rcd, &output, &max_dst_width))
213 /* Set the value for SYSMIS if requested and if we don't
215 if ((rcd->flags & RCD_MISC_MISSING) && !had_sysmis)
218 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
219 rcd->sysmis.f = output.f;
221 rcd->sysmis.c = xstrdup (output.c);
224 rcd->flags &= ~RCD_MISC_MISSING;
227 /* Since there may be multiple input values for a single
228 output, the output value need to propagated among all
230 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
231 for (i = mark; i < rcd->nmap; i++)
232 rcd->map[i].t.f = output.f;
235 for (i = mark; i < rcd->nmap; i++)
236 rcd->map[i].t.c = (output.c?xstrdup (output.c):NULL);
240 lex_get (); /* Skip `)'. */
241 if (!lex_match ('('))
245 /* Append sentinel value. */
246 rcd->map[rcd->nmap++].type = RCD_END;
248 /* Since multiple variables may use the same recodings, it is
249 necessary to propogate the codings to all of them. */
252 rcd->dest_name[0] = 0;
254 for (i = 1; i < nv; i++)
256 iter = iter->next = xmalloc (sizeof *iter);
258 iter->flags = rcd->flags | RCD_MISC_DUPLICATE;
261 iter->dest_name[0] = 0;
262 iter->has_sysmis = rcd->has_sysmis;
263 iter->sysmis = rcd->sysmis;
264 iter->map = rcd->map;
267 if (lex_match_id ("INTO"))
274 if (!parse_mixed_vars (&names, &nnames, PV_NONE))
279 for (i = 0; i < nnames; i++)
282 msg (SE, _("%d variable(s) cannot be recoded into "
283 "%d variable(s). Specify the same number "
284 "of variables as input and output variables."),
289 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
290 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
292 struct variable *v = dict_lookup_var (default_dict, names[i]);
296 msg (SE, _("There is no string variable named "
297 "%s. (All string variables specified "
298 "on INTO must already exist. Use the "
299 "STRING command to create a string "
300 "variable.)"), names[i]);
303 if (v->type != ALPHA)
305 msg (SE, _("Type mismatch between input and output "
306 "variables. Output variable %s is not "
307 "a string variable, but all the input "
308 "variables are string variables."), v->name);
311 if (v->width > (int) max_dst_width)
312 max_dst_width = v->width;
316 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
318 struct variable *v = dict_lookup_var (default_dict, names[i]);
322 if (v->type != NUMERIC)
324 msg (SE, _("Type mismatch after INTO: %s "
325 "is not a numeric variable."), v->name);
332 strcpy (iter->dest_name, names[i]);
336 /* Note that regardless of whether we succeed or fail,
337 flow-of-control comes here. `success' is the important
338 factor. Ah, if C had garbage collection... */
340 for (i = 0; i < nnames; i++)
348 if (max_src_width > max_dst_width)
349 max_dst_width = max_src_width;
351 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC
352 && (rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
354 msg (SE, _("INTO must be used when the input values are "
355 "numeric and output values are string."));
359 if ((rcd->flags & RCD_SRC_MASK) != RCD_SRC_NUMERIC
360 && (rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
362 msg (SE, _("INTO must be used when the input values are "
363 "string and output values are numeric."));
368 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
372 for (cp = rcd->map; cp->type != RCD_END; cp++)
375 if (strlen (cp->t.c) < max_dst_width)
377 /* The NULL is only really necessary for the
379 char *repl = xmalloc (max_dst_width + 1);
380 st_pad_copy (repl, cp->t.c, max_dst_width + 1);
385 /* The strings are guaranteed to be in order of
386 nondecreasing length. */
395 if (!lex_match ('/'))
399 rcd = rcd->next = xmalloc (sizeof *rcd);
404 lex_error (_("expecting end of command"));
408 for (rcd = head; rcd; rcd = rcd->next)
409 if (rcd->dest_name[0])
411 rcd->dest = dict_create_var (default_dict, rcd->dest_name, 0);
414 /* FIXME: This can occur if a destname is duplicated.
415 We could give an error at parse time but I don't
417 rcd->dest = dict_lookup_var_assert (default_dict, rcd->dest_name);
421 trns = xmalloc (sizeof *trns);
422 trns->h.proc = recode_trns_proc;
423 trns->h.free = recode_trns_free;
424 trns->codings = head;
425 add_transformation ((struct trns_header *) trns);
432 struct recode_trns t;
435 recode_trns_free ((struct trns_header *) &t);
441 parse_dest_spec (struct rcd_var * rcd, union value * v, size_t *max_dst_width)
451 flags = RCD_DEST_NUMERIC;
453 else if (lex_match_id ("SYSMIS"))
456 flags = RCD_DEST_NUMERIC;
458 else if (token == T_STRING)
460 size_t max = *max_dst_width;
461 size_t toklen = ds_length (&tokstr);
464 v->c = xmalloc (max + 1);
465 st_pad_copy (v->c, ds_c_str (&tokstr), max + 1);
466 flags = RCD_DEST_STRING;
467 *max_dst_width = max;
470 else if (lex_match_id ("COPY"))
472 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
474 flags = RCD_DEST_NUMERIC;
479 flags = RCD_DEST_STRING;
485 lex_error (_("expecting output value"));
489 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
492 else if (((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC
493 && flags != RCD_DEST_NUMERIC)
494 || ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING
495 && flags != RCD_DEST_STRING))
497 else if ((rcd->flags & RCD_DEST_MASK) ^ flags)
499 msg (SE, _("Inconsistent output types. The output values "
500 "must be all numeric or all string."));
507 /* Reads a set of source specifications and returns one of the
508 following values: 0 on failure; 1 for normal success; 2 for success
509 but with CONVERT as the keyword; 3 for success but with ELSE as the
512 parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width)
518 if (rcd->nmap >= rcd->mmap - 1)
521 rcd->map = xrealloc (rcd->map, rcd->mmap * sizeof *rcd->map);
524 c = &rcd->map[rcd->nmap];
525 c->f1.c = c->f2.c = NULL;
526 if (lex_match_id ("ELSE"))
532 else if (type == NUMERIC)
536 if (lex_match_id ("LO") || lex_match_id ("LOWEST"))
538 if (!lex_force_match_id ("THRU"))
540 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
542 else if (token == T_NUM)
550 lex_error (_("following LO THRU"));
554 else if (lex_match_id ("MISSING"))
557 rcd->flags |= RCD_MISC_MISSING;
559 else if (lex_match_id ("SYSMIS"))
562 rcd->flags |= RCD_MISC_MISSING;
566 lex_error (_("in source value"));
570 else if (token == T_NUM)
574 if (lex_match_id ("THRU"))
576 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
578 else if (token == T_NUM)
591 c->type = RCD_SINGLE;
595 lex_error (_("in source value"));
601 assert (type == ALPHA);
602 if (lex_match_id ("CONVERT"))
604 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
605 rcd->flags |= RCD_DEST_NUMERIC;
606 else if ((rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
608 msg (SE, _("Keyword CONVERT may only be used with "
609 "string input values and numeric output "
614 c->type = RCD_CONVERT;
620 /* Only the debugging code needs the NULLs at the ends
621 of the strings. However, changing code behavior more
622 than necessary based on the DEBUGGING `#define' is just
624 c->type = RCD_SINGLE;
625 if (!lex_force_string ())
627 c->f1.c = xmalloc (max_src_width + 1);
628 st_pad_copy (c->f1.c, ds_c_str (&tokstr), max_src_width + 1);
633 if (c->type != RCD_END)
643 /* Data transformation. */
646 recode_trns_free (struct trns_header * t)
649 struct rcd_var *head, *next;
651 head = ((struct recode_trns *) t)->codings;
654 if (head->map && !(head->flags & RCD_MISC_DUPLICATE))
656 if (head->flags & RCD_SRC_STRING)
657 for (i = 0; i < head->nmap; i++)
658 switch (head->map[i].type)
661 free (head->map[i].f2.c);
667 free (head->map[i].f1.c);
676 if (head->flags & RCD_DEST_STRING)
677 for (i = 0; i < head->nmap; i++)
678 if (head->map[i].type != RCD_CONVERT && head->map[i].type != RCD_END)
679 free (head->map[i].t.c);
688 static inline struct coding *
689 find_src_numeric (struct rcd_var * v, struct ccase * c)
691 double cmp = case_num (c, v->src->fv);
696 if (v->sysmis.f != -SYSMIS)
698 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
699 case_data_rw (c, v->dest->fv)->f = v->sysmis.f;
701 memcpy (case_data_rw (c, v->dest->fv)->s, v->sysmis.s,
707 for (cp = v->map;; cp++)
713 if (is_num_user_missing (cmp, v->src))
729 if (cmp >= cp->f1.f && cmp <= cp->f2.f)
739 static inline struct coding *
740 find_src_string (struct rcd_var * v, struct ccase * c)
742 const char *cmp = case_str (c, v->src->fv);
743 int w = v->src->width;
746 for (cp = v->map;; cp++)
752 if (!memcmp (cp->f1.c, cmp, w))
759 double f = convert_to_double (cmp, w);
762 case_data_rw (c, v->dest->fv)->f = f;
773 recode_trns_proc (struct trns_header * t, struct ccase * c,
778 for (v = ((struct recode_trns *) t)->codings; v; v = v->next)
782 switch (v->flags & RCD_SRC_MASK)
784 case RCD_SRC_NUMERIC:
785 cp = find_src_numeric (v, c);
788 cp = find_src_string (v, c);
797 /* A matching input value was found. */
798 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
800 double val = cp->t.f;
801 double *out = &case_data_rw (c, v->dest->fv)->f;
803 *out = case_num (c, v->src->fv);
812 if (v->dest->fv != v->src->fv)
813 st_bare_pad_len_copy (case_data_rw (c, v->dest->fv)->s,
814 case_str (c, v->src->fv),
815 v->dest->width, v->src->width);
818 memcpy (case_data_rw (c, v->dest->fv)->s, cp->t.c, v->dest->width);
825 /* Convert NPTR to a `long int' in base 10. Returns the long int on
826 success, NOT_LONG on failure. On success stores a pointer to the
827 first character after the number into *ENDPTR. From the GNU C
830 string_to_long (const char *nptr, int width, const char **endptr)
833 register unsigned long int cutoff;
834 register unsigned int cutlim;
835 register unsigned long int i;
836 register const char *s;
837 register unsigned char c;
842 /* Check for a sign. */
855 if (s >= nptr + width)
858 /* Save the pointer so we can check later if anything happened. */
861 cutoff = ULONG_MAX / 10ul;
862 cutlim = ULONG_MAX % 10ul;
867 if (isdigit ((unsigned char) c))
871 /* Check for overflow. */
872 if (i > cutoff || (i == cutoff && c > cutlim))
878 if (s >= nptr + width)
883 /* Check if anything actually happened. */
887 /* Check for a value that is within the range of `unsigned long
888 int', but outside the range of `long int'. We limit LONG_MIN and
889 LONG_MAX by one point because we know that NOT_LONG is out there
892 ? -((unsigned long int) LONG_MIN) - 1
893 : ((unsigned long int) LONG_MAX) - 1))
898 /* Return the result of the appropriate sign. */
899 return (negative ? -i : i);
902 /* Converts S to a double according to format Fx.0. Returns the value
903 found, or -SYSMIS if there was no valid number in s. WIDTH is the
904 length of string S. From the GNU C library. */
906 convert_to_double (const char *s, int width)
908 register const char *end = &s[width];
912 /* The number so far. */
915 int got_dot; /* Found a decimal point. */
916 int got_digit; /* Count of digits. */
918 /* The exponent of the number. */
921 /* Eat whitespace. */
922 while (s < end && isspace ((unsigned char) *s))
928 sign = *s == '-' ? -1 : 1;
929 if (*s == '-' || *s == '+')
942 if (isdigit ((unsigned char) *s))
946 /* Make sure that multiplication by 10 will not overflow. */
947 if (num > DBL_MAX * 0.1)
948 /* The value of the digit doesn't matter, since we have already
949 gotten as many digits as can be represented in a `double'.
950 This doesn't necessarily mean the result will overflow.
951 The exponent may reduce it to within range.
953 We just need to record that there was another
954 digit so that we can multiply by 10 later. */
957 num = (num * 10.0) + (*s - '0');
959 /* Keep track of the number of digits after the decimal point.
960 If we just divided by 10 here, we would lose precision. */
964 else if (!got_dot && *s == '.')
965 /* Record that we have found the decimal point. */
974 if (s < end && (tolower ((unsigned char) (*s)) == 'e'
975 || tolower ((unsigned char) (*s)) == 'd'))
977 /* Get the exponent specified after the `e' or `E'. */
984 exp = string_to_long (s, end - s, &s);
985 if (exp == NOT_LONG || end == s)
990 while (s < end && isspace ((unsigned char) *s))
998 /* Multiply NUM by 10 to the EXPONENT power,
999 checking for overflow and underflow. */
1003 if (-exponent + got_digit > -(DBL_MIN_10_EXP) + 5
1004 || num < DBL_MIN * pow (10.0, (double) -exponent))
1006 num *= pow (10.0, (double) exponent);
1008 else if (exponent > 0)
1010 if (num > DBL_MAX * pow (10.0, (double) -exponent))
1012 num *= pow (10.0, (double) exponent);
1015 return sign > 0 ? num : -num;