1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
33 #include "debug-print.h"
39 RCD_END, /* sentinel value */
40 RCD_USER, /* user-missing => one */
41 RCD_SINGLE, /* one => one */
42 RCD_HIGH, /* x > a => one */
43 RCD_LOW, /* x < b => one */
44 RCD_RANGE, /* b < x < a => one */
45 RCD_ELSE, /* any but SYSMIS => one */
46 RCD_CONVERT /* "123" => 123 */
49 /* Describes how to recode a single value or range of values into a
54 union value f1, f2; /* Describe value or range as src. Long
55 strings are stored in `c'. */
56 union value t; /* Describes value as dest. Long strings in `c'. */
59 /* Describes how to recode a single variable. */
64 unsigned flags; /* RCD_SRC_* | RCD_DEST_* | RCD_MISC_* */
66 struct variable *src; /* Source variable. */
67 struct variable *dest; /* Destination variable. */
68 char dest_name[9]; /* Name of dest variable if we're creating it. */
70 int has_sysmis; /* Do we recode for SYSMIS? */
71 union value sysmis; /* Coding for SYSMIS (if src is numeric). */
73 struct coding *map; /* Coding for other values. */
74 int nmap, mmap; /* Length of map, max capacity of map. */
77 /* RECODE transformation. */
81 struct rcd_var *codings;
84 /* What we're recoding from (`src'==`source'). */
85 #define RCD_SRC_ERROR 0000u /* Bad value for src. */
86 #define RCD_SRC_NUMERIC 0001u /* Src is numeric. */
87 #define RCD_SRC_STRING 0002u /* Src is short string. */
88 #define RCD_SRC_MASK 0003u /* AND mask to isolate src bits. */
90 /* What we're recoding to (`dest'==`destination'). */
91 #define RCD_DEST_ERROR 0000u /* Bad value for dest. */
92 #define RCD_DEST_NUMERIC 0004u /* Dest is numeric. */
93 #define RCD_DEST_STRING 0010u /* Dest is short string. */
94 #define RCD_DEST_MASK 0014u /* AND mask to isolate dest bits. */
96 /* Miscellaneous bits. */
97 #define RCD_MISC_CREATE 0020u /* We create dest var (numeric only) */
98 #define RCD_MISC_DUPLICATE 0040u /* This var_info has the same MAP
99 value as the previous var_info.
100 Prevents redundant free()ing. */
101 #define RCD_MISC_MISSING 0100u /* Encountered MISSING or SYSMIS in
104 static int parse_dest_spec (struct rcd_var * rcd, union value *v,
105 size_t *max_dst_width);
106 static int parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width);
107 static trns_proc_func recode_trns_proc;
108 static trns_free_func recode_trns_free;
109 static double convert_to_double (char *, int);
112 static void debug_print (struct rcd_var * head);
117 /* First transformation in the list. rcd is in this list. */
118 static struct rcd_var *head;
120 /* Variables in the current part of the recoding. */
124 /* Parses the RECODE transformation. */
130 /* Transformation that we're constructing. */
133 /* Type of the src variables. */
136 /* Length of longest src string. */
137 size_t max_src_width;
139 /* Length of longest dest string. */
140 size_t max_dst_width;
142 /* For stepping through, constructing the linked list of
144 struct rcd_var *iter;
146 /* The real transformation, just a wrapper for a list of
148 struct recode_trns *trns;
150 lex_match_id ("RECODE");
152 /* Parses each specification between slashes. */
153 head = rcd = xmalloc (sizeof *rcd);
156 /* Whether we've already encountered a specification for SYSMIS. */
159 /* Initialize this rcd_var to ensure proper cleanup. */
162 rcd->nmap = rcd->mmap = 0;
166 /* Parse variable names. */
167 if (!parse_variables (default_dict, &v, &nv, PV_SAME_TYPE))
170 /* Ensure all variables are same type; find length of longest
173 max_src_width = v[0]->width;
176 for (i = 0; i < nv; i++)
177 if (v[i]->width > (int) max_src_width)
178 max_src_width = v[i]->width;
183 rcd->flags |= RCD_SRC_NUMERIC;
185 rcd->flags |= RCD_SRC_STRING;
187 /* Parse each coding in parentheses. */
189 if (!lex_force_match ('('))
193 /* Get the input value (before the `='). */
194 int mark = rcd->nmap;
195 int code = parse_src_spec (rcd, type, max_src_width);
199 /* ELSE is the same as any other input spec except that it
200 precludes later sysmis specifications. */
207 /* If keyword CONVERT was specified, there is no output
213 /* Get the output value (after the `='). */
214 lex_get (); /* Skip `='. */
215 if (!parse_dest_spec (rcd, &output, &max_dst_width))
218 /* Set the value for SYSMIS if requested and if we don't
220 if ((rcd->flags & RCD_MISC_MISSING) && !had_sysmis)
223 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
224 rcd->sysmis.f = output.f;
226 rcd->sysmis.c = xstrdup (output.c);
229 rcd->flags &= ~RCD_MISC_MISSING;
232 /* Since there may be multiple input values for a single
233 output, the output value need to propagated among all
235 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
236 for (i = mark; i < rcd->nmap; i++)
237 rcd->map[i].t.f = output.f;
240 for (i = mark; i < rcd->nmap; i++)
241 rcd->map[i].t.c = xstrdup (output.c);
245 lex_get (); /* Skip `)'. */
246 if (!lex_match ('('))
250 /* Append sentinel value. */
251 rcd->map[rcd->nmap++].type = RCD_END;
253 /* Since multiple variables may use the same recodings, it is
254 necessary to propogate the codings to all of them. */
257 rcd->dest_name[0] = 0;
259 for (i = 1; i < nv; i++)
261 iter = iter->next = xmalloc (sizeof *iter);
263 iter->flags = rcd->flags | RCD_MISC_DUPLICATE;
266 iter->dest_name[0] = 0;
267 iter->has_sysmis = rcd->has_sysmis;
268 iter->sysmis = rcd->sysmis;
269 iter->map = rcd->map;
272 if (lex_match_id ("INTO"))
279 if (!parse_mixed_vars (&names, &nnames, PV_NONE))
284 for (i = 0; i < nnames; i++)
287 msg (SE, _("%d variable(s) cannot be recoded into "
288 "%d variable(s). Specify the same number "
289 "of variables as input and output variables."),
294 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
295 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
297 struct variable *v = dict_lookup_var (default_dict, names[i]);
301 msg (SE, _("There is no string variable named "
302 "%s. (All string variables specified "
303 "on INTO must already exist. Use the "
304 "STRING command to create a string "
305 "variable.)"), names[i]);
308 if (v->type != ALPHA)
310 msg (SE, _("Type mismatch between input and output "
311 "variables. Output variable %s is not "
312 "a string variable, but all the input "
313 "variables are string variables."), v->name);
316 if (v->width > (int) max_dst_width)
317 max_dst_width = v->width;
321 for (i = 0, iter = rcd; i < nv; i++, iter = iter->next)
323 struct variable *v = dict_lookup_var (default_dict, names[i]);
327 if (v->type != NUMERIC)
329 msg (SE, _("Type mismatch after INTO: %s "
330 "is not a numeric variable."), v->name);
337 strcpy (iter->dest_name, names[i]);
341 /* Note that regardless of whether we succeed or fail,
342 flow-of-control comes here. `success' is the important
343 factor. Ah, if C had garbage collection... */
345 for (i = 0; i < nnames; i++)
353 if (max_src_width > max_dst_width)
354 max_dst_width = max_src_width;
356 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC
357 && (rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
359 msg (SE, _("INTO must be used when the input values are "
360 "numeric and output values are string."));
364 if ((rcd->flags & RCD_SRC_MASK) != RCD_SRC_NUMERIC
365 && (rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
367 msg (SE, _("INTO must be used when the input values are "
368 "string and output values are numeric."));
373 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING)
377 for (cp = rcd->map; cp->type != RCD_END; cp++)
380 if (strlen (cp->t.c) < max_dst_width)
382 /* The NULL is only really necessary for the
384 char *repl = xmalloc (max_dst_width + 1);
385 st_pad_copy (repl, cp->t.c, max_dst_width + 1);
390 /* The strings are guaranteed to be in order of
391 nondecreasing length. */
397 if (!lex_match ('/'))
401 rcd = rcd->next = xmalloc (sizeof *rcd);
408 lex_error (_("expecting end of command"));
412 for (rcd = head; rcd; rcd = rcd->next)
413 if (rcd->dest_name[0])
415 rcd->dest = dict_create_var (default_dict, rcd->dest_name, 0);
418 /* FIXME: This can occur if a destname is duplicated.
419 We could give an error at parse time but I don't
421 rcd->dest = dict_lookup_var_assert (default_dict, rcd->dest_name);
425 trns = xmalloc (sizeof *trns);
426 trns->h.proc = recode_trns_proc;
427 trns->h.free = recode_trns_free;
428 trns->codings = head;
429 add_transformation ((struct trns_header *) trns);
439 struct recode_trns t;
442 recode_trns_free ((struct trns_header *) &t);
448 parse_dest_spec (struct rcd_var * rcd, union value * v, size_t *max_dst_width)
458 flags = RCD_DEST_NUMERIC;
460 else if (lex_match_id ("SYSMIS"))
463 flags = RCD_DEST_NUMERIC;
465 else if (token == T_STRING)
467 size_t max = *max_dst_width;
468 size_t toklen = ds_length (&tokstr);
471 v->c = xmalloc (max + 1);
472 st_pad_copy (v->c, ds_value (&tokstr), max + 1);
473 flags = RCD_DEST_STRING;
474 *max_dst_width = max;
477 else if (lex_match_id ("COPY"))
479 if ((rcd->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
481 flags = RCD_DEST_NUMERIC;
486 flags = RCD_DEST_STRING;
492 lex_error (_("expecting output value"));
496 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
499 else if (((rcd->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC
500 && flags != RCD_DEST_NUMERIC)
501 || ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_STRING
502 && flags != RCD_DEST_STRING))
504 else if ((rcd->flags & RCD_DEST_MASK) ^ flags)
506 msg (SE, _("Inconsistent output types. The output values "
507 "must be all numeric or all string."));
514 /* Reads a set of source specifications and returns one of the
515 following values: 0 on failure; 1 for normal success; 2 for success
516 but with CONVERT as the keyword; 3 for success but with ELSE as the
519 parse_src_spec (struct rcd_var * rcd, int type, size_t max_src_width)
525 if (rcd->nmap >= rcd->mmap - 1)
528 rcd->map = xrealloc (rcd->map, rcd->mmap * sizeof *rcd->map);
531 c = &rcd->map[rcd->nmap];
532 c->f1.c = c->f2.c = NULL;
533 if (lex_match_id ("ELSE"))
539 else if (type == NUMERIC)
543 if (lex_match_id ("LO") || lex_match_id ("LOWEST"))
545 if (!lex_force_match_id ("THRU"))
547 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
549 else if (token == T_NUM)
557 lex_error (_("following LO THRU"));
561 else if (lex_match_id ("MISSING"))
564 rcd->flags |= RCD_MISC_MISSING;
566 else if (lex_match_id ("SYSMIS"))
569 rcd->flags |= RCD_MISC_MISSING;
573 lex_error (_("in source value"));
577 else if (token == T_NUM)
581 if (lex_match_id ("THRU"))
583 if (lex_match_id ("HI") || lex_match_id ("HIGHEST"))
585 else if (token == T_NUM)
598 c->type = RCD_SINGLE;
602 lex_error (_("in source value"));
608 assert (type == ALPHA);
609 if (lex_match_id ("CONVERT"))
611 if ((rcd->flags & RCD_DEST_MASK) == RCD_DEST_ERROR)
612 rcd->flags |= RCD_DEST_NUMERIC;
613 else if ((rcd->flags & RCD_DEST_MASK) != RCD_DEST_NUMERIC)
615 msg (SE, _("Keyword CONVERT may only be used with "
616 "string input values and numeric output "
621 c->type = RCD_CONVERT;
627 /* Only the debugging code needs the NULLs at the ends
628 of the strings. However, changing code behavior more
629 than necessary based on the DEBUGGING `#define' is just
631 c->type = RCD_SINGLE;
632 if (!lex_force_string ())
634 c->f1.c = xmalloc (max_src_width + 1);
635 st_pad_copy (c->f1.c, ds_value (&tokstr), max_src_width + 1);
640 if (c->type != RCD_END)
650 /* Data transformation. */
653 recode_trns_free (struct trns_header * t)
656 struct rcd_var *head, *next;
658 head = ((struct recode_trns *) t)->codings;
661 if (head->map && !(head->flags & RCD_MISC_DUPLICATE))
663 if (head->flags & RCD_SRC_STRING)
664 for (i = 0; i < head->nmap; i++)
665 switch (head->map[i].type)
668 free (head->map[i].f2.c);
674 free (head->map[i].f1.c);
683 if (head->flags & RCD_DEST_STRING)
684 for (i = 0; i < head->nmap; i++)
685 if (head->map[i].type != RCD_CONVERT && head->map[i].type != RCD_END)
686 free (head->map[i].t.c);
695 static inline struct coding *
696 find_src_numeric (struct rcd_var * v, struct ccase * c)
698 double cmp = c->data[v->src->fv].f;
703 if (v->sysmis.f != -SYSMIS)
705 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
706 c->data[v->dest->fv].f = v->sysmis.f;
708 memcpy (c->data[v->dest->fv].s, v->sysmis.c,
714 for (cp = v->map;; cp++)
720 if (is_num_user_missing (cmp, v->src))
736 if (cmp >= cp->f1.f && cmp <= cp->f2.f)
746 static inline struct coding *
747 find_src_string (struct rcd_var * v, struct ccase * c)
749 char *cmp = c->data[v->src->fv].s;
750 int w = v->src->width;
753 for (cp = v->map;; cp++)
759 if (!memcmp (cp->f1.c, cmp, w))
766 double f = convert_to_double (cmp, w);
769 c->data[v->dest->fv].f = f;
780 recode_trns_proc (struct trns_header * t, struct ccase * c,
785 for (v = ((struct recode_trns *) t)->codings; v; v = v->next)
789 switch (v->flags & RCD_SRC_MASK)
791 case RCD_SRC_NUMERIC:
792 cp = find_src_numeric (v, c);
795 cp = find_src_string (v, c);
803 /* A matching input value was found. */
804 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
806 double val = cp->t.f;
808 c->data[v->dest->fv].f = c->data[v->src->fv].f;
810 c->data[v->dest->fv].f = val;
816 st_bare_pad_len_copy (c->data[v->dest->fv].s,
817 c->data[v->src->fv].c,
818 v->dest->width, v->src->width);
820 memcpy (c->data[v->dest->fv].s, cp->t.c, v->dest->width);
831 dump_dest (struct rcd_var * v, union value * c)
833 if ((v->flags & RCD_DEST_MASK) == RCD_DEST_NUMERIC)
836 else if (c->f == -SYSMIS)
839 printf ("=%g", c->f);
841 printf ("=\"%s\"", c->c);
847 debug_print (struct rcd_var * head)
849 struct rcd_var *iter, *start;
853 for (iter = head; iter; iter = iter->next)
856 printf (" %s%s", iter == head ? "" : "/", iter->src->name);
857 while (iter->next && (iter->next->flags & RCD_MISC_DUPLICATE))
860 printf (" %s", iter->src->name);
862 if (iter->has_sysmis)
865 dump_dest (iter, &iter->sysmis);
868 for (c = iter->map; c->type != RCD_END; c++)
871 if ((iter->flags & RCD_SRC_MASK) == RCD_SRC_NUMERIC)
875 printf (_("!!END!!"));
881 printf ("%g", c->f1.f);
884 printf ("%g THRU HIGH", c->f1.f);
887 printf ("LOW THRU %g", c->f1.f);
890 printf ("%g THRU %g", c->f1.f, c->f2.f);
896 printf (_("!!ERROR!!"));
903 printf ("\"%s\"", c->f1.c);
912 printf (_("!!ERROR!!"));
915 if (c->type != RCD_CONVERT)
916 dump_dest (iter, &c->t);
923 start->dest_name[0] ? start->dest_name : start->dest->name);
933 /* Convert NPTR to a `long int' in base 10. Returns the long int on
934 success, NOT_LONG on failure. On success stores a pointer to the
935 first character after the number into *ENDPTR. From the GNU C
938 string_to_long (char *nptr, int width, char **endptr)
941 register unsigned long int cutoff;
942 register unsigned int cutlim;
943 register unsigned long int i;
945 register unsigned char c;
950 /* Check for a sign. */
963 if (s >= nptr + width)
966 /* Save the pointer so we can check later if anything happened. */
969 cutoff = ULONG_MAX / 10ul;
970 cutlim = ULONG_MAX % 10ul;
975 if (isdigit ((unsigned char) c))
979 /* Check for overflow. */
980 if (i > cutoff || (i == cutoff && c > cutlim))
986 if (s >= nptr + width)
991 /* Check if anything actually happened. */
995 /* Check for a value that is within the range of `unsigned long
996 int', but outside the range of `long int'. We limit LONG_MIN and
997 LONG_MAX by one point because we know that NOT_LONG is out there
1000 ? -((unsigned long int) LONG_MIN) - 1
1001 : ((unsigned long int) LONG_MAX) - 1))
1006 /* Return the result of the appropriate sign. */
1007 return (negative ? -i : i);
1010 /* Converts S to a double according to format Fx.0. Returns the value
1011 found, or -SYSMIS if there was no valid number in s. WIDTH is the
1012 length of string S. From the GNU C library. */
1014 convert_to_double (char *s, int width)
1016 register const char *end = &s[width];
1020 /* The number so far. */
1023 int got_dot; /* Found a decimal point. */
1024 int got_digit; /* Count of digits. */
1026 /* The exponent of the number. */
1029 /* Eat whitespace. */
1030 while (s < end && isspace ((unsigned char) *s))
1036 sign = *s == '-' ? -1 : 1;
1037 if (*s == '-' || *s == '+')
1048 for (; s < end; ++s)
1050 if (isdigit ((unsigned char) *s))
1054 /* Make sure that multiplication by 10 will not overflow. */
1055 if (num > DBL_MAX * 0.1)
1056 /* The value of the digit doesn't matter, since we have already
1057 gotten as many digits as can be represented in a `double'.
1058 This doesn't necessarily mean the result will overflow.
1059 The exponent may reduce it to within range.
1061 We just need to record that there was another
1062 digit so that we can multiply by 10 later. */
1065 num = (num * 10.0) + (*s - '0');
1067 /* Keep track of the number of digits after the decimal point.
1068 If we just divided by 10 here, we would lose precision. */
1072 else if (!got_dot && *s == '.')
1073 /* Record that we have found the decimal point. */
1082 if (s < end && (tolower ((unsigned char) (*s)) == 'e'
1083 || tolower ((unsigned char) (*s)) == 'd'))
1085 /* Get the exponent specified after the `e' or `E'. */
1092 exp = string_to_long (s, end - s, &s);
1093 if (exp == NOT_LONG || end == s)
1098 while (s < end && isspace ((unsigned char) *s))
1106 /* Multiply NUM by 10 to the EXPONENT power,
1107 checking for overflow and underflow. */
1111 if (-exponent + got_digit > -(DBL_MIN_10_EXP) + 5
1112 || num < DBL_MIN * pow (10.0, (double) -exponent))
1114 num *= pow (10.0, (double) exponent);
1116 else if (exponent > 0)
1118 if (num > DBL_MAX * pow (10.0, (double) -exponent))
1120 num *= pow (10.0, (double) exponent);
1123 return sign > 0 ? num : -num;