1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
32 #include "julcal/julcal.h"
39 Expression "optimizer"
41 Operates on the tree representation of expressions.
42 optimize_expression() performs the optimizations listed below:
45 Any operation with constant operands is replaced by its value.
46 (Exception: random-number-generator functions.)
48 2. Strength reduction (x is any expression; a is a numeric constant)
52 x**1, x+0, x-0, x*1 => x
54 x/a => x*(1/a) (where 1/a is evaluated at optimization time)
56 I thought about adding additional optimizations but decided that what
57 is here could already be considered overkill.
60 static struct nonterm_node *evaluate_tree (struct nonterm_node * n);
61 static struct nonterm_node *optimize_tree (struct nonterm_node * n);
64 optimize_expression (struct nonterm_node * n)
68 /* Set to 1 if a child is nonconstant. */
71 /* Number of system-missing children. */
74 /* We can't optimize a terminal node. */
75 if (n->type > OP_TERMINAL)
78 /* Start by optimizing all the children. */
79 for (i = 0; i < n->n; i++)
81 n->arg[i] = ((union any_node *)
82 optimize_expression ((struct nonterm_node *) n->arg[i]));
83 if (n->arg[i]->type == OP_NUM_CON)
85 if (n->arg[i]->num_con.value == SYSMIS)
88 else if (n->arg[i]->type != OP_STR_CON)
92 if (sysmis && !(ops[n->type].flags & OP_ABSORB_MISS))
93 /* Just about any operation produces SYSMIS when given any SYSMIS
96 struct num_con_node *num = xmalloc (sizeof *num);
97 free_node ((union any_node *) n);
98 num->type = OP_NUM_CON;
100 n = (struct nonterm_node *) num;
103 /* If all the children of this node are constants, then there are
104 obvious optimizations. */
105 n = evaluate_tree (n);
107 /* Otherwise, we may be able to make certain optimizations
109 n = optimize_tree (n);
113 static struct nonterm_node *repl_num_con (struct nonterm_node *, double);
114 static struct nonterm_node *force_repl_num_con (struct nonterm_node *, double);
115 static struct nonterm_node *repl_str_con (struct nonterm_node *, char *, int);
117 #define n0 n->arg[0]->num_con.value
118 #define n1 n->arg[1]->num_con.value
119 #define n2 n->arg[2]->num_con.value
121 #define s0 n->arg[0]->str_con.s
122 #define s0l n->arg[0]->str_con.len
123 #define s1 n->arg[1]->str_con.s
124 #define s1l n->arg[1]->str_con.len
125 #define s2 n->arg[2]->str_con.s
126 #define s2l n->arg[2]->str_con.len
127 #define s(X) n->arg[X]->str_con.s
128 #define sl(X) n->arg[X]->str_con.len
130 static struct nonterm_node *
131 optimize_tree (struct nonterm_node * n)
136 if (n->type == OP_PLUS || n->type == OP_MUL)
138 /* Default constant value. */
139 double def = n->type == OP_MUL ? 1.0 : 0.0;
141 /* Total value of all the constants. */
144 /* Number of nonconst arguments. */
148 struct nonterm_node *m;
150 /* Argument copying counter. */
153 /* 1=SYSMIS encountered */
156 for (i = 0; i < n->n; i++)
157 if (n->arg[i]->type == OP_NUM_CON)
159 if (n->arg[i]->num_con.value != SYSMIS)
161 if (n->type == OP_MUL)
162 cval *= n->arg[i]->num_con.value;
164 cval += n->arg[i]->num_con.value;
172 /* 0*SYSMIS=0, 0/SYSMIS=0; otherwise, SYSMIS and infinities
174 if (approx_eq (cval, 0.0) && n->type == OP_MUL)
176 else if (sysmis || !finite (cval))
182 /* If no nonconstant terms, replace with a constant node. */
184 return force_repl_num_con (n, cval);
186 if (nvar == 1 && cval == def)
188 /* If there is exactly one nonconstant term and no constant
189 terms, replace with the nonconstant term. */
190 for (i = 0; i < n->n; i++)
191 if (n->arg[i]->type != OP_NUM_CON)
192 m = (struct nonterm_node *) n->arg[i];
194 free_node (n->arg[i]);
198 /* Otherwise consolidate all the nonconstant terms. */
199 m = xmalloc (sizeof (struct nonterm_node)
200 + ((nvar + approx_ne (cval, def) - 1)
201 * sizeof (union any_node *)));
202 for (i = c = 0; i < n->n; i++)
203 if (n->arg[i]->type != OP_NUM_CON)
204 m->arg[c++] = n->arg[i];
206 free_node (n->arg[i]);
208 if (approx_ne (cval, def))
210 m->arg[c] = xmalloc (sizeof (struct num_con_node));
211 m->arg[c]->num_con.type = OP_NUM_CON;
212 m->arg[c]->num_con.value = cval;
222 else if (n->type == OP_POW)
224 if (n->arg[1]->type == OP_NUM_CON)
226 if (approx_eq (n1, 1.0))
228 struct nonterm_node *m = (struct nonterm_node *) n->arg[0];
230 free_node (n->arg[1]);
234 else if (approx_eq (n1, 2.0))
236 n = xrealloc (n, sizeof (struct nonterm_node));
246 (n = repl_num_con (n, D))
249 (n = force_repl_num_con (n, D))
251 /* Finds the first NEEDLE of length NEEDLE_LEN in a HAYSTACK of length
252 HAYSTACK_LEN. Returns a 1-based index, 0 on failure. */
254 str_search (char *haystack, int haystack_len, char *needle, int needle_len)
256 char *p = memmem (haystack, haystack_len, needle, needle_len);
257 return p ? p - haystack + 1 : 0;
260 /* Finds the last NEEDLE of length NEEDLE_LEN in a HAYSTACK of length
261 HAYSTACK_LEN. Returns a 1-based index, 0 on failure. */
263 str_rsearch (char *haystack, int haystack_len, char *needle, int needle_len)
265 char *p = mm_find_reverse (haystack, haystack_len, needle, needle_len);
266 return p ? p - haystack + 1 : 0;
269 static struct nonterm_node *
270 evaluate_tree (struct nonterm_node * n)
278 strbuf = xmalloc (256);
285 return optimize_tree (n);
288 if (approx_eq (n0, 0.0) && approx_eq (n1, 0.0))
290 else if (n0 == SYSMIS && n1 == 0.0)
292 else if (n0 == 0.0 && n1 == SYSMIS)
299 if (n0 == 0.0 || n1 == 0.0)
301 else if (n0 == SYSMIS || n1 == SYSMIS)
307 if (n0 == 1.0 || n1 == 1.0)
309 else if (n0 == SYSMIS || n1 == SYSMIS)
315 rnc (n0 == 0.0 ? 1.0 : 0.0);
319 rnc (approx_eq (n0, n1));
322 rnc (approx_ge (n0, n1));
325 rnc (approx_gt (n0, n1));
328 rnc (approx_le (n0, n1));
331 rnc (approx_lt (n0, n1));
334 rnc (approx_ne (n0, n1));
337 /* String operators. */
339 rnc (st_compare_pad (s0, s0l, s1, s1l) == 0);
342 rnc (st_compare_pad (s0, s0l, s1, s1l) >= 0);
345 rnc (st_compare_pad (s0, s0l, s1, s1l) > 0);
348 rnc (st_compare_pad (s0, s0l, s1, s1l) <= 0);
351 rnc (st_compare_pad (s0, s0l, s1, s1l) < 0);
354 rnc (st_compare_pad (s0, s0l, s1, s1l) != 0);
357 /* Unary functions. */
389 rnc (n0 >= 0.0 ? floor (n0 + 0.5) : -floor (-n0 + 0.5));
401 rnc (n0 >= 0.0 ? floor (n0) : -floor (-n0));
404 /* N-ary numeric functions. */
413 for (i = 1; i < n->n; i++)
415 ni = n->arg[i]->num_con.value;
416 if (approx_eq (n0, ni))
424 frnc (sysmis ? SYSMIS : 0.0);
429 for (i = 1; i < n->n; i++)
430 if (!st_compare_pad (n->arg[0]->str_con.s, n->arg[0]->str_con.len,
431 n->arg[i]->str_con.s, n->arg[i]->str_con.len))
434 goto any_string_done;
451 {0.0, 0.0}; /* sum, sum of squares */
452 double min = DBL_MAX; /* minimum value */
453 double max = -DBL_MAX; /* maximum value */
454 double ni; /* value of i'th argument */
455 int nv = 0; /* number of valid arguments */
457 for (i = 0; i < n->n; i++)
459 ni = n->arg[i]->num_con.value;
471 if (n->type == OP_NMISS)
473 else if (n->type == OP_NVALID)
475 else if (nv >= (int) n->arg[i])
480 frnc (calc_cfvar (d, nv));
486 frnc (calc_mean (d, nv));
492 frnc (calc_stddev (calc_variance (d, nv)));
498 frnc (calc_variance (d, nv));
514 for (i = 1; i < n->n; i += 2)
516 min = n->arg[i]->num_con.value;
517 max = n->arg[i + 1]->num_con.value;
518 if (min == SYSMIS || max == SYSMIS)
521 if (approx_ge (n0, min) && approx_le (n0, max))
527 frnc (sysmis ? SYSMIS : 0.0);
531 case OP_RANGE_STRING:
532 for (i = 1; i < n->n; i += 2)
533 if (st_compare_pad (n->arg[0]->str_con.s, n->arg[0]->str_con.len,
534 n->arg[i]->str_con.s, n->arg[i]->str_con.len) >= 0
535 && st_compare_pad (n->arg[0]->str_con.s, n->arg[0]->str_con.len,
536 n->arg[i + 1]->str_con.s,
537 n->arg[i + 1]->str_con.len) <= 0)
548 rnc (60. * (60. * n0 + n1) + n2);
551 /* Date construction functions. */
553 rnc (60. * 60. * 24. * yrmoda (n2, n1, n0));
556 rnc (60. * 60. * 24. * yrmoda (n2, n0, n1));
559 rnc (60. * 60. * 24. * yrmoda (n1, n0, 1));
562 rnc (60. * 60. * 24. * yrmoda (n1, 3 * (int) n0 - 2, 1));
566 double t = yrmoda (n1, 1, 1);
568 t = 60. * 60. * 24. * (t + 7. * (n0 - 1));
574 double t = yrmoda (n0, 1, 1);
576 t = 60. * 60. * 24. * (t + n0 - 1);
581 rnc (yrmoda (n0, n1, n2));
583 /* Date extraction functions. */
585 rnc (floor (n0 / 60. / 60. / 24.) * 60. * 60. * 24.);
588 rnc (fmod (floor (n0 / 60. / 60.), 24.));
591 rnc (julian_to_jday (n0 / 86400.));
596 julian_to_calendar (n0 / 86400., NULL, NULL, &day);
600 case OP_XDATE_MINUTE:
601 rnc (fmod (floor (n0 / 60.), 60.));
606 julian_to_calendar (n0 / 86400., NULL, &month, NULL);
610 case OP_XDATE_QUARTER:
613 julian_to_calendar (n0 / 86400., NULL, &month, NULL);
614 rnc ((month - 1) / 3 + 1);
617 case OP_XDATE_SECOND:
618 rnc (fmod (n0, 60.));
621 rnc (floor (n0 / 60. / 60. / 24.));
624 rnc (n0 - floor (n0 / 60. / 60. / 24.) * 60. * 60. * 24.);
627 rnc ((julian_to_jday (n0) - 1) / 7 + 1);
630 rnc (julian_to_wday (n0));
635 julian_to_calendar (n0 / 86400., &year, NULL, NULL);
640 /* String functions. */
644 memcpy (strbuf, s0, len);
645 for (i = 1; i < n->n; i++)
650 memcpy (&strbuf[len], s (i), add);
653 n = repl_str_con (n, strbuf, len);
657 rnc (s1l ? str_search (s0, s0l, s1, s1l) : SYSMIS);
660 if (n2 == SYSMIS || (int) n2 <= 0 || s1l % (int) n2)
662 msg (SW, _("While optimizing a constant expression, there was "
663 "a bad value for the third argument to INDEX."));
669 int c = s1l / (int) n2;
672 for (i = 0; i < c; i++)
674 r = str_search (s0, s0l, s (i), sl (i));
675 if (r < pos || pos == 0)
682 rnc (str_rsearch (s0, s0l, s1, s1l));
685 if (n2 == SYSMIS || (int) n2 <= 0 || s1l % (int) n2)
687 msg (SE, _("While optimizing a constant expression, there was "
688 "a bad value for the third argument to RINDEX."));
697 for (i = 0; i < c; i++)
699 r = str_rsearch (s0, s0l, s (i), sl (i));
712 for (cp = &s0[s0l]; cp >= s0; cp--)
713 *cp = tolower ((unsigned char) (*cp));
714 n = repl_str_con (n, s0, s0l);
720 for (cp = &s0[s0[0] + 1]; cp > s0; cp--)
721 *cp = toupper ((unsigned char) (*cp));
722 n = repl_str_con (n, s0, s0l);
734 n = repl_str_con (n, NULL, 0);
738 len = range (len, 1, 255);
739 add = max (n1 - s0l, 0);
741 if (n->type == OP_LPAD_OPT || n->type == OP_RPAD_OPT)
745 c = n->type == OP_LPAD_OPT ? 'L' : 'R';
746 msg (SE, _("Third argument to %cPAD() must be at least one "
747 "character in length."), c);
756 if (n->type == OP_LPAD || n->type == OP_LPAD_OPT)
757 memmove (&s0[add], s0, len);
758 if (n->type == OP_LPAD || n->type == OP_LPAD_OPT)
761 memset (&s0[s0l], c, add);
763 n = repl_str_con (n, s0, len);
774 if (n->type == OP_LTRIM_OPT || n->type == OP_RTRIM_OPT)
778 c = n->type == OP_LTRIM_OPT ? 'L' : 'R';
779 msg (SE, _("Second argument to %cTRIM() must be at least one "
780 "character in length."), c);
787 if (n->type == OP_LTRIM || n->type == OP_LTRIM_OPT)
789 while (*cp == c && cp < &s0[len])
794 while (len > 0 && s0[len - 1] == c)
796 n = repl_str_con (n, cp, len);
808 di.flags = DI_IGNORE_ERROR;
811 if (n->type == OP_NUMBER_OPT)
813 di.format.type = (int) n->arg[1];
814 di.format.w = (int) n->arg[2];
815 di.format.d = (int) n->arg[3];
819 di.format.type = FMT_F;
832 f.type = (int) n->arg[1];
833 f.w = (int) n->arg[2];
834 f.d = (int) n->arg[3];
837 data_out (strbuf, &f, &v);
838 n = repl_str_con (n, strbuf, f.w);
845 if (pos > s0l || pos <= 0 || n1 == SYSMIS
846 || (n->type == OP_SUBSTR_OPT && n2 == SYSMIS))
847 n = repl_str_con (n, NULL, 0);
850 if (n->type == OP_SUBSTR_OPT)
853 if (len + pos - 1 > s0l)
858 n = repl_str_con (n, &s0[pos - 1], len);
868 if (approx_eq (n0, 0.0) && n1 == SYSMIS)
874 if (approx_eq (n0, 0.0))
876 else if (approx_eq (n0, 1.0))
878 else if (n0 != SYSMIS)
880 msg (SE, _("When optimizing a constant expression, an integer "
881 "that was being used as an Boolean value was found "
882 "to have a constant value other than 0, 1, or SYSMIS."));
907 static struct nonterm_node *
908 repl_num_con (struct nonterm_node * n, double d)
911 if (!finite (d) || errno)
914 for (i = 0; i < n->n; i++)
915 if (n->arg[i]->type == OP_NUM_CON && n->arg[i]->num_con.value == SYSMIS)
920 return force_repl_num_con (n, d);
923 static struct nonterm_node *
924 force_repl_num_con (struct nonterm_node * n, double d)
926 struct num_con_node *num;
928 if (!finite (d) || errno)
930 free_node ((union any_node *) n);
931 num = xmalloc (sizeof *num);
932 num->type = OP_NUM_CON;
934 return (struct nonterm_node *) num;
937 static struct nonterm_node *
938 repl_str_con (struct nonterm_node * n, char *s, int len)
940 struct str_con_node *str;
942 /* The ordering here is important since the source string may be
943 part of a subnode of n. */
944 str = xmalloc (sizeof *str + len - 1);
945 str->type = OP_STR_CON;
947 memcpy (str->s, s, len);
948 free_node ((union any_node *) n);
949 return (struct nonterm_node *) str;
952 /* Returns the number of days since 10 Oct 1582 for the date
953 YEAR/MONTH/DAY, where YEAR is in range 0..199 or 1582..19999, MONTH
954 is in 1..12, and DAY is in 1..31. */
956 yrmoda (double year, double month, double day)
958 if (year == SYSMIS || month == SYSMIS || day == SYSMIS)
961 /* The addition of EPSILON avoids converting, for example,
962 1991.9999997=>1991. */
963 year = floor (year + EPSILON);
964 month = floor (month + EPSILON);
965 day = floor (day + EPSILON);
967 if (year >= 0. && year <= 199.)
969 if ((year < 1582. || year > 19999.)
970 || (year == 1582. && (month < 10. || (month == 10. && day < 15.)))
971 || (month < -1 || month > 13)
972 || (day < -1 || day > 32))
974 return calendar_to_julian (year, month, day);
977 /* Expression dumper. */
979 static struct expression *e;
981 static int ndbl, mdbl;
982 static int nstr, mstr;
983 static int nvars, mvars;
985 static void dump_node (union any_node * n);
986 static void emit (int);
987 static void emit_num_con (double);
988 static void emit_str_con (char *, int);
989 static void emit_var (struct variable *);
992 dump_expression (union any_node * n, struct expression * expr)
1012 /* Now compute the stack height needed to evaluate the expression. */
1013 for (o = e->op; *o != OP_SENTINEL; o++)
1015 if (ops[*o].flags & OP_VAR_ARGS)
1018 height += ops[*o].height;
1020 if (height > max_height)
1021 max_height = height;
1024 /* We waste space for one `value' since pointers are not
1025 guaranteed to be able to point to a spot before a block. */
1028 e->stack = xmalloc (max_height * sizeof *e->stack);
1031 e->str_stack = e->type == EX_STRING ? xmalloc (256) : NULL;
1033 e->str_stack = xmalloc (256);
1039 dump_node (union any_node * n)
1041 if (n->type == OP_AND || n->type == OP_OR)
1045 dump_node (n->nonterm.arg[0]);
1046 for (i = 1; i < n->nonterm.n; i++)
1048 dump_node (n->nonterm.arg[i]);
1053 else if (n->type < OP_TERMINAL)
1056 for (i = 0; i < n->nonterm.n; i++)
1057 dump_node (n->nonterm.arg[i]);
1059 if (ops[n->type].flags & OP_VAR_ARGS)
1060 emit (n->nonterm.n);
1061 if (ops[n->type].flags & OP_MIN_ARGS)
1062 emit ((int) n->nonterm.arg[n->nonterm.n]);
1063 if (ops[n->type].flags & OP_FMT_SPEC)
1065 emit ((int) n->nonterm.arg[n->nonterm.n]);
1066 emit ((int) n->nonterm.arg[n->nonterm.n + 1]);
1067 emit ((int) n->nonterm.arg[n->nonterm.n + 2]);
1073 if (n->type == OP_NUM_CON)
1074 emit_num_con (n->num_con.value);
1075 else if (n->type == OP_STR_CON)
1076 emit_str_con (n->str_con.s, n->str_con.len);
1077 else if (n->type == OP_NUM_VAR || n->type == OP_STR_VAR
1078 || n->type == OP_STR_MIS)
1079 emit_var (n->var.v);
1080 else if (n->type == OP_NUM_LAG || n->type == OP_STR_LAG)
1082 emit_var (n->lag.v);
1085 else if (n->type == OP_NUM_SYS || n->type == OP_NUM_VAL)
1086 emit (n->var.v->fv);
1088 assert (n->type == OP_CASENUM);
1097 e->op = xrealloc (e->op, mop * sizeof *e->op);
1103 emit_num_con (double dbl)
1108 e->num = xrealloc (e->num, mdbl * sizeof *e->num);
1110 e->num[ndbl++] = dbl;
1114 emit_str_con (char *str, int len)
1116 if (nstr + len + 1 > mstr)
1119 e->str = xrealloc (e->str, mstr);
1121 e->str[nstr++] = len;
1122 memcpy (&e->str[nstr], str, len);
1127 emit_var (struct variable * v)
1132 e->var = xrealloc (e->var, mvars * sizeof *e->var);
1134 e->var[nvars++] = v;