1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
32 #include "julcal/julcal.h"
40 Expression "optimizer"
42 Operates on the tree representation of expressions.
43 optimize_expression() performs the optimizations listed below:
46 Any operation with constant operands is replaced by its value.
47 (Exception: random-number-generator functions.)
49 2. Strength reduction (x is any expression; a is a numeric constant)
53 x**1, x+0, x-0, x*1 => x
55 x/a => x*(1/a) (where 1/a is evaluated at optimization time)
57 I thought about adding additional optimizations but decided that what
58 is here could already be considered overkill.
61 static struct nonterm_node *evaluate_tree (struct nonterm_node * n);
62 static struct nonterm_node *optimize_tree (struct nonterm_node * n);
65 optimize_expression (struct nonterm_node * n)
69 /* Set to 1 if a child is nonconstant. */
72 /* Number of system-missing children. */
75 /* We can't optimize a terminal node. */
76 if (n->type > OP_TERMINAL)
79 /* Start by optimizing all the children. */
80 for (i = 0; i < n->n; i++)
82 n->arg[i] = ((union any_node *)
83 optimize_expression ((struct nonterm_node *) n->arg[i]));
84 if (n->arg[i]->type == OP_NUM_CON)
86 if (n->arg[i]->num_con.value == SYSMIS)
89 else if (n->arg[i]->type != OP_STR_CON)
93 if (sysmis && !(ops[n->type].flags & OP_ABSORB_MISS))
94 /* Just about any operation produces SYSMIS when given any SYSMIS
97 struct num_con_node *num = xmalloc (sizeof *num);
98 free_node ((union any_node *) n);
99 num->type = OP_NUM_CON;
101 n = (struct nonterm_node *) num;
104 /* If all the children of this node are constants, then there are
105 obvious optimizations. */
106 n = evaluate_tree (n);
108 /* Otherwise, we may be able to make certain optimizations
110 n = optimize_tree (n);
114 static struct nonterm_node *repl_num_con (struct nonterm_node *, double);
115 static struct nonterm_node *force_repl_num_con (struct nonterm_node *, double);
116 static struct nonterm_node *repl_str_con (struct nonterm_node *, char *, int);
118 #define n0 n->arg[0]->num_con.value
119 #define n1 n->arg[1]->num_con.value
120 #define n2 n->arg[2]->num_con.value
122 #define s0 n->arg[0]->str_con.s
123 #define s0l n->arg[0]->str_con.len
124 #define s1 n->arg[1]->str_con.s
125 #define s1l n->arg[1]->str_con.len
126 #define s2 n->arg[2]->str_con.s
127 #define s2l n->arg[2]->str_con.len
128 #define s(X) n->arg[X]->str_con.s
129 #define sl(X) n->arg[X]->str_con.len
131 static struct nonterm_node *
132 optimize_tree (struct nonterm_node * n)
137 if (n->type == OP_PLUS || n->type == OP_MUL)
139 /* Default constant value. */
140 double def = n->type == OP_MUL ? 1.0 : 0.0;
142 /* Total value of all the constants. */
145 /* Number of nonconst arguments. */
149 struct nonterm_node *m;
151 /* Argument copying counter. */
154 /* 1=SYSMIS encountered */
157 for (i = 0; i < n->n; i++)
158 if (n->arg[i]->type == OP_NUM_CON)
160 if (n->arg[i]->num_con.value != SYSMIS)
162 if (n->type == OP_MUL)
163 cval *= n->arg[i]->num_con.value;
165 cval += n->arg[i]->num_con.value;
173 /* 0*SYSMIS=0, 0/SYSMIS=0; otherwise, SYSMIS and infinities
175 if (approx_eq (cval, 0.0) && n->type == OP_MUL)
177 else if (sysmis || !finite (cval))
183 /* If no nonconstant terms, replace with a constant node. */
185 return force_repl_num_con (n, cval);
187 if (nvar == 1 && cval == def)
189 /* If there is exactly one nonconstant term and no constant
190 terms, replace with the nonconstant term. */
191 for (i = 0; i < n->n; i++)
192 if (n->arg[i]->type != OP_NUM_CON)
193 m = (struct nonterm_node *) n->arg[i];
195 free_node (n->arg[i]);
199 /* Otherwise consolidate all the nonconstant terms. */
200 m = xmalloc (sizeof (struct nonterm_node)
201 + ((nvar + approx_ne (cval, def) - 1)
202 * sizeof (union any_node *)));
203 for (i = c = 0; i < n->n; i++)
204 if (n->arg[i]->type != OP_NUM_CON)
205 m->arg[c++] = n->arg[i];
207 free_node (n->arg[i]);
209 if (approx_ne (cval, def))
211 m->arg[c] = xmalloc (sizeof (struct num_con_node));
212 m->arg[c]->num_con.type = OP_NUM_CON;
213 m->arg[c]->num_con.value = cval;
223 else if (n->type == OP_POW)
225 if (n->arg[1]->type == OP_NUM_CON)
227 if (approx_eq (n1, 1.0))
229 struct nonterm_node *m = (struct nonterm_node *) n->arg[0];
231 free_node (n->arg[1]);
235 else if (approx_eq (n1, 2.0))
237 n = xrealloc (n, sizeof (struct nonterm_node));
247 (n = repl_num_con (n, D))
250 (n = force_repl_num_con (n, D))
252 /* Finds the first NEEDLE of length NEEDLE_LEN in a HAYSTACK of length
253 HAYSTACK_LEN. Returns a 1-based index, 0 on failure. */
255 str_search (char *haystack, int haystack_len, char *needle, int needle_len)
257 char *p = memmem (haystack, haystack_len, needle, needle_len);
258 return p ? p - haystack + 1 : 0;
261 /* Finds the last NEEDLE of length NEEDLE_LEN in a HAYSTACK of length
262 HAYSTACK_LEN. Returns a 1-based index, 0 on failure. */
264 str_rsearch (char *haystack, int haystack_len, char *needle, int needle_len)
266 char *p = mm_find_reverse (haystack, haystack_len, needle, needle_len);
267 return p ? p - haystack + 1 : 0;
270 static struct nonterm_node *
271 evaluate_tree (struct nonterm_node * n)
279 strbuf = xmalloc (256);
286 return optimize_tree (n);
289 if (approx_eq (n0, 0.0) && approx_eq (n1, 0.0))
291 else if (n0 == SYSMIS && n1 == 0.0)
293 else if (n0 == 0.0 && n1 == SYSMIS)
300 if (n0 == 0.0 || n1 == 0.0)
302 else if (n0 == SYSMIS || n1 == SYSMIS)
308 if (n0 == 1.0 || n1 == 1.0)
310 else if (n0 == SYSMIS || n1 == SYSMIS)
316 rnc (n0 == 0.0 ? 1.0 : 0.0);
320 rnc (approx_eq (n0, n1));
323 rnc (approx_ge (n0, n1));
326 rnc (approx_gt (n0, n1));
329 rnc (approx_le (n0, n1));
332 rnc (approx_lt (n0, n1));
335 rnc (approx_ne (n0, n1));
338 /* String operators. */
340 rnc (st_compare_pad (s0, s0l, s1, s1l) == 0);
343 rnc (st_compare_pad (s0, s0l, s1, s1l) >= 0);
346 rnc (st_compare_pad (s0, s0l, s1, s1l) > 0);
349 rnc (st_compare_pad (s0, s0l, s1, s1l) <= 0);
352 rnc (st_compare_pad (s0, s0l, s1, s1l) < 0);
355 rnc (st_compare_pad (s0, s0l, s1, s1l) != 0);
358 /* Unary functions. */
390 rnc (n0 >= 0.0 ? floor (n0 + 0.5) : -floor (-n0 + 0.5));
402 rnc (n0 >= 0.0 ? floor (n0) : -floor (-n0));
405 /* N-ary numeric functions. */
414 for (i = 1; i < n->n; i++)
416 ni = n->arg[i]->num_con.value;
417 if (approx_eq (n0, ni))
425 frnc (sysmis ? SYSMIS : 0.0);
430 for (i = 1; i < n->n; i++)
431 if (!st_compare_pad (n->arg[0]->str_con.s, n->arg[0]->str_con.len,
432 n->arg[i]->str_con.s, n->arg[i]->str_con.len))
435 goto any_string_done;
452 {0.0, 0.0}; /* sum, sum of squares */
453 double min = DBL_MAX; /* minimum value */
454 double max = -DBL_MAX; /* maximum value */
455 double ni; /* value of i'th argument */
456 int nv = 0; /* number of valid arguments */
458 for (i = 0; i < n->n; i++)
460 ni = n->arg[i]->num_con.value;
472 if (n->type == OP_NMISS)
474 else if (n->type == OP_NVALID)
476 else if (nv >= (int) n->arg[i])
481 frnc (calc_cfvar (d, nv));
487 frnc (calc_mean (d, nv));
493 frnc (calc_stddev (calc_variance (d, nv)));
499 frnc (calc_variance (d, nv));
515 for (i = 1; i < n->n; i += 2)
517 min = n->arg[i]->num_con.value;
518 max = n->arg[i + 1]->num_con.value;
519 if (min == SYSMIS || max == SYSMIS)
522 if (approx_ge (n0, min) && approx_le (n0, max))
528 frnc (sysmis ? SYSMIS : 0.0);
532 case OP_RANGE_STRING:
533 for (i = 1; i < n->n; i += 2)
534 if (st_compare_pad (n->arg[0]->str_con.s, n->arg[0]->str_con.len,
535 n->arg[i]->str_con.s, n->arg[i]->str_con.len) >= 0
536 && st_compare_pad (n->arg[0]->str_con.s, n->arg[0]->str_con.len,
537 n->arg[i + 1]->str_con.s,
538 n->arg[i + 1]->str_con.len) <= 0)
549 rnc (60. * (60. * n0 + n1) + n2);
552 /* Date construction functions. */
554 rnc (60. * 60. * 24. * yrmoda (n2, n1, n0));
557 rnc (60. * 60. * 24. * yrmoda (n2, n0, n1));
560 rnc (60. * 60. * 24. * yrmoda (n1, n0, 1));
563 rnc (60. * 60. * 24. * yrmoda (n1, 3 * (int) n0 - 2, 1));
567 double t = yrmoda (n1, 1, 1);
569 t = 60. * 60. * 24. * (t + 7. * (n0 - 1));
575 double t = yrmoda (n0, 1, 1);
577 t = 60. * 60. * 24. * (t + n0 - 1);
582 rnc (yrmoda (n0, n1, n2));
584 /* Date extraction functions. */
586 rnc (floor (n0 / 60. / 60. / 24.) * 60. * 60. * 24.);
589 rnc (fmod (floor (n0 / 60. / 60.), 24.));
592 rnc (julian_to_jday (n0 / 86400.));
597 julian_to_calendar (n0 / 86400., NULL, NULL, &day);
601 case OP_XDATE_MINUTE:
602 rnc (fmod (floor (n0 / 60.), 60.));
607 julian_to_calendar (n0 / 86400., NULL, &month, NULL);
611 case OP_XDATE_QUARTER:
614 julian_to_calendar (n0 / 86400., NULL, &month, NULL);
615 rnc ((month - 1) / 3 + 1);
618 case OP_XDATE_SECOND:
619 rnc (fmod (n0, 60.));
622 rnc (floor (n0 / 60. / 60. / 24.));
625 rnc (n0 - floor (n0 / 60. / 60. / 24.) * 60. * 60. * 24.);
628 rnc ((julian_to_jday (n0) - 1) / 7 + 1);
631 rnc (julian_to_wday (n0));
636 julian_to_calendar (n0 / 86400., &year, NULL, NULL);
641 /* String functions. */
645 memcpy (strbuf, s0, len);
646 for (i = 1; i < n->n; i++)
651 memcpy (&strbuf[len], s (i), add);
654 n = repl_str_con (n, strbuf, len);
658 rnc (s1l ? str_search (s0, s0l, s1, s1l) : SYSMIS);
661 if (n2 == SYSMIS || (int) n2 <= 0 || s1l % (int) n2)
663 msg (SW, _("While optimizing a constant expression, there was "
664 "a bad value for the third argument to INDEX."));
670 int c = s1l / (int) n2;
673 for (i = 0; i < c; i++)
675 r = str_search (s0, s0l, s (i), sl (i));
676 if (r < pos || pos == 0)
683 rnc (str_rsearch (s0, s0l, s1, s1l));
686 if (n2 == SYSMIS || (int) n2 <= 0 || s1l % (int) n2)
688 msg (SE, _("While optimizing a constant expression, there was "
689 "a bad value for the third argument to RINDEX."));
698 for (i = 0; i < c; i++)
700 r = str_rsearch (s0, s0l, s (i), sl (i));
713 for (cp = &s0[s0l]; cp >= s0; cp--)
714 *cp = tolower ((unsigned char) (*cp));
715 n = repl_str_con (n, s0, s0l);
721 for (cp = &s0[s0[0] + 1]; cp > s0; cp--)
722 *cp = toupper ((unsigned char) (*cp));
723 n = repl_str_con (n, s0, s0l);
735 n = repl_str_con (n, NULL, 0);
739 len = range (len, 1, 255);
740 add = max (n1 - s0l, 0);
742 if (n->type == OP_LPAD_OPT || n->type == OP_RPAD_OPT)
746 c = n->type == OP_LPAD_OPT ? 'L' : 'R';
747 msg (SE, _("Third argument to %cPAD() must be at least one "
748 "character in length."), c);
757 if (n->type == OP_LPAD || n->type == OP_LPAD_OPT)
758 memmove (&s0[add], s0, len);
759 if (n->type == OP_LPAD || n->type == OP_LPAD_OPT)
762 memset (&s0[s0l], c, add);
764 n = repl_str_con (n, s0, len);
775 if (n->type == OP_LTRIM_OPT || n->type == OP_RTRIM_OPT)
779 c = n->type == OP_LTRIM_OPT ? 'L' : 'R';
780 msg (SE, _("Second argument to %cTRIM() must be at least one "
781 "character in length."), c);
788 if (n->type == OP_LTRIM || n->type == OP_LTRIM_OPT)
790 while (*cp == c && cp < &s0[len])
795 while (len > 0 && s0[len - 1] == c)
797 n = repl_str_con (n, cp, len);
809 di.flags = DI_IGNORE_ERROR;
812 if (n->type == OP_NUMBER_OPT)
814 di.format.type = (int) n->arg[1];
815 di.format.w = (int) n->arg[2];
816 di.format.d = (int) n->arg[3];
820 di.format.type = FMT_F;
833 f.type = (int) n->arg[1];
834 f.w = (int) n->arg[2];
835 f.d = (int) n->arg[3];
838 data_out (strbuf, &f, &v);
839 n = repl_str_con (n, strbuf, f.w);
846 if (pos > s0l || pos <= 0 || n1 == SYSMIS
847 || (n->type == OP_SUBSTR_OPT && n2 == SYSMIS))
848 n = repl_str_con (n, NULL, 0);
851 if (n->type == OP_SUBSTR_OPT)
854 if (len + pos - 1 > s0l)
859 n = repl_str_con (n, &s0[pos - 1], len);
869 if (approx_eq (n0, 0.0) && n1 == SYSMIS)
875 if (approx_eq (n0, 0.0))
877 else if (approx_eq (n0, 1.0))
879 else if (n0 != SYSMIS)
881 msg (SE, _("When optimizing a constant expression, an integer "
882 "that was being used as an Boolean value was found "
883 "to have a constant value other than 0, 1, or SYSMIS."));
908 static struct nonterm_node *
909 repl_num_con (struct nonterm_node * n, double d)
912 if (!finite (d) || errno)
915 for (i = 0; i < n->n; i++)
916 if (n->arg[i]->type == OP_NUM_CON && n->arg[i]->num_con.value == SYSMIS)
921 return force_repl_num_con (n, d);
924 static struct nonterm_node *
925 force_repl_num_con (struct nonterm_node * n, double d)
927 struct num_con_node *num;
929 if (!finite (d) || errno)
931 free_node ((union any_node *) n);
932 num = xmalloc (sizeof *num);
933 num->type = OP_NUM_CON;
935 return (struct nonterm_node *) num;
938 static struct nonterm_node *
939 repl_str_con (struct nonterm_node * n, char *s, int len)
941 struct str_con_node *str;
943 /* The ordering here is important since the source string may be
944 part of a subnode of n. */
945 str = xmalloc (sizeof *str + len - 1);
946 str->type = OP_STR_CON;
948 memcpy (str->s, s, len);
949 free_node ((union any_node *) n);
950 return (struct nonterm_node *) str;
953 /* Returns the number of days since 10 Oct 1582 for the date
954 YEAR/MONTH/DAY, where YEAR is in range 0..199 or 1582..19999, MONTH
955 is in 1..12, and DAY is in 1..31. */
957 yrmoda (double year, double month, double day)
959 if (year == SYSMIS || month == SYSMIS || day == SYSMIS)
962 /* The addition of EPSILON avoids converting, for example,
963 1991.9999997=>1991. */
964 year = floor (year + EPSILON);
965 month = floor (month + EPSILON);
966 day = floor (day + EPSILON);
968 if (year >= 0. && year <= 199.)
970 if ((year < 1582. || year > 19999.)
971 || (year == 1582. && (month < 10. || (month == 10. && day < 15.)))
972 || (month < -1 || month > 13)
973 || (day < -1 || day > 32))
975 return calendar_to_julian (year, month, day);
978 /* Expression dumper. */
980 static struct expression *e;
982 static int ndbl, mdbl;
983 static int nstr, mstr;
984 static int nvars, mvars;
986 static void dump_node (union any_node * n);
987 static void emit (int);
988 static void emit_num_con (double);
989 static void emit_str_con (char *, int);
990 static void emit_var (struct variable *);
993 dump_expression (union any_node * n, struct expression * expr)
1013 /* Now compute the stack height needed to evaluate the expression. */
1014 for (o = e->op; *o != OP_SENTINEL; o++)
1016 if (ops[*o].flags & OP_VAR_ARGS)
1019 height += ops[*o].height;
1021 if (height > max_height)
1022 max_height = height;
1025 /* We waste space for one `value' since pointers are not
1026 guaranteed to be able to point to a spot before a block. */
1029 e->stack = xmalloc (max_height * sizeof *e->stack);
1031 e->pool = pool_create ();
1035 dump_node (union any_node * n)
1037 if (n->type == OP_AND || n->type == OP_OR)
1041 dump_node (n->nonterm.arg[0]);
1042 for (i = 1; i < n->nonterm.n; i++)
1044 dump_node (n->nonterm.arg[i]);
1049 else if (n->type < OP_TERMINAL)
1052 for (i = 0; i < n->nonterm.n; i++)
1053 dump_node (n->nonterm.arg[i]);
1055 if (ops[n->type].flags & OP_VAR_ARGS)
1056 emit (n->nonterm.n);
1057 if (ops[n->type].flags & OP_MIN_ARGS)
1058 emit ((int) n->nonterm.arg[n->nonterm.n]);
1059 if (ops[n->type].flags & OP_FMT_SPEC)
1061 emit ((int) n->nonterm.arg[n->nonterm.n]);
1062 emit ((int) n->nonterm.arg[n->nonterm.n + 1]);
1063 emit ((int) n->nonterm.arg[n->nonterm.n + 2]);
1069 if (n->type == OP_NUM_CON)
1070 emit_num_con (n->num_con.value);
1071 else if (n->type == OP_STR_CON)
1072 emit_str_con (n->str_con.s, n->str_con.len);
1073 else if (n->type == OP_NUM_VAR || n->type == OP_STR_VAR
1074 || n->type == OP_STR_MIS)
1075 emit_var (n->var.v);
1076 else if (n->type == OP_NUM_LAG || n->type == OP_STR_LAG)
1078 emit_var (n->lag.v);
1081 else if (n->type == OP_NUM_SYS || n->type == OP_NUM_VAL)
1082 emit (n->var.v->fv);
1084 assert (n->type == OP_CASENUM);
1093 e->op = xrealloc (e->op, mop * sizeof *e->op);
1099 emit_num_con (double dbl)
1104 e->num = xrealloc (e->num, mdbl * sizeof *e->num);
1106 e->num[ndbl++] = dbl;
1110 emit_str_con (char *str, int len)
1112 if (nstr + len + 1 > mstr)
1115 e->str = xrealloc (e->str, mstr);
1117 e->str[nstr++] = len;
1118 memcpy (&e->str[nstr], str, len);
1123 emit_var (struct variable * v)
1128 e->var = xrealloc (e->var, mvars * sizeof *e->var);
1130 e->var[nvars++] = v;