1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 #include "data/case.h"
27 #include "data/dictionary.h"
28 #include "data/settings.h"
29 #include "data/variable.h"
30 #include "language/expressions/helpers.h"
31 #include "language/lexer/format-parser.h"
32 #include "language/lexer/lexer.h"
33 #include "language/lexer/variable-parser.h"
34 #include "libpspp/array.h"
35 #include "libpspp/assertion.h"
36 #include "libpspp/i18n.h"
37 #include "libpspp/message.h"
38 #include "libpspp/misc.h"
39 #include "libpspp/pool.h"
40 #include "libpspp/str.h"
42 #include "gl/c-strcase.h"
43 #include "gl/minmax.h"
44 #include "gl/xalloc.h"
48 /* Recursive descent parser in order of increasing precedence. */
49 typedef struct expr_node *parse_recursively_func (struct lexer *, struct expression *);
50 static parse_recursively_func parse_or, parse_and, parse_not;
51 static parse_recursively_func parse_rel, parse_add, parse_mul;
52 static parse_recursively_func parse_neg, parse_exp;
53 static parse_recursively_func parse_primary;
54 static parse_recursively_func parse_vector_element, parse_function;
56 /* Utility functions. */
57 static struct expression *expr_create (struct dataset *ds);
58 atom_type expr_node_returns (const struct expr_node *);
60 static const char *atom_type_name (atom_type);
61 static struct expression *finish_expression (struct expr_node *,
63 static bool type_check (const struct expression *, const struct expr_node *,
64 enum val_type expected_type);
65 static struct expr_node *allocate_unary_variable (struct expression *,
66 const struct variable *);
68 /* Public functions. */
70 static struct expr_node *
71 parse_expr (struct lexer *lexer, struct expression *e)
73 struct expr_node *n = parse_or (lexer, e);
74 if (n && n->type == OP_VEC_ELEM_NUM_RAW)
75 n->type = OP_VEC_ELEM_NUM;
79 /* Parses an expression of the given TYPE. If DS is nonnull then variables and
80 vectors within it may be referenced within the expression; otherwise, the
81 expression must not reference any variables or vectors. Returns the new
82 expression if successful or a null pointer otherwise. */
84 expr_parse (struct lexer *lexer, struct dataset *ds, enum val_type type)
86 assert (val_type_is_valid (type));
88 struct expression *e = expr_create (ds);
89 struct expr_node *n = parse_expr (lexer, e);
90 if (!n || !type_check (e, n, type))
96 return finish_expression (expr_optimize (n, e), e);
99 /* Parses a boolean expression, otherwise similar to expr_parse(). */
101 expr_parse_bool (struct lexer *lexer, struct dataset *ds)
103 struct expression *e = expr_create (ds);
104 struct expr_node *n = parse_expr (lexer, e);
111 atom_type actual_type = expr_node_returns (n);
112 if (actual_type == OP_number)
113 n = expr_allocate_unary (e, OP_EXPR_TO_BOOLEAN, n);
114 else if (actual_type != OP_boolean)
116 msg_at (SE, expr_location (e, n),
117 _("Type mismatch: expression has %s type, "
118 "but a boolean value is required here."),
119 atom_type_name (actual_type));
124 return finish_expression (expr_optimize (n, e), e);
127 /* Parses a numeric expression that is intended to be assigned to newly created
128 variable NEW_VAR_NAME at NEW_VAR_LOCATION. (This allows for a better error
129 message if the expression is not numeric.) Otherwise similar to
132 expr_parse_new_variable (struct lexer *lexer, struct dataset *ds,
133 const char *new_var_name,
134 const struct msg_location *new_var_location)
136 struct expression *e = expr_create (ds);
137 struct expr_node *n = parse_expr (lexer, e);
144 atom_type actual_type = expr_node_returns (n);
145 if (actual_type != OP_number && actual_type != OP_boolean)
147 msg_at (SE, new_var_location,
148 _("This command tries to create a new variable %s by assigning a "
149 "string value to it, but this is not supported. Use "
150 "the STRING command to create the new variable with the "
151 "correct width before assigning to it, e.g. STRING %s(A20)."),
152 new_var_name, new_var_name);
157 return finish_expression (expr_optimize (n, e), e);
160 /* Free expression E. */
162 expr_free (struct expression *e)
165 pool_destroy (e->expr_pool);
169 expr_parse_any (struct lexer *lexer, struct dataset *ds, bool optimize)
172 struct expression *e;
174 e = expr_create (ds);
175 n = parse_expr (lexer, e);
183 n = expr_optimize (n, e);
184 return finish_expression (n, e);
187 /* Finishing up expression building. */
189 /* Height of an expression's stacks. */
192 int number_height; /* Height of number stack. */
193 int string_height; /* Height of string stack. */
196 /* Stack heights used by different kinds of arguments. */
197 static const struct stack_heights on_number_stack = {1, 0};
198 static const struct stack_heights on_string_stack = {0, 1};
199 static const struct stack_heights not_on_stack = {0, 0};
201 /* Returns the stack heights used by an atom of the given
203 static const struct stack_heights *
204 atom_type_stack (atom_type type)
206 assert (is_atom (type));
212 case OP_num_vec_elem:
213 return &on_number_stack;
216 return &on_string_stack;
227 return ¬_on_stack;
234 /* Measures the stack height needed for node N, supposing that
235 the stack height is initially *HEIGHT and updating *HEIGHT to
236 the final stack height. Updates *MAX, if necessary, to
237 reflect the maximum intermediate or final height. */
239 measure_stack (const struct expr_node *n,
240 struct stack_heights *height, struct stack_heights *max)
242 const struct stack_heights *return_height;
244 if (is_composite (n->type))
246 struct stack_heights args;
250 for (i = 0; i < n->n_args; i++)
251 measure_stack (n->args[i], &args, max);
253 return_height = atom_type_stack (operations[n->type].returns);
256 return_height = atom_type_stack (n->type);
258 height->number_height += return_height->number_height;
259 height->string_height += return_height->string_height;
261 if (height->number_height > max->number_height)
262 max->number_height = height->number_height;
263 if (height->string_height > max->string_height)
264 max->string_height = height->string_height;
267 /* Allocates stacks within E sufficient for evaluating node N. */
269 allocate_stacks (struct expr_node *n, struct expression *e)
271 struct stack_heights initial = {0, 0};
272 struct stack_heights max = {0, 0};
274 measure_stack (n, &initial, &max);
275 e->number_stack = pool_alloc (e->expr_pool,
276 sizeof *e->number_stack * max.number_height);
277 e->string_stack = pool_alloc (e->expr_pool,
278 sizeof *e->string_stack * max.string_height);
281 /* Finalizes expression E for evaluating node N. */
282 static struct expression *
283 finish_expression (struct expr_node *n, struct expression *e)
285 /* Allocate stacks. */
286 allocate_stacks (n, e);
288 /* Output postfix representation. */
291 /* The eval_pool might have been used for allocating strings
292 during optimization. We need to keep those strings around
293 for all subsequent evaluations, so start a new eval_pool. */
294 e->eval_pool = pool_create_subpool (e->expr_pool);
299 /* Verifies that expression E, whose root node is *N, can be
300 converted to type EXPECTED_TYPE, inserting a conversion at *N
301 if necessary. Returns true if successful, false on failure. */
303 type_check (const struct expression *e, const struct expr_node *n,
304 enum val_type expected_type)
306 atom_type actual_type = expr_node_returns (n);
308 switch (expected_type)
311 if (actual_type != OP_number && actual_type != OP_boolean)
313 msg_at (SE, expr_location (e, n),
314 _("Type mismatch: expression has type '%s', "
315 "but a numeric value is required."),
316 atom_type_name (actual_type));
322 if (actual_type != OP_string)
324 msg_at (SE, expr_location (e, n),
325 _("Type mismatch: expression has type '%s', "
326 "but a string value is required."),
327 atom_type_name (actual_type));
339 /* Recursive-descent expression parser. */
342 free_msg_location (void *loc_)
344 struct msg_location *loc = loc_;
345 msg_location_destroy (loc);
349 expr_location__ (struct expression *e,
350 const struct expr_node *node,
351 const struct msg_location **minp,
352 const struct msg_location **maxp)
354 struct msg_location *loc = node->location;
357 const struct msg_location *min = *minp;
360 || loc->start.line < min->start.line
361 || (loc->start.line == min->start.line
362 && loc->start.column < min->start.column)))
365 const struct msg_location *max = *maxp;
368 || loc->end.line > max->end.line
369 || (loc->end.line == max->end.line
370 && loc->end.column > max->end.column)))
376 if (is_composite (node->type))
377 for (size_t i = 0; i < node->n_args; i++)
378 expr_location__ (e, node->args[i], minp, maxp);
381 /* Returns the source code location corresponding to expression NODE, computing
382 it lazily if needed. */
383 const struct msg_location *
384 expr_location (const struct expression *e_, const struct expr_node *node_)
386 struct expr_node *node = CONST_CAST (struct expr_node *, node_);
392 struct expression *e = CONST_CAST (struct expression *, e_);
393 const struct msg_location *min = NULL;
394 const struct msg_location *max = NULL;
395 expr_location__ (e, node, &min, &max);
398 node->location = msg_location_dup (min);
399 node->location->end = max->end;
400 pool_register (e->expr_pool, free_msg_location, node->location);
403 return node->location;
406 /* Sets e->location to the tokens in S's lexer from offset START_OFS to the
407 token before the current one. Has no effect if E already has a location or
410 expr_add_location (struct lexer *lexer, struct expression *e,
411 int start_ofs, struct expr_node *node)
413 if (node && !node->location)
415 node->location = lex_ofs_location (lexer, start_ofs, lex_ofs (lexer) - 1);
416 pool_register (e->expr_pool, free_msg_location, node->location);
421 type_coercion__ (struct expression *e, struct expr_node *node, size_t arg_idx,
424 assert (!!do_coercion == (e != NULL));
429 struct expr_node **argp = &node->args[arg_idx];
430 struct expr_node *arg = *argp;
434 const struct operation *op = &operations[node->type];
435 atom_type required_type = op->args[MIN (arg_idx, op->n_args - 1)];
436 atom_type actual_type = expr_node_returns (arg);
437 if (actual_type == required_type)
443 switch (required_type)
446 if (actual_type == OP_boolean)
448 /* To enforce strict typing rules, insert Boolean to
449 numeric "conversion". This conversion is a no-op,
450 so it will be removed later. */
452 *argp = expr_allocate_unary (e, OP_BOOLEAN_TO_NUM, arg);
455 else if (actual_type == OP_num_vec_elem)
458 arg->type = OP_VEC_ELEM_NUM;
464 /* No coercion to string. */
468 if (actual_type == OP_number)
470 /* Convert numeric to boolean. */
472 *argp = expr_allocate_binary (e, OP_OPERAND_TO_BOOLEAN, arg,
473 expr_allocate_expr_node (e, node));
479 if (actual_type == OP_number)
481 /* Convert number to integer. */
483 *argp = expr_allocate_unary (e, OP_NUM_TO_INTEGER, arg);
489 /* We never coerce to OP_format, only to OP_ni_format or OP_no_format. */
494 if (arg->type == OP_format
495 && fmt_check_input (&arg->format)
496 && fmt_check_type_compat (&arg->format, VAL_NUMERIC))
500 arg->type = OP_ni_format;
508 if (arg->type == OP_format
509 && fmt_check_output (&arg->format)
510 && fmt_check_type_compat (&arg->format, VAL_NUMERIC))
514 arg->type = OP_no_format;
521 if (arg->type == OP_NUM_VAR)
524 *argp = arg->args[0];
530 if (arg->type == OP_STR_VAR)
533 *argp = arg->args[0];
539 if (arg->type == OP_NUM_VAR || arg->type == OP_STR_VAR)
542 *argp = arg->args[0];
548 if (arg->type == OP_number
549 && floor (arg->number) == arg->number
550 && arg->number > 0 && arg->number < INT_MAX)
553 *argp = expr_allocate_pos_int (e, arg->number);
565 type_coercion (struct expression *e, struct expr_node *node, size_t arg_idx)
567 return type_coercion__ (e, node, arg_idx, true);
571 is_coercible (const struct expr_node *node_, size_t arg_idx)
573 struct expr_node *node = CONST_CAST (struct expr_node *, node_);
574 return type_coercion__ (NULL, node, arg_idx, false);
577 /* How to parse an operator.
579 Some operators support both numeric and string operators. For those,
580 'num_op' and 'str_op' are both nonzero. Otherwise, only one 'num_op' is
581 nonzero. (PSPP doesn't have any string-only operators.) */
584 enum token_type token; /* Operator token. */
585 operation_type num_op; /* Operation for numeric operands (or 0). */
586 operation_type str_op; /* Operation for string operands (or 0). */
589 static operation_type
590 match_operator (struct lexer *lexer, const struct operator ops[], size_t n_ops,
591 const struct expr_node *lhs)
593 bool lhs_is_numeric = operations[lhs->type].returns != OP_string;
594 for (const struct operator *op = ops; op < ops + n_ops; op++)
595 if (lex_token (lexer) == op->token)
597 if (op->token != T_NEG_NUM)
600 return op->str_op && !lhs_is_numeric ? op->str_op : op->num_op;
606 operator_name (enum token_type token)
608 return token == T_NEG_NUM ? "-" : token_type_to_string (token);
611 static struct expr_node *
612 parse_binary_operators__ (struct lexer *lexer, struct expression *e,
613 const struct operator ops[], size_t n_ops,
614 parse_recursively_func *parse_next_level,
615 const char *chain_warning, struct expr_node *lhs)
617 for (int op_count = 0; ; op_count++)
619 enum token_type token = lex_token (lexer);
620 operation_type optype = match_operator (lexer, ops, n_ops, lhs);
623 if (op_count > 1 && chain_warning)
624 msg_at (SW, expr_location (e, lhs), "%s", chain_warning);
629 struct expr_node *rhs = parse_next_level (lexer, e);
633 struct expr_node *node = expr_allocate_binary (e, optype, lhs, rhs);
634 if (!is_coercible (node, 0) || !is_coercible (node, 1))
637 for (size_t i = 0; i < n_ops; i++)
638 if (ops[i].token == token)
639 both = ops[i].num_op && ops[i].str_op;
641 const char *name = operator_name (token);
643 msg_at (SE, expr_location (e, node),
644 _("Both operands of %s must have the same type."), name);
645 else if (operations[node->type].args[0] != OP_string)
646 msg_at (SE, expr_location (e, node),
647 _("Both operands of %s must be numeric."), name);
651 msg_at (SN, expr_location (e, node->args[0]),
652 _("This operand has type '%s'."),
653 atom_type_name (expr_node_returns (node->args[0])));
654 msg_at (SN, expr_location (e, node->args[1]),
655 _("This operand has type '%s'."),
656 atom_type_name (expr_node_returns (node->args[1])));
661 if (!type_coercion (e, node, 0) || !type_coercion (e, node, 1))
668 static struct expr_node *
669 parse_binary_operators (struct lexer *lexer, struct expression *e,
670 const struct operator ops[], size_t n_ops,
671 parse_recursively_func *parse_next_level,
672 const char *chain_warning)
674 struct expr_node *lhs = parse_next_level (lexer, e);
678 return parse_binary_operators__ (lexer, e, ops, n_ops, parse_next_level,
682 static struct expr_node *
683 parse_inverting_unary_operator (struct lexer *lexer, struct expression *e,
684 const struct operator *op,
685 parse_recursively_func *parse_next_level)
687 int start_ofs = lex_ofs (lexer);
688 unsigned int op_count = 0;
689 while (lex_match (lexer, op->token))
692 struct expr_node *inner = parse_next_level (lexer, e);
693 if (!inner || !op_count)
696 struct expr_node *outer = expr_allocate_unary (e, op->num_op, inner);
697 expr_add_location (lexer, e, start_ofs, outer);
699 if (!type_coercion (e, outer, 0))
701 assert (operations[outer->type].args[0] != OP_string);
703 const char *name = operator_name (op->token);
704 msg_at (SE, expr_location (e, outer),
705 _("The unary %s operator requires a numeric operand."), name);
707 msg_at (SN, expr_location (e, outer->args[0]),
708 _("The operand of %s has type '%s'."),
709 name, atom_type_name (expr_node_returns (outer->args[0])));
714 return op_count % 2 ? outer : outer->args[0];
717 /* Parses the OR level. */
718 static struct expr_node *
719 parse_or (struct lexer *lexer, struct expression *e)
721 static const struct operator op = { .token = T_OR, .num_op = OP_OR };
722 return parse_binary_operators (lexer, e, &op, 1, parse_and, NULL);
725 /* Parses the AND level. */
726 static struct expr_node *
727 parse_and (struct lexer *lexer, struct expression *e)
729 static const struct operator op = { .token = T_AND, .num_op = OP_AND };
731 return parse_binary_operators (lexer, e, &op, 1, parse_not, NULL);
734 /* Parses the NOT level. */
735 static struct expr_node *
736 parse_not (struct lexer *lexer, struct expression *e)
738 static const struct operator op = { .token = T_NOT, .num_op = OP_NOT };
739 return parse_inverting_unary_operator (lexer, e, &op, parse_rel);
742 /* Parse relational operators. */
743 static struct expr_node *
744 parse_rel (struct lexer *lexer, struct expression *e)
746 const char *chain_warning =
747 _("Chaining relational operators (e.g. `a < b < c') will "
748 "not produce the mathematically expected result. "
749 "Use the AND logical operator to fix the problem "
750 "(e.g. `a < b AND b < c'). "
751 "To disable this warning, insert parentheses.");
753 static const struct operator ops[] =
755 { .token = T_EQUALS, .num_op = OP_EQ, .str_op = OP_EQ_STRING },
756 { .token = T_EQ, .num_op = OP_EQ, .str_op = OP_EQ_STRING },
757 { .token = T_GE, .num_op = OP_GE, .str_op = OP_GE_STRING },
758 { .token = T_GT, .num_op = OP_GT, .str_op = OP_GT_STRING },
759 { .token = T_LE, .num_op = OP_LE, .str_op = OP_LE_STRING },
760 { .token = T_LT, .num_op = OP_LT, .str_op = OP_LT_STRING },
761 { .token = T_NE, .num_op = OP_NE, .str_op = OP_NE_STRING },
764 return parse_binary_operators (lexer, e, ops, sizeof ops / sizeof *ops,
765 parse_add, chain_warning);
768 /* Parses the addition and subtraction level. */
769 static struct expr_node *
770 parse_add (struct lexer *lexer, struct expression *e)
772 static const struct operator ops[] =
774 { .token = T_PLUS, .num_op = OP_ADD },
775 { .token = T_DASH, .num_op = OP_SUB },
776 { .token = T_NEG_NUM, .num_op = OP_ADD },
779 return parse_binary_operators (lexer, e, ops, sizeof ops / sizeof *ops,
783 /* Parses the multiplication and division level. */
784 static struct expr_node *
785 parse_mul (struct lexer *lexer, struct expression *e)
787 static const struct operator ops[] =
789 { .token = T_ASTERISK, .num_op = OP_MUL },
790 { .token = T_SLASH, .num_op = OP_DIV },
793 return parse_binary_operators (lexer, e, ops, sizeof ops / sizeof *ops,
797 /* Parses the unary minus level. */
798 static struct expr_node *
799 parse_neg (struct lexer *lexer, struct expression *e)
801 static const struct operator op = { .token = T_DASH, .num_op = OP_NEG };
802 return parse_inverting_unary_operator (lexer, e, &op, parse_exp);
805 static struct expr_node *
806 parse_exp (struct lexer *lexer, struct expression *e)
808 static const struct operator op = { .token = T_EXP, .num_op = OP_POW };
810 const char *chain_warning =
811 _("The exponentiation operator (`**') is left-associative: "
812 "`a**b**c' equals `(a**b)**c', not `a**(b**c)'. "
813 "To disable this warning, insert parentheses.");
815 if (lex_token (lexer) != T_NEG_NUM || lex_next_token (lexer, 1) != T_EXP)
816 return parse_binary_operators (lexer, e, &op, 1,
817 parse_primary, chain_warning);
819 /* Special case for situations like "-5**6", which must be parsed as
822 int start_ofs = lex_ofs (lexer);
823 struct expr_node *lhs = expr_allocate_number (e, -lex_tokval (lexer));
825 expr_add_location (lexer, e, start_ofs, lhs);
827 struct expr_node *node = parse_binary_operators__ (
828 lexer, e, &op, 1, parse_primary, chain_warning, lhs);
832 node = expr_allocate_unary (e, OP_NEG, node);
833 expr_add_location (lexer, e, start_ofs, node);
838 ymd_to_offset (int y, int m, int d)
841 double retval = calendar_gregorian_to_offset (
842 y, m, d, settings_get_fmt_settings (), &error);
845 msg (SE, "%s", error);
851 static struct expr_node *
852 expr_date (struct expression *e, int year_digits)
854 static const char *months[12] =
856 "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
857 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC",
860 time_t last_proc_time = time_of_last_procedure (e->ds);
861 struct tm *time = localtime (&last_proc_time);
863 char *tmp = (year_digits == 2
864 ? xasprintf ("%02d-%s-%02d", time->tm_mday, months[time->tm_mon],
866 : xasprintf ("%02d-%s-%04d", time->tm_mday, months[time->tm_mon],
867 time->tm_year + 1900));
869 struct substring s = ss_clone_pool (ss_cstr (tmp), e->expr_pool);
872 return expr_allocate_string (e, s);
875 /* Parses system variables. */
876 static struct expr_node *
877 parse_sysvar (struct lexer *lexer, struct expression *e)
879 if (lex_match_id (lexer, "$CASENUM"))
880 return expr_allocate_nullary (e, OP_CASENUM);
881 else if (lex_match_id (lexer, "$DATE"))
882 return expr_date (e, 2);
883 else if (lex_match_id (lexer, "$DATE11"))
884 return expr_date (e, 4);
885 else if (lex_match_id (lexer, "$TRUE"))
886 return expr_allocate_boolean (e, 1.0);
887 else if (lex_match_id (lexer, "$FALSE"))
888 return expr_allocate_boolean (e, 0.0);
889 else if (lex_match_id (lexer, "$SYSMIS"))
890 return expr_allocate_number (e, SYSMIS);
891 else if (lex_match_id (lexer, "$JDATE"))
893 time_t time = time_of_last_procedure (e->ds);
894 struct tm *tm = localtime (&time);
895 return expr_allocate_number (e, ymd_to_offset (tm->tm_year + 1900,
899 else if (lex_match_id (lexer, "$TIME"))
901 time_t time = time_of_last_procedure (e->ds);
902 struct tm *tm = localtime (&time);
903 return expr_allocate_number (e, ymd_to_offset (tm->tm_year + 1900,
906 + tm->tm_hour * 60 * 60.
910 else if (lex_match_id (lexer, "$LENGTH"))
911 return expr_allocate_number (e, settings_get_viewlength ());
912 else if (lex_match_id (lexer, "$WIDTH"))
913 return expr_allocate_number (e, settings_get_viewwidth ());
916 lex_error (lexer, _("Unknown system variable %s."), lex_tokcstr (lexer));
921 /* Parses numbers, varnames, etc. */
922 static struct expr_node *
923 parse_primary__ (struct lexer *lexer, struct expression *e)
925 switch (lex_token (lexer))
928 if (lex_next_token (lexer, 1) == T_LPAREN)
930 /* An identifier followed by a left parenthesis may be
931 a vector element reference. If not, it's a function
933 if (e->ds != NULL && dict_lookup_vector (dataset_dict (e->ds), lex_tokcstr (lexer)) != NULL)
934 return parse_vector_element (lexer, e);
936 return parse_function (lexer, e);
938 else if (lex_tokcstr (lexer)[0] == '$')
940 /* $ at the beginning indicates a system variable. */
941 return parse_sysvar (lexer, e);
943 else if (e->ds != NULL && dict_lookup_var (dataset_dict (e->ds), lex_tokcstr (lexer)))
945 /* It looks like a user variable.
946 (It could be a format specifier, but we'll assume
947 it's a variable unless proven otherwise. */
948 return allocate_unary_variable (e, parse_variable (lexer, dataset_dict (e->ds)));
952 /* Try to parse it as a format specifier. */
957 ok = parse_format_specifier (lexer, &fmt);
961 return expr_allocate_format (e, &fmt);
963 /* All attempts failed. */
964 lex_error (lexer, _("Unknown identifier %s."), lex_tokcstr (lexer));
972 struct expr_node *node = expr_allocate_number (e, lex_tokval (lexer));
979 const char *dict_encoding;
980 struct expr_node *node;
983 dict_encoding = (e->ds != NULL
984 ? dict_get_encoding (dataset_dict (e->ds))
986 s = recode_string_pool (dict_encoding, "UTF-8", lex_tokcstr (lexer),
987 ss_length (lex_tokss (lexer)), e->expr_pool);
988 node = expr_allocate_string (e, ss_cstr (s));
997 struct expr_node *node = parse_or (lexer, e);
998 return !node || !lex_force_match (lexer, T_RPAREN) ? NULL : node;
1002 lex_error (lexer, NULL);
1007 static struct expr_node *
1008 parse_primary (struct lexer *lexer, struct expression *e)
1010 int start_ofs = lex_ofs (lexer);
1011 struct expr_node *node = parse_primary__ (lexer, e);
1012 expr_add_location (lexer, e, start_ofs, node);
1016 static struct expr_node *
1017 parse_vector_element (struct lexer *lexer, struct expression *e)
1019 int vector_start_ofs = lex_ofs (lexer);
1021 /* Find vector, skip token.
1022 The caller must already have verified that the current token
1023 is the name of a vector. */
1024 const struct vector *vector = dict_lookup_vector (dataset_dict (e->ds),
1025 lex_tokcstr (lexer));
1026 assert (vector != NULL);
1029 /* Skip left parenthesis token.
1030 The caller must have verified that the lookahead is a left
1032 assert (lex_token (lexer) == T_LPAREN);
1035 int element_start_ofs = lex_ofs (lexer);
1036 struct expr_node *element = parse_or (lexer, e);
1039 expr_add_location (lexer, e, element_start_ofs, element);
1041 if (!lex_match (lexer, T_RPAREN))
1044 operation_type type = (vector_get_type (vector) == VAL_NUMERIC
1045 ? OP_VEC_ELEM_NUM_RAW : OP_VEC_ELEM_STR);
1046 struct expr_node *node = expr_allocate_binary (
1047 e, type, element, expr_allocate_vector (e, vector));
1048 expr_add_location (lexer, e, vector_start_ofs, node);
1050 if (!type_coercion (e, node, 0))
1052 msg_at (SE, expr_location (e, node),
1053 _("A vector index must be numeric."));
1055 msg_at (SN, expr_location (e, node->args[0]),
1056 _("This vector index has type '%s'."),
1057 atom_type_name (expr_node_returns (node->args[0])));
1065 /* Individual function parsing. */
1067 const struct operation operations[OP_first + n_OP] = {
1068 #include "parse.inc"
1072 word_matches (const char **test, const char **name)
1074 size_t test_len = strcspn (*test, ".");
1075 size_t name_len = strcspn (*name, ".");
1076 if (test_len == name_len)
1078 if (buf_compare_case (*test, *name, test_len))
1081 else if (test_len < 3 || test_len > name_len)
1085 if (buf_compare_case (*test, *name, test_len))
1091 if (**test != **name)
1102 /* Returns 0 if TOKEN and FUNC do not match,
1103 1 if TOKEN is an acceptable abbreviation for FUNC,
1104 2 if TOKEN equals FUNC. */
1106 compare_function_names (const char *token_, const char *func_)
1108 const char *token = token_;
1109 const char *func = func_;
1110 while (*token || *func)
1111 if (!word_matches (&token, &func))
1113 return !c_strcasecmp (token_, func_) ? 2 : 1;
1117 lookup_function (const char *token,
1118 const struct operation **first,
1119 const struct operation **last)
1121 *first = *last = NULL;
1122 const struct operation *best = NULL;
1124 for (const struct operation *f = operations + OP_function_first;
1125 f <= operations + OP_function_last; f++)
1127 int score = compare_function_names (token, f->name);
1133 else if (score == 1 && !(f->flags & OPF_NO_ABBREV) && !best)
1142 const struct operation *f = best;
1143 while (f <= operations + OP_function_last
1144 && !c_strcasecmp (f->name, best->name))
1152 extract_min_valid (const char *s)
1154 char *p = strrchr (s, '.');
1156 || p[1] < '0' || p[1] > '9'
1157 || strspn (p + 1, "0123456789") != strlen (p + 1))
1160 return atoi (p + 1);
1164 match_function__ (struct expr_node *node, const struct operation *f)
1166 if (node->n_args < f->n_args
1167 || (node->n_args > f->n_args && (f->flags & OPF_ARRAY_OPERAND) == 0)
1168 || node->n_args - (f->n_args - 1) < f->array_min_elems)
1171 node->type = f - operations;
1172 for (size_t i = 0; i < node->n_args; i++)
1173 if (!is_coercible (node, i))
1179 static const struct operation *
1180 match_function (struct expr_node *node,
1181 const struct operation *first, const struct operation *last)
1183 for (const struct operation *f = first; f < last; f++)
1184 if (match_function__ (node, f))
1190 validate_function_args (const struct expression *e, const struct expr_node *n,
1191 const struct operation *f, int n_args, int min_valid)
1193 /* Count the function arguments that go into the trailing array (if any). We
1194 know that there must be at least the minimum number because
1195 match_function() already checked. */
1196 int array_n_args = n_args - (f->n_args - 1);
1197 assert (array_n_args >= f->array_min_elems);
1199 if ((f->flags & OPF_ARRAY_OPERAND)
1200 && array_n_args % f->array_granularity != 0)
1202 /* RANGE is the only case we have so far. It has paired arguments with
1203 one initial argument, and that's the only special case we deal with
1205 assert (f->array_granularity == 2);
1206 assert (n_args % 2 == 0);
1207 msg_at (SE, expr_location (e, n),
1208 _("%s must have an odd number of arguments."), f->prototype);
1212 if (min_valid != -1)
1214 if (f->array_min_elems == 0)
1216 assert ((f->flags & OPF_MIN_VALID) == 0);
1217 msg_at (SE, expr_location (e, n),
1218 _("%s function cannot accept suffix .%d to specify the "
1219 "minimum number of valid arguments."),
1220 f->prototype, min_valid);
1225 assert (f->flags & OPF_MIN_VALID);
1226 if (min_valid > array_n_args)
1228 msg_at (SE, expr_location (e, n),
1229 _("For %s with %d arguments, at most %d (not %d) may be "
1230 "required to be valid."),
1231 f->prototype, n_args, array_n_args, min_valid);
1241 add_arg (struct expr_node ***args, size_t *n_args, size_t *allocated_args,
1242 struct expr_node *arg,
1243 struct expression *e, struct lexer *lexer, int arg_start_ofs)
1245 if (*n_args >= *allocated_args)
1246 *args = x2nrealloc (*args, allocated_args, sizeof **args);
1248 expr_add_location (lexer, e, arg_start_ofs, arg);
1249 (*args)[(*n_args)++] = arg;
1253 put_invocation (struct string *s,
1254 const char *func_name, struct expr_node *node)
1258 ds_put_format (s, "%s(", func_name);
1259 for (i = 0; i < node->n_args; i++)
1262 ds_put_cstr (s, ", ");
1263 ds_put_cstr (s, operations[expr_node_returns (node->args[i])].prototype);
1265 ds_put_byte (s, ')');
1269 no_match (struct expression *e, const char *func_name, struct expr_node *node,
1270 const struct operation *ops, size_t n)
1278 ds_put_format (&s, _("Type mismatch invoking %s as "), ops->prototype);
1279 put_invocation (&s, func_name, node);
1283 ds_put_cstr (&s, _("Function invocation "));
1284 put_invocation (&s, func_name, node);
1285 ds_put_cstr (&s, _(" does not match any known function. Candidates are:"));
1287 for (size_t i = 0; i < n; i++)
1288 ds_put_format (&s, "\n%s", ops[i].prototype);
1290 ds_put_byte (&s, '.');
1292 msg_at (SE, expr_location (e, node), "%s", ds_cstr (&s));
1294 if (n == 1 && ops->n_args == node->n_args)
1296 for (size_t i = 0; i < node->n_args; i++)
1297 if (!is_coercible (node, i))
1299 atom_type expected = ops->args[i];
1300 atom_type actual = expr_node_returns (node->args[i]);
1301 if ((expected == OP_ni_format || expected == OP_no_format)
1302 && actual == OP_format)
1304 const struct fmt_spec *f = &node->args[i]->format;
1305 char *error = fmt_check__ (f, (ops->args[i] == OP_ni_format
1306 ? FMT_FOR_INPUT : FMT_FOR_OUTPUT));
1308 error = fmt_check_type_compat__ (f, VAL_NUMERIC);
1311 msg_at (SN, expr_location (e, node->args[i]), "%s", error);
1316 msg_at (SN, expr_location (e, node->args[i]),
1317 _("This argument has type '%s' but '%s' is required."),
1318 atom_type_name (actual), atom_type_name (expected));
1325 static struct expr_node *
1326 parse_function (struct lexer *lexer, struct expression *e)
1328 struct string func_name;
1329 ds_init_substring (&func_name, lex_tokss (lexer));
1331 int min_valid = extract_min_valid (lex_tokcstr (lexer));
1333 const struct operation *first, *last;
1334 if (!lookup_function (lex_tokcstr (lexer), &first, &last))
1336 lex_error (lexer, _("No function or vector named %s."),
1337 lex_tokcstr (lexer));
1338 ds_destroy (&func_name);
1342 int func_start_ofs = lex_ofs (lexer);
1344 if (!lex_force_match (lexer, T_LPAREN))
1346 ds_destroy (&func_name);
1350 struct expr_node **args = NULL;
1352 size_t allocated_args = 0;
1353 if (lex_token (lexer) != T_RPAREN)
1356 int arg_start_ofs = lex_ofs (lexer);
1357 if (lex_token (lexer) == T_ID
1358 && lex_next_token (lexer, 1) == T_TO)
1360 const struct variable **vars;
1363 if (!parse_variables_const (lexer, dataset_dict (e->ds),
1364 &vars, &n_vars, PV_SINGLE))
1366 for (size_t i = 0; i < n_vars; i++)
1367 add_arg (&args, &n_args, &allocated_args,
1368 allocate_unary_variable (e, vars[i]),
1369 e, lexer, arg_start_ofs);
1374 struct expr_node *arg = parse_or (lexer, e);
1378 add_arg (&args, &n_args, &allocated_args, arg,
1379 e, lexer, arg_start_ofs);
1381 if (lex_match (lexer, T_RPAREN))
1383 else if (!lex_match (lexer, T_COMMA))
1385 lex_error_expecting (lexer, "`,'", "`)'");
1390 struct expr_node *n = expr_allocate_composite (e, first - operations,
1392 expr_add_location (lexer, e, func_start_ofs, n);
1393 const struct operation *f = match_function (n, first, last);
1396 no_match (e, ds_cstr (&func_name), n, first, last - first);
1399 n->type = f - operations;
1400 n->min_valid = min_valid != -1 ? min_valid : f->array_min_elems;
1402 for (size_t i = 0; i < n_args; i++)
1403 if (!type_coercion (e, n, i))
1405 /* Unreachable because match_function already checked that the
1406 arguments were coercible. */
1409 if (!validate_function_args (e, n, f, n_args, min_valid))
1412 if ((f->flags & OPF_EXTENSION) && settings_get_syntax () == COMPATIBLE)
1413 msg_at (SW, expr_location (e, n),
1414 _("%s is a PSPP extension."), f->prototype);
1415 if (f->flags & OPF_UNIMPLEMENTED)
1417 msg_at (SE, expr_location (e, n),
1418 _("%s is not available in this version of PSPP."), f->prototype);
1421 if ((f->flags & OPF_PERM_ONLY) &&
1422 proc_in_temporary_transformations (e->ds))
1424 msg_at (SE, expr_location (e, n),
1425 _("%s may not appear after %s."), f->prototype, "TEMPORARY");
1429 if (n->type == OP_LAG_Vn || n->type == OP_LAG_Vs)
1430 dataset_need_lag (e->ds, 1);
1431 else if (n->type == OP_LAG_Vnn || n->type == OP_LAG_Vsn)
1433 assert (n->n_args == 2);
1434 assert (n->args[1]->type == OP_pos_int);
1435 dataset_need_lag (e->ds, n->args[1]->integer);
1439 ds_destroy (&func_name);
1444 ds_destroy (&func_name);
1448 /* Utility functions. */
1450 static struct expression *
1451 expr_create (struct dataset *ds)
1453 struct pool *pool = pool_create ();
1454 struct expression *e = pool_alloc (pool, sizeof *e);
1455 *e = (struct expression) {
1458 .eval_pool = pool_create_subpool (pool),
1464 expr_node_returns (const struct expr_node *n)
1467 assert (is_operation (n->type));
1468 if (is_atom (n->type))
1470 else if (is_composite (n->type))
1471 return operations[n->type].returns;
1477 atom_type_name (atom_type type)
1479 assert (is_atom (type));
1481 /* The Boolean type is purely an internal concept that the documentation
1482 doesn't mention, so it might confuse users if we talked about them in
1484 return type == OP_boolean ? "number" : operations[type].name;
1488 expr_allocate_nullary (struct expression *e, operation_type op)
1490 return expr_allocate_composite (e, op, NULL, 0);
1494 expr_allocate_unary (struct expression *e, operation_type op,
1495 struct expr_node *arg0)
1497 return expr_allocate_composite (e, op, &arg0, 1);
1501 expr_allocate_binary (struct expression *e, operation_type op,
1502 struct expr_node *arg0, struct expr_node *arg1)
1504 struct expr_node *args[2];
1507 return expr_allocate_composite (e, op, args, 2);
1511 expr_allocate_composite (struct expression *e, operation_type op,
1512 struct expr_node **args, size_t n_args)
1514 for (size_t i = 0; i < n_args; i++)
1518 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1519 *n = (struct expr_node) {
1522 .args = pool_clone (e->expr_pool, args, sizeof *n->args * n_args),
1528 expr_allocate_number (struct expression *e, double d)
1530 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1531 *n = (struct expr_node) { .type = OP_number, .number = d };
1536 expr_allocate_boolean (struct expression *e, double b)
1538 assert (b == 0.0 || b == 1.0 || b == SYSMIS);
1540 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1541 *n = (struct expr_node) { .type = OP_boolean, .number = b };
1546 expr_allocate_integer (struct expression *e, int i)
1548 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1549 *n = (struct expr_node) { .type = OP_integer, .integer = i };
1554 expr_allocate_pos_int (struct expression *e, int i)
1558 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1559 *n = (struct expr_node) { .type = OP_pos_int, .integer = i };
1564 expr_allocate_vector (struct expression *e, const struct vector *vector)
1566 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1567 *n = (struct expr_node) { .type = OP_vector, .vector = vector };
1572 expr_allocate_string (struct expression *e, struct substring s)
1574 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1575 *n = (struct expr_node) { .type = OP_string, .string = s };
1580 expr_allocate_variable (struct expression *e, const struct variable *v)
1582 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1583 *n = (struct expr_node) {
1584 .type = var_is_numeric (v) ? OP_num_var : OP_str_var,
1591 expr_allocate_format (struct expression *e, const struct fmt_spec *format)
1593 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1594 *n = (struct expr_node) { .type = OP_format, .format = *format };
1599 expr_allocate_expr_node (struct expression *e,
1600 const struct expr_node *expr_node)
1602 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1603 *n = (struct expr_node) { .type = OP_expr_node, .expr_node = expr_node };
1607 /* Allocates a unary composite node that represents the value of
1608 variable V in expression E. */
1609 static struct expr_node *
1610 allocate_unary_variable (struct expression *e, const struct variable *v)
1613 return expr_allocate_unary (e, var_is_numeric (v) ? OP_NUM_VAR : OP_STR_VAR,
1614 expr_allocate_variable (e, v));
1617 /* Export function details to other modules. */
1619 /* Returns the operation structure for the function with the
1621 const struct operation *
1622 expr_get_function (size_t idx)
1624 assert (idx < n_OP_function);
1625 return &operations[OP_function_first + idx];
1628 /* Returns the number of expression functions. */
1630 expr_get_n_functions (void)
1632 return n_OP_function;
1635 /* Returns the name of operation OP. */
1637 expr_operation_get_name (const struct operation *op)
1642 /* Returns the human-readable prototype for operation OP. */
1644 expr_operation_get_prototype (const struct operation *op)
1646 return op->prototype;
1649 /* Returns the number of arguments for operation OP. */
1651 expr_operation_get_n_args (const struct operation *op)