1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/scan.h"
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
28 #include "libpspp/i18n.h"
30 #include "gl/c-ctype.h"
31 #include "gl/c-strtod.h"
32 #include "gl/xmemdup0.h"
35 #define _(msgid) gettext (msgid)
37 /* Returns the integer value of (hex) digit C. */
53 case 'a': case 'A': return 10;
54 case 'b': case 'B': return 11;
55 case 'c': case 'C': return 12;
56 case 'd': case 'D': return 13;
57 case 'e': case 'E': return 14;
58 case 'f': case 'F': return 15;
59 default: return INT_MAX;
64 scan_quoted_string (struct substring in, struct token *token)
66 /* Trim ' or " from front and back. */
67 int quote = in.string[0];
71 struct substring out = { .string = xmalloc (in.length + 1) };
75 size_t pos = ss_find_byte (in, quote);
79 memcpy (ss_end (out), in.string, pos + 1);
80 out.length += pos + 1;
81 ss_advance (&in, pos + 2);
84 memcpy (ss_end (out), in.string, in.length);
85 out.length += in.length;
86 out.string[out.length] = '\0';
88 *token = (struct token) { .type = T_STRING, .string = out };
92 scan_hex_string__ (struct substring in, struct substring *out)
94 if (in.length % 2 != 0)
95 return xasprintf (_("String of hex digits has %zu characters, which "
96 "is not a multiple of 2."), in.length);
98 ss_realloc (out, in.length / 2 + 1);
99 uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
100 out->length = in.length / 2;
101 for (size_t i = 0; i < in.length; i += 2)
103 int hi = digit_value (in.string[i]);
104 int lo = digit_value (in.string[i + 1]);
106 if (hi >= 16 || lo >= 16)
107 return xasprintf (_("`%c' is not a valid hex digit."),
108 in.string[hi >= 16 ? i : i + 1]);
110 *dst++ = hi * 16 + lo;
117 scan_unicode_string__ (struct substring in, struct substring *out)
119 if (in.length < 1 || in.length > 8)
120 return xasprintf (_("Unicode string contains %zu bytes, which is "
121 "not in the valid range of 1 to 8 bytes."),
125 for (size_t i = 0; i < in.length; i++)
127 int digit = digit_value (in.string[i]);
129 return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
130 uc = uc * 16 + digit;
133 if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
134 return xasprintf (_("U+%04llX is not a valid Unicode code point."),
137 ss_realloc (out, 4 + 1);
138 out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
143 static enum token_type
144 scan_reserved_word__ (struct substring word)
146 switch (c_toupper (word.string[0]))
155 return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
158 return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
161 return word.length == 2 ? T_NE : T_NOT;
170 return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
179 static enum token_type
180 scan_punct1__ (char c0)
184 case '(': return T_LPAREN;
185 case ')': return T_RPAREN;
186 case ',': return T_COMMA;
187 case '=': return T_EQUALS;
188 case '-': return T_DASH;
189 case '[': return T_LBRACK;
190 case ']': return T_RBRACK;
191 case '&': return T_AND;
192 case '|': return T_OR;
193 case '+': return T_PLUS;
194 case '/': return T_SLASH;
195 case '*': return T_ASTERISK;
196 case '<': return T_LT;
197 case '>': return T_GT;
198 case '~': return T_NOT;
199 default: return T_MACRO_PUNCT;
205 static enum token_type
206 scan_punct2__ (char c0, char c1)
214 return c1 == '=' ? T_LE : T_NE;
232 static enum token_type
233 scan_punct__ (struct substring s)
235 return (s.length == 1
236 ? scan_punct1__ (s.string[0])
237 : scan_punct2__ (s.string[0], s.string[1]));
241 scan_number__ (struct substring s, struct token *token)
246 if (s.length < sizeof buf)
249 memcpy (buf, s.string, s.length);
250 buf[s.length] = '\0';
253 p = xmemdup0 (s.string, s.length);
255 bool negative = *p == '-';
256 double x = c_strtod (p + negative, NULL);
257 *token = (struct token) {
258 .type = negative ? T_NEG_NUM : T_POS_NUM,
259 .number = negative ? -x : x,
267 tokenize_error__ (struct token *token, char *error)
269 *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
272 static enum tokenize_result
273 tokenize_string_segment__ (enum segment_type type,
274 struct substring s, struct token *token)
276 /* Trim X' or U' from front and ' from back. */
280 struct substring out = SS_EMPTY_INITIALIZER;
281 char *error = (type == SEG_HEX_STRING
282 ? scan_hex_string__ (s, &out)
283 : scan_unicode_string__ (s, &out));
286 out.string[out.length] = '\0';
287 *token = (struct token) { .type = T_STRING, .string = out };
288 return TOKENIZE_TOKEN;
292 tokenize_error__ (token, error);
294 return TOKENIZE_ERROR;
299 tokenize_unexpected_char (const struct substring *s, struct token *token)
302 u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
305 tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
306 uc_name (uc, c_name)));
310 token_from_segment (enum segment_type type, struct substring s,
316 scan_number__ (s, token);
317 return TOKENIZE_TOKEN;
319 case SEG_QUOTED_STRING:
320 scan_quoted_string (s, token);
321 return TOKENIZE_TOKEN;
324 case SEG_UNICODE_STRING:
325 return tokenize_string_segment__ (type, s, token);
327 case SEG_UNQUOTED_STRING:
328 case SEG_DO_REPEAT_COMMAND:
329 case SEG_INLINE_DATA:
333 *token = (struct token) { .type = T_STRING };
334 ss_alloc_substring (&token->string, s);
335 return TOKENIZE_TOKEN;
337 case SEG_RESERVED_WORD:
338 *token = (struct token) { .type = scan_reserved_word__ (s) };
339 return TOKENIZE_TOKEN;
342 *token = (struct token) { .type = T_ID };
343 ss_alloc_substring (&token->string, s);
344 return TOKENIZE_TOKEN;
347 *token = (struct token) { .type = T_MACRO_ID };
348 ss_alloc_substring (&token->string, s);
349 return TOKENIZE_TOKEN;
352 *token = (struct token) { .type = scan_punct__ (s) };
353 if (token->type == T_MACRO_PUNCT)
354 ss_alloc_substring (&token->string, s);
355 return TOKENIZE_TOKEN;
361 case SEG_COMMENT_COMMAND:
362 return TOKENIZE_EMPTY;
364 case SEG_START_DOCUMENT:
365 *token = (struct token) { .type = T_ID };
366 ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
367 return TOKENIZE_TOKEN;
369 case SEG_START_COMMAND:
370 case SEG_SEPARATE_COMMANDS:
371 case SEG_END_COMMAND:
372 *token = (struct token) { .type = T_ENDCMD };
373 return TOKENIZE_TOKEN;
376 *token = (struct token) { .type = T_STOP };
377 return TOKENIZE_TOKEN;
379 case SEG_EXPECTED_QUOTE:
380 tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
381 return TOKENIZE_ERROR;
383 case SEG_EXPECTED_EXPONENT:
384 tokenize_error__ (token,
385 xasprintf (_("Missing exponent following `%.*s'."),
386 (int) s.length, s.string));
387 return TOKENIZE_ERROR;
389 case SEG_UNEXPECTED_CHAR:
390 tokenize_unexpected_char (&s, token);
391 return TOKENIZE_ERROR;
398 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
401 SLEX has no internal state to free, but it retains a reference to INPUT, so
402 INPUT must not be modified or freed while SLEX is still in use. */
404 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
405 enum segmenter_mode mode, bool is_snippet)
407 *slex = (struct string_lexer) {
411 .segmenter = segmenter_init (mode, is_snippet),
416 enum string_lexer_result
417 string_lexer_next (struct string_lexer *slex, struct token *token)
421 const char *s = slex->input + slex->offset;
422 size_t left = slex->length - slex->offset;
423 enum segment_type type;
426 n = segmenter_push (&slex->segmenter, s, left, true, &type);
430 switch (token_from_segment (type, ss_buffer (s, n), token))
433 return token->type == T_STOP ? SLR_END : SLR_TOKEN;
444 static struct substring
445 concat (struct substring a, struct substring b)
447 size_t length = a.length + b.length;
448 struct substring out = { .string = xmalloc (length + 1), .length = length };
449 memcpy (out.string, a.string, a.length);
450 memcpy (out.string + a.length, b.string, b.length);
451 out.string[length] = '\0';
455 /* Attempts to merge a sequence of tokens together into a single token. The
456 caller feeds tokens in one by one and the merger FSM reports progress. The
457 caller must supply a merger structure M that is set to MERGER_INIT before
458 the first call. The caller must also supply a token OUT for storage, which
459 need not be initialized.
463 * -1 if more tokens are needed. Token OUT might be in use for temporary
464 storage; to ensure that it is freed, continue calling merger_add() until
465 it returns something other than -1. (T_STOP or T_ENDCMD will make it do
468 * 0 if the first token submitted to the merger is the output. This is the
469 common case for the first call, and it can be returned for subsequent
472 * A positive number if OUT is initialized to the output token. The return
473 value is the number of tokens being merged to produce this one. */
475 merger_add (struct merger *m, const struct token *in, struct token *out)
477 /* We perform two different kinds of token merging:
479 - String concatenation, where syntax like "a" + "b" is converted into a
480 single string token. This is definitely needed because the parser
483 - Negative number merging, where syntax like -5 is converted from a pair
484 of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM). This
485 might not be needed anymore because the segmenter directly treats a dash
486 followed by a number, with optional intervening white space, as a
487 negative number. It's only needed if we want intervening comments to be
488 allowed or for part of the negative number token to be produced by macro
493 if (in->type == T_DASH || in->type == T_STRING)
502 if (out->type == T_DASH)
504 if (in->type == T_POS_NUM)
506 *out = (struct token) {
508 .number = -in->number
516 return in->type == T_PLUS ? -1 : 0;
520 if (in->type == T_STRING)
522 out->string = concat (out->string, in->string);
531 return in->type == T_PLUS ? -1 : m->state - 1;
534 if (in->type == T_STRING)
536 struct substring s = concat (out->string, in->string);
537 ss_swap (&s, &out->string);