1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/scan.h"
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
28 #include "libpspp/i18n.h"
30 #include "gl/c-ctype.h"
31 #include "gl/c-strtod.h"
32 #include "gl/xmemdup0.h"
35 #define _(msgid) gettext (msgid)
37 /* Returns the integer value of (hex) digit C. */
53 case 'a': case 'A': return 10;
54 case 'b': case 'B': return 11;
55 case 'c': case 'C': return 12;
56 case 'd': case 'D': return 13;
57 case 'e': case 'E': return 14;
58 case 'f': case 'F': return 15;
59 default: return INT_MAX;
64 scan_quoted_string (struct substring in, struct token *token)
66 /* Trim ' or " from front and back. */
67 int quote = in.string[0];
71 struct substring out = { .string = xmalloc (in.length + 1) };
75 size_t pos = ss_find_byte (in, quote);
79 memcpy (ss_end (out), in.string, pos + 1);
80 out.length += pos + 1;
81 ss_advance (&in, pos + 2);
84 memcpy (ss_end (out), in.string, in.length);
85 out.length += in.length;
86 out.string[out.length] = '\0';
88 *token = (struct token) { .type = T_STRING, .string = out };
92 scan_hex_string__ (struct substring in, struct substring *out)
94 if (in.length % 2 != 0)
95 return xasprintf (_("String of hex digits has %zu characters, which "
96 "is not a multiple of 2."), in.length);
98 ss_realloc (out, in.length / 2 + 1);
99 uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
100 out->length = in.length / 2;
101 for (size_t i = 0; i < in.length; i += 2)
103 int hi = digit_value (in.string[i]);
104 int lo = digit_value (in.string[i + 1]);
106 if (hi >= 16 || lo >= 16)
107 return xasprintf (_("`%c' is not a valid hex digit."),
108 in.string[hi >= 16 ? i : i + 1]);
110 *dst++ = hi * 16 + lo;
117 scan_unicode_string__ (struct substring in, struct substring *out)
119 if (in.length < 1 || in.length > 8)
120 return xasprintf (_("Unicode string contains %zu bytes, which is "
121 "not in the valid range of 1 to 8 bytes."),
125 for (size_t i = 0; i < in.length; i++)
127 int digit = digit_value (in.string[i]);
129 return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
130 uc = uc * 16 + digit;
133 if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
134 return xasprintf (_("U+%04llX is not a valid Unicode code point."),
137 ss_realloc (out, 4 + 1);
138 out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
143 static enum token_type
144 scan_reserved_word__ (struct substring word)
146 switch (c_toupper (word.string[0]))
155 return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
158 return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
161 return word.length == 2 ? T_NE : T_NOT;
170 return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
179 static enum token_type
180 scan_punct1__ (char c0)
184 case '(': return T_LPAREN;
185 case ')': return T_RPAREN;
186 case ',': return T_COMMA;
187 case '=': return T_EQUALS;
188 case '-': return T_DASH;
189 case '[': return T_LBRACK;
190 case ']': return T_RBRACK;
191 case '&': return T_AND;
192 case '|': return T_OR;
193 case '+': return T_PLUS;
194 case '/': return T_SLASH;
195 case '*': return T_ASTERISK;
196 case '<': return T_LT;
197 case '>': return T_GT;
198 case '~': return T_NOT;
199 default: return T_MACRO_PUNCT;
205 static enum token_type
206 scan_punct2__ (char c0, char c1)
214 return c1 == '=' ? T_LE : T_NE;
232 static enum token_type
233 scan_punct__ (struct substring s)
235 return (s.length == 1
236 ? scan_punct1__ (s.string[0])
237 : scan_punct2__ (s.string[0], s.string[1]));
241 scan_number__ (struct substring s, struct token *token)
246 if (s.length < sizeof buf)
249 memcpy (buf, s.string, s.length);
250 buf[s.length] = '\0';
253 p = xmemdup0 (s.string, s.length);
255 bool negative = *p == '-';
256 double x = c_strtod (p + negative, NULL);
257 *token = (struct token) {
258 .type = negative ? T_NEG_NUM : T_POS_NUM,
259 .number = negative ? -x : x,
267 tokenize_error__ (struct token *token, char *error)
269 *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
272 static enum tokenize_result
273 tokenize_string_segment__ (enum segment_type type,
274 struct substring s, struct token *token)
276 /* Trim X' or U' from front and ' from back. */
280 struct substring out = SS_EMPTY_INITIALIZER;
281 char *error = (type == SEG_HEX_STRING
282 ? scan_hex_string__ (s, &out)
283 : scan_unicode_string__ (s, &out));
286 out.string[out.length] = '\0';
287 *token = (struct token) { .type = T_STRING, .string = out };
288 return TOKENIZE_TOKEN;
292 tokenize_error__ (token, error);
293 return TOKENIZE_ERROR;
298 tokenize_unexpected_char (const struct substring *s, struct token *token)
301 u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
304 tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
305 uc_name (uc, c_name)));
309 token_from_segment (enum segment_type type, struct substring s,
315 scan_number__ (s, token);
316 return TOKENIZE_TOKEN;
318 case SEG_QUOTED_STRING:
319 scan_quoted_string (s, token);
320 return TOKENIZE_TOKEN;
323 case SEG_UNICODE_STRING:
324 return tokenize_string_segment__ (type, s, token);
326 case SEG_UNQUOTED_STRING:
327 case SEG_DO_REPEAT_COMMAND:
328 case SEG_INLINE_DATA:
331 *token = (struct token) { .type = T_STRING };
332 ss_alloc_substring (&token->string, s);
333 return TOKENIZE_TOKEN;
335 case SEG_RESERVED_WORD:
336 *token = (struct token) { .type = scan_reserved_word__ (s) };
337 return TOKENIZE_TOKEN;
340 *token = (struct token) { .type = T_ID };
341 ss_alloc_substring (&token->string, s);
342 return TOKENIZE_TOKEN;
345 *token = (struct token) { .type = T_MACRO_ID };
346 ss_alloc_substring (&token->string, s);
347 return TOKENIZE_TOKEN;
350 *token = (struct token) { .type = scan_punct__ (s) };
351 if (token->type == T_MACRO_PUNCT)
352 ss_alloc_substring (&token->string, s);
353 return TOKENIZE_TOKEN;
359 case SEG_COMMENT_COMMAND:
360 return TOKENIZE_EMPTY;
362 case SEG_START_DOCUMENT:
363 *token = (struct token) { .type = T_ID };
364 ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
365 return TOKENIZE_TOKEN;
367 case SEG_START_COMMAND:
368 case SEG_SEPARATE_COMMANDS:
369 case SEG_END_COMMAND:
370 *token = (struct token) { .type = T_ENDCMD };
371 return TOKENIZE_TOKEN;
374 *token = (struct token) { .type = T_STOP };
375 return TOKENIZE_TOKEN;
377 case SEG_EXPECTED_QUOTE:
378 tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
379 return TOKENIZE_ERROR;
381 case SEG_EXPECTED_EXPONENT:
382 tokenize_error__ (token,
383 xasprintf (_("Missing exponent following `%.*s'."),
384 (int) s.length, s.string));
385 return TOKENIZE_ERROR;
387 case SEG_UNEXPECTED_CHAR:
388 tokenize_unexpected_char (&s, token);
389 return TOKENIZE_ERROR;
396 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
399 SLEX has no internal state to free, but it retains a reference to INPUT, so
400 INPUT must not be modified or freed while SLEX is still in use. */
402 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
403 enum segmenter_mode mode, bool is_snippet)
405 *slex = (struct string_lexer) {
409 .segmenter = segmenter_init (mode, is_snippet),
414 enum string_lexer_result
415 string_lexer_next (struct string_lexer *slex, struct token *token)
419 const char *s = slex->input + slex->offset;
420 size_t left = slex->length - slex->offset;
421 enum segment_type type;
424 n = segmenter_push (&slex->segmenter, s, left, true, &type);
428 switch (token_from_segment (type, ss_buffer (s, n), token))
431 return token->type == T_STOP ? SLR_END : SLR_TOKEN;
442 static struct substring
443 concat (struct substring a, struct substring b)
445 size_t length = a.length + b.length;
446 struct substring out = { .string = xmalloc (length + 1), .length = length };
447 memcpy (out.string, a.string, a.length);
448 memcpy (out.string + a.length, b.string, b.length);
449 out.string[length] = '\0';
453 /* Attempts to merge a sequence of tokens together into a single token. The
454 caller feeds tokens in one by one and the merger FSM reports progress. The
455 caller must supply a merger structure M that is set to MERGER_INIT before
456 the first call. The caller must also supply a token OUT for storage, which
457 need not be initialized.
461 * -1 if more tokens are needed. Token OUT might be in use for temporary
462 storage; to ensure that it is freed, continue calling merger_add() until
463 it returns something other than -1. (T_STOP or T_ENDCMD will make it do
466 * 0 if the first token submitted to the merger is the output. This is the
467 common case for the first call, and it can be returned for subsequent
470 * A positive number if OUT is initialized to the output token. The return
471 value is the number of tokens being merged to produce this one. */
473 merger_add (struct merger *m, const struct token *in, struct token *out)
475 /* We perform two different kinds of token merging:
477 - String concatenation, where syntax like "a" + "b" is converted into a
478 single string token. This is definitely needed because the parser
481 - Negative number merging, where syntax like -5 is converted from a pair
482 of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM). This
483 might not be needed anymore because the segmenter directly treats a dash
484 followed by a number, with optional intervening white space, as a
485 negative number. It's only needed if we want intervening comments to be
486 allowed or for part of the negative number token to be produced by macro
491 if (in->type == T_DASH || in->type == T_STRING)
500 if (out->type == T_DASH)
502 if (in->type == T_POS_NUM)
504 *out = (struct token) {
506 .number = -in->number
514 return in->type == T_PLUS ? -1 : 0;
518 if (in->type == T_STRING)
520 out->string = concat (out->string, in->string);
529 return in->type == T_PLUS ? -1 : m->state - 1;
532 if (in->type == T_STRING)
534 struct substring s = concat (out->string, in->string);
535 ss_swap (&s, &out->string);