1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/scan.h"
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
28 #include "libpspp/i18n.h"
30 #include "gl/c-ctype.h"
31 #include "gl/c-strtod.h"
32 #include "gl/xmemdup0.h"
35 #define _(msgid) gettext (msgid)
37 /* Returns the integer value of (hex) digit C. */
53 case 'a': case 'A': return 10;
54 case 'b': case 'B': return 11;
55 case 'c': case 'C': return 12;
56 case 'd': case 'D': return 13;
57 case 'e': case 'E': return 14;
58 case 'f': case 'F': return 15;
59 default: return INT_MAX;
64 scan_quoted_string (struct substring in, struct token *token)
66 /* Trim ' or " from front and back. */
67 int quote = in.string[0];
71 struct substring out = { .string = xmalloc (in.length + 1) };
75 size_t pos = ss_find_byte (in, quote);
79 memcpy (ss_end (out), in.string, pos + 1);
80 out.length += pos + 1;
81 ss_advance (&in, pos + 2);
84 memcpy (ss_end (out), in.string, in.length);
85 out.length += in.length;
86 out.string[out.length] = '\0';
88 *token = (struct token) { .type = T_STRING, .string = out };
92 scan_hex_string__ (struct substring in, struct substring *out)
94 if (in.length % 2 != 0)
95 return xasprintf (_("String of hex digits has %zu characters, which "
96 "is not a multiple of 2."), in.length);
98 ss_realloc (out, in.length / 2 + 1);
99 uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
100 out->length = in.length / 2;
101 for (size_t i = 0; i < in.length; i += 2)
103 int hi = digit_value (in.string[i]);
104 int lo = digit_value (in.string[i + 1]);
106 if (hi >= 16 || lo >= 16)
107 return xasprintf (_("`%c' is not a valid hex digit."),
108 in.string[hi >= 16 ? i : i + 1]);
110 *dst++ = hi * 16 + lo;
117 scan_unicode_string__ (struct substring in, struct substring *out)
119 if (in.length < 1 || in.length > 8)
120 return xasprintf (_("Unicode string contains %zu bytes, which is "
121 "not in the valid range of 1 to 8 bytes."),
125 for (size_t i = 0; i < in.length; i++)
127 int digit = digit_value (in.string[i]);
129 return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
130 uc = uc * 16 + digit;
133 if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
134 return xasprintf (_("U+%04llX is not a valid Unicode code point."),
137 ss_realloc (out, 4 + 1);
138 out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
143 static enum token_type
144 scan_reserved_word__ (struct substring word)
146 switch (c_toupper (word.string[0]))
155 return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
158 return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
161 return word.length == 2 ? T_NE : T_NOT;
170 return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
179 static enum token_type
180 scan_punct1__ (char c0)
184 case '(': return T_LPAREN;
185 case ')': return T_RPAREN;
186 case ',': return T_COMMA;
187 case '=': return T_EQUALS;
188 case '-': return T_DASH;
189 case '[': return T_LBRACK;
190 case ']': return T_RBRACK;
191 case '&': return T_AND;
192 case '|': return T_OR;
193 case '+': return T_PLUS;
194 case '/': return T_SLASH;
195 case '*': return T_ASTERISK;
196 case '<': return T_LT;
197 case '>': return T_GT;
198 case '~': return T_NOT;
199 default: return T_MACRO_PUNCT;
205 static enum token_type
206 scan_punct2__ (char c0, char c1)
214 return c1 == '=' ? T_LE : T_NE;
232 static enum token_type
233 scan_punct__ (struct substring s)
235 return (s.length == 1
236 ? scan_punct1__ (s.string[0])
237 : scan_punct2__ (s.string[0], s.string[1]));
241 scan_number__ (struct substring s, struct token *token)
246 if (s.length < sizeof buf)
249 memcpy (buf, s.string, s.length);
250 buf[s.length] = '\0';
253 p = xmemdup0 (s.string, s.length);
255 bool negative = *p == '-';
256 double x = c_strtod (p + negative, NULL);
257 *token = (struct token) {
258 .type = negative ? T_NEG_NUM : T_POS_NUM,
259 .number = negative ? -x : x,
267 tokenize_error__ (struct token *token, char *error)
269 *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
272 static enum tokenize_result
273 tokenize_string_segment__ (enum segment_type type,
274 struct substring s, struct token *token)
276 /* Trim X' or U' from front and ' from back. */
280 struct substring out = SS_EMPTY_INITIALIZER;
281 char *error = (type == SEG_HEX_STRING
282 ? scan_hex_string__ (s, &out)
283 : scan_unicode_string__ (s, &out));
286 out.string[out.length] = '\0';
287 *token = (struct token) { .type = T_STRING, .string = out };
288 return TOKENIZE_TOKEN;
292 tokenize_error__ (token, error);
293 return TOKENIZE_ERROR;
298 tokenize_unexpected_char (const struct substring *s, struct token *token)
301 u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
304 tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
305 uc_name (uc, c_name)));
309 token_from_segment (enum segment_type type, struct substring s,
315 scan_number__ (s, token);
316 return TOKENIZE_TOKEN;
318 case SEG_QUOTED_STRING:
319 scan_quoted_string (s, token);
320 return TOKENIZE_TOKEN;
323 case SEG_UNICODE_STRING:
324 return tokenize_string_segment__ (type, s, token);
326 case SEG_UNQUOTED_STRING:
327 case SEG_DO_REPEAT_COMMAND:
328 case SEG_INLINE_DATA:
332 *token = (struct token) { .type = T_STRING };
333 ss_alloc_substring (&token->string, s);
334 return TOKENIZE_TOKEN;
336 case SEG_RESERVED_WORD:
337 *token = (struct token) { .type = scan_reserved_word__ (s) };
338 return TOKENIZE_TOKEN;
341 *token = (struct token) { .type = T_ID };
342 ss_alloc_substring (&token->string, s);
343 return TOKENIZE_TOKEN;
346 *token = (struct token) { .type = T_MACRO_ID };
347 ss_alloc_substring (&token->string, s);
348 return TOKENIZE_TOKEN;
351 *token = (struct token) { .type = scan_punct__ (s) };
352 if (token->type == T_MACRO_PUNCT)
353 ss_alloc_substring (&token->string, s);
354 return TOKENIZE_TOKEN;
360 case SEG_COMMENT_COMMAND:
361 return TOKENIZE_EMPTY;
363 case SEG_START_DOCUMENT:
364 *token = (struct token) { .type = T_ID };
365 ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
366 return TOKENIZE_TOKEN;
368 case SEG_START_COMMAND:
369 case SEG_SEPARATE_COMMANDS:
370 case SEG_END_COMMAND:
371 *token = (struct token) { .type = T_ENDCMD };
372 return TOKENIZE_TOKEN;
375 *token = (struct token) { .type = T_STOP };
376 return TOKENIZE_TOKEN;
378 case SEG_EXPECTED_QUOTE:
379 tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
380 return TOKENIZE_ERROR;
382 case SEG_EXPECTED_EXPONENT:
383 tokenize_error__ (token,
384 xasprintf (_("Missing exponent following `%.*s'."),
385 (int) s.length, s.string));
386 return TOKENIZE_ERROR;
388 case SEG_UNEXPECTED_CHAR:
389 tokenize_unexpected_char (&s, token);
390 return TOKENIZE_ERROR;
397 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
400 SLEX has no internal state to free, but it retains a reference to INPUT, so
401 INPUT must not be modified or freed while SLEX is still in use. */
403 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
404 enum segmenter_mode mode, bool is_snippet)
406 *slex = (struct string_lexer) {
410 .segmenter = segmenter_init (mode, is_snippet),
415 enum string_lexer_result
416 string_lexer_next (struct string_lexer *slex, struct token *token)
420 const char *s = slex->input + slex->offset;
421 size_t left = slex->length - slex->offset;
422 enum segment_type type;
425 n = segmenter_push (&slex->segmenter, s, left, true, &type);
429 switch (token_from_segment (type, ss_buffer (s, n), token))
432 return token->type == T_STOP ? SLR_END : SLR_TOKEN;
443 static struct substring
444 concat (struct substring a, struct substring b)
446 size_t length = a.length + b.length;
447 struct substring out = { .string = xmalloc (length + 1), .length = length };
448 memcpy (out.string, a.string, a.length);
449 memcpy (out.string + a.length, b.string, b.length);
450 out.string[length] = '\0';
454 /* Attempts to merge a sequence of tokens together into a single token. The
455 caller feeds tokens in one by one and the merger FSM reports progress. The
456 caller must supply a merger structure M that is set to MERGER_INIT before
457 the first call. The caller must also supply a token OUT for storage, which
458 need not be initialized.
462 * -1 if more tokens are needed. Token OUT might be in use for temporary
463 storage; to ensure that it is freed, continue calling merger_add() until
464 it returns something other than -1. (T_STOP or T_ENDCMD will make it do
467 * 0 if the first token submitted to the merger is the output. This is the
468 common case for the first call, and it can be returned for subsequent
471 * A positive number if OUT is initialized to the output token. The return
472 value is the number of tokens being merged to produce this one. */
474 merger_add (struct merger *m, const struct token *in, struct token *out)
476 /* We perform two different kinds of token merging:
478 - String concatenation, where syntax like "a" + "b" is converted into a
479 single string token. This is definitely needed because the parser
482 - Negative number merging, where syntax like -5 is converted from a pair
483 of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM). This
484 might not be needed anymore because the segmenter directly treats a dash
485 followed by a number, with optional intervening white space, as a
486 negative number. It's only needed if we want intervening comments to be
487 allowed or for part of the negative number token to be produced by macro
492 if (in->type == T_DASH || in->type == T_STRING)
501 if (out->type == T_DASH)
503 if (in->type == T_POS_NUM)
505 *out = (struct token) {
507 .number = -in->number
515 return in->type == T_PLUS ? -1 : 0;
519 if (in->type == T_STRING)
521 out->string = concat (out->string, in->string);
530 return in->type == T_PLUS ? -1 : m->state - 1;
533 if (in->type == T_STRING)
535 struct substring s = concat (out->string, in->string);
536 ss_swap (&s, &out->string);