1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/scan.h"
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
28 #include "libpspp/i18n.h"
30 #include "gl/c-ctype.h"
31 #include "gl/c-strtod.h"
32 #include "gl/xmemdup0.h"
35 #define _(msgid) gettext (msgid)
37 /* Returns the integer value of (hex) digit C. */
53 case 'a': case 'A': return 10;
54 case 'b': case 'B': return 11;
55 case 'c': case 'C': return 12;
56 case 'd': case 'D': return 13;
57 case 'e': case 'E': return 14;
58 case 'f': case 'F': return 15;
59 default: return INT_MAX;
64 scan_quoted_string (struct substring in, struct token *token)
66 /* Trim ' or " from front and back. */
67 int quote = in.string[0];
71 struct substring out = { .string = xmalloc (in.length + 1) };
75 size_t pos = ss_find_byte (in, quote);
79 memcpy (ss_end (out), in.string, pos + 1);
80 out.length += pos + 1;
81 ss_advance (&in, pos + 2);
84 memcpy (ss_end (out), in.string, in.length);
85 out.length += in.length;
86 out.string[out.length] = '\0';
88 *token = (struct token) { .type = T_STRING, .string = out };
92 scan_hex_string__ (struct substring in, struct substring *out)
94 if (in.length % 2 != 0)
95 return xasprintf (_("String of hex digits has %zu characters, which "
96 "is not a multiple of 2."), in.length);
98 ss_realloc (out, in.length / 2 + 1);
99 uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
100 out->length = in.length / 2;
101 for (size_t i = 0; i < in.length; i += 2)
103 int hi = digit_value (in.string[i]);
104 int lo = digit_value (in.string[i + 1]);
106 if (hi >= 16 || lo >= 16)
107 return xasprintf (_("`%c' is not a valid hex digit."),
108 in.string[hi >= 16 ? i : i + 1]);
110 *dst++ = hi * 16 + lo;
117 scan_unicode_string__ (struct substring in, struct substring *out)
119 if (in.length < 1 || in.length > 8)
120 return xasprintf (_("Unicode string contains %zu bytes, which is "
121 "not in the valid range of 1 to 8 bytes."),
125 for (size_t i = 0; i < in.length; i++)
127 int digit = digit_value (in.string[i]);
129 return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
130 uc = uc * 16 + digit;
133 if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
134 return xasprintf (_("U+%04llX is not a valid Unicode code point."),
137 ss_realloc (out, 4 + 1);
138 out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
143 static enum token_type
144 scan_reserved_word__ (struct substring word)
146 switch (c_toupper (word.string[0]))
155 return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
158 return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
161 return word.length == 2 ? T_NE : T_NOT;
170 return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
179 static enum token_type
180 scan_punct1__ (char c0)
184 case '(': return T_LPAREN;
185 case ')': return T_RPAREN;
186 case ',': return T_COMMA;
187 case '=': return T_EQUALS;
188 case '-': return T_DASH;
189 case '[': return T_LBRACK;
190 case ']': return T_RBRACK;
191 case '{': return T_LCURLY;
192 case '}': return T_RCURLY;
193 case '&': return T_AND;
194 case '|': return T_OR;
195 case '+': return T_PLUS;
196 case '/': return T_SLASH;
197 case '*': return T_ASTERISK;
198 case '<': return T_LT;
199 case '>': return T_GT;
200 case '~': return T_NOT;
201 case ';': return T_SEMICOLON;
202 case ':': return T_COLON;
203 default: return T_MACRO_PUNCT;
209 static enum token_type
210 scan_punct2__ (char c0, char c1)
218 return c1 == '=' ? T_LE : T_NE;
236 static enum token_type
237 scan_punct__ (struct substring s)
239 return (s.length == 1
240 ? scan_punct1__ (s.string[0])
241 : scan_punct2__ (s.string[0], s.string[1]));
245 scan_number__ (struct substring s, struct token *token)
250 if (s.length < sizeof buf)
253 memcpy (buf, s.string, s.length);
254 buf[s.length] = '\0';
257 p = xmemdup0 (s.string, s.length);
259 bool negative = *p == '-';
260 double x = c_strtod (p + negative, NULL);
261 *token = (struct token) {
262 .type = negative ? T_NEG_NUM : T_POS_NUM,
263 .number = negative ? -x : x,
271 tokenize_error__ (struct token *token, char *error)
273 *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
276 static enum tokenize_result
277 tokenize_string_segment__ (enum segment_type type,
278 struct substring s, struct token *token)
280 /* Trim X' or U' from front and ' from back. */
284 struct substring out = SS_EMPTY_INITIALIZER;
285 char *error = (type == SEG_HEX_STRING
286 ? scan_hex_string__ (s, &out)
287 : scan_unicode_string__ (s, &out));
290 out.string[out.length] = '\0';
291 *token = (struct token) { .type = T_STRING, .string = out };
292 return TOKENIZE_TOKEN;
296 tokenize_error__ (token, error);
298 return TOKENIZE_ERROR;
303 tokenize_unexpected_char (const struct substring *s, struct token *token)
306 u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
309 tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
310 uc_name (uc, c_name)));
314 token_from_segment (enum segment_type type, struct substring s,
320 scan_number__ (s, token);
321 return TOKENIZE_TOKEN;
323 case SEG_QUOTED_STRING:
324 scan_quoted_string (s, token);
325 return TOKENIZE_TOKEN;
328 case SEG_UNICODE_STRING:
329 return tokenize_string_segment__ (type, s, token);
331 case SEG_UNQUOTED_STRING:
332 case SEG_DO_REPEAT_COMMAND:
333 case SEG_INLINE_DATA:
337 *token = (struct token) { .type = T_STRING };
338 ss_alloc_substring (&token->string, s);
339 return TOKENIZE_TOKEN;
341 case SEG_RESERVED_WORD:
342 *token = (struct token) { .type = scan_reserved_word__ (s) };
343 return TOKENIZE_TOKEN;
346 *token = (struct token) { .type = T_ID };
347 ss_alloc_substring (&token->string, s);
348 return TOKENIZE_TOKEN;
351 *token = (struct token) { .type = T_MACRO_ID };
352 ss_alloc_substring (&token->string, s);
353 return TOKENIZE_TOKEN;
356 *token = (struct token) { .type = scan_punct__ (s) };
357 if (token->type == T_MACRO_PUNCT)
358 ss_alloc_substring (&token->string, s);
359 return TOKENIZE_TOKEN;
365 case SEG_COMMENT_COMMAND:
366 return TOKENIZE_EMPTY;
368 case SEG_START_DOCUMENT:
369 *token = (struct token) { .type = T_ID };
370 ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
371 return TOKENIZE_TOKEN;
373 case SEG_START_COMMAND:
374 case SEG_SEPARATE_COMMANDS:
375 case SEG_END_COMMAND:
376 *token = (struct token) { .type = T_ENDCMD };
377 return TOKENIZE_TOKEN;
380 *token = (struct token) { .type = T_STOP };
381 return TOKENIZE_TOKEN;
383 case SEG_EXPECTED_QUOTE:
384 tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
385 return TOKENIZE_ERROR;
387 case SEG_EXPECTED_EXPONENT:
388 tokenize_error__ (token,
389 xasprintf (_("Missing exponent following `%.*s'."),
390 (int) s.length, s.string));
391 return TOKENIZE_ERROR;
393 case SEG_UNEXPECTED_CHAR:
394 tokenize_unexpected_char (&s, token);
395 return TOKENIZE_ERROR;
402 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
405 SLEX has no internal state to free, but it retains a reference to INPUT, so
406 INPUT must not be modified or freed while SLEX is still in use. */
408 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
409 enum segmenter_mode mode, bool is_snippet)
411 *slex = (struct string_lexer) {
415 .segmenter = segmenter_init (mode, is_snippet),
420 enum string_lexer_result
421 string_lexer_next (struct string_lexer *slex, struct token *token)
425 const char *s = slex->input + slex->offset;
426 size_t left = slex->length - slex->offset;
427 enum segment_type type;
430 n = segmenter_push (&slex->segmenter, s, left, true, &type);
434 switch (token_from_segment (type, ss_buffer (s, n), token))
437 return token->type == T_STOP ? SLR_END : SLR_TOKEN;
448 static struct substring
449 concat (struct substring a, struct substring b)
451 size_t length = a.length + b.length;
452 struct substring out = { .string = xmalloc (length + 1), .length = length };
453 memcpy (out.string, a.string, a.length);
454 memcpy (out.string + a.length, b.string, b.length);
455 out.string[length] = '\0';
459 /* Attempts to merge a sequence of tokens together into a single token. The
460 caller feeds tokens in one by one and the merger FSM reports progress. The
461 caller must supply a merger structure M that is set to MERGER_INIT before
462 the first call. The caller must also supply a token OUT for storage, which
463 need not be initialized.
467 * -1 if more tokens are needed. Token OUT might be in use for temporary
468 storage; to ensure that it is freed, continue calling merger_add() until
469 it returns something other than -1. (T_STOP or T_ENDCMD will make it do
472 * 0 if the first token submitted to the merger is the output. This is the
473 common case for the first call, and it can be returned for subsequent
476 * A positive number if OUT is initialized to the output token. The return
477 value is the number of tokens being merged to produce this one. */
479 merger_add (struct merger *m, const struct token *in, struct token *out)
481 /* We perform two different kinds of token merging:
483 - String concatenation, where syntax like "a" + "b" is converted into a
484 single string token. This is definitely needed because the parser
487 - Negative number merging, where syntax like -5 is converted from a pair
488 of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM). This
489 might not be needed anymore because the segmenter directly treats a dash
490 followed by a number, with optional intervening white space, as a
491 negative number. It's only needed if we want intervening comments to be
492 allowed or for part of the negative number token to be produced by macro
497 if (in->type == T_DASH || in->type == T_STRING)
506 if (out->type == T_DASH)
508 if (in->type == T_POS_NUM)
510 *out = (struct token) {
512 .number = -in->number
520 return in->type == T_PLUS ? -1 : 0;
524 if (in->type == T_STRING)
526 out->string = concat (out->string, in->string);
535 return in->type == T_PLUS ? -1 : m->state - 1;
538 if (in->type == T_STRING)
540 struct substring s = concat (out->string, in->string);
541 ss_swap (&s, &out->string);