1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/scan.h"
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
28 #include "libpspp/i18n.h"
30 #include "gl/c-ctype.h"
31 #include "gl/c-strtod.h"
32 #include "gl/xmemdup0.h"
35 #define _(msgid) gettext (msgid)
37 /* Returns the integer value of (hex) digit C. */
53 case 'a': case 'A': return 10;
54 case 'b': case 'B': return 11;
55 case 'c': case 'C': return 12;
56 case 'd': case 'D': return 13;
57 case 'e': case 'E': return 14;
58 case 'f': case 'F': return 15;
59 default: return INT_MAX;
64 scan_quoted_string (struct substring in, struct token *token)
66 /* Trim ' or " from front and back. */
67 int quote = in.string[0];
71 struct substring out = { .string = xmalloc (in.length + 1) };
75 size_t pos = ss_find_byte (in, quote);
79 memcpy (ss_end (out), in.string, pos + 1);
80 out.length += pos + 1;
81 ss_advance (&in, pos + 2);
84 memcpy (ss_end (out), in.string, in.length);
85 out.length += in.length;
86 out.string[out.length] = '\0';
88 *token = (struct token) { .type = T_STRING, .string = out };
92 scan_hex_string__ (struct substring in, struct substring *out)
94 if (in.length % 2 != 0)
95 return xasprintf (_("String of hex digits has %zu characters, which "
96 "is not a multiple of 2."), in.length);
98 ss_realloc (out, in.length / 2 + 1);
99 uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
100 out->length = in.length / 2;
101 for (size_t i = 0; i < in.length; i += 2)
103 int hi = digit_value (in.string[i]);
104 int lo = digit_value (in.string[i + 1]);
106 if (hi >= 16 || lo >= 16)
107 return xasprintf (_("`%c' is not a valid hex digit."),
108 in.string[hi >= 16 ? i : i + 1]);
110 *dst++ = hi * 16 + lo;
117 scan_unicode_string__ (struct substring in, struct substring *out)
119 if (in.length < 1 || in.length > 8)
120 return xasprintf (_("Unicode string contains %zu bytes, which is "
121 "not in the valid range of 1 to 8 bytes."),
125 for (size_t i = 0; i < in.length; i++)
127 int digit = digit_value (in.string[i]);
129 return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
130 uc = uc * 16 + digit;
133 if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
134 return xasprintf (_("U+%04llX is not a valid Unicode code point."),
137 ss_realloc (out, 4 + 1);
138 out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
143 static enum token_type
144 scan_reserved_word__ (struct substring word)
146 switch (c_toupper (word.string[0]))
155 return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
158 return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
161 return word.length == 2 ? T_NE : T_NOT;
170 return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
179 static enum token_type
180 scan_punct1__ (char c0)
184 case '(': return T_LPAREN;
185 case ')': return T_RPAREN;
186 case ',': return T_COMMA;
187 case '=': return T_EQUALS;
188 case '-': return T_DASH;
189 case '[': return T_LBRACK;
190 case ']': return T_RBRACK;
191 case '{': return T_LCURLY;
192 case '}': return T_RCURLY;
193 case '&': return T_AND;
194 case '|': return T_OR;
195 case '+': return T_PLUS;
196 case '/': return T_SLASH;
197 case '*': return T_ASTERISK;
198 case '<': return T_LT;
199 case '>': return T_GT;
200 case '~': return T_NOT;
201 case ';': return T_SEMICOLON;
202 case ':': return T_COLON;
203 default: return T_MACRO_PUNCT;
209 static enum token_type
210 scan_punct2__ (char c0, char c1)
218 return c1 == '=' ? T_LE : T_NE;
236 static enum token_type
237 scan_punct__ (struct substring s)
239 return (s.length == 1
240 ? scan_punct1__ (s.string[0])
241 : scan_punct2__ (s.string[0], s.string[1]));
245 scan_number__ (struct substring s, struct token *token)
250 if (s.length < sizeof buf)
253 memcpy (buf, s.string, s.length);
254 buf[s.length] = '\0';
257 p = xmemdup0 (s.string, s.length);
259 bool negative = *p == '-';
260 double x = c_strtod (p + negative, NULL);
261 *token = (struct token) {
262 .type = negative ? T_NEG_NUM : T_POS_NUM,
263 .number = negative ? -x : x,
271 tokenize_error__ (struct token *token, char *error)
273 *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
276 static enum tokenize_result
277 tokenize_string_segment__ (enum segment_type type,
278 struct substring s, struct token *token)
280 /* Trim X' or U' from front and ' from back. */
284 struct substring out = SS_EMPTY_INITIALIZER;
285 char *error = (type == SEG_HEX_STRING
286 ? scan_hex_string__ (s, &out)
287 : scan_unicode_string__ (s, &out));
290 out.string[out.length] = '\0';
291 *token = (struct token) { .type = T_STRING, .string = out };
292 return TOKENIZE_TOKEN;
296 tokenize_error__ (token, error);
298 return TOKENIZE_ERROR;
303 tokenize_unexpected_char (const struct substring *s, struct token *token)
306 u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
309 tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
310 uc_name (uc, c_name)));
314 token_from_segment (enum segment_type type, struct substring s,
320 scan_number__ (s, token);
321 return TOKENIZE_TOKEN;
323 case SEG_QUOTED_STRING:
324 scan_quoted_string (s, token);
325 return TOKENIZE_TOKEN;
328 case SEG_UNICODE_STRING:
329 return tokenize_string_segment__ (type, s, token);
331 case SEG_UNQUOTED_STRING:
332 case SEG_DO_REPEAT_COMMAND:
333 case SEG_INLINE_DATA:
337 *token = (struct token) { .type = T_STRING, .string = ss_clone (s) };
338 return TOKENIZE_TOKEN;
340 case SEG_RESERVED_WORD:
341 *token = (struct token) { .type = scan_reserved_word__ (s) };
342 return TOKENIZE_TOKEN;
345 *token = (struct token) { .type = T_ID, .string = ss_clone (s) };
346 return TOKENIZE_TOKEN;
349 *token = (struct token) { .type = T_MACRO_ID, .string = ss_clone (s)};
350 return TOKENIZE_TOKEN;
353 *token = (struct token) { .type = scan_punct__ (s) };
354 if (token->type == T_MACRO_PUNCT)
355 token->string = ss_clone (s);
356 return TOKENIZE_TOKEN;
362 case SEG_COMMENT_COMMAND:
363 return TOKENIZE_EMPTY;
365 case SEG_START_DOCUMENT:
366 *token = (struct token) {
368 .string = ss_clone (ss_cstr ("DOCUMENT"))
370 return TOKENIZE_TOKEN;
372 case SEG_START_COMMAND:
373 case SEG_SEPARATE_COMMANDS:
374 case SEG_END_COMMAND:
375 *token = (struct token) { .type = T_ENDCMD };
376 return TOKENIZE_TOKEN;
379 *token = (struct token) { .type = T_STOP };
380 return TOKENIZE_TOKEN;
382 case SEG_EXPECTED_QUOTE:
383 tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
384 return TOKENIZE_ERROR;
386 case SEG_EXPECTED_EXPONENT:
387 tokenize_error__ (token,
388 xasprintf (_("Missing exponent following `%.*s'."),
389 (int) s.length, s.string));
390 return TOKENIZE_ERROR;
392 case SEG_UNEXPECTED_CHAR:
393 tokenize_unexpected_char (&s, token);
394 return TOKENIZE_ERROR;
401 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
404 SLEX has no internal state to free, but it retains a reference to INPUT, so
405 INPUT must not be modified or freed while SLEX is still in use. */
407 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
408 enum segmenter_mode mode, bool is_snippet)
410 *slex = (struct string_lexer) {
414 .segmenter = segmenter_init (mode, is_snippet),
419 enum string_lexer_result
420 string_lexer_next (struct string_lexer *slex, struct token *token)
424 const char *s = slex->input + slex->offset;
425 size_t left = slex->length - slex->offset;
426 enum segment_type type;
429 n = segmenter_push (&slex->segmenter, s, left, true, &type);
433 switch (token_from_segment (type, ss_buffer (s, n), token))
436 return token->type == T_STOP ? SLR_END : SLR_TOKEN;
447 static struct substring
448 concat (struct substring a, struct substring b)
450 size_t length = a.length + b.length;
451 struct substring out = { .string = xmalloc (length + 1), .length = length };
452 memcpy (out.string, a.string, a.length);
453 memcpy (out.string + a.length, b.string, b.length);
454 out.string[length] = '\0';
458 /* Attempts to merge a sequence of tokens together into a single token. The
459 caller feeds tokens in one by one and the merger FSM reports progress. The
460 caller must supply a merger structure M that is set to MERGER_INIT before
461 the first call. The caller must also supply a token OUT for storage, which
462 need not be initialized.
466 * -1 if more tokens are needed. Token OUT might be in use for temporary
467 storage; to ensure that it is freed, continue calling merger_add() until
468 it returns something other than -1. (T_STOP or T_ENDCMD will make it do
471 * 0 if the first token submitted to the merger is the output. This is the
472 common case for the first call, and it can be returned for subsequent
475 * A positive number if OUT is initialized to the output token. The return
476 value is the number of tokens being merged to produce this one. */
478 merger_add (struct merger *m, const struct token *in, struct token *out)
480 /* We perform two different kinds of token merging:
482 - String concatenation, where syntax like "a" + "b" is converted into a
483 single string token. This is definitely needed because the parser
486 - Negative number merging, where syntax like -5 is converted from a pair
487 of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM). This
488 might not be needed anymore because the segmenter directly treats a dash
489 followed by a number, with optional intervening white space, as a
490 negative number. It's only needed if we want intervening comments to be
491 allowed or for part of the negative number token to be produced by macro
496 if (in->type == T_DASH || in->type == T_STRING)
505 if (out->type == T_DASH)
507 if (in->type == T_POS_NUM)
509 *out = (struct token) {
511 .number = -in->number
519 return in->type == T_PLUS ? -1 : 0;
523 if (in->type == T_STRING)
525 out->string = concat (out->string, in->string);
534 return in->type == T_PLUS ? -1 : m->state - 1;
537 if (in->type == T_STRING)
539 struct substring s = concat (out->string, in->string);
540 ss_swap (&s, &out->string);