1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/scan.h"
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
28 #include "libpspp/i18n.h"
30 #include "gl/c-ctype.h"
31 #include "gl/c-strtod.h"
32 #include "gl/xmemdup0.h"
35 #define _(msgid) gettext (msgid)
44 #define SS_NL_BEFORE_PLUS (1u << 0)
45 #define SS_PLUS (1u << 1)
46 #define SS_NL_AFTER_PLUS (1u << 2)
48 /* Returns the integer value of (hex) digit C. */
64 case 'a': case 'A': return 10;
65 case 'b': case 'B': return 11;
66 case 'c': case 'C': return 12;
67 case 'd': case 'D': return 13;
68 case 'e': case 'E': return 14;
69 case 'f': case 'F': return 15;
70 default: return INT_MAX;
75 scan_quoted_string__ (struct substring s, struct token *token)
79 /* Trim ' or " from front and back. */
80 quote = s.string[s.length - 1];
84 ss_realloc (&token->string, token->string.length + s.length + 1);
88 size_t pos = ss_find_byte (s, quote);
92 memcpy (ss_end (token->string), s.string, pos + 1);
93 token->string.length += pos + 1;
94 ss_advance (&s, pos + 2);
97 memcpy (ss_end (token->string), s.string, ss_length (s));
98 token->string.length += ss_length (s);
104 scan_hex_string__ (struct substring s, struct token *token)
109 /* Trim X' from front and ' from back. */
113 if (s.length % 2 != 0)
115 token->type = SCAN_BAD_HEX_LENGTH;
116 token->number = s.length;
120 ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
121 dst = CHAR_CAST (uint8_t *, ss_end (token->string));
122 token->string.length += s.length / 2;
123 for (i = 0; i < s.length; i += 2)
125 int hi = digit_value (s.string[i]);
126 int lo = digit_value (s.string[i + 1]);
128 if (hi >= 16 || lo >= 16)
130 token->type = SCAN_BAD_HEX_DIGIT;
131 token->number = s.string[hi >= 16 ? i : i + 1];
135 *dst++ = hi * 16 + lo;
142 scan_unicode_string__ (struct substring s, struct token *token)
148 /* Trim U' from front and ' from back. */
152 if (s.length < 1 || s.length > 8)
154 token->type = SCAN_BAD_UNICODE_LENGTH;
155 token->number = s.length;
159 ss_realloc (&token->string, token->string.length + 4 + 1);
162 for (i = 0; i < s.length; i++)
164 int digit = digit_value (s.string[i]);
167 token->type = SCAN_BAD_UNICODE_DIGIT;
168 token->number = s.string[i];
171 uc = uc * 16 + digit;
174 if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
176 token->type = SCAN_BAD_UNICODE_CODE_POINT;
181 dst = CHAR_CAST (uint8_t *, ss_end (token->string));
182 token->string.length += u8_uctomb (dst, uc, 4);
187 static enum scan_result
188 scan_string_segment__ (struct scanner *scanner, enum segment_type type,
189 struct substring s, struct token *token)
195 case SEG_QUOTED_STRING:
196 ok = scan_quoted_string__ (s, token);
200 ok = scan_hex_string__ (s, token);
203 case SEG_UNICODE_STRING:
204 ok = scan_unicode_string__ (s, token);
213 token->type = T_STRING;
214 token->string.string[token->string.length] = '\0';
215 scanner->state = S_STRING;
216 scanner->substate = 0;
221 /* The function we called above should have filled in token->type and
222 token->number properly to describe the error. */
223 ss_dealloc (&token->string);
224 token->string = ss_empty ();
230 static enum scan_result
231 add_bit (struct scanner *scanner, unsigned int bit)
233 if (!(scanner->substate & bit))
235 scanner->substate |= bit;
242 static enum scan_result
243 scan_string__ (struct scanner *scanner, enum segment_type type,
244 struct substring s, struct token *token)
253 if (scanner->substate & SS_PLUS)
254 return add_bit (scanner, SS_NL_AFTER_PLUS);
256 return add_bit (scanner, SS_NL_BEFORE_PLUS);
259 return (s.length == 1 && s.string[0] == '+'
260 ? add_bit (scanner, SS_PLUS)
263 case SEG_QUOTED_STRING:
265 case SEG_UNICODE_STRING:
266 return (scanner->substate & SS_PLUS
267 ? scan_string_segment__ (scanner, type, s, token)
275 static enum token_type
276 scan_reserved_word__ (struct substring word)
278 switch (c_toupper (word.string[0]))
287 return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
290 return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
293 return word.length == 2 ? T_NE : T_NOT;
302 return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
311 static enum token_type
312 scan_punct1__ (char c0)
316 case '(': return T_LPAREN;
317 case ')': return T_RPAREN;
318 case ',': return T_COMMA;
319 case '=': return T_EQUALS;
320 case '-': return T_DASH;
321 case '[': return T_LBRACK;
322 case ']': return T_RBRACK;
323 case '&': return T_AND;
324 case '|': return T_OR;
325 case '+': return T_PLUS;
326 case '/': return T_SLASH;
327 case '*': return T_ASTERISK;
328 case '<': return T_LT;
329 case '>': return T_GT;
330 case '~': return T_NOT;
331 default: return T_MACRO_PUNCT;
337 static enum token_type
338 scan_punct2__ (char c0, char c1)
346 return c1 == '=' ? T_LE : T_NE;
364 static enum token_type
365 scan_punct__ (struct substring s)
367 return (s.length == 1
368 ? scan_punct1__ (s.string[0])
369 : scan_punct2__ (s.string[0], s.string[1]));
373 scan_number__ (struct substring s, struct token *token)
378 if (s.length < sizeof buf)
381 memcpy (buf, s.string, s.length);
382 buf[s.length] = '\0';
385 p = xmemdup0 (s.string, s.length);
387 bool negative = *p == '-';
388 double x = c_strtod (p + negative, NULL);
389 *token = (struct token) {
390 .type = negative ? T_NEG_NUM : T_POS_NUM,
391 .number = negative ? -x : x,
398 static enum scan_result
399 scan_unexpected_char (const struct substring *s, struct token *token)
403 token->type = SCAN_UNEXPECTED_CHAR;
404 u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
411 scan_type_to_string (enum scan_type type)
415 #define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
420 return token_type_to_name ((enum token_type) type);
425 is_scan_type (enum scan_type type)
427 return type > SCAN_FIRST && type < SCAN_LAST;
430 /* If TOKEN has the type of a scan error (a subset of those identified by
431 is_scan_type()), returns an appropriate error message. Otherwise, returns
434 scan_token_to_error (const struct token *token)
438 case SCAN_BAD_HEX_LENGTH:
439 return xasprintf (_("String of hex digits has %d characters, which "
440 "is not a multiple of 2."), (int) token->number);
442 case SCAN_BAD_HEX_DIGIT:
443 case SCAN_BAD_UNICODE_DIGIT:
444 return xasprintf (_("`%c' is not a valid hex digit."),
445 (int) token->number);
447 case SCAN_BAD_UNICODE_LENGTH:
448 return xasprintf (_("Unicode string contains %d bytes, which is "
449 "not in the valid range of 1 to 8 bytes."),
450 (int) token->number);
452 case SCAN_BAD_UNICODE_CODE_POINT:
453 return xasprintf (_("U+%04X is not a valid Unicode code point."),
454 (int) token->number);
456 case SCAN_EXPECTED_QUOTE:
457 return xasprintf (_("Unterminated string constant."));
459 case SCAN_EXPECTED_EXPONENT:
460 return xasprintf (_("Missing exponent following `%s'."),
461 token->string.string);
463 case SCAN_UNEXPECTED_CHAR:
466 return xasprintf (_("Bad character %s in input."),
467 uc_name (token->number, c_name));
474 static enum scan_result
475 scan_start__ (struct scanner *scanner, enum segment_type type,
476 struct substring s, struct token *token)
481 scan_number__ (s, token);
484 case SEG_QUOTED_STRING:
486 case SEG_UNICODE_STRING:
487 return scan_string_segment__ (scanner, type, s, token);
489 case SEG_UNQUOTED_STRING:
490 case SEG_DO_REPEAT_COMMAND:
491 case SEG_INLINE_DATA:
494 token->type = T_STRING;
495 ss_alloc_substring (&token->string, s);
498 case SEG_RESERVED_WORD:
499 token->type = scan_reserved_word__ (s);
504 ss_alloc_substring (&token->string, s);
508 token->type = T_MACRO_ID;
509 ss_alloc_substring (&token->string, s);
513 if (s.length == 1 && s.string[0] == '-')
515 scanner->state = S_DASH;
520 token->type = scan_punct__ (s);
521 if (token->type == T_MACRO_PUNCT)
522 ss_alloc_substring (&token->string, s);
530 case SEG_COMMENT_COMMAND:
531 token->type = SCAN_SKIP;
534 case SEG_START_DOCUMENT:
536 ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
539 case SEG_START_COMMAND:
540 case SEG_SEPARATE_COMMANDS:
541 case SEG_END_COMMAND:
542 token->type = T_ENDCMD;
546 token->type = T_STOP;
549 case SEG_EXPECTED_QUOTE:
550 token->type = SCAN_EXPECTED_QUOTE;
553 case SEG_EXPECTED_EXPONENT:
554 token->type = SCAN_EXPECTED_EXPONENT;
555 ss_alloc_substring (&token->string, s);
558 case SEG_UNEXPECTED_CHAR:
559 return scan_unexpected_char (&s, token);
565 static enum scan_result
566 scan_dash__ (enum segment_type type, struct substring s, struct token *token)
575 scan_number__ (s, token);
576 token->type = T_NEG_NUM;
577 token->number = -token->number;
581 token->type = T_DASH;
586 /* Initializes SCANNER for scanning a token from a sequence of segments.
587 Initializes TOKEN as the output token. (The client retains ownership of
588 TOKEN, but it must be preserved across subsequent calls to scanner_push()
591 A scanner only produces a single token. To obtain the next token,
592 re-initialize it by calling this function again.
594 A scanner does not contain any external references, so nothing needs to be
595 done to destroy one. For the same reason, scanners may be copied with plain
596 struct assignment (or memcpy). */
598 scanner_init (struct scanner *scanner, struct token *token)
600 scanner->state = S_START;
601 *token = (struct token) { .type = T_STOP };
604 /* Adds the segment with type TYPE and UTF-8 text S to SCANNER. TOKEN must be
605 the same token passed to scanner_init() for SCANNER, or a copy of it.
606 scanner_push() may modify TOKEN. The client retains ownership of TOKEN,
608 The possible return values are:
610 - SCAN_DONE: All of the segments that have been passed to scanner_push()
611 form the token now stored in TOKEN. SCANNER is now "used up" and must
612 be reinitialized with scanner_init() if it is to be used again.
614 Most tokens only consist of a single segment, so this is the most common
617 - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
618 token. The caller should call scanner_push() again with the next token.
619 (This won't happen if TYPE is SEG_END indicating the end of input.)
621 - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
622 needs to "save its place" in the stream of segments for a possible
623 future SCAN_BACK return. This value can be returned more than once in a
624 sequence of scanner_push() calls for SCANNER, but the caller only needs
625 to keep track of the most recent position.
627 - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
628 the segments up to and including the segment for which SCAN_SAVE was
629 most recently returned. Segments following that one should be passed to
630 the next scanner to be initialized.
633 scanner_push (struct scanner *scanner, enum segment_type type,
634 struct substring s, struct token *token)
636 switch (scanner->state)
639 return scan_start__ (scanner, type, s, token);
642 return scan_dash__ (type, s, token);
645 return scan_string__ (scanner, type, s, token);
651 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
654 SLEX has no internal state to free, but it retains a reference to INPUT, so
655 INPUT must not be modified or freed while SLEX is still in use. */
657 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
658 enum segmenter_mode mode, bool is_snippet)
660 *slex = (struct string_lexer) {
664 .segmenter = segmenter_init (mode, is_snippet),
670 string_lexer_next (struct string_lexer *slex, struct token *token)
672 struct segmenter saved_segmenter;
673 size_t saved_offset = 0;
675 struct scanner scanner;
677 scanner_init (&scanner, token);
680 const char *s = slex->input + slex->offset;
681 size_t left = slex->length - slex->offset;
682 enum segment_type type;
685 n = segmenter_push (&slex->segmenter, s, left, true, &type);
689 switch (scanner_push (&scanner, type, ss_buffer (s, n), token))
692 slex->segmenter = saved_segmenter;
693 slex->offset = saved_offset;
696 return token->type != T_STOP;
702 saved_segmenter = slex->segmenter;
703 saved_offset = slex->offset;