1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/scan.h"
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strtod.h"
31 #include "gl/xmemdup0.h"
40 #define SS_NL_BEFORE_PLUS (1u << 0)
41 #define SS_PLUS (1u << 1)
42 #define SS_NL_AFTER_PLUS (1u << 2)
44 /* Returns the integer value of (hex) digit C. */
60 case 'a': case 'A': return 10;
61 case 'b': case 'B': return 11;
62 case 'c': case 'C': return 12;
63 case 'd': case 'D': return 13;
64 case 'e': case 'E': return 14;
65 case 'f': case 'F': return 15;
66 default: return INT_MAX;
71 scan_quoted_string__ (struct substring s, struct token *token)
75 /* Trim ' or " from front and back. */
76 quote = s.string[s.length - 1];
80 ss_realloc (&token->string, token->string.length + s.length + 1);
84 size_t pos = ss_find_byte (s, quote);
88 memcpy (ss_end (token->string), s.string, pos + 1);
89 token->string.length += pos + 1;
90 ss_advance (&s, pos + 2);
93 memcpy (ss_end (token->string), s.string, ss_length (s));
94 token->string.length += ss_length (s);
100 scan_hex_string__ (struct substring s, struct token *token)
105 /* Trim X' from front and ' from back. */
109 if (s.length % 2 != 0)
111 token->type = SCAN_BAD_HEX_LENGTH;
112 token->number = s.length;
116 ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
117 dst = CHAR_CAST (uint8_t *, ss_end (token->string));
118 token->string.length += s.length / 2;
119 for (i = 0; i < s.length; i += 2)
121 int hi = digit_value (s.string[i]);
122 int lo = digit_value (s.string[i + 1]);
124 if (hi >= 16 || lo >= 16)
126 token->type = SCAN_BAD_HEX_DIGIT;
127 token->number = s.string[hi >= 16 ? i : i + 1];
131 *dst++ = hi * 16 + lo;
138 scan_unicode_string__ (struct substring s, struct token *token)
144 /* Trim U' from front and ' from back. */
148 if (s.length < 1 || s.length > 8)
150 token->type = SCAN_BAD_UNICODE_LENGTH;
151 token->number = s.length;
155 ss_realloc (&token->string, token->string.length + 4 + 1);
158 for (i = 0; i < s.length; i++)
160 int digit = digit_value (s.string[i]);
163 token->type = SCAN_BAD_UNICODE_DIGIT;
164 token->number = s.string[i];
167 uc = uc * 16 + digit;
170 if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
172 token->type = SCAN_BAD_UNICODE_CODE_POINT;
177 dst = CHAR_CAST (uint8_t *, ss_end (token->string));
178 token->string.length += u8_uctomb (dst, uc, 4);
183 static enum scan_result
184 scan_string_segment__ (struct scanner *scanner, enum segment_type type,
185 struct substring s, struct token *token)
191 case SEG_QUOTED_STRING:
192 ok = scan_quoted_string__ (s, token);
196 ok = scan_hex_string__ (s, token);
199 case SEG_UNICODE_STRING:
200 ok = scan_unicode_string__ (s, token);
209 token->type = T_STRING;
210 token->string.string[token->string.length] = '\0';
211 scanner->state = S_STRING;
212 scanner->substate = 0;
217 /* The function we called above should have filled in token->type and
218 token->number properly to describe the error. */
219 ss_dealloc (&token->string);
220 token->string = ss_empty ();
226 static enum scan_result
227 add_bit (struct scanner *scanner, unsigned int bit)
229 if (!(scanner->substate & bit))
231 scanner->substate |= bit;
238 static enum scan_result
239 scan_string__ (struct scanner *scanner, enum segment_type type,
240 struct substring s, struct token *token)
249 if (scanner->substate & SS_PLUS)
250 return add_bit (scanner, SS_NL_AFTER_PLUS);
252 return add_bit (scanner, SS_NL_BEFORE_PLUS);
255 return (s.length == 1 && s.string[0] == '+'
256 ? add_bit (scanner, SS_PLUS)
259 case SEG_QUOTED_STRING:
261 case SEG_UNICODE_STRING:
262 return (scanner->substate & SS_PLUS
263 ? scan_string_segment__ (scanner, type, s, token)
271 static enum token_type
272 scan_reserved_word__ (struct substring word)
274 switch (c_toupper (word.string[0]))
283 return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
286 return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
289 return word.length == 2 ? T_NE : T_NOT;
298 return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
307 static enum token_type
308 scan_punct1__ (char c0)
312 case '(': return T_LPAREN;
313 case ')': return T_RPAREN;
314 case ',': return T_COMMA;
315 case '=': return T_EQUALS;
316 case '-': return T_DASH;
317 case '[': return T_LBRACK;
318 case ']': return T_RBRACK;
319 case '&': return T_AND;
320 case '|': return T_OR;
321 case '+': return T_PLUS;
322 case '/': return T_SLASH;
323 case '*': return T_ASTERISK;
324 case '<': return T_LT;
325 case '>': return T_GT;
326 case '~': return T_NOT;
327 default: return T_MACRO_PUNCT;
333 static enum token_type
334 scan_punct2__ (char c0, char c1)
342 return c1 == '=' ? T_LE : T_NE;
360 static enum token_type
361 scan_punct__ (struct substring s)
363 return (s.length == 1
364 ? scan_punct1__ (s.string[0])
365 : scan_punct2__ (s.string[0], s.string[1]));
369 scan_number__ (struct substring s)
375 if (s.length < sizeof buf)
378 memcpy (buf, s.string, s.length);
379 buf[s.length] = '\0';
382 p = xmemdup0 (s.string, s.length);
384 number = c_strtod (p, NULL);
392 static enum scan_result
393 scan_unexpected_char (const struct substring *s, struct token *token)
397 token->type = SCAN_UNEXPECTED_CHAR;
398 u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
405 scan_type_to_string (enum scan_type type)
409 #define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
414 return token_type_to_name ((enum token_type) type);
419 is_scan_type (enum scan_type type)
421 return type > SCAN_FIRST && type < SCAN_LAST;
424 static enum scan_result
425 scan_start__ (struct scanner *scanner, enum segment_type type,
426 struct substring s, struct token *token)
431 token->type = T_POS_NUM;
432 token->number = scan_number__ (s);
435 case SEG_QUOTED_STRING:
437 case SEG_UNICODE_STRING:
438 return scan_string_segment__ (scanner, type, s, token);
440 case SEG_UNQUOTED_STRING:
441 case SEG_DO_REPEAT_COMMAND:
442 case SEG_INLINE_DATA:
445 token->type = T_STRING;
446 ss_alloc_substring (&token->string, s);
449 case SEG_RESERVED_WORD:
450 token->type = scan_reserved_word__ (s);
455 ss_alloc_substring (&token->string, s);
459 token->type = T_MACRO_ID;
460 ss_alloc_substring (&token->string, s);
464 if (s.length == 1 && s.string[0] == '-')
466 scanner->state = S_DASH;
471 token->type = scan_punct__ (s);
472 if (token->type == T_MACRO_PUNCT)
473 ss_alloc_substring (&token->string, s);
481 case SEG_COMMENT_COMMAND:
482 token->type = SCAN_SKIP;
485 case SEG_START_DOCUMENT:
487 ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
490 case SEG_START_COMMAND:
491 case SEG_SEPARATE_COMMANDS:
492 case SEG_END_COMMAND:
493 token->type = T_ENDCMD;
497 token->type = T_STOP;
500 case SEG_EXPECTED_QUOTE:
501 token->type = SCAN_EXPECTED_QUOTE;
504 case SEG_EXPECTED_EXPONENT:
505 token->type = SCAN_EXPECTED_EXPONENT;
506 ss_alloc_substring (&token->string, s);
509 case SEG_UNEXPECTED_DOT:
510 token->type = SCAN_UNEXPECTED_DOT;
513 case SEG_UNEXPECTED_CHAR:
514 return scan_unexpected_char (&s, token);
520 static enum scan_result
521 scan_dash__ (enum segment_type type, struct substring s, struct token *token)
530 token->type = T_NEG_NUM;
531 token->number = -scan_number__ (s);
535 token->type = T_DASH;
540 /* Initializes SCANNER for scanning a token from a sequence of segments.
541 Initializes TOKEN as the output token. (The client retains ownership of
542 TOKEN, but it must be preserved across subsequent calls to scanner_push()
545 A scanner only produces a single token. To obtain the next token,
546 re-initialize it by calling this function again.
548 A scanner does not contain any external references, so nothing needs to be
549 done to destroy one. For the same reason, scanners may be copied with plain
550 struct assignment (or memcpy). */
552 scanner_init (struct scanner *scanner, struct token *token)
554 scanner->state = S_START;
558 /* Adds the segment with type TYPE and UTF-8 text S to SCANNER. TOKEN must be
559 the same token passed to scanner_init() for SCANNER, or a copy of it.
560 scanner_push() may modify TOKEN. The client retains ownership of TOKEN,
562 The possible return values are:
564 - SCAN_DONE: All of the segments that have been passed to scanner_push()
565 form the token now stored in TOKEN. SCANNER is now "used up" and must
566 be reinitialized with scanner_init() if it is to be used again.
568 Most tokens only consist of a single segment, so this is the most common
571 - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
572 token. The caller should call scanner_push() again with the next token.
573 (This won't happen if TYPE is SEG_END indicating the end of input.)
575 - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
576 needs to "save its place" in the stream of segments for a possible
577 future SCAN_BACK return. This value can be returned more than once in a
578 sequence of scanner_push() calls for SCANNER, but the caller only needs
579 to keep track of the most recent position.
581 - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
582 the segments up to and including the segment for which SCAN_SAVE was
583 most recently returned. Segments following that one should be passed to
584 the next scanner to be initialized.
587 scanner_push (struct scanner *scanner, enum segment_type type,
588 struct substring s, struct token *token)
590 switch (scanner->state)
593 return scan_start__ (scanner, type, s, token);
596 return scan_dash__ (type, s, token);
599 return scan_string__ (scanner, type, s, token);
605 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
608 SLEX has no internal state to free, but it retains a reference to INPUT, so
609 INPUT must not be modified or freed while SLEX is still in use. */
611 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
612 enum segmenter_mode mode)
615 slex->length = length;
617 segmenter_init (&slex->segmenter, mode);
622 string_lexer_next (struct string_lexer *slex, struct token *token)
624 struct segmenter saved_segmenter;
625 size_t saved_offset = 0;
627 struct scanner scanner;
629 scanner_init (&scanner, token);
632 const char *s = slex->input + slex->offset;
633 size_t left = slex->length - slex->offset;
634 enum segment_type type;
637 n = segmenter_push (&slex->segmenter, s, left, true, &type);
641 switch (scanner_push (&scanner, type, ss_buffer (s, n), token))
644 slex->segmenter = saved_segmenter;
645 slex->offset = saved_offset;
648 return token->type != T_STOP;
654 saved_segmenter = slex->segmenter;
655 saved_offset = slex->offset;