1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/scan.h"
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/xmemdup0.h"
39 #define SS_NL_BEFORE_PLUS (1u << 0)
40 #define SS_PLUS (1u << 1)
41 #define SS_NL_AFTER_PLUS (1u << 2)
43 /* Returns the integer value of (hex) digit C. */
59 case 'a': case 'A': return 10;
60 case 'b': case 'B': return 11;
61 case 'c': case 'C': return 12;
62 case 'd': case 'D': return 13;
63 case 'e': case 'E': return 14;
64 case 'f': case 'F': return 15;
65 default: return INT_MAX;
70 scan_quoted_string__ (struct substring s, struct token *token)
74 /* Trim ' or " from front and back. */
75 quote = s.string[s.length - 1];
79 ss_realloc (&token->string, token->string.length + s.length + 1);
83 size_t pos = ss_find_byte (s, quote);
87 memcpy (ss_end (token->string), s.string, pos + 1);
88 token->string.length += pos + 1;
89 ss_advance (&s, pos + 2);
92 memcpy (ss_end (token->string), s.string, ss_length (s));
93 token->string.length += ss_length (s);
99 scan_hex_string__ (struct substring s, struct token *token)
104 /* Trim X' from front and ' from back. */
108 if (s.length % 2 != 0)
110 token->type = SCAN_BAD_HEX_LENGTH;
111 token->number = s.length;
115 ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
116 dst = CHAR_CAST (uint8_t *, ss_end (token->string));
117 token->string.length += s.length / 2;
118 for (i = 0; i < s.length; i += 2)
120 int hi = digit_value (s.string[i]);
121 int lo = digit_value (s.string[i + 1]);
123 if (hi >= 16 || lo >= 16)
125 token->type = SCAN_BAD_HEX_DIGIT;
126 token->number = s.string[hi >= 16 ? i : i + 1];
130 *dst++ = hi * 16 + lo;
137 scan_unicode_string__ (struct substring s, struct token *token)
143 /* Trim U' from front and ' from back. */
147 if (s.length < 1 || s.length > 8)
149 token->type = SCAN_BAD_UNICODE_LENGTH;
150 token->number = s.length;
154 ss_realloc (&token->string, token->string.length + 4 + 1);
157 for (i = 0; i < s.length; i++)
159 int digit = digit_value (s.string[i]);
162 token->type = SCAN_BAD_UNICODE_DIGIT;
163 token->number = s.string[i];
166 uc = uc * 16 + digit;
169 if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
171 token->type = SCAN_BAD_UNICODE_CODE_POINT;
176 dst = CHAR_CAST (uint8_t *, ss_end (token->string));
177 token->string.length += u8_uctomb (dst, uc, 4);
182 static enum scan_result
183 scan_string_segment__ (struct scanner *scanner, enum segment_type type,
184 struct substring s, struct token *token)
190 case SEG_QUOTED_STRING:
191 ok = scan_quoted_string__ (s, token);
195 ok = scan_hex_string__ (s, token);
198 case SEG_UNICODE_STRING:
199 ok = scan_unicode_string__ (s, token);
208 token->type = T_STRING;
209 token->string.string[token->string.length] = '\0';
210 scanner->state = S_STRING;
211 scanner->substate = 0;
216 /* The function we called above should have filled in token->type and
217 token->number properly to describe the error. */
218 ss_dealloc (&token->string);
219 token->string = ss_empty ();
225 static enum scan_result
226 add_bit (struct scanner *scanner, unsigned int bit)
228 if (!(scanner->substate & bit))
230 scanner->substate |= bit;
237 static enum scan_result
238 scan_string__ (struct scanner *scanner, enum segment_type type,
239 struct substring s, struct token *token)
248 if (scanner->substate & SS_PLUS)
249 return add_bit (scanner, SS_NL_AFTER_PLUS);
251 return add_bit (scanner, SS_NL_BEFORE_PLUS);
254 return (s.length == 1 && s.string[0] == '+'
255 ? add_bit (scanner, SS_PLUS)
258 case SEG_QUOTED_STRING:
260 case SEG_UNICODE_STRING:
261 return (scanner->substate & SS_PLUS
262 ? scan_string_segment__ (scanner, type, s, token)
270 static enum token_type
271 scan_reserved_word__ (struct substring word)
273 switch (c_toupper (word.string[0]))
282 return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
285 return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
288 return word.length == 2 ? T_NE : T_NOT;
297 return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
306 static enum token_type
307 scan_punct1__ (char c0)
311 case '(': return T_LPAREN;
312 case ')': return T_RPAREN;
313 case ',': return T_COMMA;
314 case '=': return T_EQUALS;
315 case '-': return T_DASH;
316 case '[': return T_LBRACK;
317 case ']': return T_RBRACK;
318 case '&': return T_AND;
319 case '|': return T_OR;
320 case '+': return T_PLUS;
321 case '/': return T_SLASH;
322 case '*': return T_ASTERISK;
323 case '<': return T_LT;
324 case '>': return T_GT;
325 case '~': return T_NOT;
331 static enum token_type
332 scan_punct2__ (char c0, char c1)
340 return c1 == '=' ? T_LE : T_NE;
358 static enum token_type
359 scan_punct__ (struct substring s)
361 return (s.length == 1
362 ? scan_punct1__ (s.string[0])
363 : scan_punct2__ (s.string[0], s.string[1]));
367 scan_number__ (struct substring s)
373 if (s.length < sizeof buf)
376 memcpy (buf, s.string, s.length);
377 buf[s.length] = '\0';
380 p = xmemdup0 (s.string, s.length);
382 number = strtod (p, NULL);
390 static enum scan_result
391 scan_unexpected_char (const struct substring *s, struct token *token)
395 token->type = SCAN_UNEXPECTED_CHAR;
396 u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
403 scan_type_to_string (enum scan_type type)
407 #define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
412 return token_type_to_name (type);
417 is_scan_type (enum scan_type type)
419 return type > SCAN_FIRST && type < SCAN_LAST;
422 static enum scan_result
423 scan_start__ (struct scanner *scanner, enum segment_type type,
424 struct substring s, struct token *token)
429 token->type = T_POS_NUM;
430 token->number = scan_number__ (s);
433 case SEG_QUOTED_STRING:
435 case SEG_UNICODE_STRING:
436 return scan_string_segment__ (scanner, type, s, token);
438 case SEG_UNQUOTED_STRING:
439 case SEG_DO_REPEAT_COMMAND:
440 case SEG_INLINE_DATA:
442 token->type = T_STRING;
443 ss_alloc_substring (&token->string, s);
446 case SEG_RESERVED_WORD:
447 token->type = scan_reserved_word__ (s);
452 ss_alloc_substring (&token->string, s);
456 if (s.length == 1 && s.string[0] == '-')
458 scanner->state = S_DASH;
463 token->type = scan_punct__ (s);
471 case SEG_COMMENT_COMMAND:
472 token->type = SCAN_SKIP;
475 case SEG_START_DOCUMENT:
477 ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
480 case SEG_START_COMMAND:
481 case SEG_SEPARATE_COMMANDS:
482 case SEG_END_COMMAND:
483 token->type = T_ENDCMD;
487 token->type = T_STOP;
490 case SEG_EXPECTED_QUOTE:
491 token->type = SCAN_EXPECTED_QUOTE;
494 case SEG_EXPECTED_EXPONENT:
495 token->type = SCAN_EXPECTED_EXPONENT;
496 ss_alloc_substring (&token->string, s);
499 case SEG_UNEXPECTED_DOT:
500 token->type = SCAN_UNEXPECTED_DOT;
503 case SEG_UNEXPECTED_CHAR:
504 return scan_unexpected_char (&s, token);
513 static enum scan_result
514 scan_dash__ (enum segment_type type, struct substring s, struct token *token)
523 token->type = T_NEG_NUM;
524 token->number = -scan_number__ (s);
528 token->type = T_DASH;
533 /* Initializes SCANNER for scanning a token from a sequence of segments.
534 Initializes TOKEN as the output token. (The client retains ownership of
535 TOKEN, but it must be preserved across subsequent calls to scanner_push()
538 A scanner only produces a single token. To obtain the next token,
539 re-initialize it by calling this function again.
541 A scanner does not contain any external references, so nothing needs to be
542 done to destroy one. For the same reason, scanners may be copied with plain
543 struct assignment (or memcpy). */
545 scanner_init (struct scanner *scanner, struct token *token)
547 scanner->state = S_START;
551 /* Adds the segment with type TYPE and UTF-8 text S to SCANNER. TOKEN must be
552 the same token passed to scanner_init() for SCANNER, or a copy of it.
553 scanner_push() may modify TOKEN. The client retains ownership of TOKEN,
555 The possible return values are:
557 - SCAN_DONE: All of the segments that have been passed to scanner_push()
558 form the token now stored in TOKEN. SCANNER is now "used up" and must
559 be reinitialized with scanner_init() if it is to be used again.
561 Most tokens only consist of a single segment, so this is the most common
564 - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
565 token. The caller should call scanner_push() again with the next token.
566 (This won't happen if TYPE is SEG_END indicating the end of input.)
568 - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
569 needs to "save its place" in the stream of segments for a possible
570 future SCAN_BACK return. This value can be returned more than once in a
571 sequence of scanner_push() calls for SCANNER, but the caller only needs
572 to keep track of the most recent position.
574 - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
575 the segments up to and including the segment for which SCAN_SAVE was
576 most recently returned. Segments following that one should be passed to
577 the next scanner to be initialized.
580 scanner_push (struct scanner *scanner, enum segment_type type,
581 struct substring s, struct token *token)
583 switch (scanner->state)
586 return scan_start__ (scanner, type, s, token);
589 return scan_dash__ (type, s, token);
592 return scan_string__ (scanner, type, s, token);