X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Flexer%2Fscan.c;h=f2dcebca15ea562543d5fde64b7cac95ec96ab4b;hb=0fde6afee3c995bf264c24c438f43eeb58b859b5;hp=de75eeef3bd1f519a5508c55924be76cc6a2bd66;hpb=6f3865480503c571963d8a2d1af858a4d72d4e88;p=pspp diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c index de75eeef3b..f2dcebca15 100644 --- a/src/language/lexer/scan.c +++ b/src/language/lexer/scan.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,21 +25,14 @@ #include "language/lexer/token.h" #include "libpspp/assertion.h" #include "libpspp/cast.h" +#include "libpspp/i18n.h" #include "gl/c-ctype.h" #include "gl/c-strtod.h" #include "gl/xmemdup0.h" -enum - { - S_START, - S_DASH, - S_STRING - }; - -#define SS_NL_BEFORE_PLUS (1u << 0) -#define SS_PLUS (1u << 1) -#define SS_NL_AFTER_PLUS (1u << 2) +#include "gettext.h" +#define _(msgid) gettext (msgid) /* Returns the integer value of (hex) digit C. */ static int @@ -67,205 +60,84 @@ digit_value (int c) } } -static bool -scan_quoted_string__ (struct substring s, struct token *token) +static void +scan_quoted_string (struct substring in, struct token *token) { - int quote; - /* Trim ' or " from front and back. */ - quote = s.string[s.length - 1]; - s.string++; - s.length -= 2; + int quote = in.string[0]; + in.string++; + in.length -= 2; - ss_realloc (&token->string, token->string.length + s.length + 1); + struct substring out = { .string = xmalloc (in.length + 1) }; for (;;) { - size_t pos = ss_find_byte (s, quote); + size_t pos = ss_find_byte (in, quote); if (pos == SIZE_MAX) break; - memcpy (ss_end (token->string), s.string, pos + 1); - token->string.length += pos + 1; - ss_advance (&s, pos + 2); + memcpy (ss_end (out), in.string, pos + 1); + out.length += pos + 1; + ss_advance (&in, pos + 2); } - memcpy (ss_end (token->string), s.string, ss_length (s)); - token->string.length += ss_length (s); + memcpy (ss_end (out), in.string, in.length); + out.length += in.length; + out.string[out.length] = '\0'; - return true; + *token = (struct token) { .type = T_STRING, .string = out }; } -static bool -scan_hex_string__ (struct substring s, struct token *token) +static char * +scan_hex_string__ (struct substring in, struct substring *out) { - uint8_t *dst; - size_t i; - - /* Trim X' from front and ' from back. */ - s.string += 2; - s.length -= 3; - - if (s.length % 2 != 0) - { - token->type = SCAN_BAD_HEX_LENGTH; - token->number = s.length; - return false; - } - - ss_realloc (&token->string, token->string.length + s.length / 2 + 1); - dst = CHAR_CAST (uint8_t *, ss_end (token->string)); - token->string.length += s.length / 2; - for (i = 0; i < s.length; i += 2) + if (in.length % 2 != 0) + return xasprintf (_("String of hex digits has %zu characters, which " + "is not a multiple of 2."), in.length); + + ss_realloc (out, in.length / 2 + 1); + uint8_t *dst = CHAR_CAST (uint8_t *, out->string); + out->length = in.length / 2; + for (size_t i = 0; i < in.length; i += 2) { - int hi = digit_value (s.string[i]); - int lo = digit_value (s.string[i + 1]); + int hi = digit_value (in.string[i]); + int lo = digit_value (in.string[i + 1]); if (hi >= 16 || lo >= 16) - { - token->type = SCAN_BAD_HEX_DIGIT; - token->number = s.string[hi >= 16 ? i : i + 1]; - return false; - } + return xasprintf (_("`%c' is not a valid hex digit."), + in.string[hi >= 16 ? i : i + 1]); *dst++ = hi * 16 + lo; } - return true; + return NULL; } -static bool -scan_unicode_string__ (struct substring s, struct token *token) +static char * +scan_unicode_string__ (struct substring in, struct substring *out) { - uint8_t *dst; - ucs4_t uc; - size_t i; - - /* Trim U' from front and ' from back. */ - s.string += 2; - s.length -= 3; - - if (s.length < 1 || s.length > 8) - { - token->type = SCAN_BAD_UNICODE_LENGTH; - token->number = s.length; - return 0; - } - - ss_realloc (&token->string, token->string.length + 4 + 1); + if (in.length < 1 || in.length > 8) + return xasprintf (_("Unicode string contains %zu bytes, which is " + "not in the valid range of 1 to 8 bytes."), + in.length); - uc = 0; - for (i = 0; i < s.length; i++) + ucs4_t uc = 0; + for (size_t i = 0; i < in.length; i++) { - int digit = digit_value (s.string[i]); + int digit = digit_value (in.string[i]); if (digit >= 16) - { - token->type = SCAN_BAD_UNICODE_DIGIT; - token->number = s.string[i]; - return 0; - } + return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]); uc = uc * 16 + digit; } if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff) - { - token->type = SCAN_BAD_UNICODE_CODE_POINT; - token->number = uc; - return 0; - } - - dst = CHAR_CAST (uint8_t *, ss_end (token->string)); - token->string.length += u8_uctomb (dst, uc, 4); - - return true; -} - -static enum scan_result -scan_string_segment__ (struct scanner *scanner, enum segment_type type, - struct substring s, struct token *token) -{ - bool ok; - - switch (type) - { - case SEG_QUOTED_STRING: - ok = scan_quoted_string__ (s, token); - break; - - case SEG_HEX_STRING: - ok = scan_hex_string__ (s, token); - break; - - case SEG_UNICODE_STRING: - ok = scan_unicode_string__ (s, token); - break; - - default: - NOT_REACHED (); - } - - if (ok) - { - token->type = T_STRING; - token->string.string[token->string.length] = '\0'; - scanner->state = S_STRING; - scanner->substate = 0; - return SCAN_SAVE; - } - else - { - /* The function we called above should have filled in token->type and - token->number properly to describe the error. */ - ss_dealloc (&token->string); - token->string = ss_empty (); - return SCAN_DONE; - } - -} - -static enum scan_result -add_bit (struct scanner *scanner, unsigned int bit) -{ - if (!(scanner->substate & bit)) - { - scanner->substate |= bit; - return SCAN_MORE; - } - else - return SCAN_BACK; -} - -static enum scan_result -scan_string__ (struct scanner *scanner, enum segment_type type, - struct substring s, struct token *token) -{ - switch (type) - { - case SEG_SPACES: - case SEG_COMMENT: - return SCAN_MORE; - - case SEG_NEWLINE: - if (scanner->substate & SS_PLUS) - return add_bit (scanner, SS_NL_AFTER_PLUS); - else - return add_bit (scanner, SS_NL_BEFORE_PLUS); + return xasprintf (_("U+%04llX is not a valid Unicode code point."), + (long long) uc); - case SEG_PUNCT: - return (s.length == 1 && s.string[0] == '+' - ? add_bit (scanner, SS_PLUS) - : SCAN_BACK); + ss_realloc (out, 4 + 1); + out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4); - case SEG_QUOTED_STRING: - case SEG_HEX_STRING: - case SEG_UNICODE_STRING: - return (scanner->substate & SS_PLUS - ? scan_string_segment__ (scanner, type, s, token) - : SCAN_BACK); - - default: - return SCAN_BACK; - } + return NULL; } static enum token_type @@ -316,6 +188,8 @@ scan_punct1__ (char c0) case '-': return T_DASH; case '[': return T_LBRACK; case ']': return T_RBRACK; + case '{': return T_LCURLY; + case '}': return T_RCURLY; case '&': return T_AND; case '|': return T_OR; case '+': return T_PLUS; @@ -324,6 +198,9 @@ scan_punct1__ (char c0) case '<': return T_LT; case '>': return T_GT; case '~': return T_NOT; + case ';': return T_SEMICOLON; + case ':': return T_COLON; + default: return T_MACRO_PUNCT; } NOT_REACHED (); @@ -364,11 +241,10 @@ scan_punct__ (struct substring s) : scan_punct2__ (s.string[0], s.string[1])); } -static double -scan_number__ (struct substring s) +static void +scan_number__ (struct substring s, struct token *token) { char buf[128]; - double number; char *p; if (s.length < sizeof buf) @@ -380,218 +256,295 @@ scan_number__ (struct substring s) else p = xmemdup0 (s.string, s.length); - number = c_strtod (p, NULL); + bool negative = *p == '-'; + double x = c_strtod (p + negative, NULL); + *token = (struct token) { + .type = negative ? T_NEG_NUM : T_POS_NUM, + .number = negative ? -x : x, + }; if (p != buf) free (p); - - return number; } - -static enum scan_result -scan_unexpected_char (const struct substring *s, struct token *token) + +static void +tokenize_error__ (struct token *token, char *error) { - ucs4_t uc; - - token->type = SCAN_UNEXPECTED_CHAR; - u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length); - token->number = uc; - - return SCAN_DONE; + *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) }; } -const char * -scan_type_to_string (enum scan_type type) +static enum tokenize_result +tokenize_string_segment__ (enum segment_type type, + struct substring s, struct token *token) { - switch (type) - { -#define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME; - SCAN_TYPES -#undef SCAN_TYPE + /* Trim X' or U' from front and ' from back. */ + s.string += 2; + s.length -= 3; - default: - return token_type_to_name (type); + struct substring out = SS_EMPTY_INITIALIZER; + char *error = (type == SEG_HEX_STRING + ? scan_hex_string__ (s, &out) + : scan_unicode_string__ (s, &out)); + if (!error) + { + out.string[out.length] = '\0'; + *token = (struct token) { .type = T_STRING, .string = out }; + return TOKENIZE_TOKEN; + } + else + { + tokenize_error__ (token, error); + ss_dealloc (&out); + return TOKENIZE_ERROR; } } -bool -is_scan_type (enum scan_type type) +static void +tokenize_unexpected_char (const struct substring *s, struct token *token) { - return type > SCAN_FIRST && type < SCAN_LAST; + ucs4_t uc; + u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length); + + char c_name[16]; + tokenize_error__ (token, xasprintf (_("Bad character %s in input."), + uc_name (uc, c_name))); } -static enum scan_result -scan_start__ (struct scanner *scanner, enum segment_type type, - struct substring s, struct token *token) +enum tokenize_result +token_from_segment (enum segment_type type, struct substring s, + struct token *token) { switch (type) { case SEG_NUMBER: - token->type = T_POS_NUM; - token->number = scan_number__ (s); - return SCAN_DONE; + scan_number__ (s, token); + return TOKENIZE_TOKEN; case SEG_QUOTED_STRING: + scan_quoted_string (s, token); + return TOKENIZE_TOKEN; + case SEG_HEX_STRING: case SEG_UNICODE_STRING: - return scan_string_segment__ (scanner, type, s, token); + return tokenize_string_segment__ (type, s, token); case SEG_UNQUOTED_STRING: case SEG_DO_REPEAT_COMMAND: case SEG_INLINE_DATA: case SEG_DOCUMENT: - token->type = T_STRING; + case SEG_MACRO_BODY: + case SEG_MACRO_NAME: + *token = (struct token) { .type = T_STRING }; ss_alloc_substring (&token->string, s); - return SCAN_DONE; + return TOKENIZE_TOKEN; case SEG_RESERVED_WORD: - token->type = scan_reserved_word__ (s); - return SCAN_DONE; + *token = (struct token) { .type = scan_reserved_word__ (s) }; + return TOKENIZE_TOKEN; case SEG_IDENTIFIER: - token->type = T_ID; + *token = (struct token) { .type = T_ID }; + ss_alloc_substring (&token->string, s); + return TOKENIZE_TOKEN; + + case SEG_MACRO_ID: + *token = (struct token) { .type = T_MACRO_ID }; ss_alloc_substring (&token->string, s); - return SCAN_DONE; + return TOKENIZE_TOKEN; case SEG_PUNCT: - if (s.length == 1 && s.string[0] == '-') - { - scanner->state = S_DASH; - return SCAN_SAVE; - } - else - { - token->type = scan_punct__ (s); - return SCAN_DONE; - } + *token = (struct token) { .type = scan_punct__ (s) }; + if (token->type == T_MACRO_PUNCT) + ss_alloc_substring (&token->string, s); + return TOKENIZE_TOKEN; case SEG_SHBANG: case SEG_SPACES: case SEG_COMMENT: case SEG_NEWLINE: case SEG_COMMENT_COMMAND: - token->type = SCAN_SKIP; - return SCAN_DONE; + return TOKENIZE_EMPTY; case SEG_START_DOCUMENT: - token->type = T_ID; + *token = (struct token) { .type = T_ID }; ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT")); - return SCAN_DONE; + return TOKENIZE_TOKEN; case SEG_START_COMMAND: case SEG_SEPARATE_COMMANDS: case SEG_END_COMMAND: - token->type = T_ENDCMD; - return SCAN_DONE; + *token = (struct token) { .type = T_ENDCMD }; + return TOKENIZE_TOKEN; case SEG_END: - token->type = T_STOP; - return SCAN_DONE; + *token = (struct token) { .type = T_STOP }; + return TOKENIZE_TOKEN; case SEG_EXPECTED_QUOTE: - token->type = SCAN_EXPECTED_QUOTE; - return SCAN_DONE; + tokenize_error__ (token, xasprintf (_("Unterminated string constant."))); + return TOKENIZE_ERROR; case SEG_EXPECTED_EXPONENT: - token->type = SCAN_EXPECTED_EXPONENT; - ss_alloc_substring (&token->string, s); - return SCAN_DONE; - - case SEG_UNEXPECTED_DOT: - token->type = SCAN_UNEXPECTED_DOT; - return SCAN_DONE; + tokenize_error__ (token, + xasprintf (_("Missing exponent following `%.*s'."), + (int) s.length, s.string)); + return TOKENIZE_ERROR; case SEG_UNEXPECTED_CHAR: - return scan_unexpected_char (&s, token); - - case SEG_N_TYPES: - NOT_REACHED (); + tokenize_unexpected_char (&s, token); + return TOKENIZE_ERROR; } NOT_REACHED (); } -static enum scan_result -scan_dash__ (enum segment_type type, struct substring s, struct token *token) + +/* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the + specified MODE. + + SLEX has no internal state to free, but it retains a reference to INPUT, so + INPUT must not be modified or freed while SLEX is still in use. */ +void +string_lexer_init (struct string_lexer *slex, const char *input, size_t length, + enum segmenter_mode mode, bool is_snippet) { - switch (type) + *slex = (struct string_lexer) { + .input = input, + .length = length, + .offset = 0, + .segmenter = segmenter_init (mode, is_snippet), + }; +} + +/* */ +enum string_lexer_result +string_lexer_next (struct string_lexer *slex, struct token *token) +{ + for (;;) { - case SEG_SPACES: - case SEG_COMMENT: - return SCAN_MORE; + const char *s = slex->input + slex->offset; + size_t left = slex->length - slex->offset; + enum segment_type type; + int n; - case SEG_NUMBER: - token->type = T_NEG_NUM; - token->number = -scan_number__ (s); - return SCAN_DONE; + n = segmenter_push (&slex->segmenter, s, left, true, &type); + assert (n >= 0); - default: - token->type = T_DASH; - return SCAN_BACK; - } -} + slex->offset += n; + switch (token_from_segment (type, ss_buffer (s, n), token)) + { + case TOKENIZE_TOKEN: + return token->type == T_STOP ? SLR_END : SLR_TOKEN; -/* Initializes SCANNER for scanning a token from a sequence of segments. - Initializes TOKEN as the output token. (The client retains ownership of - TOKEN, but it must be preserved across subsequent calls to scanner_push() - for SCANNER.) + case TOKENIZE_ERROR: + return SLR_ERROR; - A scanner only produces a single token. To obtain the next token, - re-initialize it by calling this function again. + case TOKENIZE_EMPTY: + break; + } + } +} - A scanner does not contain any external references, so nothing needs to be - done to destroy one. For the same reason, scanners may be copied with plain - struct assignment (or memcpy). */ -void -scanner_init (struct scanner *scanner, struct token *token) +static struct substring +concat (struct substring a, struct substring b) { - scanner->state = S_START; - token_init (token); + size_t length = a.length + b.length; + struct substring out = { .string = xmalloc (length + 1), .length = length }; + memcpy (out.string, a.string, a.length); + memcpy (out.string + a.length, b.string, b.length); + out.string[length] = '\0'; + return out; } -/* Adds the segment with type TYPE and UTF-8 text S to SCANNER. TOKEN must be - the same token passed to scanner_init() for SCANNER, or a copy of it. - scanner_push() may modify TOKEN. The client retains ownership of TOKEN, - - The possible return values are: - - - SCAN_DONE: All of the segments that have been passed to scanner_push() - form the token now stored in TOKEN. SCANNER is now "used up" and must - be reinitialized with scanner_init() if it is to be used again. - - Most tokens only consist of a single segment, so this is the most common - return value. - - - SCAN_MORE: The segments passed to scanner_push() don't yet determine a - token. The caller should call scanner_push() again with the next token. - (This won't happen if TYPE is SEG_END indicating the end of input.) - - - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller - needs to "save its place" in the stream of segments for a possible - future SCAN_BACK return. This value can be returned more than once in a - sequence of scanner_push() calls for SCANNER, but the caller only needs - to keep track of the most recent position. - - - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only - the segments up to and including the segment for which SCAN_SAVE was - most recently returned. Segments following that one should be passed to - the next scanner to be initialized. -*/ -enum scan_result -scanner_push (struct scanner *scanner, enum segment_type type, - struct substring s, struct token *token) +/* Attempts to merge a sequence of tokens together into a single token. The + caller feeds tokens in one by one and the merger FSM reports progress. The + caller must supply a merger structure M that is set to MERGER_INIT before + the first call. The caller must also supply a token OUT for storage, which + need not be initialized. + + Returns: + + * -1 if more tokens are needed. Token OUT might be in use for temporary + storage; to ensure that it is freed, continue calling merger_add() until + it returns something other than -1. (T_STOP or T_ENDCMD will make it do + that.) + + * 0 if the first token submitted to the merger is the output. This is the + common case for the first call, and it can be returned for subsequent + calls as well. + + * A positive number if OUT is initialized to the output token. The return + value is the number of tokens being merged to produce this one. */ +int +merger_add (struct merger *m, const struct token *in, struct token *out) { - switch (scanner->state) + /* We perform two different kinds of token merging: + + - String concatenation, where syntax like "a" + "b" is converted into a + single string token. This is definitely needed because the parser + relies on it. + + - Negative number merging, where syntax like -5 is converted from a pair + of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM). This + might not be needed anymore because the segmenter directly treats a dash + followed by a number, with optional intervening white space, as a + negative number. It's only needed if we want intervening comments to be + allowed or for part of the negative number token to be produced by macro + expansion. */ + switch (++m->state) { - case S_START: - return scan_start__ (scanner, type, s, token); + case 1: + if (in->type == T_DASH || in->type == T_STRING) + { + *out = *in; + return -1; + } + else + return 0; - case S_DASH: - return scan_dash__ (type, s, token); + case 2: + if (out->type == T_DASH) + { + if (in->type == T_POS_NUM) + { + *out = (struct token) { + .type = T_NEG_NUM, + .number = -in->number + }; + return 2; + } + else + return 0; + } + else + return in->type == T_PLUS ? -1 : 0; + NOT_REACHED (); - case S_STRING: - return scan_string__ (scanner, type, s, token); - } + case 3: + if (in->type == T_STRING) + { + out->string = concat (out->string, in->string); + return -1; + } + else + return 0; + NOT_REACHED (); - NOT_REACHED (); + default: + if (!(m->state % 2)) + return in->type == T_PLUS ? -1 : m->state - 1; + else + { + if (in->type == T_STRING) + { + struct substring s = concat (out->string, in->string); + ss_swap (&s, &out->string); + ss_dealloc (&s); + return -1; + } + else + return m->state - 2; + } + NOT_REACHED (); + } }