}
}
-static bool
+static char *
scan_quoted_string__ (struct substring s, struct token *token)
{
int quote;
memcpy (ss_end (token->string), s.string, ss_length (s));
token->string.length += ss_length (s);
- return true;
+ return NULL;
}
-static bool
+static char *
scan_hex_string__ (struct substring s, struct token *token)
{
- uint8_t *dst;
- size_t i;
-
/* Trim X' from front and ' from back. */
s.string += 2;
s.length -= 3;
if (s.length % 2 != 0)
- {
- token->type = SCAN_BAD_HEX_LENGTH;
- token->number = s.length;
- return false;
- }
+ return xasprintf (_("String of hex digits has %zu characters, which "
+ "is not a multiple of 2."), s.length);
ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
- dst = CHAR_CAST (uint8_t *, ss_end (token->string));
+ uint8_t *dst = CHAR_CAST (uint8_t *, ss_end (token->string));
token->string.length += s.length / 2;
- for (i = 0; i < s.length; i += 2)
+ for (size_t i = 0; i < s.length; i += 2)
{
int hi = digit_value (s.string[i]);
int lo = digit_value (s.string[i + 1]);
if (hi >= 16 || lo >= 16)
- {
- token->type = SCAN_BAD_HEX_DIGIT;
- token->number = s.string[hi >= 16 ? i : i + 1];
- return false;
- }
+ return xasprintf (_("`%c' is not a valid hex digit."),
+ s.string[hi >= 16 ? i : i + 1]);
*dst++ = hi * 16 + lo;
}
- return true;
+ return NULL;
}
-static bool
+static char *
scan_unicode_string__ (struct substring s, struct token *token)
{
- uint8_t *dst;
- ucs4_t uc;
- size_t i;
-
/* Trim U' from front and ' from back. */
s.string += 2;
s.length -= 3;
if (s.length < 1 || s.length > 8)
- {
- token->type = SCAN_BAD_UNICODE_LENGTH;
- token->number = s.length;
- return 0;
- }
+ return xasprintf (_("Unicode string contains %zu bytes, which is "
+ "not in the valid range of 1 to 8 bytes."),
+ s.length);
ss_realloc (&token->string, token->string.length + 4 + 1);
- uc = 0;
- for (i = 0; i < s.length; i++)
+ ucs4_t uc = 0;
+ for (size_t i = 0; i < s.length; i++)
{
int digit = digit_value (s.string[i]);
if (digit >= 16)
- {
- token->type = SCAN_BAD_UNICODE_DIGIT;
- token->number = s.string[i];
- return 0;
- }
+ return xasprintf (_("`%c' is not a valid hex digit."),
+ s.string[i]);
uc = uc * 16 + digit;
}
if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
- {
- token->type = SCAN_BAD_UNICODE_CODE_POINT;
- token->number = uc;
- return 0;
- }
+ return xasprintf (_("U+%04llX is not a valid Unicode code point."),
+ (long long) uc);
- dst = CHAR_CAST (uint8_t *, ss_end (token->string));
+ uint8_t *dst = CHAR_CAST (uint8_t *, ss_end (token->string));
token->string.length += u8_uctomb (dst, uc, 4);
- return true;
+ return NULL;
+}
+
+static enum scan_result
+scan_error__ (struct token *token, char *error)
+{
+ ss_dealloc (&token->string);
+ token->type = T_STRING;
+ token->string = ss_cstr (error);
+ return SCAN_ERROR;
}
static enum scan_result
scan_string_segment__ (struct scanner *scanner, enum segment_type type,
struct substring s, struct token *token)
{
- bool ok;
-
- switch (type)
- {
- case SEG_QUOTED_STRING:
- ok = scan_quoted_string__ (s, token);
- break;
-
- case SEG_HEX_STRING:
- ok = scan_hex_string__ (s, token);
- break;
-
- case SEG_UNICODE_STRING:
- ok = scan_unicode_string__ (s, token);
- break;
-
- default:
- NOT_REACHED ();
- }
-
- if (ok)
+ char *error = (type == SEG_QUOTED_STRING ? scan_quoted_string__ (s, token)
+ : type == SEG_HEX_STRING ? scan_hex_string__ (s, token)
+ : scan_unicode_string__ (s, token));
+ if (!error)
{
token->type = T_STRING;
token->string.string[token->string.length] = '\0';
return SCAN_SAVE;
}
else
- {
- /* The function we called above should have filled in token->type and
- token->number properly to describe the error. */
- ss_dealloc (&token->string);
- token->string = ss_empty ();
- return SCAN_DONE;
- }
-
+ return scan_error__ (token, error);
}
static enum scan_result
scan_unexpected_char (const struct substring *s, struct token *token)
{
ucs4_t uc;
-
- token->type = SCAN_UNEXPECTED_CHAR;
u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
- token->number = uc;
-
- return SCAN_DONE;
-}
-
-const char *
-scan_type_to_string (enum scan_type type)
-{
- switch (type)
- {
-#define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
- SCAN_TYPES
-#undef SCAN_TYPE
-
- default:
- return token_type_to_name ((enum token_type) type);
- }
-}
-
-bool
-is_scan_type (enum scan_type type)
-{
- return type > SCAN_FIRST && type < SCAN_LAST;
-}
-
-/* If TOKEN has the type of a scan error (a subset of those identified by
- is_scan_type()), returns an appropriate error message. Otherwise, returns
- NULL. */
-char *
-scan_token_to_error (const struct token *token)
-{
- switch (token->type)
- {
- case SCAN_BAD_HEX_LENGTH:
- return xasprintf (_("String of hex digits has %d characters, which "
- "is not a multiple of 2."), (int) token->number);
-
- case SCAN_BAD_HEX_DIGIT:
- case SCAN_BAD_UNICODE_DIGIT:
- return xasprintf (_("`%c' is not a valid hex digit."),
- (int) token->number);
-
- case SCAN_BAD_UNICODE_LENGTH:
- return xasprintf (_("Unicode string contains %d bytes, which is "
- "not in the valid range of 1 to 8 bytes."),
- (int) token->number);
-
- case SCAN_BAD_UNICODE_CODE_POINT:
- return xasprintf (_("U+%04X is not a valid Unicode code point."),
- (int) token->number);
-
- case SCAN_EXPECTED_QUOTE:
- return xasprintf (_("Unterminated string constant."));
-
- case SCAN_EXPECTED_EXPONENT:
- return xasprintf (_("Missing exponent following `%s'."),
- token->string.string);
-
- case SCAN_UNEXPECTED_CHAR:
- {
- char c_name[16];
- return xasprintf (_("Bad character %s in input."),
- uc_name (token->number, c_name));
- }
- }
- return NULL;
+ char c_name[16];
+ return scan_error__ (token, xasprintf (_("Bad character %s in input."),
+ uc_name (uc, c_name)));
}
static enum scan_result
case SEG_COMMENT:
case SEG_NEWLINE:
case SEG_COMMENT_COMMAND:
- token->type = SCAN_SKIP;
- return SCAN_DONE;
+ return SCAN_EMPTY;
case SEG_START_DOCUMENT:
token->type = T_ID;
return SCAN_DONE;
case SEG_EXPECTED_QUOTE:
- token->type = SCAN_EXPECTED_QUOTE;
- return SCAN_DONE;
+ return scan_error__ (token,
+ xasprintf (_("Unterminated string constant.")));
case SEG_EXPECTED_EXPONENT:
- token->type = SCAN_EXPECTED_EXPONENT;
- ss_alloc_substring (&token->string, s);
- return SCAN_DONE;
+ return scan_error__ (token,
+ xasprintf (_("Missing exponent following `%.*s'."),
+ (int) s.length, s.string));
case SEG_UNEXPECTED_CHAR:
return scan_unexpected_char (&s, token);
the segments up to and including the segment for which SCAN_SAVE was
most recently returned. Segments following that one should be passed to
the next scanner to be initialized.
+
+ - SCAN_EMPTY: This is similar to SCAN_DONE, but there's no token because
+ the scanner consumed white space or comments or other syntax that
+ doesn't produce a token.
+
+ - SCAN_ERROR: This is simila to SCAN_DONE, but the token is a T_STRING
+ that describes some lexical error. The caller should report the error
+ and discard the token.
*/
enum scan_result
scanner_push (struct scanner *scanner, enum segment_type type,
}
/* */
-bool
+enum string_lexer_result
string_lexer_next (struct string_lexer *slex, struct token *token)
{
struct segmenter saved_segmenter;
size_t saved_offset = 0;
struct scanner scanner;
-
+next:
scanner_init (&scanner, token);
for (;;)
{
slex->offset = saved_offset;
/* Fall through. */
case SCAN_DONE:
- return token->type != T_STOP;
+ return token->type == T_STOP ? SLR_END : SLR_TOKEN;
case SCAN_MORE:
break;
saved_segmenter = slex->segmenter;
saved_offset = slex->offset;
break;
+
+ case SCAN_ERROR:
+ return SLR_ERROR;
+
+ case SCAN_EMPTY:
+ goto next;
}
}
}