#include "language/lexer/token.h"
#include "libpspp/assertion.h"
#include "libpspp/cast.h"
+#include "libpspp/i18n.h"
#include "gl/c-ctype.h"
#include "gl/c-strtod.h"
#include "gl/xmemdup0.h"
-enum
- {
- S_START,
- S_DASH,
- S_STRING
- };
-
-#define SS_NL_BEFORE_PLUS (1u << 0)
-#define SS_PLUS (1u << 1)
-#define SS_NL_AFTER_PLUS (1u << 2)
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
/* Returns the integer value of (hex) digit C. */
static int
}
}
-static bool
-scan_quoted_string__ (struct substring s, struct token *token)
+static void
+scan_quoted_string (struct substring in, struct token *token)
{
- int quote;
-
/* Trim ' or " from front and back. */
- quote = s.string[s.length - 1];
- s.string++;
- s.length -= 2;
+ int quote = in.string[0];
+ in.string++;
+ in.length -= 2;
- ss_realloc (&token->string, token->string.length + s.length + 1);
+ struct substring out = { .string = xmalloc (in.length + 1) };
for (;;)
{
- size_t pos = ss_find_byte (s, quote);
+ size_t pos = ss_find_byte (in, quote);
if (pos == SIZE_MAX)
break;
- memcpy (ss_end (token->string), s.string, pos + 1);
- token->string.length += pos + 1;
- ss_advance (&s, pos + 2);
+ memcpy (ss_end (out), in.string, pos + 1);
+ out.length += pos + 1;
+ ss_advance (&in, pos + 2);
}
- memcpy (ss_end (token->string), s.string, ss_length (s));
- token->string.length += ss_length (s);
+ memcpy (ss_end (out), in.string, in.length);
+ out.length += in.length;
+ out.string[out.length] = '\0';
- return true;
+ *token = (struct token) { .type = T_STRING, .string = out };
}
-static bool
-scan_hex_string__ (struct substring s, struct token *token)
+static char *
+scan_hex_string__ (struct substring in, struct substring *out)
{
- uint8_t *dst;
- size_t i;
-
- /* Trim X' from front and ' from back. */
- s.string += 2;
- s.length -= 3;
-
- if (s.length % 2 != 0)
+ if (in.length % 2 != 0)
+ return xasprintf (_("String of hex digits has %zu characters, which "
+ "is not a multiple of 2."), in.length);
+
+ ss_realloc (out, in.length / 2 + 1);
+ uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
+ out->length = in.length / 2;
+ for (size_t i = 0; i < in.length; i += 2)
{
- token->type = SCAN_BAD_HEX_LENGTH;
- token->number = s.length;
- return false;
- }
-
- ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
- dst = CHAR_CAST (uint8_t *, ss_end (token->string));
- token->string.length += s.length / 2;
- for (i = 0; i < s.length; i += 2)
- {
- int hi = digit_value (s.string[i]);
- int lo = digit_value (s.string[i + 1]);
+ int hi = digit_value (in.string[i]);
+ int lo = digit_value (in.string[i + 1]);
if (hi >= 16 || lo >= 16)
- {
- token->type = SCAN_BAD_HEX_DIGIT;
- token->number = s.string[hi >= 16 ? i : i + 1];
- return false;
- }
+ return xasprintf (_("`%c' is not a valid hex digit."),
+ in.string[hi >= 16 ? i : i + 1]);
*dst++ = hi * 16 + lo;
}
- return true;
+ return NULL;
}
-static bool
-scan_unicode_string__ (struct substring s, struct token *token)
+static char *
+scan_unicode_string__ (struct substring in, struct substring *out)
{
- uint8_t *dst;
- ucs4_t uc;
- size_t i;
-
- /* Trim U' from front and ' from back. */
- s.string += 2;
- s.length -= 3;
+ if (in.length < 1 || in.length > 8)
+ return xasprintf (_("Unicode string contains %zu bytes, which is "
+ "not in the valid range of 1 to 8 bytes."),
+ in.length);
- if (s.length < 1 || s.length > 8)
+ ucs4_t uc = 0;
+ for (size_t i = 0; i < in.length; i++)
{
- token->type = SCAN_BAD_UNICODE_LENGTH;
- token->number = s.length;
- return 0;
- }
-
- ss_realloc (&token->string, token->string.length + 4 + 1);
-
- uc = 0;
- for (i = 0; i < s.length; i++)
- {
- int digit = digit_value (s.string[i]);
+ int digit = digit_value (in.string[i]);
if (digit >= 16)
- {
- token->type = SCAN_BAD_UNICODE_DIGIT;
- token->number = s.string[i];
- return 0;
- }
+ return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
uc = uc * 16 + digit;
}
if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
- {
- token->type = SCAN_BAD_UNICODE_CODE_POINT;
- token->number = uc;
- return 0;
- }
+ return xasprintf (_("U+%04llX is not a valid Unicode code point."),
+ (long long) uc);
- dst = CHAR_CAST (uint8_t *, ss_end (token->string));
- token->string.length += u8_uctomb (dst, uc, 4);
+ ss_realloc (out, 4 + 1);
+ out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
- return true;
-}
-
-static enum scan_result
-scan_string_segment__ (struct scanner *scanner, enum segment_type type,
- struct substring s, struct token *token)
-{
- bool ok;
-
- switch (type)
- {
- case SEG_QUOTED_STRING:
- ok = scan_quoted_string__ (s, token);
- break;
-
- case SEG_HEX_STRING:
- ok = scan_hex_string__ (s, token);
- break;
-
- case SEG_UNICODE_STRING:
- ok = scan_unicode_string__ (s, token);
- break;
-
- default:
- NOT_REACHED ();
- }
-
- if (ok)
- {
- token->type = T_STRING;
- token->string.string[token->string.length] = '\0';
- scanner->state = S_STRING;
- scanner->substate = 0;
- return SCAN_SAVE;
- }
- else
- {
- /* The function we called above should have filled in token->type and
- token->number properly to describe the error. */
- ss_dealloc (&token->string);
- token->string = ss_empty ();
- return SCAN_DONE;
- }
-
-}
-
-static enum scan_result
-add_bit (struct scanner *scanner, unsigned int bit)
-{
- if (!(scanner->substate & bit))
- {
- scanner->substate |= bit;
- return SCAN_MORE;
- }
- else
- return SCAN_BACK;
-}
-
-static enum scan_result
-scan_string__ (struct scanner *scanner, enum segment_type type,
- struct substring s, struct token *token)
-{
- switch (type)
- {
- case SEG_SPACES:
- case SEG_COMMENT:
- return SCAN_MORE;
-
- case SEG_NEWLINE:
- if (scanner->substate & SS_PLUS)
- return add_bit (scanner, SS_NL_AFTER_PLUS);
- else
- return add_bit (scanner, SS_NL_BEFORE_PLUS);
-
- case SEG_PUNCT:
- return (s.length == 1 && s.string[0] == '+'
- ? add_bit (scanner, SS_PLUS)
- : SCAN_BACK);
-
- case SEG_QUOTED_STRING:
- case SEG_HEX_STRING:
- case SEG_UNICODE_STRING:
- return (scanner->substate & SS_PLUS
- ? scan_string_segment__ (scanner, type, s, token)
- : SCAN_BACK);
-
- default:
- return SCAN_BACK;
- }
+ return NULL;
}
static enum token_type
case '-': return T_DASH;
case '[': return T_LBRACK;
case ']': return T_RBRACK;
+ case '{': return T_LCURLY;
+ case '}': return T_RCURLY;
case '&': return T_AND;
case '|': return T_OR;
case '+': return T_PLUS;
case '<': return T_LT;
case '>': return T_GT;
case '~': return T_NOT;
+ case ';': return T_SEMICOLON;
+ case ':': return T_COLON;
+ default: return T_MACRO_PUNCT;
}
NOT_REACHED ();
: scan_punct2__ (s.string[0], s.string[1]));
}
-static double
-scan_number__ (struct substring s)
+static void
+scan_number__ (struct substring s, struct token *token)
{
char buf[128];
- double number;
char *p;
if (s.length < sizeof buf)
else
p = xmemdup0 (s.string, s.length);
- number = c_strtod (p, NULL);
+ bool negative = *p == '-';
+ double x = c_strtod (p + negative, NULL);
+ *token = (struct token) {
+ .type = negative ? T_NEG_NUM : T_POS_NUM,
+ .number = negative ? -x : x,
+ };
if (p != buf)
free (p);
-
- return number;
}
-
-static enum scan_result
-scan_unexpected_char (const struct substring *s, struct token *token)
+\f
+static void
+tokenize_error__ (struct token *token, char *error)
{
- ucs4_t uc;
-
- token->type = SCAN_UNEXPECTED_CHAR;
- u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
- token->number = uc;
-
- return SCAN_DONE;
+ *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
}
-const char *
-scan_type_to_string (enum scan_type type)
+static enum tokenize_result
+tokenize_string_segment__ (enum segment_type type,
+ struct substring s, struct token *token)
{
- switch (type)
- {
-#define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
- SCAN_TYPES
-#undef SCAN_TYPE
+ /* Trim X' or U' from front and ' from back. */
+ s.string += 2;
+ s.length -= 3;
- default:
- return token_type_to_name ((enum token_type) type);
+ struct substring out = SS_EMPTY_INITIALIZER;
+ char *error = (type == SEG_HEX_STRING
+ ? scan_hex_string__ (s, &out)
+ : scan_unicode_string__ (s, &out));
+ if (!error)
+ {
+ out.string[out.length] = '\0';
+ *token = (struct token) { .type = T_STRING, .string = out };
+ return TOKENIZE_TOKEN;
+ }
+ else
+ {
+ tokenize_error__ (token, error);
+ ss_dealloc (&out);
+ return TOKENIZE_ERROR;
}
}
-bool
-is_scan_type (enum scan_type type)
+static void
+tokenize_unexpected_char (const struct substring *s, struct token *token)
{
- return type > SCAN_FIRST && type < SCAN_LAST;
+ ucs4_t uc;
+ u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
+
+ char c_name[16];
+ tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
+ uc_name (uc, c_name)));
}
-static enum scan_result
-scan_start__ (struct scanner *scanner, enum segment_type type,
- struct substring s, struct token *token)
+enum tokenize_result
+token_from_segment (enum segment_type type, struct substring s,
+ struct token *token)
{
switch (type)
{
case SEG_NUMBER:
- token->type = T_POS_NUM;
- token->number = scan_number__ (s);
- return SCAN_DONE;
+ scan_number__ (s, token);
+ return TOKENIZE_TOKEN;
case SEG_QUOTED_STRING:
+ scan_quoted_string (s, token);
+ return TOKENIZE_TOKEN;
+
case SEG_HEX_STRING:
case SEG_UNICODE_STRING:
- return scan_string_segment__ (scanner, type, s, token);
+ return tokenize_string_segment__ (type, s, token);
case SEG_UNQUOTED_STRING:
case SEG_DO_REPEAT_COMMAND:
case SEG_INLINE_DATA:
case SEG_DOCUMENT:
- token->type = T_STRING;
+ case SEG_MACRO_BODY:
+ case SEG_MACRO_NAME:
+ *token = (struct token) { .type = T_STRING };
ss_alloc_substring (&token->string, s);
- return SCAN_DONE;
+ return TOKENIZE_TOKEN;
case SEG_RESERVED_WORD:
- token->type = scan_reserved_word__ (s);
- return SCAN_DONE;
+ *token = (struct token) { .type = scan_reserved_word__ (s) };
+ return TOKENIZE_TOKEN;
case SEG_IDENTIFIER:
- token->type = T_ID;
+ *token = (struct token) { .type = T_ID };
ss_alloc_substring (&token->string, s);
- return SCAN_DONE;
+ return TOKENIZE_TOKEN;
case SEG_MACRO_ID:
- token->type = T_MACRO_ID;
+ *token = (struct token) { .type = T_MACRO_ID };
ss_alloc_substring (&token->string, s);
- return SCAN_DONE;
+ return TOKENIZE_TOKEN;
case SEG_PUNCT:
- if (s.length == 1 && s.string[0] == '-')
- {
- scanner->state = S_DASH;
- return SCAN_SAVE;
- }
- else
- {
- token->type = scan_punct__ (s);
- return SCAN_DONE;
- }
+ *token = (struct token) { .type = scan_punct__ (s) };
+ if (token->type == T_MACRO_PUNCT)
+ ss_alloc_substring (&token->string, s);
+ return TOKENIZE_TOKEN;
case SEG_SHBANG:
case SEG_SPACES:
case SEG_COMMENT:
case SEG_NEWLINE:
case SEG_COMMENT_COMMAND:
- token->type = SCAN_SKIP;
- return SCAN_DONE;
+ return TOKENIZE_EMPTY;
case SEG_START_DOCUMENT:
- token->type = T_ID;
+ *token = (struct token) { .type = T_ID };
ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
- return SCAN_DONE;
+ return TOKENIZE_TOKEN;
case SEG_START_COMMAND:
case SEG_SEPARATE_COMMANDS:
case SEG_END_COMMAND:
- token->type = T_ENDCMD;
- return SCAN_DONE;
+ *token = (struct token) { .type = T_ENDCMD };
+ return TOKENIZE_TOKEN;
case SEG_END:
- token->type = T_STOP;
- return SCAN_DONE;
+ *token = (struct token) { .type = T_STOP };
+ return TOKENIZE_TOKEN;
case SEG_EXPECTED_QUOTE:
- token->type = SCAN_EXPECTED_QUOTE;
- return SCAN_DONE;
+ tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
+ return TOKENIZE_ERROR;
case SEG_EXPECTED_EXPONENT:
- token->type = SCAN_EXPECTED_EXPONENT;
- ss_alloc_substring (&token->string, s);
- return SCAN_DONE;
-
- case SEG_UNEXPECTED_DOT:
- token->type = SCAN_UNEXPECTED_DOT;
- return SCAN_DONE;
+ tokenize_error__ (token,
+ xasprintf (_("Missing exponent following `%.*s'."),
+ (int) s.length, s.string));
+ return TOKENIZE_ERROR;
case SEG_UNEXPECTED_CHAR:
- return scan_unexpected_char (&s, token);
+ tokenize_unexpected_char (&s, token);
+ return TOKENIZE_ERROR;
}
NOT_REACHED ();
}
-static enum scan_result
-scan_dash__ (enum segment_type type, struct substring s, struct token *token)
-{
- switch (type)
- {
- case SEG_SPACES:
- case SEG_COMMENT:
- return SCAN_MORE;
-
- case SEG_NUMBER:
- token->type = T_NEG_NUM;
- token->number = -scan_number__ (s);
- return SCAN_DONE;
-
- default:
- token->type = T_DASH;
- return SCAN_BACK;
- }
-}
-
-/* Initializes SCANNER for scanning a token from a sequence of segments.
- Initializes TOKEN as the output token. (The client retains ownership of
- TOKEN, but it must be preserved across subsequent calls to scanner_push()
- for SCANNER.)
-
- A scanner only produces a single token. To obtain the next token,
- re-initialize it by calling this function again.
-
- A scanner does not contain any external references, so nothing needs to be
- done to destroy one. For the same reason, scanners may be copied with plain
- struct assignment (or memcpy). */
-void
-scanner_init (struct scanner *scanner, struct token *token)
-{
- scanner->state = S_START;
- token_init (token);
-}
-
-/* Adds the segment with type TYPE and UTF-8 text S to SCANNER. TOKEN must be
- the same token passed to scanner_init() for SCANNER, or a copy of it.
- scanner_push() may modify TOKEN. The client retains ownership of TOKEN,
-
- The possible return values are:
-
- - SCAN_DONE: All of the segments that have been passed to scanner_push()
- form the token now stored in TOKEN. SCANNER is now "used up" and must
- be reinitialized with scanner_init() if it is to be used again.
-
- Most tokens only consist of a single segment, so this is the most common
- return value.
-
- - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
- token. The caller should call scanner_push() again with the next token.
- (This won't happen if TYPE is SEG_END indicating the end of input.)
-
- - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
- needs to "save its place" in the stream of segments for a possible
- future SCAN_BACK return. This value can be returned more than once in a
- sequence of scanner_push() calls for SCANNER, but the caller only needs
- to keep track of the most recent position.
-
- - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
- the segments up to and including the segment for which SCAN_SAVE was
- most recently returned. Segments following that one should be passed to
- the next scanner to be initialized.
-*/
-enum scan_result
-scanner_push (struct scanner *scanner, enum segment_type type,
- struct substring s, struct token *token)
-{
- switch (scanner->state)
- {
- case S_START:
- return scan_start__ (scanner, type, s, token);
-
- case S_DASH:
- return scan_dash__ (type, s, token);
-
- case S_STRING:
- return scan_string__ (scanner, type, s, token);
- }
-
- NOT_REACHED ();
-}
\f
/* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
specified MODE.
INPUT must not be modified or freed while SLEX is still in use. */
void
string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
- enum segmenter_mode mode)
+ enum segmenter_mode mode, bool is_snippet)
{
- slex->input = input;
- slex->length = length;
- slex->offset = 0;
- segmenter_init (&slex->segmenter, mode);
+ *slex = (struct string_lexer) {
+ .input = input,
+ .length = length,
+ .offset = 0,
+ .segmenter = segmenter_init (mode, is_snippet),
+ };
}
/* */
-bool
+enum string_lexer_result
string_lexer_next (struct string_lexer *slex, struct token *token)
{
- struct segmenter saved_segmenter;
- size_t saved_offset = 0;
-
- struct scanner scanner;
-
- scanner_init (&scanner, token);
for (;;)
{
const char *s = slex->input + slex->offset;
assert (n >= 0);
slex->offset += n;
- switch (scanner_push (&scanner, type, ss_buffer (s, n), token))
+ switch (token_from_segment (type, ss_buffer (s, n), token))
{
- case SCAN_BACK:
- slex->segmenter = saved_segmenter;
- slex->offset = saved_offset;
- /* Fall through. */
- case SCAN_DONE:
- return token->type != T_STOP;
-
- case SCAN_MORE:
- break;
+ case TOKENIZE_TOKEN:
+ return token->type == T_STOP ? SLR_END : SLR_TOKEN;
- case SCAN_SAVE:
- saved_segmenter = slex->segmenter;
- saved_offset = slex->offset;
+ case TOKENIZE_ERROR:
+ return SLR_ERROR;
+
+ case TOKENIZE_EMPTY:
break;
}
}
}
+
+static struct substring
+concat (struct substring a, struct substring b)
+{
+ size_t length = a.length + b.length;
+ struct substring out = { .string = xmalloc (length + 1), .length = length };
+ memcpy (out.string, a.string, a.length);
+ memcpy (out.string + a.length, b.string, b.length);
+ out.string[length] = '\0';
+ return out;
+}
+
+/* Attempts to merge a sequence of tokens together into a single token. The
+ caller feeds tokens in one by one and the merger FSM reports progress. The
+ caller must supply a merger structure M that is set to MERGER_INIT before
+ the first call. The caller must also supply a token OUT for storage, which
+ need not be initialized.
+
+ Returns:
+
+ * -1 if more tokens are needed. Token OUT might be in use for temporary
+ storage; to ensure that it is freed, continue calling merger_add() until
+ it returns something other than -1. (T_STOP or T_ENDCMD will make it do
+ that.)
+
+ * 0 if the first token submitted to the merger is the output. This is the
+ common case for the first call, and it can be returned for subsequent
+ calls as well.
+
+ * A positive number if OUT is initialized to the output token. The return
+ value is the number of tokens being merged to produce this one. */
+int
+merger_add (struct merger *m, const struct token *in, struct token *out)
+{
+ /* We perform two different kinds of token merging:
+
+ - String concatenation, where syntax like "a" + "b" is converted into a
+ single string token. This is definitely needed because the parser
+ relies on it.
+
+ - Negative number merging, where syntax like -5 is converted from a pair
+ of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM). This
+ might not be needed anymore because the segmenter directly treats a dash
+ followed by a number, with optional intervening white space, as a
+ negative number. It's only needed if we want intervening comments to be
+ allowed or for part of the negative number token to be produced by macro
+ expansion. */
+ switch (++m->state)
+ {
+ case 1:
+ if (in->type == T_DASH || in->type == T_STRING)
+ {
+ *out = *in;
+ return -1;
+ }
+ else
+ return 0;
+
+ case 2:
+ if (out->type == T_DASH)
+ {
+ if (in->type == T_POS_NUM)
+ {
+ *out = (struct token) {
+ .type = T_NEG_NUM,
+ .number = -in->number
+ };
+ return 2;
+ }
+ else
+ return 0;
+ }
+ else
+ return in->type == T_PLUS ? -1 : 0;
+ NOT_REACHED ();
+
+ case 3:
+ if (in->type == T_STRING)
+ {
+ out->string = concat (out->string, in->string);
+ return -1;
+ }
+ else
+ return 0;
+ NOT_REACHED ();
+
+ default:
+ if (!(m->state % 2))
+ return in->type == T_PLUS ? -1 : m->state - 1;
+ else
+ {
+ if (in->type == T_STRING)
+ {
+ struct substring s = concat (out->string, in->string);
+ ss_swap (&s, &out->string);
+ ss_dealloc (&s);
+ return -1;
+ }
+ else
+ return m->state - 2;
+ }
+ NOT_REACHED ();
+ }
+}