From: Ben Pfaff Date: Mon, 24 Sep 2018 03:42:07 +0000 (-0700) Subject: lexer: Add support for embedded \0 bytes and missing trailing new-line. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e0f9210e814d03bc43b6a9b30a402e403d5666b9;p=pspp lexer: Add support for embedded \0 bytes and missing trailing new-line. The lexer, at a low level, has not supported \0 bytes in the input stream because it used such a byte as the end-of-input indicator. This caused some awkwardness for the higher-level lexer, which had to remove and flag \0 bytes as it read them. This caused a bug in the higher-level lexer, which raised an error for each \0 byte it removed but did so when the lexer was in an intermediate state, which could read uninitialized data. This commit fixes the problem by adding support for \0 bytes to the low-level lexer (segmenter). At the same time, it adds support for input that doesn't end in a new-line character. Bug #54664. Thanks to Tianxiao Gu for reporting this bug. --- diff --git a/src/language/control/repeat.c b/src/language/control/repeat.c index 316ac8fc06..c73d028271 100644 --- a/src/language/control/repeat.c +++ b/src/language/control/repeat.c @@ -210,7 +210,7 @@ do_parse_commands (struct substring s, enum segmenter_mode mode, enum segment_type type; int n; - n = segmenter_push (&segmenter, s.string, s.length, &type); + n = segmenter_push (&segmenter, s.string, s.length, true, &type); assert (n >= 0); if (type == SEG_DO_REPEAT_COMMAND) @@ -220,7 +220,7 @@ do_parse_commands (struct substring s, enum segmenter_mode mode, int k; k = segmenter_push (&segmenter, s.string + n, s.length - n, - &type); + true, &type); if (type != SEG_NEWLINE && type != SEG_DO_REPEAT_COMMAND) break; @@ -275,9 +275,6 @@ parse_commands (struct lexer *lexer, struct hmap *dummies) ds_put_byte (&input, '\n'); lex_get (lexer); } - if (ds_is_empty (&input)) - ds_put_byte (&input, '\n'); - ds_put_byte (&input, '\0'); n_values = count_values (dummies); outputs = xmalloc (n_values * sizeof *outputs); diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index a3642f8a6c..fb45465f19 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -132,6 +132,7 @@ lex_reader_init (struct lex_reader *reader, reader->file_name = NULL; reader->encoding = NULL; reader->line_number = 0; + reader->eof = false; } /* Frees any file name already in READER and replaces it by a copy of @@ -876,7 +877,7 @@ lex_match_phrase (struct lexer *lexer, const char *s) int i; i = 0; - string_lexer_init (&slex, s, SEG_MODE_INTERACTIVE); + string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE); while (string_lexer_next (&slex, &token)) if (token.type != SCAN_SKIP) { @@ -1190,38 +1191,11 @@ lex_source_read__ (struct lex_source *src) space, prompt); assert (n <= space); - for (char *p = &src->buffer[head_ofs]; p < &src->buffer[head_ofs + n]; - p++) - if (*p == '\0') - { - struct msg m; - m.category = MSG_C_SYNTAX; - m.severity = MSG_S_ERROR; - m.file_name = src->reader->file_name; - m.first_line = 0; - m.last_line = 0; - m.first_column = 0; - m.last_column = 0; - m.text = xstrdup ("Bad character U+0000 in input."); - msg_emit (&m); - - *p = ' '; - } - if (n == 0) { - /* End of input. - - Ensure that the input always ends in a new-line followed by a null - byte, as required by the segmenter library. */ - - if (src->head == src->tail - || src->buffer[src->head - src->tail - 1] != '\n') - src->buffer[src->head++ - src->tail] = '\n'; - + /* End of input. */ + src->reader->eof = true; lex_source_expand__ (src); - src->buffer[src->head++ - src->tail] = '\0'; - return; } @@ -1261,6 +1235,7 @@ lex_ellipsize__ (struct substring in, char *out, size_t out_size) for (out_len = 0; out_len < in.length; out_len += mblen) { if (in.string[out_len] == '\n' + || in.string[out_len] == '\0' || (in.string[out_len] == '\r' && out_len + 1 < in.length && in.string[out_len + 1] == '\n')) @@ -1391,10 +1366,11 @@ lex_source_get__ (const struct lex_source *src_) size_t seg_maxlen = src->head - state.seg_pos; enum segment_type type; int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, - &type); + src->reader->eof, &type); if (seg_len < 0) { /* The segmenter needs more input to produce a segment. */ + assert (!src->reader->eof); lex_source_read__ (src); continue; } diff --git a/src/language/lexer/lexer.h b/src/language/lexer/lexer.h index 7383927eef..463747f974 100644 --- a/src/language/lexer/lexer.h +++ b/src/language/lexer/lexer.h @@ -56,6 +56,7 @@ struct lex_reader char *encoding; char *file_name; /* NULL if not associated with a file. */ int line_number; /* 1-based initial line number, 0 if none. */ + bool eof; }; /* An implementation of a lex_reader. */ diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c index 2cd66f0796..573a00df9d 100644 --- a/src/language/lexer/scan.c +++ b/src/language/lexer/scan.c @@ -593,16 +593,17 @@ scanner_push (struct scanner *scanner, enum segment_type type, NOT_REACHED (); } -/* Initializes SLEX for parsing INPUT in the specified MODE. +/* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the + specified MODE. SLEX has no internal state to free, but it retains a reference to INPUT, so INPUT must not be modified or freed while SLEX is still in use. */ void -string_lexer_init (struct string_lexer *slex, const char *input, +string_lexer_init (struct string_lexer *slex, const char *input, size_t length, enum segmenter_mode mode) { slex->input = input; - slex->length = strlen (input) + 1; + slex->length = length; slex->offset = 0; segmenter_init (&slex->segmenter, mode); } @@ -624,7 +625,7 @@ string_lexer_next (struct string_lexer *slex, struct token *token) enum segment_type type; int n; - n = segmenter_push (&slex->segmenter, s, left, &type); + n = segmenter_push (&slex->segmenter, s, left, true, &type); assert (n >= 0); slex->offset += n; diff --git a/src/language/lexer/scan.h b/src/language/lexer/scan.h index 73f208033b..4327e9bb0b 100644 --- a/src/language/lexer/scan.h +++ b/src/language/lexer/scan.h @@ -101,7 +101,7 @@ struct string_lexer }; void string_lexer_init (struct string_lexer *, const char *input, - enum segmenter_mode); + size_t length, enum segmenter_mode); bool string_lexer_next (struct string_lexer *, struct token *); #endif /* scan.h */ diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c index 52ff37a457..c0a09973ce 100644 --- a/src/language/lexer/segment.c +++ b/src/language/lexer/segment.c @@ -28,7 +28,6 @@ #include "gl/c-ctype.h" #include "gl/c-strcase.h" -#include "gl/memchr2.h" enum segmenter_state { @@ -55,108 +54,122 @@ enum segmenter_state #define SS_START_OF_COMMAND (1u << 1) static int segmenter_detect_command_name__ (const char *input, - size_t n, int ofs); + size_t n, bool eof, int ofs); static int -segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n) +segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof, + size_t ofs) { const uint8_t *input = CHAR_CAST (const uint8_t *, input_); int mblen; - assert (n > 0); + assert (n > ofs); + + input += ofs; + n -= ofs; mblen = u8_mbtoucr (puc, input, n); - return (mblen >= 0 ? mblen - : mblen == -2 ? -1 - : u8_mbtouc (puc, input, n)); + if (mblen >= 0) + return mblen; + else if (mblen != -2) + return u8_mbtouc (puc, input, n); + else if (eof) + { + *puc = 0xfffd; + return n; + } + else + return -1; } static int segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n, - enum segment_type *type) + bool eof, enum segment_type *type) { if (input[0] == '#') { - if (n < 2) - return -1; - else if (input[1] == '!') + if (n >= 2) { - int ofs; + if (input[1] == '!') + { + int ofs; - for (ofs = 2; ofs < n; ofs++) - if (input[ofs] == '\n' || input[ofs] == '\0') - { - if (input[ofs] == '\n' && input[ofs - 1] == '\r') - ofs--; + for (ofs = 2; ofs < n; ofs++) + if (input[ofs] == '\n') + { + if (input[ofs] == '\n' && input[ofs - 1] == '\r') + ofs--; - s->state = S_GENERAL; - s->substate = SS_START_OF_COMMAND; - *type = SEG_SHBANG; - return ofs; - } + s->state = S_GENERAL; + s->substate = SS_START_OF_COMMAND; + *type = SEG_SHBANG; + return ofs; + } - return -1; + return eof ? ofs : -1; + } } + else if (!eof) + return -1; } s->state = S_GENERAL; s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND; - return segmenter_push (s, input, n, type); + return segmenter_push (s, input, n, eof, type); } static int segmenter_parse_digraph__ (const char *seconds, struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { assert (s->state == S_GENERAL); - if (n < 2) - return -1; - *type = SEG_PUNCT; s->substate = 0; - return input[1] != '\0' && strchr (seconds, input[1]) != NULL ? 2 : 1; + return (n < 2 + ? (eof ? 1 : -1) + : (strchr (seconds, input[1]) != NULL ? 2 : 1)); } static int -skip_comment (const char *input, size_t n, size_t ofs) +skip_comment (const char *input, size_t n, bool eof, size_t ofs) { for (; ofs < n; ofs++) { - if (input[ofs] == '\n' || input[ofs] == '\0') + if (input[ofs] == '\n') return ofs; else if (input[ofs] == '*') { if (ofs + 1 >= n) - return -1; + return eof ? ofs + 1 : -1; else if (input[ofs + 1] == '/') return ofs + 2; } } - return -1; + return eof ? ofs : -1; } static int -skip_spaces_and_comments (const char *input, size_t n, int ofs) +skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs) { while (ofs < n) { ucs4_t uc; int mblen; - mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); if (mblen < 0) return -1; if (uc == '/') { if (ofs + 1 >= n) - return -1; + return eof ? ofs : -1; else if (input[ofs + 1] != '*') return ofs; - ofs = skip_comment (input, n, ofs + 2); + ofs = skip_comment (input, n, eof, ofs + 2); if (ofs < 0) return -1; } @@ -166,18 +179,20 @@ skip_spaces_and_comments (const char *input, size_t n, int ofs) return ofs; } - return -1; + return eof ? ofs : -1; } static int -is_end_of_line (const char *input, size_t n, int ofs) +is_end_of_line (const char *input, size_t n, bool eof, int ofs) { - if (input[ofs] == '\n' || input[ofs] == '\0') + if (ofs >= n) + return eof ? 1 : -1; + else if (input[ofs] == '\n') return 1; else if (input[ofs] == '\r') { if (ofs + 1 >= n) - return -1; + return eof ? 1 : -1; return input[ofs + 1] == '\n'; } else @@ -185,17 +200,17 @@ is_end_of_line (const char *input, size_t n, int ofs) } static int -at_end_of_line (const char *input, size_t n, int ofs) +at_end_of_line (const char *input, size_t n, bool eof, int ofs) { - ofs = skip_spaces_and_comments (input, n, ofs); + ofs = skip_spaces_and_comments (input, n, eof, ofs); if (ofs < 0) return -1; - return is_end_of_line (input, n, ofs); + return is_end_of_line (input, n, eof, ofs); } static int -segmenter_parse_newline__ (const char *input, size_t n, +segmenter_parse_newline__ (const char *input, size_t n, bool eof, enum segment_type *type) { int ofs; @@ -205,7 +220,10 @@ segmenter_parse_newline__ (const char *input, size_t n, else { if (n < 2) - return -1; + { + assert (!eof); + return -1; + } assert (input[0] == '\r'); assert (input[1] == '\n'); @@ -217,93 +235,113 @@ segmenter_parse_newline__ (const char *input, size_t n, } static int -skip_spaces (const char *input, size_t n, size_t ofs) +skip_spaces (const char *input, size_t n, bool eof, size_t ofs) { while (ofs < n) { ucs4_t uc; int mblen; - mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); if (mblen < 0) return -1; - if (!lex_uc_is_space (uc) || uc == '\n' || uc == '\0') + if (!lex_uc_is_space (uc) || uc == '\n') return ofs; ofs += mblen; } - return -1; + return eof ? ofs : -1; } static int -skip_digits (const char *input, size_t n, int ofs) +skip_digits (const char *input, size_t n, bool eof, int ofs) { for (; ofs < n; ofs++) if (!c_isdigit (input[ofs])) return ofs; - return -1; + return eof ? ofs : -1; } static int segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n, - enum segment_type *type) + bool eof, enum segment_type *type) { int ofs; assert (s->state == S_GENERAL); - ofs = skip_digits (input, n, 0); + ofs = skip_digits (input, n, eof, 0); if (ofs < 0) return -1; + if (ofs >= n) + { + if (!eof) + return -1; + goto number; + }; if (input[ofs] == '.') { - ofs = skip_digits (input, n, ofs + 1); + ofs = skip_digits (input, n, eof, ofs + 1); if (ofs < 0) return -1; } if (ofs >= n) - return -1; + { + if (!eof) + return -1; + goto number; + } if (input[ofs] == 'e' || input[ofs] == 'E') { ofs++; if (ofs >= n) - return -1; + { + if (!eof) + return -1; + goto expected_exponent; + } if (input[ofs] == '+' || input[ofs] == '-') { ofs++; if (ofs >= n) - return -1; + { + if (!eof) + return -1; + goto expected_exponent; + } } if (!c_isdigit (input[ofs])) - { - *type = SEG_EXPECTED_EXPONENT; - s->substate = 0; - return ofs; - } + goto expected_exponent; - ofs = skip_digits (input, n, ofs); + ofs = skip_digits (input, n, eof, ofs); if (ofs < 0) return -1; } if (input[ofs - 1] == '.') { - int eol = at_end_of_line (input, n, ofs); + int eol = at_end_of_line (input, n, eof, ofs); if (eol < 0) return -1; else if (eol) ofs--; } +number: *type = SEG_NUMBER; s->substate = 0; return ofs; + +expected_exponent: + *type = SEG_EXPECTED_EXPONENT; + s->substate = 0; + return ofs; } static bool @@ -344,7 +382,7 @@ is_reserved_word (const char *s, int n) static int segmenter_parse_comment_1__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { int endcmd; @@ -357,7 +395,7 @@ segmenter_parse_comment_1__ (struct segmenter *s, ucs4_t uc; int mblen; - mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); if (mblen < 0) return -1; @@ -370,9 +408,7 @@ segmenter_parse_comment_1__ (struct segmenter *s, case '\n': if (ofs > 1 && input[ofs - 1] == '\r') ofs--; - /* Fall through. */ - case '\0': - if (endcmd == -2 || uc == '\0') + if (endcmd == -2) { /* Blank line ends comment command. */ s->state = S_GENERAL; @@ -405,50 +441,66 @@ segmenter_parse_comment_1__ (struct segmenter *s, ofs += mblen; } + + if (eof) + { + /* End of file. */ + s->state = S_GENERAL; + s->substate = SS_START_OF_COMMAND; + *type = SEG_SEPARATE_COMMANDS; + return ofs; + } + return -1; } static int -segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n, - enum segment_type *type) +segmenter_parse_comment_2__ (struct segmenter *s, const char *input, + size_t n, bool eof, enum segment_type *type) { - int new_cmd; - ucs4_t uc; - int mblen; - int ofs; - - ofs = segmenter_parse_newline__ (input, n, type); - if (ofs < 0 || ofs >= n) - return -1; - - mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); - if (mblen < 0) + int ofs = segmenter_parse_newline__ (input, n, eof, type); + if (ofs < 0) return -1; - if (uc == '+' || uc == '-' || uc == '.') - new_cmd = true; - else if (!lex_uc_is_space (uc)) - switch (s->mode) - { - case SEG_MODE_INTERACTIVE: - new_cmd = false; - break; + int new_cmd; + if (ofs >= n) + { + if (!eof) + return -1; + new_cmd = false; + } + else + { + ucs4_t uc; + int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); + if (mblen < 0) + return -1; - case SEG_MODE_BATCH: + if (uc == '+' || uc == '-' || uc == '.') new_cmd = true; - break; + else if (!lex_uc_is_space (uc)) + switch (s->mode) + { + case SEG_MODE_INTERACTIVE: + new_cmd = false; + break; - case SEG_MODE_AUTO: - new_cmd = segmenter_detect_command_name__ (input, n, ofs); - if (new_cmd < 0) - return -1; - break; + case SEG_MODE_BATCH: + new_cmd = true; + break; - default: - NOT_REACHED (); - } - else - new_cmd = false; + case SEG_MODE_AUTO: + new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs); + if (new_cmd < 0) + return -1; + break; + + default: + NOT_REACHED (); + } + else + new_cmd = false; + } if (new_cmd) { @@ -462,7 +514,7 @@ segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n, static int segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n, - enum segment_type *type) + bool eof, enum segment_type *type) { bool end_cmd; int ofs; @@ -474,7 +526,7 @@ segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n, ucs4_t uc; int mblen; - mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); if (mblen < 0) return -1; @@ -492,11 +544,6 @@ segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n, s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2; return ofs; - case '\0': - *type = SEG_DOCUMENT; - s->state = S_DOCUMENT_3; - return ofs; - default: if (!lex_uc_is_space (uc)) end_cmd = false; @@ -505,16 +552,22 @@ segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n, ofs += mblen; } + if (eof) + { + *type = SEG_DOCUMENT; + s->state = S_DOCUMENT_3; + return ofs; + } return -1; } static int segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n, - enum segment_type *type) + bool eof, enum segment_type *type) { int ofs; - ofs = segmenter_parse_newline__ (input, n, type); + ofs = segmenter_parse_newline__ (input, n, eof, type); if (ofs < 0) return -1; @@ -532,22 +585,27 @@ segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type) } static int -segmenter_unquoted (const char *input, size_t n, int ofs) +segmenter_unquoted (const char *input, size_t n, bool eof, int ofs) { - char c; - - ofs = skip_spaces_and_comments (input, n, ofs); + ofs = skip_spaces_and_comments (input, n, eof, ofs); if (ofs < 0) return -1; - - c = input[ofs]; - return c != '\'' && c != '"' && c != '\n' && c != '\0'; + else if (ofs < n) + { + char c = input[ofs]; + return c != '\'' && c != '"' && c != '\n'; + } + else + { + assert (eof); + return 0; + } } static int next_id_in_command (const struct segmenter *s, const char *input, size_t n, - int ofs, char id[], size_t id_size) + bool eof, int ofs, char id[], size_t id_size) { struct segmenter sub; @@ -561,7 +619,7 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n, enum segment_type type; int retval; - retval = segmenter_push (&sub, input + ofs, n - ofs, &type); + retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type); if (retval < 0) { id[0] = '\0'; @@ -612,13 +670,15 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n, } } +/* Called when INPUT begins with a character that can start off an ID token. */ static int segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, - enum segment_type *type) + bool eof, enum segment_type *type) { ucs4_t uc; int ofs; + assert (n > 0); assert (s->state == S_GENERAL); ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n); @@ -627,9 +687,13 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, int mblen; if (ofs >= n) - return -1; + { + if (eof) + break; + return -1; + } - mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); if (mblen < 0) return -1; else if (!lex_uc_is_idn (uc)) @@ -640,7 +704,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, if (input[ofs - 1] == '.') { - int eol = at_end_of_line (input, n, ofs); + int eol = at_end_of_line (input, n, eof, ofs); if (eol < 0) return -1; else if (eol) @@ -659,7 +723,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4)) { s->state = S_COMMENT_1; - return segmenter_parse_comment_1__ (s, input, n, type); + return segmenter_parse_comment_1__ (s, input, n, eof, type); } else if (lex_id_match (ss_cstr ("DOCUMENT"), word)) { @@ -670,7 +734,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, else if (lex_id_match (ss_cstr ("TITLE"), word) || lex_id_match (ss_cstr ("SUBTITLE"), word)) { - int result = segmenter_unquoted (input, n, ofs); + int result = segmenter_unquoted (input, n, eof, ofs); if (result < 0) return -1; else if (result) @@ -683,7 +747,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, { char id[16]; - if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0) + if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0) return -1; else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id))) { @@ -696,7 +760,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, { char id[16]; - if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0) + if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0) return -1; else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id))) { @@ -710,25 +774,27 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, char id[16]; int ofs2; - ofs2 = next_id_in_command (s, input, n, ofs, id, sizeof id); + ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id); if (ofs2 < 0) return -1; else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id))) { int eol; - ofs2 = skip_spaces_and_comments (input, n, ofs2); + ofs2 = skip_spaces_and_comments (input, n, eof, ofs2); if (ofs2 < 0) return -1; - if (input[ofs2] == '.') + if (ofs2 >= n) + assert (eof); + else if (input[ofs2] == '.') { - ofs2 = skip_spaces_and_comments (input, n, ofs2 + 1); + ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1); if (ofs2 < 0) return -1; } - eol = is_end_of_line (input, n, ofs2); + eol = is_end_of_line (input, n, eof, ofs2); if (eol < 0) return -1; else if (eol) @@ -751,7 +817,8 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, static int segmenter_parse_string__ (enum segment_type string_type, int ofs, struct segmenter *s, - const char *input, size_t n, enum segment_type *type) + const char *input, size_t n, bool eof, + enum segment_type *type) { int quote = input[ofs]; @@ -760,46 +827,57 @@ segmenter_parse_string__ (enum segment_type string_type, if (input[ofs] == quote) { ofs++; - if (ofs >= n) - return -1; - else if (input[ofs] == quote) - ofs++; - else + if (ofs < n) { - *type = string_type; - s->substate = 0; - return ofs; + if (input[ofs] == quote) + { + ofs++; + continue; + } } - } - else if (input[ofs] == '\n' || input[ofs] == '\0') - { - *type = SEG_EXPECTED_QUOTE; + else if (!eof) + return -1; + + *type = string_type; s->substate = 0; return ofs; } + else if (input[ofs] == '\n') + goto expected_quote; else ofs++; + if (eof) + goto expected_quote; + return -1; + +expected_quote: + *type = SEG_EXPECTED_QUOTE; + s->substate = 0; + return ofs; } static int segmenter_maybe_parse_string__ (enum segment_type string_type, struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { if (n < 2) - return -1; + { + if (!eof) + return -1; + } else if (input[1] == '\'' || input[1] == '"') - return segmenter_parse_string__ (string_type, 1, s, input, n, type); - else - return segmenter_parse_id__ (s, input, n, type); + return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type); + + return segmenter_parse_id__ (s, input, n, eof, type); } static int segmenter_parse_mid_command__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { ucs4_t uc; @@ -809,7 +887,7 @@ segmenter_parse_mid_command__ (struct segmenter *s, assert (s->state == S_GENERAL); assert (!(s->substate & SS_START_OF_LINE)); - mblen = segmenter_u8_to_uc__ (&uc, input, n); + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0); if (mblen < 0) return -1; @@ -821,23 +899,24 @@ segmenter_parse_mid_command__ (struct segmenter *s, return 1; case '/': - if (n == 1) - return -1; + if (n < 2) + { + if (!eof) + return -1; + } else if (input[1] == '*') { - ofs = skip_comment (input, n, 2); + ofs = skip_comment (input, n, eof, 2); if (ofs < 0) return -1; *type = SEG_COMMENT; return ofs; } - else - { - s->substate = 0; - *type = SEG_PUNCT; - return 1; - } + + s->substate = 0; + *type = SEG_PUNCT; + return 1; case '(': case ')': case ',': case '=': case '-': case '[': case ']': case '&': case '|': case '+': @@ -850,62 +929,62 @@ segmenter_parse_mid_command__ (struct segmenter *s, { /* '*' at the beginning of a command begins a comment. */ s->state = S_COMMENT_1; - return segmenter_parse_comment_1__ (s, input, n, type); + return segmenter_parse_comment_1__ (s, input, n, eof, type); } else - return segmenter_parse_digraph__ ("*", s, input, n, type); + return segmenter_parse_digraph__ ("*", s, input, n, eof, type); case '<': - return segmenter_parse_digraph__ ("=>", s, input, n, type); + return segmenter_parse_digraph__ ("=>", s, input, n, eof, type); case '>': - return segmenter_parse_digraph__ ("=", s, input, n, type); + return segmenter_parse_digraph__ ("=", s, input, n, eof, type); case '~': - return segmenter_parse_digraph__ ("=", s, input, n, type); + return segmenter_parse_digraph__ ("=", s, input, n, eof, type); case '.': if (n < 2) - return -1; - else if (c_isdigit (input[1])) - return segmenter_parse_number__ (s, input, n, type); - else { - int eol = at_end_of_line (input, n, 1); - if (eol < 0) + if (!eof) return -1; + } + else if (c_isdigit (input[1])) + return segmenter_parse_number__ (s, input, n, eof, type); - if (eol) - { - *type = SEG_END_COMMAND; - s->substate = SS_START_OF_COMMAND; - } - else - *type = SEG_UNEXPECTED_DOT; - return 1; + int eol = at_end_of_line (input, n, eof, 1); + if (eol < 0) + return -1; + + if (eol) + { + *type = SEG_END_COMMAND; + s->substate = SS_START_OF_COMMAND; } - NOT_REACHED (); + else + *type = SEG_UNEXPECTED_DOT; + return 1; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - return segmenter_parse_number__ (s, input, n, type); + return segmenter_parse_number__ (s, input, n, eof, type); case 'u': case 'U': return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING, - s, input, n, type); + s, input, n, eof, type); case 'x': case 'X': return segmenter_maybe_parse_string__ (SEG_HEX_STRING, - s, input, n, type); + s, input, n, eof, type); case '\'': case '"': return segmenter_parse_string__ (SEG_QUOTED_STRING, 0, - s, input, n, type); + s, input, n, eof, type); default: if (lex_uc_is_space (uc)) { - ofs = skip_spaces (input, n, mblen); + ofs = skip_spaces (input, n, eof, mblen); if (ofs < 0) return -1; @@ -924,7 +1003,7 @@ segmenter_parse_mid_command__ (struct segmenter *s, return ofs; } else if (lex_uc_is_id1 (uc)) - return segmenter_parse_id__ (s, input, n, type); + return segmenter_parse_id__ (s, input, n, eof, type); else { *type = SEG_UNEXPECTED_CHAR; @@ -985,7 +1064,8 @@ segmenter_get_command_name_candidates (unsigned char first) } static int -segmenter_detect_command_name__ (const char *input, size_t n, int ofs) +segmenter_detect_command_name__ (const char *input, size_t n, bool eof, + int ofs) { const char **commands; @@ -998,13 +1078,17 @@ segmenter_detect_command_name__ (const char *input, size_t n, int ofs) int mblen; if (ofs >= n) - return -1; + { + if (eof) + break; + return -1; + } - mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); if (mblen < 0) return -1; - if (uc == '\n' || uc == '\0' + if (uc == '\n' || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-')) break; @@ -1033,15 +1117,16 @@ segmenter_detect_command_name__ (const char *input, size_t n, int ofs) } static int -is_start_of_string__ (const char *input, size_t n, int ofs) +is_start_of_string__ (const char *input, size_t n, bool eof, int ofs) { - int c; + if (ofs >= n) + return eof ? 0 : -1; - c = input[ofs]; + int c = input[ofs]; if (c == 'x' || c == 'X' || c == 'u' || c == 'U') { if (ofs + 1 >= n) - return -1; + return eof ? 0 : -1; return input[ofs + 1] == '\'' || input[ofs + 1] == '"'; } @@ -1051,7 +1136,7 @@ is_start_of_string__ (const char *input, size_t n, int ofs) static int segmenter_parse_start_of_line__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { ucs4_t uc; @@ -1061,19 +1146,19 @@ segmenter_parse_start_of_line__ (struct segmenter *s, assert (s->state == S_GENERAL); assert (s->substate & SS_START_OF_LINE); - mblen = segmenter_u8_to_uc__ (&uc, input, n); + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0); if (mblen < 0) return -1; switch (uc) { case '+': - ofs = skip_spaces_and_comments (input, n, 1); + ofs = skip_spaces_and_comments (input, n, eof, 1); if (ofs < 0) return -1; else { - int is_string = is_start_of_string__ (input, n, ofs); + int is_string = is_start_of_string__ (input, n, eof, ofs); if (is_string < 0) return -1; else if (is_string) @@ -1095,7 +1180,7 @@ segmenter_parse_start_of_line__ (struct segmenter *s, default: if (lex_uc_is_space (uc)) { - int eol = at_end_of_line (input, n, 0); + int eol = at_end_of_line (input, n, eof, 0); if (eol < 0) return -1; else if (eol) @@ -1111,7 +1196,7 @@ segmenter_parse_start_of_line__ (struct segmenter *s, break; else if (s->mode == SEG_MODE_AUTO) { - int cmd = segmenter_detect_command_name__ (input, n, 0); + int cmd = segmenter_detect_command_name__ (input, n, eof, 0); if (cmd < 0) return -1; else if (cmd == 0) @@ -1126,12 +1211,12 @@ segmenter_parse_start_of_line__ (struct segmenter *s, } s->substate = SS_START_OF_COMMAND; - return segmenter_parse_mid_command__ (s, input, n, type); + return segmenter_parse_mid_command__ (s, input, n, eof, type); } static int segmenter_parse_file_label__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { struct segmenter sub; @@ -1139,7 +1224,7 @@ segmenter_parse_file_label__ (struct segmenter *s, sub = *s; sub.state = S_GENERAL; - ofs = segmenter_push (&sub, input, n, type); + ofs = segmenter_push (&sub, input, n, eof, type); if (ofs < 0) return -1; @@ -1149,7 +1234,7 @@ segmenter_parse_file_label__ (struct segmenter *s, assert (lex_id_match (ss_cstr ("LABEL"), ss_buffer ((char *) input, ofs))); - result = segmenter_unquoted (input, n, ofs); + result = segmenter_unquoted (input, n, eof, ofs); if (result < 0) return -1; else @@ -1170,7 +1255,8 @@ segmenter_parse_file_label__ (struct segmenter *s, static int segmenter_subparse (struct segmenter *s, - const char *input, size_t n, enum segment_type *type) + const char *input, size_t n, bool eof, + enum segment_type *type) { struct segmenter sub; int ofs; @@ -1178,17 +1264,17 @@ segmenter_subparse (struct segmenter *s, sub.mode = s->mode; sub.state = S_GENERAL; sub.substate = s->substate; - ofs = segmenter_push (&sub, input, n, type); + ofs = segmenter_push (&sub, input, n, eof, type); s->substate = sub.substate; return ofs; } static int segmenter_parse_do_repeat_1__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { - int ofs = segmenter_subparse (s, input, n, type); + int ofs = segmenter_subparse (s, input, n, eof, type); if (ofs < 0) return -1; @@ -1205,10 +1291,10 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s, static int segmenter_parse_do_repeat_2__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { - int ofs = segmenter_subparse (s, input, n, type); + int ofs = segmenter_subparse (s, input, n, eof, type); if (ofs < 0) return -1; @@ -1223,7 +1309,7 @@ segmenter_parse_do_repeat_2__ (struct segmenter *s, static bool check_repeat_command (struct segmenter *s, - const char *input, size_t n) + const char *input, size_t n, bool eof) { int direction; char id[16]; @@ -1233,7 +1319,7 @@ check_repeat_command (struct segmenter *s, if (input[ofs] == '+' || input[ofs] == '-') ofs++; - ofs = next_id_in_command (s, input, n, ofs, id, sizeof id); + ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id); if (ofs < 0) return false; else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id))) @@ -1243,7 +1329,7 @@ check_repeat_command (struct segmenter *s, else return true; - ofs = next_id_in_command (s, input, n, ofs, id, sizeof id); + ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id); if (ofs < 0) return false; @@ -1253,48 +1339,40 @@ check_repeat_command (struct segmenter *s, } static int -segmenter_parse_full_line__ (const char *input, size_t n, +segmenter_parse_full_line__ (const char *input, size_t n, bool eof, enum segment_type *type) { - const char *newline = memchr2 (input, '\n', '\0', n); + const char *newline = memchr (input, '\n', n); + if (!newline) + return eof ? n : -1; - if (newline == NULL) - return -1; - else + ptrdiff_t ofs = newline - input; + if (ofs == 0 || (ofs == 1 && input[0] == '\r')) { - int ofs = newline - input; - if (*newline == '\0') - { - assert (ofs > 0); - return ofs; - } - else if (ofs == 0 || (ofs == 1 && input[0] == '\r')) - { - *type = SEG_NEWLINE; - return ofs + 1; - } - else - return ofs - (input[ofs - 1] == '\r'); + *type = SEG_NEWLINE; + return ofs + 1; } + else + return ofs - (input[ofs - 1] == '\r'); } static int segmenter_parse_do_repeat_3__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { int ofs; - ofs = segmenter_parse_full_line__ (input, n, type); - if (ofs < 0 || input[ofs - 1] == '\n') + ofs = segmenter_parse_full_line__ (input, n, eof, type); + if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n')) return ofs; - else if (!check_repeat_command (s, input, n)) + else if (!check_repeat_command (s, input, n, eof) && !eof) return -1; else if (s->substate == 0) { s->state = S_GENERAL; s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE; - return segmenter_push (s, input, n, type); + return segmenter_push (s, input, n, eof, type); } else { @@ -1305,10 +1383,10 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s, static int segmenter_parse_begin_data_1__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { - int ofs = segmenter_subparse (s, input, n, type); + int ofs = segmenter_subparse (s, input, n, eof, type); if (ofs < 0) return -1; @@ -1320,10 +1398,10 @@ segmenter_parse_begin_data_1__ (struct segmenter *s, static int segmenter_parse_begin_data_2__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { - int ofs = segmenter_subparse (s, input, n, type); + int ofs = segmenter_subparse (s, input, n, eof, type); if (ofs < 0) return -1; @@ -1342,7 +1420,7 @@ is_end_data (const char *input, size_t n) int mblen; int ofs; - if (n < 3 || c_strncasecmp (input, "END", 3)) + if (n < 4 || c_strncasecmp (input, "END", 3)) return false; ofs = 3; @@ -1375,19 +1453,19 @@ is_end_data (const char *input, size_t n) static int segmenter_parse_begin_data_3__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { int ofs; - ofs = segmenter_parse_full_line__ (input, n, type); + ofs = segmenter_parse_full_line__ (input, n, eof, type); if (ofs < 0) return -1; else if (is_end_data (input, ofs)) { s->state = S_GENERAL; s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE; - return segmenter_push (s, input, n, type); + return segmenter_push (s, input, n, eof, type); } else { @@ -1399,12 +1477,12 @@ segmenter_parse_begin_data_3__ (struct segmenter *s, static int segmenter_parse_begin_data_4__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { int ofs; - ofs = segmenter_parse_newline__ (input, n, type); + ofs = segmenter_parse_newline__ (input, n, eof, type); if (ofs < 0) return -1; @@ -1414,12 +1492,12 @@ segmenter_parse_begin_data_4__ (struct segmenter *s, static int segmenter_parse_title_1__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { int ofs; - ofs = skip_spaces (input, n, 0); + ofs = skip_spaces (input, n, eof, 0); if (ofs < 0) return -1; s->state = S_TITLE_2; @@ -1429,7 +1507,7 @@ segmenter_parse_title_1__ (struct segmenter *s, static int segmenter_parse_title_2__ (struct segmenter *s, - const char *input, size_t n, + const char *input, size_t n, bool eof, enum segment_type *type) { int endcmd; @@ -1442,18 +1520,14 @@ segmenter_parse_title_2__ (struct segmenter *s, ucs4_t uc; int mblen; - mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); if (mblen < 0) return -1; switch (uc) { case '\n': - case '\0': - s->state = S_GENERAL; - s->substate = 0; - *type = SEG_UNQUOTED_STRING; - return endcmd >= 0 ? endcmd : ofs; + goto end_of_line; case '.': endcmd = ofs; @@ -1468,6 +1542,15 @@ segmenter_parse_title_2__ (struct segmenter *s, ofs += mblen; } + if (eof) + { + end_of_line: + s->state = S_GENERAL; + s->substate = 0; + *type = SEG_UNQUOTED_STRING; + return endcmd >= 0 ? endcmd : ofs; + } + return -1; } @@ -1510,9 +1593,9 @@ segmenter_get_mode (const struct segmenter *s) /* Attempts to label a prefix of S's remaining input with a segment type. The caller supplies the first N bytes of the remaining input as INPUT, which - must be a UTF-8 encoded string. The end of the input stream must be - indicated by a null byte at the beginning of a line, that is, immediately - following a new-line (or as the first byte of the input stream). + must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied + are the entire (remainder) of the input; if EOF is false, then further input + is potentially available. The input may contain '\n' or '\r\n' line ends in any combination. @@ -1523,11 +1606,11 @@ segmenter_get_mode (const struct segmenter *s) the segmenter. Failure occurs only if the segment type of the N bytes in INPUT cannot yet - be determined. In this case segmenter_push() returns -1. The caller should - obtain more input and then call segmenter_push() again with a larger N and - repeat until the input is exhausted (which must be indicated as described - above) or until a valid segment is returned. segmenter_push() will never - return -1 when the end of input is visible within INPUT. + be determined. In this case segmenter_push() returns -1. If more input is + available, the caller should obtain some more, then call again with a larger + N. If this is not enough, the process might need to repeat again and agin. + If input is exhausted, then the caller may call again setting EOF to true. + segmenter_push() will never return -1 when EOF is true. The caller must not, in a sequence of calls, supply contradictory input. That is, bytes provided as part of INPUT in one call, but not consumed, must @@ -1535,63 +1618,65 @@ segmenter_get_mode (const struct segmenter *s) because segmenter_push() must often make decisions based on looking ahead beyond the bytes that it consumes. */ int -segmenter_push (struct segmenter *s, const char *input, size_t n, +segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof, enum segment_type *type) { - if (n == 0) - return -1; - - if (input[0] == '\0') + if (!n) { - *type = SEG_END; - return 1; + if (eof) + { + *type = SEG_END; + return 0; + } + else + return -1; } switch (s->state) { case S_SHBANG: - return segmenter_parse_shbang__ (s, input, n, type); + return segmenter_parse_shbang__ (s, input, n, eof, type); case S_GENERAL: return (s->substate & SS_START_OF_LINE - ? segmenter_parse_start_of_line__ (s, input, n, type) - : segmenter_parse_mid_command__ (s, input, n, type)); + ? segmenter_parse_start_of_line__ (s, input, n, eof, type) + : segmenter_parse_mid_command__ (s, input, n, eof, type)); case S_COMMENT_1: - return segmenter_parse_comment_1__ (s, input, n, type); + return segmenter_parse_comment_1__ (s, input, n, eof, type); case S_COMMENT_2: - return segmenter_parse_comment_2__ (s, input, n, type); + return segmenter_parse_comment_2__ (s, input, n, eof, type); case S_DOCUMENT_1: - return segmenter_parse_document_1__ (s, input, n, type); + return segmenter_parse_document_1__ (s, input, n, eof, type); case S_DOCUMENT_2: - return segmenter_parse_document_2__ (s, input, n, type); + return segmenter_parse_document_2__ (s, input, n, eof, type); case S_DOCUMENT_3: return segmenter_parse_document_3__ (s, type); case S_FILE_LABEL: - return segmenter_parse_file_label__ (s, input, n, type); + return segmenter_parse_file_label__ (s, input, n, eof, type); case S_DO_REPEAT_1: - return segmenter_parse_do_repeat_1__ (s, input, n, type); + return segmenter_parse_do_repeat_1__ (s, input, n, eof, type); case S_DO_REPEAT_2: - return segmenter_parse_do_repeat_2__ (s, input, n, type); + return segmenter_parse_do_repeat_2__ (s, input, n, eof, type); case S_DO_REPEAT_3: - return segmenter_parse_do_repeat_3__ (s, input, n, type); + return segmenter_parse_do_repeat_3__ (s, input, n, eof, type); case S_BEGIN_DATA_1: - return segmenter_parse_begin_data_1__ (s, input, n, type); + return segmenter_parse_begin_data_1__ (s, input, n, eof, type); case S_BEGIN_DATA_2: - return segmenter_parse_begin_data_2__ (s, input, n, type); + return segmenter_parse_begin_data_2__ (s, input, n, eof, type); case S_BEGIN_DATA_3: - return segmenter_parse_begin_data_3__ (s, input, n, type); + return segmenter_parse_begin_data_3__ (s, input, n, eof, type); case S_BEGIN_DATA_4: - return segmenter_parse_begin_data_4__ (s, input, n, type); + return segmenter_parse_begin_data_4__ (s, input, n, eof, type); case S_TITLE_1: - return segmenter_parse_title_1__ (s, input, n, type); + return segmenter_parse_title_1__ (s, input, n, eof, type); case S_TITLE_2: - return segmenter_parse_title_2__ (s, input, n, type); + return segmenter_parse_title_2__ (s, input, n, eof, type); } NOT_REACHED (); diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h index 1c209c5acb..c647c8691d 100644 --- a/src/language/lexer/segment.h +++ b/src/language/lexer/segment.h @@ -118,7 +118,7 @@ void segmenter_init (struct segmenter *, enum segmenter_mode); enum segmenter_mode segmenter_get_mode (const struct segmenter *); -int segmenter_push (struct segmenter *, const char *input, size_t n, +int segmenter_push (struct segmenter *, const char *input, size_t n, bool eof, enum segment_type *); enum prompt_style segmenter_get_prompt (const struct segmenter *); diff --git a/tests/language/lexer/lexer.at b/tests/language/lexer/lexer.at index f13940bd67..87ce344eac 100644 --- a/tests/language/lexer/lexer.at +++ b/tests/language/lexer/lexer.at @@ -84,16 +84,13 @@ AT_SETUP([lexer crash due to null byte]) printf "datA dist list notable file='input.txt'/a b c. lis|.\0" > lexer.sps -# We sort the output into a predictable order because the lexer finds -# and reports null bytes as soon as it reads them into its input -# buffer, as opposed to when it encounters them during tokenization. -# This also means that null bytes might be reported as part of one -# command or another or none, hence removing the LIST: prefix. -AT_CHECK([pspp -O format=csv lexer.sps > lexer.csv], [1]) -AT_CHECK([sed '/^$/d -s/LIST: //' lexer.csv | sort], [0], [dnl -lexer.sps: error: Bad character U+0000 in input. +AT_CHECK([pspp -O format=csv lexer.sps], [1], [dnl lexer.sps:1: error: Unknown command `datA dist'. -lexer.sps:2: error: LIST is allowed only after the active dataset has been defined. + +lexer.sps:2: error: LIST: LIST is allowed only after the active dataset has been defined. + +lexer.sps:2.5: error: LIST: Syntax error at `.': Unexpected `.' in middle of command. + +lexer.sps:2.6: error: LIST: Syntax error at `...': Bad character U+0000 in input. ]) AT_CLEANUP diff --git a/tests/language/lexer/scan-test.c b/tests/language/lexer/scan-test.c index cfa8a79938..abbf0f9455 100644 --- a/tests/language/lexer/scan-test.c +++ b/tests/language/lexer/scan-test.c @@ -39,6 +39,10 @@ /* -a/--auto, -b/--batch, -i/--interactive: syntax mode. */ static enum segmenter_mode mode = SEG_MODE_AUTO; +/* -s, --strip-trailing-newline: Strip trailing newline from last line of + input. */ +static bool strip_trailing_newline; + static const char *parse_options (int argc, char **argv); static void usage (void) NO_RETURN; @@ -55,19 +59,21 @@ main (int argc, char *argv[]) set_program_name (argv[0]); file_name = parse_options (argc, argv); - /* Read from stdin into 'input'. Ensure that 'input' ends in a new-line - followed by a null byte. */ + /* Read from stdin into 'input'. */ input = (!strcmp (file_name, "-") ? fread_file (stdin, &length) : read_file (file_name, &length)); if (input == NULL) error (EXIT_FAILURE, errno, "reading %s failed", file_name); - input = xrealloc (input, length + 3); - if (length == 0 || input[length - 1] != '\n') - input[length++] = '\n'; - input[length++] = '\0'; - string_lexer_init (&slex, input, mode); + if (strip_trailing_newline && length && input[length - 1] == '\n') + { + length--; + if (length && input[length - 1] == '\r') + length--; + } + + string_lexer_init (&slex, input, length, mode); do { struct token token; @@ -107,11 +113,12 @@ parse_options (int argc, char **argv) {"auto", no_argument, NULL, 'a'}, {"batch", no_argument, NULL, 'b'}, {"interactive", no_argument, NULL, 'i'}, + {"strip-trailing-newline", no_argument, NULL, 's'}, {"help", no_argument, NULL, 'h'}, {NULL, 0, NULL, 0}, }; - int c = getopt_long (argc, argv, "abih", options, NULL); + int c = getopt_long (argc, argv, "sabih", options, NULL); if (c == -1) break; @@ -129,6 +136,10 @@ parse_options (int argc, char **argv) mode = SEG_MODE_INTERACTIVE; break; + case 's': + strip_trailing_newline = true; + break; + case 'h': usage (); @@ -159,10 +170,10 @@ usage (void) usage: %s [OPTIONS] INPUT\n\ \n\ Options:\n\ - -1, --one-segment feed one segment at a time\n\ -a, --auto use \"auto\" syntax mode\n\ -b, --batch use \"batch\" syntax mode\n\ -i, --interactive use \"interactive\" syntax mode (default)\n\ + -s, --strip-trailing-newline remove newline from end of input\n\ -v, --verbose include rows and column numbers in output\n\ -h, --help print this help message\n", program_name, program_name); diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at index a6b0e62881..8eb48059e9 100644 --- a/tests/language/lexer/scan.at +++ b/tests/language/lexer/scan.at @@ -16,7 +16,11 @@ dnl along with this program. If not, see . dnl AT_BANNER([syntax scanning]) m4_define([PSPP_CHECK_SCAN], - [AT_CHECK([scan-test $1 input], [0], [expout])]) + [sed 's/^-//' < expout-base > expout + AT_CHECK([scan-test $1 input], [0], [expout]) + + sed '/^-/d' < expout-base > expout + AT_CHECK([scan-test -s $1 input], [0], [expout])]) AT_SETUP([identifiers]) AT_KEYWORDS([scan]) @@ -28,7 +32,7 @@ QrStUv./* end of line comment */ @&t@ WXYZ. /* unterminated end of line comment �. /* U+FFFD is not valid in an identifier ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl ID "a" SKIP ID "aB" @@ -73,7 +77,7 @@ UNEXPECTED_CHAR 65533 ENDCMD SKIP SKIP -SKIP +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -87,7 +91,7 @@ AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH andx orx notx eqx gex gtx lex ltx nex allx byx tox withx and. with. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl AND SKIP OR @@ -170,7 +174,7 @@ ID "and." SKIP WITH ENDCMD -SKIP +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -182,7 +186,7 @@ AT_DATA([input], [dnl ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] ** ~&|=>=><=<~=<>(),-+*/[[]]** ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl NOT SKIP AND @@ -243,7 +247,7 @@ SLASH LBRACK RBRACK EXP -SKIP +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -260,7 +264,7 @@ AT_DATA([input], [dnl 1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 . 1e e1 1e+ 1e- ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl POS_NUM SKIP POS_NUM 1 @@ -328,7 +332,7 @@ SKIP EXPECTED_EXPONENT "1e+" SKIP EXPECTED_EXPONENT "1e-" -SKIP +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -367,7 +371,7 @@ x"4142" "�あいうえお" "abc"+U"FFFD"+u'3048'+"xyz" ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl STRING "x" SKIP STRING "y" @@ -423,7 +427,7 @@ SKIP STRING "�あいうえお" SKIP STRING "abc�えxyz" -SKIP +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -435,7 +439,7 @@ AT_DATA([input], [dnl #! /usr/bin/pspp #! /usr/bin/pspp ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl SKIP SKIP ID "#" @@ -447,7 +451,7 @@ SLASH ID "bin" SLASH ID "pspp" -SKIP +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -473,7 +477,7 @@ com is ambiguous with COMPUTE. next command. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl SKIP SKIP SKIP @@ -523,8 +527,8 @@ SKIP ID "command" ENDCMD SKIP -ENDCMD -SKIP +-ENDCMD +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -544,7 +548,7 @@ isn't parsed as tokens second paragraph. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl ID "DOCUMENT" STRING "DOCUMENT one line." ENDCMD @@ -571,9 +575,9 @@ SKIP STRING "" SKIP STRING "second paragraph." -ENDCMD -ENDCMD -SKIP +-ENDCMD +-ENDCMD +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -598,7 +602,7 @@ FILE /* /**/ lab not quoted here either ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl ID "title" SKIP STRING "Quoted string title" @@ -656,8 +660,8 @@ ID "lab" SKIP STRING "not quoted here either" SKIP -ENDCMD -SKIP +-ENDCMD +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -678,7 +682,7 @@ end data end data . ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl ID "begin" SKIP ID "data" @@ -714,7 +718,7 @@ SKIP ID "data" SKIP ENDCMD -SKIP +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -733,7 +737,7 @@ end /* x */ /* y */ repeat print. end repeat. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl ID "do" SKIP ID "repeat" @@ -771,7 +775,7 @@ SKIP SKIP ID "repeat" ENDCMD -SKIP +-SKIP STOP ]) PSPP_CHECK_SCAN([-i]) @@ -788,7 +792,7 @@ third command fourth command. fifth command. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl ID "first" SKIP ID "command" @@ -827,7 +831,7 @@ ID "fifth" SKIP ID "command" ENDCMD -SKIP +-SKIP STOP ]) PSPP_CHECK_SCAN([-b]) diff --git a/tests/language/lexer/segment-test.c b/tests/language/lexer/segment-test.c index ef5ff613fe..2cd141cfe0 100644 --- a/tests/language/lexer/segment-test.c +++ b/tests/language/lexer/segment-test.c @@ -50,6 +50,10 @@ static bool one_byte; /* -0, --truncations: Check that every truncation of input yields a result. */ static bool check_truncations; +/* -s, --strip-trailing-newline: Strip trailing newline from last line of + input. */ +static bool strip_trailing_newline; + static const char *parse_options (int argc, char **argv); static void usage (void) NO_RETURN; @@ -74,23 +78,23 @@ main (int argc, char *argv[]) if (input == NULL) error (EXIT_FAILURE, errno, "reading %s failed", file_name); - if (!check_truncations) + if (strip_trailing_newline && length && input[length - 1] == '\n') { - input = xrealloc (input, length + 3); - if (length == 0 || input[length - 1] != '\n') - input[length++] = '\n'; - input[length++] = '\0'; - - check_segmentation (input, length, true); + length--; + if (length && input[length - 1] == '\r') + length--; } + + if (!check_truncations) + check_segmentation (input, length, true); else { size_t test_len; for (test_len = 0; test_len <= length; test_len++) { - char *copy = xmemdup0 (input, test_len); - check_segmentation (copy, test_len + 1, false); + char *copy = xmemdup (input, test_len); + check_segmentation (copy, test_len, false); free (copy); } } @@ -102,18 +106,16 @@ main (int argc, char *argv[]) static void check_segmentation (const char *input, size_t length, bool print_segments) { - size_t offset, line_number, line_offset; struct segmenter s; - int prev_type; - segmenter_init (&s, mode); - line_number = 1; - line_offset = 0; - prev_type = -1; - for (offset = 0; offset < length; ) + size_t line_number = 1; + size_t line_offset = 0; + int prev_type = -1; + size_t offset = 0; + enum segment_type type; + do { - enum segment_type type; const char *type_name, *p; int n; @@ -132,7 +134,7 @@ check_segmentation (const char *input, size_t length, bool print_segments) n_newlines++; copy = xmemdup (input + offset, i); - n = segmenter_push (&s, copy, i, &type); + n = segmenter_push (&s, copy, i, i + offset >= length, &type); free (copy); if (n >= 0) @@ -141,17 +143,24 @@ check_segmentation (const char *input, size_t length, bool print_segments) assert (n_newlines <= 2); } else - n = segmenter_push (&s, input + offset, length - offset, &type); + n = segmenter_push (&s, input + offset, length - offset, true, &type); if (n < 0) - error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu", - offset); + { + if (!print_segments) + check_segmentation (input, length, true); + else + error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu", + offset); + } assert (offset + n <= length); if (type == SEG_NEWLINE) - assert ((n == 1 && input[offset] == '\n') - || (n == 2 - && input[offset] == '\r' && input[offset + 1] == '\n')); + { + assert ((n == 1 && input[offset] == '\n') + || (n == 2 + && input[offset] == '\r' && input[offset + 1] == '\n')); + } else assert (memchr (&input[offset], '\n', n) == NULL); @@ -266,6 +275,7 @@ check_segmentation (const char *input, size_t length, bool print_segments) printf (" (%s)\n", prompt_style_to_string (prompt)); } } + while (type != SEG_END); if (print_segments) putchar ('\n'); @@ -280,6 +290,7 @@ parse_options (int argc, char **argv) { {"one-byte", no_argument, NULL, '1'}, {"truncations", no_argument, NULL, '0'}, + {"strip-trailing-newline", no_argument, NULL, 's'}, {"auto", no_argument, NULL, 'a'}, {"batch", no_argument, NULL, 'b'}, {"interactive", no_argument, NULL, 'i'}, @@ -288,7 +299,7 @@ parse_options (int argc, char **argv) {NULL, 0, NULL, 0}, }; - int c = getopt_long (argc, argv, "01abivh", options, NULL); + int c = getopt_long (argc, argv, "01abivhs", options, NULL); if (c == -1) break; @@ -302,6 +313,10 @@ parse_options (int argc, char **argv) check_truncations = true; break; + case 's': + strip_trailing_newline = true; + break; + case 'a': mode = SEG_MODE_AUTO; break; @@ -350,6 +365,7 @@ usage: %s [OPTIONS] INPUT\n\ Options:\n\ -1, --one-byte feed one byte at a time\n\ -0, --truncations check null truncation of each prefix of input\n\ + -s, --strip-trailing-newline remove newline from end of input\n\ -a, --auto use \"auto\" syntax mode\n\ -b, --batch use \"batch\" syntax mode\n\ -i, --interactive use \"interactive\" syntax mode (default)\n\ diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at index bd3bc38281..3660c924f7 100644 --- a/tests/language/lexer/segment.at +++ b/tests/language/lexer/segment.at @@ -16,10 +16,16 @@ dnl along with this program. If not, see . dnl AT_BANNER([syntax segmentation]) m4_define([PSPP_CHECK_SEGMENT], - [AT_CHECK([segment-test $1 input], [0], [expout]) - AT_CHECK([segment-test -1 $1 input], [0], [expout]) - AT_CHECK([segment-test -0 $1 input]) - AT_CHECK([segment-test -01 $1 input])]) + [for strip in "" "-s"; do + case $strip in # ( + '') sed 's/^-//' < expout-base > expout ;; # ( + -s) sed '/^-/d' < expout-base > expout ;; + esac + AT_CHECK([segment-test $1 $strip input], [0], [expout]) + AT_CHECK([segment-test -1 $strip $1 input], [0], [expout]) + AT_CHECK([segment-test -0 $strip $1 input]) + AT_CHECK([segment-test -01 $strip $1 input]) + done]) AT_SETUP([identifiers]) AT_KEYWORDS([segment]) @@ -36,7 +42,7 @@ f@#_.#6 GhIjK .x 1y _z ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl identifier a space identifier ab space identifier abc space @@ -97,9 +103,9 @@ number 1 identifier y space unexpected_char \_ identifier z -newline \n (later) - -end +-newline \n (later) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -125,7 +131,7 @@ wxyz./* unterminated end of line comment WXYZ. /* unterminated end of line comment WxYz./* unterminated end of line comment @&t@ ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl identifier abcd. space identifier abcd end_command . @@ -203,9 +209,9 @@ newline \n (first) identifier WxYz end_command . comment /*_unterminated_end_of_line_comment_ -newline \n (first) - -end +-newline \n (first) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -218,7 +224,7 @@ AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH andx orx notx eqx gex gtx lex ltx nex allx byx tox withx and. with. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl reserved_word and space reserved_word or space reserved_word not space @@ -267,9 +273,9 @@ newline \n (later) identifier and. space reserved_word with end_command . -newline \n (first) - -end +-newline \n (first) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -280,7 +286,7 @@ AT_DATA([input], [dnl ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] ** ~&|=>=><=<~=<>(),-+*/[[]]** ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl punct ~ space punct & space punct | space @@ -323,9 +329,9 @@ punct / punct [[ punct ]] punct ** -newline \n (later) - -end +-newline \n (later) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -341,7 +347,7 @@ AT_DATA([input], [dnl 1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 . 1e e1 1e+ 1e- ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl number 0 space number 1 space number 01 space @@ -390,9 +396,9 @@ expected_exponent 1e space identifier e1 space expected_exponent 1e+ space expected_exponent 1e- -newline \n (later) - -end +-newline \n (later) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -413,7 +419,7 @@ u'fffd' U"041" + /* also a punctuator on blank line - 'new command' ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl quoted_string 'x' space quoted_string "y" space quoted_string 'abc' @@ -462,9 +468,9 @@ newline \n (later) start_command - space quoted_string 'new_command' -newline \n (later) - -end +-newline \n (later) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -476,7 +482,7 @@ AT_DATA([input], [dnl title my title. #! /usr/bin/pspp ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl shbang #!_/usr/bin/pspp newline \n (first) @@ -493,9 +499,9 @@ punct / identifier bin punct / identifier pspp -newline \n (later) - -end +-newline \n (later) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -520,7 +526,7 @@ com is ambiguous with COMPUTE. next command. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl comment_command *_Comment_commands_"don't newline \n (COMMENT) @@ -580,10 +586,10 @@ identifier command end_command . newline \n (first) -separate_commands -newline \n (first) - -end +-separate_commands +-newline \n (first) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -602,7 +608,7 @@ isn't parsed as tokens second paragraph. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl start_document document DOCUMENT_one_line. end_command @@ -638,11 +644,11 @@ document newline \n (DOCUMENT) document second_paragraph. -end_command -separate_commands -newline \n (first) - -end +-end_command +-separate_commands +-newline \n (first) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -666,7 +672,7 @@ FILE /* /**/ lab not quoted here either ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl identifier title comment /**/ quoted_string 'Quoted_string_title' @@ -728,10 +734,10 @@ identifier lab space unquoted_string not_quoted_here_either newline \n (later) -separate_commands -newline \n (first) - -end +-separate_commands +-newline \n (first) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -763,7 +769,7 @@ begin data "xxx". begin data 123. not data ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl identifier begin space identifier data end_command . @@ -854,9 +860,9 @@ newline \n (first) reserved_word not space identifier data -newline \n (later) - -end +-newline \n (later) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -878,7 +884,7 @@ do inner command. end repeat. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl identifier do space identifier repeat space identifier x @@ -936,9 +942,9 @@ newline \n (DO REPEAT) identifier end space identifier repeat end_command . -newline \n (first) - -end +-newline \n (first) +- +end ]) PSPP_CHECK_SEGMENT([-i]) AT_CLEANUP @@ -954,7 +960,7 @@ third command fourth command. fifth command. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl identifier first space identifier command newline \n (later) @@ -990,9 +996,9 @@ spaces ___ identifier fifth space identifier command end_command . -newline \n (first) - -end +-newline \n (first) +- +end ]) PSPP_CHECK_SEGMENT([-b]) AT_CLEANUP @@ -1014,7 +1020,7 @@ twostep cluster fourth command. fifth command. ]) -AT_DATA([expout], [dnl +AT_DATA([expout-base], [dnl identifier command newline \n (later) @@ -1080,9 +1086,9 @@ spaces ___ identifier fifth space identifier command end_command . -newline \n (first) - -end +-newline \n (first) +- +end ]) PSPP_CHECK_SEGMENT([-a]) AT_CLEANUP