From aaa693a2c7f2ab9e11504b50184018d188efdc0b Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 13 Jun 2021 13:01:30 -0700 Subject: [PATCH] segment: Add support for segmenting "snippets". Otherwise, segmenting a little bit of syntax like "-123" will produce T_ENDCMD, T_POS_NUM. --- src/language/control/repeat.c | 2 +- src/language/lexer/lexer.c | 5 +++-- src/language/lexer/scan.c | 10 ++++++---- src/language/lexer/segment.c | 24 +++++++++++++++++------- src/language/lexer/segment.h | 4 +--- tests/language/lexer/segment-test.c | 2 +- 6 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/language/control/repeat.c b/src/language/control/repeat.c index 0438fa1bd8..86dd36f7f0 100644 --- a/src/language/control/repeat.c +++ b/src/language/control/repeat.c @@ -201,7 +201,7 @@ do_parse_commands (struct substring s, enum segmenter_mode mode, struct hmap *dummies, struct string *outputs, size_t n_outputs) { - struct segmenter segmenter = SEGMENTER_INIT (mode); + struct segmenter segmenter = segmenter_init (mode, false); while (!ss_is_empty (s)) { enum segment_type type; diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index cedbeafd50..1148cc29e0 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -1229,7 +1229,8 @@ lex_interactive_reset (struct lexer *lexer) src->journal_pos = src->seg_pos = src->line_pos = 0; src->n_newlines = 0; src->suppress_next_newline = false; - segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter)); + src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter), + false); while (!deque_is_empty (&src->deque)) lex_source_pop__ (src); lex_source_push_endcmd__ (src); @@ -1758,7 +1759,7 @@ lex_source_create (struct lexer *lexer, struct lex_reader *reader) src = xzalloc (sizeof *src); src->reader = reader; - segmenter_init (&src->segmenter, reader->syntax); + src->segmenter = segmenter_init (reader->syntax, false); src->lexer = lexer; src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens); diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c index 0e29dc9e71..57e3c2d1a8 100644 --- a/src/language/lexer/scan.c +++ b/src/language/lexer/scan.c @@ -607,10 +607,12 @@ void string_lexer_init (struct string_lexer *slex, const char *input, size_t length, enum segmenter_mode mode) { - slex->input = input; - slex->length = length; - slex->offset = 0; - segmenter_init (&slex->segmenter, mode); + *slex = (struct string_lexer) { + .input = input, + .length = length, + .offset = 0, + .segmenter = segmenter_init (mode, true), + }; } /* */ diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c index ac88117ff5..9728e176f4 100644 --- a/src/language/lexer/segment.c +++ b/src/language/lexer/segment.c @@ -55,9 +55,6 @@ enum segmenter_state S_TITLE_2 }; -/* S_SHBANG is the start state that SEGMENTER_INIT refers to as just 0. */ -verify (S_SHBANG == 0); - #define SS_START_OF_LINE (1u << 0) #define SS_START_OF_COMMAND (1u << 1) @@ -1802,15 +1799,28 @@ segment_type_to_string (enum segment_type type) } } -/* Initializes S as a segmenter with the given syntax MODE. +/* Returns a segmenter with the given syntax MODE. + + If IS_SNIPPET is false, then the segmenter will parse as if it's being given + a whole file. This means, for example, that it will interpret - or + at the + beginning of the syntax as a separator between commands (since - or + at the + beginning of a line has this meaning). + + If IS_SNIPPET is true, then the segmenter will parse as if it's being given + an isolated piece of syntax. This means that, for example, that it will + interpret - or + at the beginning of the syntax as an operator token or (if + followed by a digit) as part of a number. A segmenter does not contain any external references, so nothing needs to be done to destroy one. For the same reason, segmenters may be copied with plain struct assignment (or memcpy). */ -void -segmenter_init (struct segmenter *s, enum segmenter_mode mode) +struct segmenter +segmenter_init (enum segmenter_mode mode, bool is_snippet) { - *s = (struct segmenter) SEGMENTER_INIT (mode); + return (struct segmenter) { + .state = is_snippet ? S_GENERAL : S_SHBANG, + .mode = mode, + }; } /* Returns the mode passed to segmenter_init() for S. */ diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h index 10551066b0..5d550f531f 100644 --- a/src/language/lexer/segment.h +++ b/src/language/lexer/segment.h @@ -117,9 +117,7 @@ struct segmenter unsigned char mode; }; -#define SEGMENTER_INIT(MODE) { .mode = MODE } - -void segmenter_init (struct segmenter *, enum segmenter_mode); +struct segmenter segmenter_init (enum segmenter_mode, bool is_snippet); enum segmenter_mode segmenter_get_mode (const struct segmenter *); diff --git a/tests/language/lexer/segment-test.c b/tests/language/lexer/segment-test.c index cb46401b34..acb444f200 100644 --- a/tests/language/lexer/segment-test.c +++ b/tests/language/lexer/segment-test.c @@ -108,7 +108,7 @@ main (int argc, char *argv[]) static void check_segmentation (const char *input, size_t length, bool print_segments) { - struct segmenter s = SEGMENTER_INIT (mode); + struct segmenter s = segmenter_init (mode, false); size_t line_number = 1; size_t line_offset = 0; -- 2.30.2