X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Flexer%2Fsegment.c;h=a4fea0b213118559d474b94a2bc4efa4008ff0d0;hb=dde9b7b4e92fd1221de01e429343ea72ae444e33;hp=c607c4bd1ffc52061ab7cf7d6d6632d3e4a8b249;hpb=fe94912b9c8682c4666873b84c83cda88f4c135d;p=pspp diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c index c607c4bd1f..a4fea0b213 100644 --- a/src/language/lexer/segment.c +++ b/src/language/lexer/segment.c @@ -42,6 +42,10 @@ enum segmenter_state S_DO_REPEAT_1, S_DO_REPEAT_2, S_DO_REPEAT_3, + S_DEFINE_1, + S_DEFINE_2, + S_DEFINE_3, + S_DEFINE_4, S_BEGIN_DATA_1, S_BEGIN_DATA_2, S_BEGIN_DATA_3, @@ -214,6 +218,22 @@ at_end_of_line (const char *input, size_t n, bool eof, int ofs) return is_end_of_line (input, n, eof, ofs); } +static bool +is_all_spaces (const char *input_, size_t n) +{ + const uint8_t *input = CHAR_CAST (const uint8_t *, input_); + + int mblen; + for (int ofs = 0; ofs < n; ofs += mblen) + { + ucs4_t uc; + mblen = u8_mbtouc (&uc, input + ofs, n - ofs); + if (!lex_uc_is_space (uc)) + return false; + } + return true; +} + static int segmenter_parse_newline__ (const char *input, size_t n, bool eof, enum segment_type *type) @@ -286,20 +306,23 @@ segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n, if (!eof) return -1; goto number; - }; + } if (input[ofs] == '.') { + if (ofs + 1 >= n) + { + if (!eof) + return -1; + goto number; + } + ofs = skip_digits (input, n, eof, ofs + 1); if (ofs < 0) return -1; + else if (ofs >= n) + goto number; } - if (ofs >= n) - { - if (!eof) - return -1; - goto number; - } if (input[ofs] == 'e' || input[ofs] == 'E') { ofs++; @@ -658,6 +681,8 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n, case SEG_COMMENT_COMMAND: case SEG_DO_REPEAT_COMMAND: case SEG_INLINE_DATA: + case SEG_MACRO_ID: + case SEG_MACRO_BODY: case SEG_START_DOCUMENT: case SEG_DOCUMENT: case SEG_START_COMMAND: @@ -666,7 +691,6 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n, case SEG_END: case SEG_EXPECTED_QUOTE: case SEG_EXPECTED_EXPONENT: - case SEG_UNEXPECTED_DOT: case SEG_UNEXPECTED_CHAR: id[0] = '\0'; return ofs + retval; @@ -716,10 +740,9 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, ofs--; } - if (is_reserved_word (input, ofs)) - *type = SEG_RESERVED_WORD; - else - *type = SEG_IDENTIFIER; + *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD + : input[0] == '!' ? SEG_MACRO_ID + : SEG_IDENTIFIER); if (s->substate & SS_START_OF_COMMAND) { @@ -748,6 +771,11 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, return ofs; } } + else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6)) + { + s->state = S_DEFINE_1; + return ofs; + } else if (lex_id_match (ss_cstr ("FILE"), word)) { char id[16]; @@ -967,7 +995,7 @@ segmenter_parse_mid_command__ (struct segmenter *s, s->substate = SS_START_OF_COMMAND; } else - *type = SEG_UNEXPECTED_DOT; + *type = SEG_PUNCT; return 1; case '0': case '1': case '2': case '3': case '4': @@ -986,6 +1014,9 @@ segmenter_parse_mid_command__ (struct segmenter *s, return segmenter_parse_string__ (SEG_QUOTED_STRING, 0, s, input, n, eof, type); + case '!': + return segmenter_parse_id__ (s, input, n, eof, type); + default: if (lex_uc_is_space (uc)) { @@ -1009,6 +1040,12 @@ segmenter_parse_mid_command__ (struct segmenter *s, } else if (lex_uc_is_id1 (uc)) return segmenter_parse_id__ (s, input, n, eof, type); + else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^') + { + *type = SEG_PUNCT; + s->substate = 0; + return 1; + } else { *type = SEG_UNEXPECTED_CHAR; @@ -1274,6 +1311,9 @@ segmenter_subparse (struct segmenter *s, return ofs; } +/* We are segmenting a DO REPEAT command, currently reading the syntax that + defines the stand-in variables (the head) before the lines of syntax to be + repeated (the body). */ static int segmenter_parse_do_repeat_1__ (struct segmenter *s, const char *input, size_t n, bool eof, @@ -1283,10 +1323,14 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s, if (ofs < 0) return -1; - if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS) - s->state = S_DO_REPEAT_2; - else if (*type == SEG_END_COMMAND) + if (*type == SEG_SEPARATE_COMMANDS) + { + /* We reached a blank line that separates the head from the body. */ + s->state = S_DO_REPEAT_2; + } + else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND) { + /* We reached the body. */ s->state = S_DO_REPEAT_3; s->substate = 1; } @@ -1294,6 +1338,8 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s, return ofs; } +/* We are segmenting a DO REPEAT command, currently reading a blank line that + separates the head from the body. */ static int segmenter_parse_do_repeat_2__ (struct segmenter *s, const char *input, size_t n, bool eof, @@ -1305,6 +1351,7 @@ segmenter_parse_do_repeat_2__ (struct segmenter *s, if (*type == SEG_NEWLINE) { + /* We reached the body. */ s->state = S_DO_REPEAT_3; s->substate = 1; } @@ -1361,6 +1408,12 @@ segmenter_parse_full_line__ (const char *input, size_t n, bool eof, return ofs - (input[ofs - 1] == '\r'); } +/* We are in the body of DO REPEAT, segmenting the lines of syntax that are to + be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND. + + DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside + the lines we're segmenting. s->substate counts the nesting level, starting + at 1. */ static int segmenter_parse_do_repeat_3__ (struct segmenter *s, const char *input, size_t n, bool eof, @@ -1375,6 +1428,8 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s, return -1; else if (s->substate == 0) { + /* Nesting level dropped to 0, so we've finished reading the DO REPEAT + body. */ s->state = S_GENERAL; s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE; return segmenter_push (s, input, n, eof, type); @@ -1386,6 +1441,173 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s, } } +/* We are segmenting a DEFINE command, which consists of: + + - The DEFINE keyword. + + - Anything but "(". + + - "(" followed by a sequence of tokens possibly including balanced parentheses + up to a final ")". + + - A sequence of any number of lines, one string per line, ending with + "!ENDDEFINE". The first line is usually blank (that is, a newline follows + the "("). The last line usually just has "!ENDDEFINE." on it, but it can + start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single + line, even. + */ +static int +segmenter_parse_define_1__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int ofs = segmenter_subparse (s, input, n, eof, type); + if (ofs < 0) + return -1; + + if (*type == SEG_SEPARATE_COMMANDS + || *type == SEG_END_COMMAND + || *type == SEG_START_COMMAND) + { + /* The DEFINE command is malformed because we reached its end without + ever hitting a "(" token. Transition back to general parsing. */ + s->state = S_GENERAL; + return ofs; + } + else if (*type == SEG_PUNCT && input[0] == '(') + { + s->state = S_DEFINE_2; + s->nest = 1; + return ofs; + } + + return ofs; +} + +static int +segmenter_parse_define_2__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int ofs = segmenter_subparse (s, input, n, eof, type); + if (ofs < 0) + return -1; + + if (*type == SEG_SEPARATE_COMMANDS + || *type == SEG_END_COMMAND + || *type == SEG_START_COMMAND) + { + /* The DEFINE command is malformed because we reached its end before + closing the set of parentheses. Transition back to general + parsing. */ + s->state = S_GENERAL; + return ofs; + } + else if (*type == SEG_PUNCT && input[0] == '(') + { + s->nest++; + return ofs; + } + else if (*type == SEG_PUNCT && input[0] == ')') + { + s->nest--; + if (!s->nest) + { + s->state = S_DEFINE_3; + s->substate = 0; + } + return ofs; + } + + return ofs; +} + +static size_t +find_enddefine (struct substring input) +{ + size_t n = input.length; + const struct substring enddefine = ss_cstr ("!ENDDEFINE"); + for (size_t i = 0; i + enddefine.length <= n; i++) + if (input.string[i] == '!' + && ss_equals_case (ss_substr (input, i, enddefine.length), enddefine)) + return i; + return SIZE_MAX; +} + +/* We are in the body of a macro definition, looking for additional lines of + the body or !ENDDEFINE. */ +static int +segmenter_parse_define_3__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + /* Gather a whole line. */ + const char *newline = memchr (input, '\n', n); + int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r') + : eof ? n + : -1); + if (ofs < 0) + return -1; + + /* Does the line contain !ENDDEFINE? */ + size_t end = find_enddefine (ss_buffer (input, ofs)); + if (end == SIZE_MAX) + { + /* No !ENDDEFINE. We have a full line of macro body. + + The line might be blank, whether completely empty or just spaces and + comments. That's OK: we need to report blank lines because they can + have significance. + + However, if the first line of the macro body (the same line as the + closing parenthesis in the argument definition) is blank, we just + report it as spaces because it's not significant. */ + *type = (s->substate == 0 && is_all_spaces (input, ofs) + ? SEG_SPACES : SEG_MACRO_BODY); + s->state = S_DEFINE_4; + s->substate = 1; + return ofs; + } + else + { + /* Macro ends at the !ENDDEFINE on this line. */ + s->state = S_GENERAL; + s->substate = 0; + if (!end) + { + /* Line starts with !ENDDEFINE. */ + return segmenter_push (s, input, n, eof, type); + } + else + { + if (is_all_spaces (input, end)) + { + /* Line starts with spaces followed by !ENDDEFINE. */ + *type = SEG_SPACES; + } + else + { + /* Line starts with some content followed by !ENDDEFINE. */ + *type = SEG_MACRO_BODY; + } + return end; + } + } +} + +static int +segmenter_parse_define_4__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int ofs = segmenter_parse_newline__ (input, n, eof, type); + if (ofs < 0) + return -1; + + s->state = S_DEFINE_3; + return ofs; +} + static int segmenter_parse_begin_data_1__ (struct segmenter *s, const char *input, size_t n, bool eof, @@ -1669,6 +1891,15 @@ segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof, case S_DO_REPEAT_3: return segmenter_parse_do_repeat_3__ (s, input, n, eof, type); + case S_DEFINE_1: + return segmenter_parse_define_1__ (s, input, n, eof, type); + case S_DEFINE_2: + return segmenter_parse_define_2__ (s, input, n, eof, type); + case S_DEFINE_3: + return segmenter_parse_define_3__ (s, input, n, eof, type); + case S_DEFINE_4: + return segmenter_parse_define_4__ (s, input, n, eof, type); + case S_BEGIN_DATA_1: return segmenter_parse_begin_data_1__ (s, input, n, eof, type); case S_BEGIN_DATA_2: @@ -1721,6 +1952,13 @@ segmenter_get_prompt (const struct segmenter *s) case S_DO_REPEAT_3: return PROMPT_DO_REPEAT; + case S_DEFINE_1: + case S_DEFINE_2: + return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER; + case S_DEFINE_3: + case S_DEFINE_4: + return PROMPT_DEFINE; + case S_BEGIN_DATA_1: return PROMPT_FIRST; case S_BEGIN_DATA_2: