X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Flexer%2Fsegment.c;h=346910898ce3898b4bec6c62563a1a6f8dfe10bd;hb=e86d3e8623564b379e6097a3df9e7232e8087160;hp=8d17ce38fe56b4de73d05b764c652a9d9a040594;hpb=012b9e4ecf47d844aea352b2c0b7dda80a7194bd;p=pspp diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c index 8d17ce38fe..346910898c 100644 --- a/src/language/lexer/segment.c +++ b/src/language/lexer/segment.c @@ -38,7 +38,9 @@ enum segmenter_state S_DOCUMENT_1, S_DOCUMENT_2, S_DOCUMENT_3, - S_FILE_LABEL, + S_FILE_LABEL_1, + S_FILE_LABEL_2, + S_FILE_LABEL_3, S_DO_REPEAT_1, S_DO_REPEAT_2, S_DO_REPEAT_3, @@ -46,12 +48,11 @@ enum segmenter_state S_DEFINE_2, S_DEFINE_3, S_DEFINE_4, + S_DEFINE_5, S_BEGIN_DATA_1, S_BEGIN_DATA_2, S_BEGIN_DATA_3, S_BEGIN_DATA_4, - S_TITLE_1, - S_TITLE_2 }; #define SS_START_OF_LINE (1u << 0) @@ -227,7 +228,7 @@ is_all_spaces (const char *input_, size_t n) for (int ofs = 0; ofs < n; ofs += mblen) { ucs4_t uc; - mblen = u8_mbtouc (&uc, input, n); + mblen = u8_mbtouc (&uc, input + ofs, n - ofs); if (!lex_uc_is_space (uc)) return false; } @@ -291,13 +292,11 @@ skip_digits (const char *input, size_t n, bool eof, int ofs) static int segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n, - bool eof, enum segment_type *type) + bool eof, enum segment_type *type, int ofs) { - int ofs; - assert (s->state == S_GENERAL); - ofs = skip_digits (input, n, eof, 0); + ofs = skip_digits (input, n, eof, ofs); if (ofs < 0) return -1; @@ -682,6 +681,7 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n, case SEG_DO_REPEAT_COMMAND: case SEG_INLINE_DATA: case SEG_MACRO_ID: + case SEG_MACRO_NAME: case SEG_MACRO_BODY: case SEG_START_DOCUMENT: case SEG_DOCUMENT: @@ -759,18 +759,6 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, *type = SEG_START_DOCUMENT; return 0; } - else if (lex_id_match (ss_cstr ("TITLE"), word) - || lex_id_match (ss_cstr ("SUBTITLE"), word)) - { - int result = segmenter_unquoted (input, n, eof, ofs); - if (result < 0) - return -1; - else if (result) - { - s->state = S_TITLE_1; - return ofs; - } - } else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6)) { s->state = S_DEFINE_1; @@ -784,7 +772,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, return -1; else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id))) { - s->state = S_FILE_LABEL; + s->state = S_FILE_LABEL_1; s->substate = 0; return ofs; } @@ -951,8 +939,25 @@ segmenter_parse_mid_command__ (struct segmenter *s, *type = SEG_PUNCT; return 1; - case '(': case ')': case ',': case '=': case '-': - case '[': case ']': case '&': case '|': case '+': + case '-': + ofs = skip_spaces (input, n, eof, 1); + if (ofs < 0) + return -1; + else if (ofs < n && c_isdigit (input[ofs])) + return segmenter_parse_number__ (s, input, n, eof, type, ofs); + else if (ofs < n && input[ofs] == '.') + { + if (ofs + 1 >= n) + { + if (!eof) + return -1; + } + else if (c_isdigit (input[ofs + 1])) + return segmenter_parse_number__ (s, input, n, eof, type, ofs); + } + /* Fall through. */ + case '(': case ')': case '{': case ',': case '=': case ';': case ':': + case '[': case ']': case '}': case '&': case '|': case '+': *type = SEG_PUNCT; s->substate = 0; return 1; @@ -983,7 +988,7 @@ segmenter_parse_mid_command__ (struct segmenter *s, return -1; } else if (c_isdigit (input[1])) - return segmenter_parse_number__ (s, input, n, eof, type); + return segmenter_parse_number__ (s, input, n, eof, type, 0); int eol = at_end_of_line (input, n, eof, 1); if (eol < 0) @@ -1000,7 +1005,7 @@ segmenter_parse_mid_command__ (struct segmenter *s, case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - return segmenter_parse_number__ (s, input, n, eof, type); + return segmenter_parse_number__ (s, input, n, eof, type, 0); case 'u': case 'U': return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING, @@ -1015,7 +1020,20 @@ segmenter_parse_mid_command__ (struct segmenter *s, s, input, n, eof, type); case '!': - return segmenter_parse_id__ (s, input, n, eof, type); + if (n < 2) + { + if (!eof) + return -1; + *type = SEG_PUNCT; + return 1; + } + else if (input[1] == '*') + { + *type = SEG_MACRO_ID; + return 2; + } + else + return segmenter_parse_id__ (s, input, n, eof, type); default: if (lex_uc_is_space (uc)) @@ -1024,7 +1042,7 @@ segmenter_parse_mid_command__ (struct segmenter *s, if (ofs < 0) return -1; - if (input[ofs - 1] == '\r' && input[ofs] == '\n') + if (ofs < n && input[ofs - 1] == '\r' && input[ofs] == '\n') { if (ofs == 1) { @@ -1257,9 +1275,9 @@ segmenter_parse_start_of_line__ (struct segmenter *s, } static int -segmenter_parse_file_label__ (struct segmenter *s, - const char *input, size_t n, bool eof, - enum segment_type *type) +segmenter_parse_file_label_1__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) { struct segmenter sub; int ofs; @@ -1282,7 +1300,7 @@ segmenter_parse_file_label__ (struct segmenter *s, else { if (result) - s->state = S_TITLE_1; + s->state = S_FILE_LABEL_2; else *s = sub; return ofs; @@ -1295,6 +1313,70 @@ segmenter_parse_file_label__ (struct segmenter *s, } } +static int +segmenter_parse_file_label_2__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int ofs; + + ofs = skip_spaces (input, n, eof, 0); + if (ofs < 0) + return -1; + s->state = S_FILE_LABEL_3; + *type = SEG_SPACES; + return ofs; +} + +static int +segmenter_parse_file_label_3__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int endcmd; + int ofs; + + endcmd = -1; + ofs = 0; + while (ofs < n) + { + ucs4_t uc; + int mblen; + + mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); + if (mblen < 0) + return -1; + + switch (uc) + { + case '\n': + goto end_of_line; + + case '.': + endcmd = ofs; + break; + + default: + if (!lex_uc_is_space (uc)) + endcmd = -1; + break; + } + + ofs += mblen; + } + + if (eof) + { + end_of_line: + s->state = S_GENERAL; + s->substate = 0; + *type = SEG_UNQUOTED_STRING; + return endcmd >= 0 ? endcmd : ofs; + } + + return -1; +} + static int segmenter_subparse (struct segmenter *s, const char *input, size_t n, bool eof, @@ -1445,6 +1527,10 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s, - The DEFINE keyword. + - An identifier. We transform this into SEG_MACRO_NAME instead of + SEG_IDENTIFIER or SEG_MACRO_NAME because this identifier must never be + macro-expanded. + - Anything but "(". - "(" followed by a sequence of tokens possibly including balanced parentheses @@ -1457,15 +1543,21 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s, line, even. */ static int -segmenter_parse_define_1__ (struct segmenter *s, - const char *input, size_t n, bool eof, - enum segment_type *type) +segmenter_parse_define_1_2__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) { int ofs = segmenter_subparse (s, input, n, eof, type); if (ofs < 0) return -1; - if (*type == SEG_SEPARATE_COMMANDS + if (s->state == S_DEFINE_1 + && (*type == SEG_IDENTIFIER || *type == SEG_MACRO_ID)) + { + *type = SEG_MACRO_NAME; + s->state = S_DEFINE_2; + } + else if (*type == SEG_SEPARATE_COMMANDS || *type == SEG_END_COMMAND || *type == SEG_START_COMMAND) { @@ -1476,7 +1568,7 @@ segmenter_parse_define_1__ (struct segmenter *s, } else if (*type == SEG_PUNCT && input[0] == '(') { - s->state = S_DEFINE_2; + s->state = S_DEFINE_3; s->nest = 1; return ofs; } @@ -1485,7 +1577,7 @@ segmenter_parse_define_1__ (struct segmenter *s, } static int -segmenter_parse_define_2__ (struct segmenter *s, +segmenter_parse_define_3__ (struct segmenter *s, const char *input, size_t n, bool eof, enum segment_type *type) { @@ -1512,7 +1604,10 @@ segmenter_parse_define_2__ (struct segmenter *s, { s->nest--; if (!s->nest) - s->state = S_DEFINE_3; + { + s->state = S_DEFINE_4; + s->substate = 0; + } return ofs; } @@ -1524,17 +1619,39 @@ find_enddefine (struct substring input) { size_t n = input.length; const struct substring enddefine = ss_cstr ("!ENDDEFINE"); - for (size_t i = 0; i + enddefine.length <= n; i++) - if (input.string[i] == '!' - && ss_equals_case (ss_substr (input, i, enddefine.length), enddefine)) - return i; - return SIZE_MAX; + for (int ofs = 0;;) + { + /* Skip !ENDDEFINE in comment. */ + ofs = skip_spaces_and_comments (input.string, n, true, ofs); + if (ofs + enddefine.length > n) + return SIZE_MAX; + + char c = input.string[ofs]; + if (c == '!' + && ss_equals_case (ss_substr (input, ofs, enddefine.length), + enddefine)) + return ofs; + else if (c == '\'' || c == '"') + { + /* Skip quoted !ENDDEFINE. */ + ofs++; + for (;;) + { + if (ofs >= n) + return SIZE_MAX; + else if (input.string[ofs++] == c) + break; + } + } + else + ofs++; + } } /* We are in the body of a macro definition, looking for additional lines of the body or !ENDDEFINE. */ static int -segmenter_parse_define_3__ (struct segmenter *s, +segmenter_parse_define_4__ (struct segmenter *s, const char *input, size_t n, bool eof, enum segment_type *type) { @@ -1554,9 +1671,15 @@ segmenter_parse_define_3__ (struct segmenter *s, The line might be blank, whether completely empty or just spaces and comments. That's OK: we need to report blank lines because they can - have significance. */ - *type = SEG_MACRO_BODY; - s->state = S_DEFINE_4; + have significance. + + However, if the first line of the macro body (the same line as the + closing parenthesis in the argument definition) is blank, we just + report it as spaces because it's not significant. */ + *type = (s->substate == 0 && is_all_spaces (input, ofs) + ? SEG_SPACES : SEG_MACRO_BODY); + s->state = S_DEFINE_5; + s->substate = 1; return ofs; } else @@ -1587,7 +1710,7 @@ segmenter_parse_define_3__ (struct segmenter *s, } static int -segmenter_parse_define_4__ (struct segmenter *s, +segmenter_parse_define_5__ (struct segmenter *s, const char *input, size_t n, bool eof, enum segment_type *type) { @@ -1595,7 +1718,7 @@ segmenter_parse_define_4__ (struct segmenter *s, if (ofs < 0) return -1; - s->state = S_DEFINE_3; + s->state = S_DEFINE_4; return ofs; } @@ -1708,70 +1831,6 @@ segmenter_parse_begin_data_4__ (struct segmenter *s, return ofs; } -static int -segmenter_parse_title_1__ (struct segmenter *s, - const char *input, size_t n, bool eof, - enum segment_type *type) -{ - int ofs; - - ofs = skip_spaces (input, n, eof, 0); - if (ofs < 0) - return -1; - s->state = S_TITLE_2; - *type = SEG_SPACES; - return ofs; -} - -static int -segmenter_parse_title_2__ (struct segmenter *s, - const char *input, size_t n, bool eof, - enum segment_type *type) -{ - int endcmd; - int ofs; - - endcmd = -1; - ofs = 0; - while (ofs < n) - { - ucs4_t uc; - int mblen; - - mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs); - if (mblen < 0) - return -1; - - switch (uc) - { - case '\n': - goto end_of_line; - - case '.': - endcmd = ofs; - break; - - default: - if (!lex_uc_is_space (uc)) - endcmd = -1; - break; - } - - ofs += mblen; - } - - if (eof) - { - end_of_line: - s->state = S_GENERAL; - s->substate = 0; - *type = SEG_UNQUOTED_STRING; - return endcmd >= 0 ? endcmd : ofs; - } - - return -1; -} - /* Returns the name of segment TYPE as a string. The caller must not modify or free the returned string. @@ -1789,17 +1848,28 @@ segment_type_to_string (enum segment_type type) } } -/* Initializes S as a segmenter with the given syntax MODE. +/* Returns a segmenter with the given syntax MODE. + + If IS_SNIPPET is false, then the segmenter will parse as if it's being given + a whole file. This means, for example, that it will interpret - or + at the + beginning of the syntax as a separator between commands (since - or + at the + beginning of a line has this meaning). + + If IS_SNIPPET is true, then the segmenter will parse as if it's being given + an isolated piece of syntax. This means that, for example, that it will + interpret - or + at the beginning of the syntax as an operator token or (if + followed by a digit) as part of a number. A segmenter does not contain any external references, so nothing needs to be done to destroy one. For the same reason, segmenters may be copied with plain struct assignment (or memcpy). */ -void -segmenter_init (struct segmenter *s, enum segmenter_mode mode) +struct segmenter +segmenter_init (enum segmenter_mode mode, bool is_snippet) { - s->state = S_SHBANG; - s->substate = 0; - s->mode = mode; + return (struct segmenter) { + .state = is_snippet ? S_GENERAL : S_SHBANG, + .mode = mode, + }; } /* Returns the mode passed to segmenter_init() for S. */ @@ -1823,6 +1893,9 @@ segmenter_get_mode (const struct segmenter *s) bytes as part of INPUT, because they have (figuratively) been consumed by the segmenter. + Segments can have zero length, including segment types SEG_END, + SEG_SEPARATE_COMMANDS, SEG_START_DOCUMENT, SEG_INLINE_DATA, and SEG_SPACES. + Failure occurs only if the segment type of the N bytes in INPUT cannot yet be determined. In this case segmenter_push() returns -1. If more input is available, the caller should obtain some more, then call again with a larger @@ -1872,8 +1945,12 @@ segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof, case S_DOCUMENT_3: return segmenter_parse_document_3__ (s, type); - case S_FILE_LABEL: - return segmenter_parse_file_label__ (s, input, n, eof, type); + case S_FILE_LABEL_1: + return segmenter_parse_file_label_1__ (s, input, n, eof, type); + case S_FILE_LABEL_2: + return segmenter_parse_file_label_2__ (s, input, n, eof, type); + case S_FILE_LABEL_3: + return segmenter_parse_file_label_3__ (s, input, n, eof, type); case S_DO_REPEAT_1: return segmenter_parse_do_repeat_1__ (s, input, n, eof, type); @@ -1883,13 +1960,14 @@ segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof, return segmenter_parse_do_repeat_3__ (s, input, n, eof, type); case S_DEFINE_1: - return segmenter_parse_define_1__ (s, input, n, eof, type); case S_DEFINE_2: - return segmenter_parse_define_2__ (s, input, n, eof, type); + return segmenter_parse_define_1_2__ (s, input, n, eof, type); case S_DEFINE_3: return segmenter_parse_define_3__ (s, input, n, eof, type); case S_DEFINE_4: return segmenter_parse_define_4__ (s, input, n, eof, type); + case S_DEFINE_5: + return segmenter_parse_define_5__ (s, input, n, eof, type); case S_BEGIN_DATA_1: return segmenter_parse_begin_data_1__ (s, input, n, eof, type); @@ -1899,11 +1977,6 @@ segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof, return segmenter_parse_begin_data_3__ (s, input, n, eof, type); case S_BEGIN_DATA_4: return segmenter_parse_begin_data_4__ (s, input, n, eof, type); - - case S_TITLE_1: - return segmenter_parse_title_1__ (s, input, n, eof, type); - case S_TITLE_2: - return segmenter_parse_title_2__ (s, input, n, eof, type); } NOT_REACHED (); @@ -1934,8 +2007,11 @@ segmenter_get_prompt (const struct segmenter *s) case S_DOCUMENT_3: return PROMPT_FIRST; - case S_FILE_LABEL: + case S_FILE_LABEL_1: return PROMPT_LATER; + case S_FILE_LABEL_2: + case S_FILE_LABEL_3: + return PROMPT_FIRST; case S_DO_REPEAT_1: case S_DO_REPEAT_2: @@ -1945,9 +2021,10 @@ segmenter_get_prompt (const struct segmenter *s) case S_DEFINE_1: case S_DEFINE_2: - return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER; case S_DEFINE_3: + return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER; case S_DEFINE_4: + case S_DEFINE_5: return PROMPT_DEFINE; case S_BEGIN_DATA_1: @@ -1958,9 +2035,6 @@ segmenter_get_prompt (const struct segmenter *s) case S_BEGIN_DATA_4: return PROMPT_DATA; - case S_TITLE_1: - case S_TITLE_2: - return PROMPT_FIRST; } NOT_REACHED ();