From 65c76af5768d836d91b79d87702f9b4e004c18aa Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 9 Apr 2021 10:53:33 -0700 Subject: [PATCH] lexer: Add support for DEFINE...!ENDDEFINE. This doesn't support the command itself, just the lexical analysis needed for it. --- src/language/lexer/scan.c | 1 + src/language/lexer/segment.c | 188 +++++++++++++++++++- src/language/lexer/segment.h | 5 +- src/libpspp/prompt.c | 2 + src/libpspp/prompt.h | 3 +- src/ui/terminal/terminal-reader.c | 3 + tests/language/lexer/scan.at | 272 +++++++++++++++++++++++++++++ tests/language/lexer/segment.at | 281 ++++++++++++++++++++++++++++++ 8 files changed, 752 insertions(+), 3 deletions(-) diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c index cae523cb37..2ad467d978 100644 --- a/src/language/lexer/scan.c +++ b/src/language/lexer/scan.c @@ -441,6 +441,7 @@ scan_start__ (struct scanner *scanner, enum segment_type type, case SEG_DO_REPEAT_COMMAND: case SEG_INLINE_DATA: case SEG_DOCUMENT: + case SEG_MACRO_BODY: token->type = T_STRING; ss_alloc_substring (&token->string, s); return SCAN_DONE; diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c index 5f7fc01310..43e02ac6fb 100644 --- a/src/language/lexer/segment.c +++ b/src/language/lexer/segment.c @@ -42,6 +42,11 @@ enum segmenter_state S_DO_REPEAT_1, S_DO_REPEAT_2, S_DO_REPEAT_3, + S_DEFINE_1, + S_DEFINE_2, + S_DEFINE_3, + S_DEFINE_4, + S_DEFINE_5, S_BEGIN_DATA_1, S_BEGIN_DATA_2, S_BEGIN_DATA_3, @@ -651,7 +656,6 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n, } /* fall through */ - case SEG_MACRO_ID: case SEG_NUMBER: case SEG_QUOTED_STRING: case SEG_HEX_STRING: @@ -662,6 +666,8 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n, case SEG_COMMENT_COMMAND: case SEG_DO_REPEAT_COMMAND: case SEG_INLINE_DATA: + case SEG_MACRO_ID: + case SEG_MACRO_BODY: case SEG_START_DOCUMENT: case SEG_DOCUMENT: case SEG_START_COMMAND: @@ -751,6 +757,11 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, return ofs; } } + else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6)) + { + s->state = S_DEFINE_1; + return ofs; + } else if (lex_id_match (ss_cstr ("FILE"), word)) { char id[16]; @@ -1416,6 +1427,162 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s, } } +/* We are segmenting a DEFINE command, which consists of: + + - The DEFINE keyword. + + - Anything but "(". + + - "(" followed by a sequence of tokens possibly including balanced parentheses + up to a final ")". + + - A newline. + + - A sequence of lines that don't start with "!ENDDEFINE", one string per line, + each ending in a newline. + + - "!ENDDEFINE". + + */ +static int +segmenter_parse_define_1__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int ofs = segmenter_subparse (s, input, n, eof, type); + if (ofs < 0) + return -1; + + if (*type == SEG_SEPARATE_COMMANDS + || *type == SEG_END_COMMAND + || *type == SEG_START_COMMAND) + { + /* The DEFINE command is malformed because we reached its end without + ever hitting a "(" token. Transition back to general parsing. */ + s->state = S_GENERAL; + return ofs; + } + else if (*type == SEG_PUNCT && input[0] == '(') + { + s->state = S_DEFINE_2; + s->nest = 1; + return ofs; + } + + return ofs; +} + +static int +segmenter_parse_define_2__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int ofs = segmenter_subparse (s, input, n, eof, type); + if (ofs < 0) + return -1; + + if (*type == SEG_SEPARATE_COMMANDS + || *type == SEG_END_COMMAND + || *type == SEG_START_COMMAND) + { + /* The DEFINE command is malformed because we reached its end before + closing the set of parentheses. Transition back to general + parsing. */ + s->state = S_GENERAL; + return ofs; + } + else if (*type == SEG_PUNCT && input[0] == '(') + { + s->nest++; + return ofs; + } + else if (*type == SEG_PUNCT && input[0] == ')') + { + s->nest--; + if (!s->nest) + s->state = S_DEFINE_3; + return ofs; + } + + return ofs; +} + +static int +segmenter_parse_define_3__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int ofs = segmenter_subparse (s, input, n, eof, type); + if (ofs < 0) + return -1; + + if (*type == SEG_NEWLINE) + s->state = S_DEFINE_4; + + return ofs; +} + +static bool +is_enddefine (const char *input, size_t n) +{ + int ofs = skip_spaces_and_comments (input, n, true, 0); + assert (ofs >= 0); + + const struct substring enddefine = ss_cstr ("!ENDDEFINE"); + if (n - ofs < enddefine.length) + return false; + + if (!ss_equals_case (ss_buffer (input + ofs, enddefine.length), enddefine)) + return false; + + if (ofs + enddefine.length >= n) + return true; + + const uint8_t *u_input = CHAR_CAST (const uint8_t *, input); + ucs4_t uc; + u8_mbtouc (&uc, u_input + ofs, n - ofs); + return uc == '.' || !lex_uc_is_idn (uc); +} + +static int +segmenter_parse_define_4__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int ofs; + + ofs = segmenter_parse_full_line__ (input, n, eof, type); + if (ofs < 0) + return -1; + else if (is_enddefine (input, ofs)) + { + s->state = S_GENERAL; + s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE; + return segmenter_push (s, input, n, eof, type); + } + else + { + *type = SEG_MACRO_BODY; + s->state = S_DEFINE_5; + return input[ofs - 1] == '\n' ? 0 : ofs; + } +} + +static int +segmenter_parse_define_5__ (struct segmenter *s, + const char *input, size_t n, bool eof, + enum segment_type *type) +{ + int ofs; + + ofs = segmenter_parse_newline__ (input, n, eof, type); + if (ofs < 0) + return -1; + + s->state = S_DEFINE_4; + return ofs; +} + static int segmenter_parse_begin_data_1__ (struct segmenter *s, const char *input, size_t n, bool eof, @@ -1699,6 +1866,17 @@ segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof, case S_DO_REPEAT_3: return segmenter_parse_do_repeat_3__ (s, input, n, eof, type); + case S_DEFINE_1: + return segmenter_parse_define_1__ (s, input, n, eof, type); + case S_DEFINE_2: + return segmenter_parse_define_2__ (s, input, n, eof, type); + case S_DEFINE_3: + return segmenter_parse_define_3__ (s, input, n, eof, type); + case S_DEFINE_4: + return segmenter_parse_define_4__ (s, input, n, eof, type); + case S_DEFINE_5: + return segmenter_parse_define_5__ (s, input, n, eof, type); + case S_BEGIN_DATA_1: return segmenter_parse_begin_data_1__ (s, input, n, eof, type); case S_BEGIN_DATA_2: @@ -1751,6 +1929,14 @@ segmenter_get_prompt (const struct segmenter *s) case S_DO_REPEAT_3: return PROMPT_DO_REPEAT; + case S_DEFINE_1: + case S_DEFINE_2: + case S_DEFINE_3: + return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER; + case S_DEFINE_4: + case S_DEFINE_5: + return PROMPT_DEFINE; + case S_BEGIN_DATA_1: return PROMPT_FIRST; case S_BEGIN_DATA_2: diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h index dbb43d529d..bb788f4230 100644 --- a/src/language/lexer/segment.h +++ b/src/language/lexer/segment.h @@ -67,7 +67,6 @@ enum segmenter_mode SEG_TYPE(UNQUOTED_STRING) \ SEG_TYPE(RESERVED_WORD) \ SEG_TYPE(IDENTIFIER) \ - SEG_TYPE(MACRO_ID) \ SEG_TYPE(PUNCT) \ \ SEG_TYPE(SHBANG) \ @@ -79,6 +78,9 @@ enum segmenter_mode SEG_TYPE(DO_REPEAT_COMMAND) \ SEG_TYPE(INLINE_DATA) \ \ + SEG_TYPE(MACRO_ID) \ + SEG_TYPE(MACRO_BODY) \ + \ SEG_TYPE(START_DOCUMENT) \ SEG_TYPE(DOCUMENT) \ \ @@ -112,6 +114,7 @@ struct segmenter { unsigned char state; unsigned char substate; + unsigned char nest; unsigned char mode; }; diff --git a/src/libpspp/prompt.c b/src/libpspp/prompt.c index 0722c3b96b..f96ca8c100 100644 --- a/src/libpspp/prompt.c +++ b/src/libpspp/prompt.c @@ -35,6 +35,8 @@ prompt_style_to_string (enum prompt_style style) return "DOCUMENT"; case PROMPT_DO_REPEAT: return "DO REPEAT"; + case PROMPT_DEFINE: + return "DEFINE"; default: return "unknown prompt"; } diff --git a/src/libpspp/prompt.h b/src/libpspp/prompt.h index 14e820b8a4..8022e73732 100644 --- a/src/libpspp/prompt.h +++ b/src/libpspp/prompt.h @@ -24,7 +24,8 @@ enum prompt_style PROMPT_DATA, /* Between BEGIN DATA and END DATA. */ PROMPT_COMMENT, /* COMMENT or * command. */ PROMPT_DOCUMENT, /* DOCUMENT command. */ - PROMPT_DO_REPEAT /* DO REPEAT command. */ + PROMPT_DO_REPEAT, /* DO REPEAT command. */ + PROMPT_DEFINE, /* DEFINE command. */ }; const char *prompt_style_to_string (enum prompt_style); diff --git a/src/ui/terminal/terminal-reader.c b/src/ui/terminal/terminal-reader.c index c7a9a311c2..e0f219826d 100644 --- a/src/ui/terminal/terminal-reader.c +++ b/src/ui/terminal/terminal-reader.c @@ -229,6 +229,9 @@ readline_prompt (enum prompt_style style) case PROMPT_DO_REPEAT: return "DO REPEAT> "; + + case PROMPT_DEFINE: + return "DEFINE> "; } NOT_REACHED (); diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at index b442958d5e..953be86deb 100644 --- a/tests/language/lexer/scan.at +++ b/tests/language/lexer/scan.at @@ -881,6 +881,278 @@ STOP PSPP_CHECK_SCAN([-b]) AT_CLEANUP +AT_SETUP([DEFINE command - simple]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +define !macro1() +var1 var2 var3 +!enddefine. +]) +AT_DATA([expout-base], [dnl +ID "define" +SKIP +MACRO_ID "!macro1" +LPAREN +RPAREN +SKIP +STRING "var1 var2 var3" +SKIP +MACRO_ID "!enddefine" +ENDCMD +-SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - empty]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +define !macro1() +!enddefine. +]) +AT_DATA([expout-base], [dnl +ID "define" +SKIP +MACRO_ID "!macro1" +LPAREN +RPAREN +SKIP +MACRO_ID "!enddefine" +ENDCMD +-SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - arguments]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +define !macro1(a(), b(), c()) +!enddefine. +]) +AT_DATA([expout-base], [dnl +ID "define" +SKIP +MACRO_ID "!macro1" +LPAREN +ID "a" +LPAREN +RPAREN +COMMA +SKIP +ID "b" +LPAREN +RPAREN +COMMA +SKIP +ID "c" +LPAREN +RPAREN +RPAREN +SKIP +MACRO_ID "!enddefine" +ENDCMD +-SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - multiline arguments]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +define !macro1( + a(), b( + ), + c() +) +!enddefine. +]) +AT_DATA([expout-base], [dnl +ID "define" +SKIP +MACRO_ID "!macro1" +LPAREN +SKIP +SKIP +ID "a" +LPAREN +RPAREN +COMMA +SKIP +ID "b" +LPAREN +SKIP +SKIP +RPAREN +COMMA +SKIP +SKIP +ID "c" +LPAREN +RPAREN +SKIP +RPAREN +SKIP +MACRO_ID "!enddefine" +ENDCMD +-SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - arguments start on second line]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +define !macro1 +(x,y,z +) +content 1 +content 2 +!enddefine. +]) +AT_DATA([expout-base], [dnl +ID "define" +SKIP +MACRO_ID "!macro1" +SKIP +LPAREN +ID "x" +COMMA +ID "y" +COMMA +ID "z" +SKIP +RPAREN +SKIP +STRING "content 1" +SKIP +STRING "content 2" +SKIP +MACRO_ID "!enddefine" +ENDCMD +-SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - early end of command 1]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +define !macro1. +data list /x 1. +]) +AT_DATA([expout-base], [dnl +ID "define" +SKIP +MACRO_ID "!macro1" +ENDCMD +SKIP +ID "data" +SKIP +ID "list" +SKIP +SLASH +ID "x" +SKIP +POS_NUM 1 +ENDCMD +-SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - early end of command 2]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +define !macro1 +x. +data list /x 1. +]) +AT_DATA([expout-base], [dnl +ID "define" +SKIP +MACRO_ID "!macro1" +SKIP +ID "x" +ENDCMD +SKIP +ID "data" +SKIP +ID "list" +SKIP +SLASH +ID "x" +SKIP +POS_NUM 1 +ENDCMD +-SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - early end of command 3]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +define !macro1(. +x. +data list /x 1. +]) +AT_DATA([expout-base], [dnl +ID "define" +SKIP +MACRO_ID "!macro1" +LPAREN +ENDCMD +SKIP +ID "x" +ENDCMD +SKIP +ID "data" +SKIP +ID "list" +SKIP +SLASH +ID "x" +SKIP +POS_NUM 1 +ENDCMD +-SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - missing !ENDDEFINE]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +define !macro1(). +content line 1 +content line 2 +]) +AT_DATA([expout-base], [dnl +ID "define" +SKIP +MACRO_ID "!macro1" +LPAREN +RPAREN +ENDCMD +SKIP +STRING "content line 1" +SKIP +STRING "content line 2" +-SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + AT_SETUP([batch mode]) AT_KEYWORDS([scan]) AT_DATA([input], [dnl diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at index b55216cfef..54520717c7 100644 --- a/tests/language/lexer/segment.at +++ b/tests/language/lexer/segment.at @@ -1053,6 +1053,287 @@ end PSPP_CHECK_SEGMENT([-b]) AT_CLEANUP +AT_SETUP([DEFINE command - simple]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +define !macro1() +var1 var2 var3 +!enddefine. +]) +AT_DATA([expout-base], [dnl +identifier define space +macro_id !macro1 +punct ( +punct ) +newline \n (DEFINE) + +macro_body var1_var2_var3 +newline \n (DEFINE) + +macro_id !enddefine +end_command . +-newline \n (first) +- +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - empty]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +define !macro1() +!enddefine. +]) +AT_DATA([expout-base], [dnl +identifier define space +macro_id !macro1 +punct ( +punct ) +newline \n (DEFINE) + +macro_id !enddefine +end_command . +-newline \n (first) +- +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - arguments]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +define !macro1(a(), b(), c()) +!enddefine. +]) +AT_DATA([expout-base], [dnl +identifier define space +macro_id !macro1 +punct ( +identifier a +punct ( +punct ) +punct , space +identifier b +punct ( +punct ) +punct , space +identifier c +punct ( +punct ) +punct ) +newline \n (DEFINE) + +macro_id !enddefine +end_command . +-newline \n (first) +- +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - multiline arguments]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +define !macro1( + a(), b( + ), + c() +) +!enddefine. +]) +AT_DATA([expout-base], [dnl +identifier define space +macro_id !macro1 +punct ( +newline \n (later) + +spaces __ +identifier a +punct ( +punct ) +punct , space +identifier b +punct ( +newline \n (later) + +spaces __ +punct ) +punct , +newline \n (later) + +spaces __ +identifier c +punct ( +punct ) +newline \n (later) + +punct ) +newline \n (DEFINE) + +macro_id !enddefine +end_command . +-newline \n (first) +- +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - arguments start on second line]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +define !macro1 +(x,y,z +) +content 1 +content 2 +!enddefine. +]) +AT_DATA([expout-base], [dnl +identifier define space +macro_id !macro1 +newline \n (later) + +punct ( +identifier x +punct , +identifier y +punct , +identifier z +newline \n (later) + +punct ) +newline \n (DEFINE) + +macro_body content_1 +newline \n (DEFINE) + +macro_body content_2 +newline \n (DEFINE) + +macro_id !enddefine +end_command . +-newline \n (first) +- +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - early end of command 1]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +define !macro1. +data list /x 1. +]) +AT_DATA([expout-base], [dnl +identifier define space +macro_id !macro1 +end_command . +newline \n (first) + +identifier data space +identifier list space +punct / +identifier x space +number 1 +end_command . +-newline \n (first) +- +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - early end of command 2]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +define !macro1 +x. +data list /x 1. +]) +AT_DATA([expout-base], [dnl +identifier define space +macro_id !macro1 +newline \n (later) + +identifier x +end_command . +newline \n (first) + +identifier data space +identifier list space +punct / +identifier x space +number 1 +end_command . +-newline \n (first) +- +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - early end of command 3]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +define !macro1(. +x. +data list /x 1. +]) +AT_DATA([expout-base], [dnl +identifier define space +macro_id !macro1 +punct ( +end_command . +newline \n (first) + +identifier x +end_command . +newline \n (first) + +identifier data space +identifier list space +punct / +identifier x space +number 1 +end_command . +-newline \n (first) +- +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DEFINE command - missing !ENDDEFINE]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +define !macro1(). +content line 1 +content line 2 +]) +AT_DATA([expout-base], [dnl +identifier define space +macro_id !macro1 +punct ( +punct ) +end_command . +newline \n (DEFINE) + +macro_body content_line_1 +newline \n (DEFINE) + +macro_body content_line_2 +-newline \n (DEFINE) +- +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + AT_SETUP([batch mode]) AT_KEYWORDS([segment]) AT_DATA([input], [dnl -- 2.30.2