From: Ben Pfaff Date: Mon, 22 Mar 2021 06:06:14 +0000 (-0700) Subject: lexer: Add support for macro punctuation. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=ed109bf498216cef15a3cbf180827dc8b20eff0b lexer: Add support for macro punctuation. These punctuation symbols can be used to delimit macro arguments, even though they aren't allowed anywhere else in the language. --- diff --git a/src/data/identifier.c b/src/data/identifier.c index c613734c94..e3b33a3382 100644 --- a/src/data/identifier.c +++ b/src/data/identifier.c @@ -62,10 +62,11 @@ token_type_to_string (enum token_type token) switch (token) { case T_ID: - case T_MACRO_ID: case T_POS_NUM: case T_NEG_NUM: case T_STRING: + case T_MACRO_ID: + case T_MACRO_PUNCT: case T_STOP: return NULL; diff --git a/src/data/identifier.h b/src/data/identifier.h index 85299979c0..1fc63b4808 100644 --- a/src/data/identifier.h +++ b/src/data/identifier.h @@ -24,7 +24,6 @@ #define TOKEN_TYPES \ TOKEN_TYPE(ID) /* Identifier. */ \ - TOKEN_TYPE(MACRO_ID) /* Identifier starting with '!'. */ \ TOKEN_TYPE(POS_NUM) /* Positive number. */ \ TOKEN_TYPE(NEG_NUM) /* Negative number. */ \ TOKEN_TYPE(STRING) /* Quoted string. */ \ @@ -37,7 +36,7 @@ TOKEN_TYPE(SLASH) /* / */ \ TOKEN_TYPE(EQUALS) /* = */ \ TOKEN_TYPE(LPAREN) /* (*/ \ - TOKEN_TYPE(RPAREN) /*) */ \ + TOKEN_TYPE(RPAREN) /* ) */ \ TOKEN_TYPE(LBRACK) /* [ */ \ TOKEN_TYPE(RBRACK) /* ] */ \ TOKEN_TYPE(COMMA) /* , */ \ @@ -58,8 +57,10 @@ TOKEN_TYPE(TO) /* TO */ \ TOKEN_TYPE(WITH) /* WITH */ \ \ - TOKEN_TYPE(EXP) /* ** */ - + TOKEN_TYPE(EXP) /* ** */ \ + \ + TOKEN_TYPE(MACRO_ID) /* Identifier starting with '!'. */ \ + TOKEN_TYPE(MACRO_PUNCT) /* Miscellaneous punctuator. */ /* Token types. */ enum token_type { diff --git a/src/language/command.def b/src/language/command.def index a97f9b83e7..ff7cd8e84c 100644 --- a/src/language/command.def +++ b/src/language/command.def @@ -18,6 +18,7 @@ DEF_CMD (S_ANY, F_ENHANCED, "CLOSE FILE HANDLE", cmd_close_file_handle) DEF_CMD (S_ANY, 0, "CACHE", cmd_cache) DEF_CMD (S_ANY, 0, "CD", cmd_cd) +//DEF_CMD (S_ANY, 0, "DEFINE", cmd_define) DEF_CMD (S_ANY, 0, "DO REPEAT", cmd_do_repeat) DEF_CMD (S_ANY, 0, "END REPEAT", cmd_end_repeat) DEF_CMD (S_ANY, 0, "ECHO", cmd_echo) @@ -188,7 +189,6 @@ UNIMPL_CMD ("CSTABULATE", "Tabulate complex samples") UNIMPL_CMD ("CTABLES", "Display complex samples") UNIMPL_CMD ("CURVEFIT", "Fit curve to line plot") UNIMPL_CMD ("DATE", "Create time series data") -UNIMPL_CMD ("DEFINE", "Syntax macros") UNIMPL_CMD ("DETECTANOMALY", "Find unusual cases") UNIMPL_CMD ("DISCRIMINANT", "Linear discriminant analysis") UNIMPL_CMD ("EDIT", "obsolete") diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c index 94437d9dd8..cae523cb37 100644 --- a/src/language/lexer/scan.c +++ b/src/language/lexer/scan.c @@ -324,6 +324,7 @@ scan_punct1__ (char c0) case '<': return T_LT; case '>': return T_GT; case '~': return T_NOT; + default: return T_MACRO_PUNCT; } NOT_REACHED (); @@ -467,6 +468,8 @@ scan_start__ (struct scanner *scanner, enum segment_type type, else { token->type = scan_punct__ (s); + if (token->type == T_MACRO_PUNCT) + ss_alloc_substring (&token->string, s); return SCAN_DONE; } diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c index cfe3de522f..5f7fc01310 100644 --- a/src/language/lexer/segment.c +++ b/src/language/lexer/segment.c @@ -1015,6 +1015,12 @@ segmenter_parse_mid_command__ (struct segmenter *s, } else if (lex_uc_is_id1 (uc)) return segmenter_parse_id__ (s, input, n, eof, type); + else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^') + { + *type = SEG_PUNCT; + s->substate = 0; + return 1; + } else { *type = SEG_UNEXPECTED_CHAR; diff --git a/src/language/lexer/token.c b/src/language/lexer/token.c index f6bc6a1d22..98fb72f14e 100644 --- a/src/language/lexer/token.c +++ b/src/language/lexer/token.c @@ -143,6 +143,7 @@ token_to_string (const struct token *token) case T_ID: case T_MACRO_ID: + case T_MACRO_PUNCT: return ss_xstrdup (token->string); case T_STRING: diff --git a/tests/language/lexer/lexer.at b/tests/language/lexer/lexer.at index d499f0922f..8438bfb26d 100644 --- a/tests/language/lexer/lexer.at +++ b/tests/language/lexer/lexer.at @@ -46,7 +46,7 @@ u'110000' 'foo 'very long unterminated string that be ellipsized in its error message 1e .x -` +^ � ]) AT_CHECK([pspp -O format=csv lexer.sps], [1], [dnl @@ -72,7 +72,7 @@ lexer.sps:9.4: error: Syntax error at `.': Unexpected `.' in middle of command. lexer.sps:9: error: Unknown command `x'. -lexer.sps:10.1: error: Syntax error at ``': Bad character ``' in input. +lexer.sps:10.1: error: Syntax error at `^': Bad character `^' in input. lexer.sps:11.1: error: Syntax error at `�': Bad character U+FFFD in input. ]) diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at index 30ee16ad9b..b442958d5e 100644 --- a/tests/language/lexer/scan.at +++ b/tests/language/lexer/scan.at @@ -52,7 +52,7 @@ SKIP UNEXPECTED_DOT ID "x" SKIP -UNEXPECTED_CHAR 95 +MACRO_PUNCT "_" ID "z" ENDCMD SKIP @@ -187,6 +187,7 @@ AT_KEYWORDS([scan]) AT_DATA([input], [dnl ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] ** ~&|=>=><=<~=<>(),-+*/[[]]** +% : ; ? _ ` { } ~ ]) AT_DATA([expout-base], [dnl NOT @@ -249,6 +250,24 @@ SLASH LBRACK RBRACK EXP +SKIP +MACRO_PUNCT "%" +SKIP +MACRO_PUNCT ":" +SKIP +MACRO_PUNCT ";" +SKIP +MACRO_PUNCT "?" +SKIP +MACRO_PUNCT "_" +SKIP +MACRO_PUNCT "`" +SKIP +MACRO_PUNCT "{" +SKIP +MACRO_PUNCT "}" +SKIP +NOT -SKIP STOP ]) diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at index 414318cb5a..b55216cfef 100644 --- a/tests/language/lexer/segment.at +++ b/tests/language/lexer/segment.at @@ -107,7 +107,7 @@ start_command . identifier x space number 1 identifier y space -unexpected_char \_ +punct \_ identifier z -newline \n (later) - @@ -291,6 +291,7 @@ AT_KEYWORDS([segment]) AT_DATA([input], [dnl ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] ** ~&|=>=><=<~=<>(),-+*/[[]]** +% : ; ? _ ` { } ~ ]) AT_DATA([expout-base], [dnl punct ~ space @@ -335,6 +336,17 @@ punct / punct [[ punct ]] punct ** +newline \n (later) + +punct % space +punct : space +punct ; space +punct ? space +punct \_ space +punct ` space +punct { space +punct } space +punct ~ -newline \n (later) - end