From: Ben Pfaff Date: Sun, 21 Mar 2021 21:58:09 +0000 (-0700) Subject: lexer: Add support for macro identifiers (that begin with '!'). X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=d26105c398be227dc38668ce3e742c31adef15f7 lexer: Add support for macro identifiers (that begin with '!'). --- diff --git a/src/data/identifier.c b/src/data/identifier.c index db20010464..c613734c94 100644 --- a/src/data/identifier.c +++ b/src/data/identifier.c @@ -62,6 +62,7 @@ token_type_to_string (enum token_type token) switch (token) { case T_ID: + case T_MACRO_ID: case T_POS_NUM: case T_NEG_NUM: case T_STRING: diff --git a/src/data/identifier.h b/src/data/identifier.h index b7affdb192..85299979c0 100644 --- a/src/data/identifier.h +++ b/src/data/identifier.h @@ -23,41 +23,42 @@ #include "libpspp/str.h" #define TOKEN_TYPES \ - TOKEN_TYPE(ID) /* Identifier. */ \ - TOKEN_TYPE(POS_NUM) /* Positive number. */ \ - TOKEN_TYPE(NEG_NUM) /* Negative number. */ \ - TOKEN_TYPE(STRING) /* Quoted string. */ \ - TOKEN_TYPE(STOP) /* End of input. */ \ + TOKEN_TYPE(ID) /* Identifier. */ \ + TOKEN_TYPE(MACRO_ID) /* Identifier starting with '!'. */ \ + TOKEN_TYPE(POS_NUM) /* Positive number. */ \ + TOKEN_TYPE(NEG_NUM) /* Negative number. */ \ + TOKEN_TYPE(STRING) /* Quoted string. */ \ + TOKEN_TYPE(STOP) /* End of input. */ \ \ - TOKEN_TYPE(ENDCMD) /* . */ \ - TOKEN_TYPE(PLUS) /* + */ \ - TOKEN_TYPE(DASH) /* - */ \ - TOKEN_TYPE(ASTERISK) /* * */ \ - TOKEN_TYPE(SLASH) /* / */ \ - TOKEN_TYPE(EQUALS) /* = */ \ - TOKEN_TYPE(LPAREN) /* (*/ \ - TOKEN_TYPE(RPAREN) /*) */ \ - TOKEN_TYPE(LBRACK) /* [ */ \ - TOKEN_TYPE(RBRACK) /* ] */ \ - TOKEN_TYPE(COMMA) /* , */ \ + TOKEN_TYPE(ENDCMD) /* . */ \ + TOKEN_TYPE(PLUS) /* + */ \ + TOKEN_TYPE(DASH) /* - */ \ + TOKEN_TYPE(ASTERISK) /* * */ \ + TOKEN_TYPE(SLASH) /* / */ \ + TOKEN_TYPE(EQUALS) /* = */ \ + TOKEN_TYPE(LPAREN) /* (*/ \ + TOKEN_TYPE(RPAREN) /*) */ \ + TOKEN_TYPE(LBRACK) /* [ */ \ + TOKEN_TYPE(RBRACK) /* ] */ \ + TOKEN_TYPE(COMMA) /* , */ \ \ - TOKEN_TYPE(AND) /* AND */ \ - TOKEN_TYPE(OR) /* OR */ \ - TOKEN_TYPE(NOT) /* NOT */ \ + TOKEN_TYPE(AND) /* AND */ \ + TOKEN_TYPE(OR) /* OR */ \ + TOKEN_TYPE(NOT) /* NOT */ \ \ - TOKEN_TYPE(EQ) /* EQ */ \ - TOKEN_TYPE(GE) /* GE or >= */ \ - TOKEN_TYPE(GT) /* GT or > */ \ - TOKEN_TYPE(LE) /* LE or <= */ \ - TOKEN_TYPE(LT) /* LT or < */ \ - TOKEN_TYPE(NE) /* NE or ~= */ \ + TOKEN_TYPE(EQ) /* EQ */ \ + TOKEN_TYPE(GE) /* GE or >= */ \ + TOKEN_TYPE(GT) /* GT or > */ \ + TOKEN_TYPE(LE) /* LE or <= */ \ + TOKEN_TYPE(LT) /* LT or < */ \ + TOKEN_TYPE(NE) /* NE or ~= */ \ \ - TOKEN_TYPE(ALL) /* ALL */ \ - TOKEN_TYPE(BY) /* BY */ \ - TOKEN_TYPE(TO) /* TO */ \ - TOKEN_TYPE(WITH) /* WITH */ \ + TOKEN_TYPE(ALL) /* ALL */ \ + TOKEN_TYPE(BY) /* BY */ \ + TOKEN_TYPE(TO) /* TO */ \ + TOKEN_TYPE(WITH) /* WITH */ \ \ - TOKEN_TYPE(EXP) /* ** */ + TOKEN_TYPE(EXP) /* ** */ /* Token types. */ enum token_type diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index 6d3549a827..5044731348 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -933,8 +933,8 @@ lex_next_tokcstr (const struct lexer *lexer, int n) The string is null-terminated (but the null terminator is not included in the returned substring's 'length'). - Only T_ID and T_STRING tokens have meaningful strings. For other tokens - this functions this function will always return NULL. + Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other + tokens this functions this function will always return NULL. The UTF-8 encoding of the returned string is correct for variable names and other identifiers. Use filename_to_utf8() to use it as a filename. Use diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c index 573a00df9d..94437d9dd8 100644 --- a/src/language/lexer/scan.c +++ b/src/language/lexer/scan.c @@ -453,6 +453,11 @@ scan_start__ (struct scanner *scanner, enum segment_type type, ss_alloc_substring (&token->string, s); return SCAN_DONE; + case SEG_MACRO_ID: + token->type = T_MACRO_ID; + ss_alloc_substring (&token->string, s); + return SCAN_DONE; + case SEG_PUNCT: if (s.length == 1 && s.string[0] == '-') { diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c index 3e060a5b9f..cfe3de522f 100644 --- a/src/language/lexer/segment.c +++ b/src/language/lexer/segment.c @@ -651,6 +651,7 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n, } /* fall through */ + case SEG_MACRO_ID: case SEG_NUMBER: case SEG_QUOTED_STRING: case SEG_HEX_STRING: @@ -719,10 +720,9 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, ofs--; } - if (is_reserved_word (input, ofs)) - *type = SEG_RESERVED_WORD; - else - *type = SEG_IDENTIFIER; + *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD + : input[0] == '!' ? SEG_MACRO_ID + : SEG_IDENTIFIER); if (s->substate & SS_START_OF_COMMAND) { @@ -989,6 +989,9 @@ segmenter_parse_mid_command__ (struct segmenter *s, return segmenter_parse_string__ (SEG_QUOTED_STRING, 0, s, input, n, eof, type); + case '!': + return segmenter_parse_id__ (s, input, n, eof, type); + default: if (lex_uc_is_space (uc)) { diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h index c647c8691d..dbb43d529d 100644 --- a/src/language/lexer/segment.h +++ b/src/language/lexer/segment.h @@ -67,6 +67,7 @@ enum segmenter_mode SEG_TYPE(UNQUOTED_STRING) \ SEG_TYPE(RESERVED_WORD) \ SEG_TYPE(IDENTIFIER) \ + SEG_TYPE(MACRO_ID) \ SEG_TYPE(PUNCT) \ \ SEG_TYPE(SHBANG) \ diff --git a/src/language/lexer/token.c b/src/language/lexer/token.c index 9c5fef9991..f6bc6a1d22 100644 --- a/src/language/lexer/token.c +++ b/src/language/lexer/token.c @@ -142,6 +142,7 @@ token_to_string (const struct token *token) return number_token_to_string (token); case T_ID: + case T_MACRO_ID: return ss_xstrdup (token->string); case T_STRING: diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at index 3da89484c4..30ee16ad9b 100644 --- a/tests/language/lexer/scan.at +++ b/tests/language/lexer/scan.at @@ -25,7 +25,7 @@ m4_define([PSPP_CHECK_SCAN], AT_SETUP([identifiers]) AT_KEYWORDS([scan]) AT_DATA([input], [dnl -a aB i5 $x @efg @@. #.# .x _z. +a aB i5 $x @efg @@. !abcd #.# .x _z. abcd. abcd. QRSTUV./* end of line comment */ QrStUv./* end of line comment */ @&t@ @@ -45,6 +45,8 @@ ID "@efg" SKIP ID "@@." SKIP +MACRO_ID "!abcd" +SKIP ID "#.#" SKIP UNEXPECTED_DOT @@ -443,7 +445,7 @@ AT_DATA([expout-base], [dnl SKIP SKIP ID "#" -UNEXPECTED_CHAR 33 +MACRO_ID "!" SKIP SLASH ID "usr" diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at index b358b3e509..414318cb5a 100644 --- a/tests/language/lexer/segment.at +++ b/tests/language/lexer/segment.at @@ -30,13 +30,13 @@ m4_define([PSPP_CHECK_SEGMENT], AT_SETUP([identifiers]) AT_KEYWORDS([segment]) AT_DATA([input], [dnl -a ab abc abcd -A AB ABC ABCD -aB aBC aBcD -$x $y $z +a ab abc abcd !abcd +A AB ABC ABCD !ABCD +aB aBC aBcD !aBcD +$x $y $z !$z grève@<00A0>@Ângstrom@<00A0>@poté -#a #b #c ## #d -@efg @ @@. @#@ @&t@ +#a #b #c ## #d !#d +@efg @ @@. @#@ !@ @&t@ ## # #12345 #.# f@#_.#6 GhIjK @@ -46,23 +46,27 @@ AT_DATA([expout-base], [dnl identifier a space identifier ab space identifier abc space -identifier abcd +identifier abcd space +macro_id !abcd newline \n (later) identifier A space identifier AB space identifier ABC space -identifier ABCD +identifier ABCD space +macro_id !ABCD newline \n (later) identifier aB space identifier aBC space -identifier aBcD +identifier aBcD space +macro_id !aBcD newline \n (later) identifier $x space identifier $y space -identifier $z +identifier $z space +macro_id !$z newline \n (later) identifier grève @@ -76,13 +80,15 @@ identifier #a space identifier #b space identifier #c space identifier ## space -identifier #d +identifier #d space +macro_id !#d newline \n (later) identifier @efg space identifier @ space identifier @@. space identifier @#@ space +macro_id !@ space newline \n (later) identifier ## space @@ -494,7 +500,7 @@ end_command . newline \n (first) identifier # -unexpected_char ! space +macro_id ! space punct / identifier usr punct /