lexer: Add support for macro identifiers (that begin with '!').
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 21 Mar 2021 21:58:09 +0000 (14:58 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Fri, 9 Apr 2021 18:16:48 +0000 (11:16 -0700)
src/data/identifier.c
src/data/identifier.h
src/language/lexer/lexer.c
src/language/lexer/scan.c
src/language/lexer/segment.c
src/language/lexer/segment.h
src/language/lexer/token.c
tests/language/lexer/scan.at
tests/language/lexer/segment.at

index db20010464cab1e3f3a1f42cc1e7a4c012bf8c1b..c613734c94dc647ec4d2c1ca93e9e0dfb8232542 100644 (file)
@@ -62,6 +62,7 @@ token_type_to_string (enum token_type token)
   switch (token)
     {
     case T_ID:
+    case T_MACRO_ID:
     case T_POS_NUM:
     case T_NEG_NUM:
     case T_STRING:
index b7affdb192823f094994c6c662ffa0a3e992b2df..85299979c034c5c2be05633d79704a7bf497f188 100644 (file)
 #include "libpspp/str.h"
 
 #define TOKEN_TYPES                                                     \
-    TOKEN_TYPE(ID)                         /* Identifier. */            \
-    TOKEN_TYPE(POS_NUM)                    /* Positive number. */       \
-    TOKEN_TYPE(NEG_NUM)                    /* Negative number. */       \
-    TOKEN_TYPE(STRING)                     /* Quoted string. */         \
-    TOKEN_TYPE(STOP)                       /* End of input. */          \
+    TOKEN_TYPE(ID)                  /* Identifier. */                   \
+    TOKEN_TYPE(MACRO_ID)            /* Identifier starting with '!'. */ \
+    TOKEN_TYPE(POS_NUM)             /* Positive number. */              \
+    TOKEN_TYPE(NEG_NUM)             /* Negative number. */              \
+    TOKEN_TYPE(STRING)              /* Quoted string. */                \
+    TOKEN_TYPE(STOP)                /* End of input. */                 \
                                                                         \
-    TOKEN_TYPE(ENDCMD)                     /* . */                      \
-    TOKEN_TYPE(PLUS)                       /* + */                      \
-    TOKEN_TYPE(DASH)                       /* - */                      \
-    TOKEN_TYPE(ASTERISK)                   /* * */                      \
-    TOKEN_TYPE(SLASH)                      /* / */                      \
-    TOKEN_TYPE(EQUALS)                     /* = */                      \
-    TOKEN_TYPE(LPAREN)                     /* (*/                      \
-    TOKEN_TYPE(RPAREN)                     /*) */                      \
-    TOKEN_TYPE(LBRACK)                     /* [ */                      \
-    TOKEN_TYPE(RBRACK)                     /* ] */                      \
-    TOKEN_TYPE(COMMA)                      /* , */                      \
+    TOKEN_TYPE(ENDCMD)              /* . */                             \
+    TOKEN_TYPE(PLUS)                /* + */                             \
+    TOKEN_TYPE(DASH)                /* - */                             \
+    TOKEN_TYPE(ASTERISK)            /* * */                             \
+    TOKEN_TYPE(SLASH)               /* / */                             \
+    TOKEN_TYPE(EQUALS)              /* = */                             \
+    TOKEN_TYPE(LPAREN)              /* (*/                              \
+    TOKEN_TYPE(RPAREN)              /*) */                              \
+    TOKEN_TYPE(LBRACK)              /* [ */                             \
+    TOKEN_TYPE(RBRACK)              /* ] */                             \
+    TOKEN_TYPE(COMMA)               /* , */                             \
                                                                         \
-    TOKEN_TYPE(AND)                        /* AND */                    \
-    TOKEN_TYPE(OR)                         /* OR */                     \
-    TOKEN_TYPE(NOT)                        /* NOT */                    \
+    TOKEN_TYPE(AND)                 /* AND */                           \
+    TOKEN_TYPE(OR)                  /* OR */                            \
+    TOKEN_TYPE(NOT)                 /* NOT */                           \
                                                                         \
-    TOKEN_TYPE(EQ)                         /* EQ */                     \
-    TOKEN_TYPE(GE)                         /* GE or >= */               \
-    TOKEN_TYPE(GT)                         /* GT or > */                \
-    TOKEN_TYPE(LE)                         /* LE or <= */               \
-    TOKEN_TYPE(LT)                         /* LT or < */                \
-    TOKEN_TYPE(NE)                         /* NE or ~= */               \
+    TOKEN_TYPE(EQ)                  /* EQ */                            \
+    TOKEN_TYPE(GE)                  /* GE or >= */                      \
+    TOKEN_TYPE(GT)                  /* GT or > */                       \
+    TOKEN_TYPE(LE)                  /* LE or <= */                      \
+    TOKEN_TYPE(LT)                  /* LT or < */                       \
+    TOKEN_TYPE(NE)                  /* NE or ~= */                      \
                                                                         \
-    TOKEN_TYPE(ALL)                        /* ALL */                    \
-    TOKEN_TYPE(BY)                         /* BY */                     \
-    TOKEN_TYPE(TO)                         /* TO */                     \
-    TOKEN_TYPE(WITH)                       /* WITH */                   \
+    TOKEN_TYPE(ALL)                 /* ALL */                           \
+    TOKEN_TYPE(BY)                  /* BY */                            \
+    TOKEN_TYPE(TO)                  /* TO */                            \
+    TOKEN_TYPE(WITH)                /* WITH */                          \
                                                                         \
-    TOKEN_TYPE(EXP)                        /* ** */
+    TOKEN_TYPE(EXP)                 /* ** */
 
 /* Token types. */
 enum token_type
index 6d3549a82782134f62c8603e76ac207f54fe6fef..5044731348ecce0c6a7573dfaabd28b9b3709d67 100644 (file)
@@ -933,8 +933,8 @@ lex_next_tokcstr (const struct lexer *lexer, int n)
    The string is null-terminated (but the null terminator is not included in
    the returned substring's 'length').
 
-   Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
-   this functions this function will always return NULL.
+   Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
+   tokens this functions this function will always return NULL.
 
    The UTF-8 encoding of the returned string is correct for variable names and
    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
index 573a00df9d327f42445cda0c7ccb6492bc093f95..94437d9dd8af5b6967fa5235d4765d5af3467c18 100644 (file)
@@ -453,6 +453,11 @@ scan_start__ (struct scanner *scanner, enum segment_type type,
       ss_alloc_substring (&token->string, s);
       return SCAN_DONE;
 
+    case SEG_MACRO_ID:
+      token->type = T_MACRO_ID;
+      ss_alloc_substring (&token->string, s);
+      return SCAN_DONE;
+
     case SEG_PUNCT:
       if (s.length == 1 && s.string[0] == '-')
         {
index 3e060a5b9f9e9e3d8342ad641d47dad594b953df..cfe3de522fca5c912d8d5da8ec37d75d1c3fcbf6 100644 (file)
@@ -651,6 +651,7 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
             }
           /* fall through */
 
+        case SEG_MACRO_ID:
         case SEG_NUMBER:
         case SEG_QUOTED_STRING:
         case SEG_HEX_STRING:
@@ -719,10 +720,9 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
         ofs--;
     }
 
-  if (is_reserved_word (input, ofs))
-    *type = SEG_RESERVED_WORD;
-  else
-    *type = SEG_IDENTIFIER;
+  *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
+           : input[0] == '!' ? SEG_MACRO_ID
+           : SEG_IDENTIFIER);
 
   if (s->substate & SS_START_OF_COMMAND)
     {
@@ -989,6 +989,9 @@ segmenter_parse_mid_command__ (struct segmenter *s,
       return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
                                        s, input, n, eof, type);
 
+    case '!':
+      return segmenter_parse_id__ (s, input, n, eof, type);
+
     default:
       if (lex_uc_is_space (uc))
         {
index c647c8691dadd81f1e634ed91a63f0b2b11a4b9d..dbb43d529d59541e9635d93ca536a2a31625b934 100644 (file)
@@ -67,6 +67,7 @@ enum segmenter_mode
     SEG_TYPE(UNQUOTED_STRING)                   \
     SEG_TYPE(RESERVED_WORD)                     \
     SEG_TYPE(IDENTIFIER)                        \
+    SEG_TYPE(MACRO_ID)                          \
     SEG_TYPE(PUNCT)                             \
                                                 \
     SEG_TYPE(SHBANG)                            \
index 9c5fef9991b20726811a684c047fb52d941c8af4..f6bc6a1d22166a8f7dfc118f400bcdbd5f8ac88d 100644 (file)
@@ -142,6 +142,7 @@ token_to_string (const struct token *token)
       return number_token_to_string (token);
 
     case T_ID:
+    case T_MACRO_ID:
       return ss_xstrdup (token->string);
 
     case T_STRING:
index 3da89484c44e1e0145d406269840d21843d47816..30ee16ad9b92c1c16a900971dd31ee0a8d5a400a 100644 (file)
@@ -25,7 +25,7 @@ m4_define([PSPP_CHECK_SCAN],
 AT_SETUP([identifiers])
 AT_KEYWORDS([scan])
 AT_DATA([input], [dnl
-a aB i5 $x @efg @@. #.# .x _z.
+a aB i5 $x @efg @@. !abcd #.# .x _z.
 abcd. abcd.
 QRSTUV./* end of line comment */
 QrStUv./* end of line comment */ @&t@
@@ -45,6 +45,8 @@ ID "@efg"
 SKIP
 ID "@@."
 SKIP
+MACRO_ID "!abcd"
+SKIP
 ID "#.#"
 SKIP
 UNEXPECTED_DOT
@@ -443,7 +445,7 @@ AT_DATA([expout-base], [dnl
 SKIP
 SKIP
 ID "#"
-UNEXPECTED_CHAR 33
+MACRO_ID "!"
 SKIP
 SLASH
 ID "usr"
index b358b3e509affef6590c1894239592309bef5454..414318cb5a42120fe9be21ae13705b30bbc217e9 100644 (file)
@@ -30,13 +30,13 @@ m4_define([PSPP_CHECK_SEGMENT],
 AT_SETUP([identifiers])
 AT_KEYWORDS([segment])
 AT_DATA([input], [dnl
-a ab abc abcd
-A AB ABC ABCD
-aB aBC aBcD
-$x $y $z
+a ab abc abcd !abcd
+A AB ABC ABCD !ABCD
+aB aBC aBcD !aBcD
+$x $y $z !$z
 grève@<00A0>@Ângstrom@<00A0>@poté
-#a #b #c ## #d
-@efg @ @@. @#@ @&t@
+#a #b #c ## #d !#d
+@efg @ @@. @#@ !@ @&t@
 ## # #12345 #.#
 f@#_.#6
 GhIjK
@@ -46,23 +46,27 @@ AT_DATA([expout-base], [dnl
 identifier      a    space
 identifier      ab    space
 identifier      abc    space
-identifier      abcd
+identifier      abcd    space
+macro_id        !abcd
 newline         \n (later)
 
 identifier      A    space
 identifier      AB    space
 identifier      ABC    space
-identifier      ABCD
+identifier      ABCD    space
+macro_id        !ABCD
 newline         \n (later)
 
 identifier      aB    space
 identifier      aBC    space
-identifier      aBcD
+identifier      aBcD    space
+macro_id        !aBcD
 newline         \n (later)
 
 identifier      $x    space
 identifier      $y    space
-identifier      $z
+identifier      $z    space
+macro_id        !$z
 newline         \n (later)
 
 identifier      grève
@@ -76,13 +80,15 @@ identifier      #a    space
 identifier      #b    space
 identifier      #c    space
 identifier      ##    space
-identifier      #d
+identifier      #d    space
+macro_id        !#d
 newline         \n (later)
 
 identifier      @efg    space
 identifier      @    space
 identifier      @@.    space
 identifier      @#@    space
+macro_id        !@    space
 newline         \n (later)
 
 identifier      ##    space
@@ -494,7 +500,7 @@ end_command     .
 newline         \n (first)
 
 identifier      #
-unexpected_char !    space
+macro_id        !    space
 punct           /
 identifier      usr
 punct           /