lexer: Add support for macro identifiers (that begin with '!').

author Ben Pfaff <blp@cs.stanford.edu>

Sun, 21 Mar 2021 21:58:09 +0000 (14:58 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Fri, 9 Apr 2021 18:16:48 +0000 (11:16 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sun, 21 Mar 2021 21:58:09 +0000 (14:58 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Fri, 9 Apr 2021 18:16:48 +0000 (11:16 -0700)
diff --git a/src/data/identifier.c b/src/data/identifier.c

index db20010464cab1e3f3a1f42cc1e7a4c012bf8c1b..c613734c94dc647ec4d2c1ca93e9e0dfb8232542 100644 (file)
--- a/src/data/identifier.c
+++ b/src/data/identifier.c
@@ -62,6 +62,7 @@ token_type_to_string (enum token_type token)
    switch (token)
      {
      case T_ID:
+    case T_MACRO_ID:
      case T_POS_NUM:
      case T_NEG_NUM:
      case T_STRING:
diff --git a/src/data/identifier.h b/src/data/identifier.h

index b7affdb192823f094994c6c662ffa0a3e992b2df..85299979c034c5c2be05633d79704a7bf497f188 100644 (file)
--- a/src/data/identifier.h
+++ b/src/data/identifier.h
@@ -23,41 +23,42 @@
  #include "libpspp/str.h"
  
  #define TOKEN_TYPES                                                     \
-    TOKEN_TYPE(ID)                         /* Identifier. */            \
-    TOKEN_TYPE(POS_NUM)                    /* Positive number. */       \
-    TOKEN_TYPE(NEG_NUM)                    /* Negative number. */       \
-    TOKEN_TYPE(STRING)                     /* Quoted string. */         \
-    TOKEN_TYPE(STOP)                       /* End of input. */          \
+    TOKEN_TYPE(ID)                  /* Identifier. */                   \
+    TOKEN_TYPE(MACRO_ID)            /* Identifier starting with '!'. */ \
+    TOKEN_TYPE(POS_NUM)             /* Positive number. */              \
+    TOKEN_TYPE(NEG_NUM)             /* Negative number. */              \
+    TOKEN_TYPE(STRING)              /* Quoted string. */                \
+    TOKEN_TYPE(STOP)                /* End of input. */                 \
                                                                          \
-    TOKEN_TYPE(ENDCMD)                     /* . */                      \
-    TOKEN_TYPE(PLUS)                       /* + */                      \
-    TOKEN_TYPE(DASH)                       /* - */                      \
-    TOKEN_TYPE(ASTERISK)                   /* * */                      \
-    TOKEN_TYPE(SLASH)                      /* / */                      \
-    TOKEN_TYPE(EQUALS)                     /* = */                      \
-    TOKEN_TYPE(LPAREN)                     /* (*/                      \
-    TOKEN_TYPE(RPAREN)                     /*) */                      \
-    TOKEN_TYPE(LBRACK)                     /* [ */                      \
-    TOKEN_TYPE(RBRACK)                     /* ] */                      \
-    TOKEN_TYPE(COMMA)                      /* , */                      \
+    TOKEN_TYPE(ENDCMD)              /* . */                             \
+    TOKEN_TYPE(PLUS)                /* + */                             \
+    TOKEN_TYPE(DASH)                /* - */                             \
+    TOKEN_TYPE(ASTERISK)            /* * */                             \
+    TOKEN_TYPE(SLASH)               /* / */                             \
+    TOKEN_TYPE(EQUALS)              /* = */                             \
+    TOKEN_TYPE(LPAREN)              /* (*/                              \
+    TOKEN_TYPE(RPAREN)              /*) */                              \
+    TOKEN_TYPE(LBRACK)              /* [ */                             \
+    TOKEN_TYPE(RBRACK)              /* ] */                             \
+    TOKEN_TYPE(COMMA)               /* , */                             \
                                                                          \
-    TOKEN_TYPE(AND)                        /* AND */                    \
-    TOKEN_TYPE(OR)                         /* OR */                     \
-    TOKEN_TYPE(NOT)                        /* NOT */                    \
+    TOKEN_TYPE(AND)                 /* AND */                           \
+    TOKEN_TYPE(OR)                  /* OR */                            \
+    TOKEN_TYPE(NOT)                 /* NOT */                           \
                                                                          \
-    TOKEN_TYPE(EQ)                         /* EQ */                     \
-    TOKEN_TYPE(GE)                         /* GE or >= */               \
-    TOKEN_TYPE(GT)                         /* GT or > */                \
-    TOKEN_TYPE(LE)                         /* LE or <= */               \
-    TOKEN_TYPE(LT)                         /* LT or < */                \
-    TOKEN_TYPE(NE)                         /* NE or ~= */               \
+    TOKEN_TYPE(EQ)                  /* EQ */                            \
+    TOKEN_TYPE(GE)                  /* GE or >= */                      \
+    TOKEN_TYPE(GT)                  /* GT or > */                       \
+    TOKEN_TYPE(LE)                  /* LE or <= */                      \
+    TOKEN_TYPE(LT)                  /* LT or < */                       \
+    TOKEN_TYPE(NE)                  /* NE or ~= */                      \
                                                                          \
-    TOKEN_TYPE(ALL)                        /* ALL */                    \
-    TOKEN_TYPE(BY)                         /* BY */                     \
-    TOKEN_TYPE(TO)                         /* TO */                     \
-    TOKEN_TYPE(WITH)                       /* WITH */                   \
+    TOKEN_TYPE(ALL)                 /* ALL */                           \
+    TOKEN_TYPE(BY)                  /* BY */                            \
+    TOKEN_TYPE(TO)                  /* TO */                            \
+    TOKEN_TYPE(WITH)                /* WITH */                          \
                                                                          \
-    TOKEN_TYPE(EXP)                        /* ** */
+    TOKEN_TYPE(EXP)                 /* ** */
  
  /* Token types. */
  enum token_type
diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c

index 6d3549a82782134f62c8603e76ac207f54fe6fef..5044731348ecce0c6a7573dfaabd28b9b3709d67 100644 (file)
--- a/src/language/lexer/lexer.c
+++ b/src/language/lexer/lexer.c
@@ -933,8 +933,8 @@ lex_next_tokcstr (const struct lexer *lexer, int n)
     The string is null-terminated (but the null terminator is not included in
     the returned substring's 'length').
  
-   Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
-   this functions this function will always return NULL.
+   Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
+   tokens this functions this function will always return NULL.
  
     The UTF-8 encoding of the returned string is correct for variable names and
     other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c

index 573a00df9d327f42445cda0c7ccb6492bc093f95..94437d9dd8af5b6967fa5235d4765d5af3467c18 100644 (file)
--- a/src/language/lexer/scan.c
+++ b/src/language/lexer/scan.c
@@ -453,6 +453,11 @@ scan_start__ (struct scanner *scanner, enum segment_type type,
        ss_alloc_substring (&token->string, s);
        return SCAN_DONE;
  
+    case SEG_MACRO_ID:
+      token->type = T_MACRO_ID;
+      ss_alloc_substring (&token->string, s);
+      return SCAN_DONE;
+
      case SEG_PUNCT:
        if (s.length == 1 && s.string[0] == '-')
          {
diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c

index 3e060a5b9f9e9e3d8342ad641d47dad594b953df..cfe3de522fca5c912d8d5da8ec37d75d1c3fcbf6 100644 (file)
--- a/src/language/lexer/segment.c
+++ b/src/language/lexer/segment.c
@@ -651,6 +651,7 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
              }
            /* fall through */
  
+        case SEG_MACRO_ID:
          case SEG_NUMBER:
          case SEG_QUOTED_STRING:
          case SEG_HEX_STRING:
@@ -719,10 +720,9 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
          ofs--;
      }
  
-  if (is_reserved_word (input, ofs))
-    *type = SEG_RESERVED_WORD;
-  else
-    *type = SEG_IDENTIFIER;
+  *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
+           : input[0] == '!' ? SEG_MACRO_ID
+           : SEG_IDENTIFIER);
  
    if (s->substate & SS_START_OF_COMMAND)
      {
@@ -989,6 +989,9 @@ segmenter_parse_mid_command__ (struct segmenter *s,
        return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
                                         s, input, n, eof, type);
  
+    case '!':
+      return segmenter_parse_id__ (s, input, n, eof, type);
+
      default:
        if (lex_uc_is_space (uc))
          {
diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h

index c647c8691dadd81f1e634ed91a63f0b2b11a4b9d..dbb43d529d59541e9635d93ca536a2a31625b934 100644 (file)
--- a/src/language/lexer/segment.h
+++ b/src/language/lexer/segment.h
@@ -67,6 +67,7 @@ enum segmenter_mode
      SEG_TYPE(UNQUOTED_STRING)                   \
      SEG_TYPE(RESERVED_WORD)                     \
      SEG_TYPE(IDENTIFIER)                        \
+    SEG_TYPE(MACRO_ID)                          \
      SEG_TYPE(PUNCT)                             \
                                                  \
      SEG_TYPE(SHBANG)                            \
diff --git a/src/language/lexer/token.c b/src/language/lexer/token.c

index 9c5fef9991b20726811a684c047fb52d941c8af4..f6bc6a1d22166a8f7dfc118f400bcdbd5f8ac88d 100644 (file)
--- a/src/language/lexer/token.c
+++ b/src/language/lexer/token.c
@@ -142,6 +142,7 @@ token_to_string (const struct token *token)
        return number_token_to_string (token);
  
      case T_ID:
+    case T_MACRO_ID:
        return ss_xstrdup (token->string);
  
      case T_STRING:
diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at

index 3da89484c44e1e0145d406269840d21843d47816..30ee16ad9b92c1c16a900971dd31ee0a8d5a400a 100644 (file)
--- a/tests/language/lexer/scan.at
+++ b/tests/language/lexer/scan.at
@@ -25,7 +25,7 @@ m4_define([PSPP_CHECK_SCAN],
  AT_SETUP([identifiers])
  AT_KEYWORDS([scan])
  AT_DATA([input], [dnl
-a aB i5 $x @efg @@. #.# .x _z.
+a aB i5 $x @efg @@. !abcd #.# .x _z.
  abcd. abcd.
  QRSTUV./* end of line comment */
  QrStUv./* end of line comment */ @&t@
@@ -45,6 +45,8 @@ ID "@efg"
  SKIP
  ID "@@."
  SKIP
+MACRO_ID "!abcd"
+SKIP
  ID "#.#"
  SKIP
  UNEXPECTED_DOT
@@ -443,7 +445,7 @@ AT_DATA([expout-base], [dnl
  SKIP
  SKIP
  ID "#"
-UNEXPECTED_CHAR 33
+MACRO_ID "!"
  SKIP
  SLASH
  ID "usr"
diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at

index b358b3e509affef6590c1894239592309bef5454..414318cb5a42120fe9be21ae13705b30bbc217e9 100644 (file)
--- a/tests/language/lexer/segment.at
+++ b/tests/language/lexer/segment.at
@@ -30,13 +30,13 @@ m4_define([PSPP_CHECK_SEGMENT],
  AT_SETUP([identifiers])
  AT_KEYWORDS([segment])
  AT_DATA([input], [dnl
-a ab abc abcd
-A AB ABC ABCD
-aB aBC aBcD
-$x $y $z
+a ab abc abcd !abcd
+A AB ABC ABCD !ABCD
+aB aBC aBcD !aBcD
+$x $y $z !$z
  grève@<00A0>@Ângstrom@<00A0>@poté
-#a #b #c ## #d
-@efg @ @@. @#@ @&t@
+#a #b #c ## #d !#d
+@efg @ @@. @#@ !@ @&t@
  ## # #12345 #.#
  f@#_.#6
  GhIjK
@@ -46,23 +46,27 @@ AT_DATA([expout-base], [dnl
  identifier      a    space
  identifier      ab    space
  identifier      abc    space
-identifier      abcd
+identifier      abcd    space
+macro_id        !abcd
  newline         \n (later)
  
  identifier      A    space
  identifier      AB    space
  identifier      ABC    space
-identifier      ABCD
+identifier      ABCD    space
+macro_id        !ABCD
  newline         \n (later)
  
  identifier      aB    space
  identifier      aBC    space
-identifier      aBcD
+identifier      aBcD    space
+macro_id        !aBcD
  newline         \n (later)
  
  identifier      $x    space
  identifier      $y    space
-identifier      $z
+identifier      $z    space
+macro_id        !$z
  newline         \n (later)
  
  identifier      grève
@@ -76,13 +80,15 @@ identifier      #a    space
  identifier      #b    space
  identifier      #c    space
  identifier      ##    space
-identifier      #d
+identifier      #d    space
+macro_id        !#d
  newline         \n (later)
  
  identifier      @efg    space
  identifier      @    space
  identifier      @@.    space
  identifier      @#@    space
+macro_id        !@    space
  newline         \n (later)
  
  identifier      ##    space
@@ -494,7 +500,7 @@ end_command     .
  newline         \n (first)
  
  identifier      #
-unexpected_char !    space
+macro_id        !    space
  punct           /
  identifier      usr
  punct           /
author	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 21 Mar 2021 21:58:09 +0000 (14:58 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Fri, 9 Apr 2021 18:16:48 +0000 (11:16 -0700)
src/data/identifier.c		patch \| blob \| history
src/data/identifier.h		patch \| blob \| history
src/language/lexer/lexer.c		patch \| blob \| history
src/language/lexer/scan.c		patch \| blob \| history
src/language/lexer/segment.c		patch \| blob \| history
src/language/lexer/segment.h		patch \| blob \| history
src/language/lexer/token.c		patch \| blob \| history
tests/language/lexer/scan.at		patch \| blob \| history
tests/language/lexer/segment.at		patch \| blob \| history