lexer: Add support for macro punctuation.
authorBen Pfaff <blp@cs.stanford.edu>
Mon, 22 Mar 2021 06:06:14 +0000 (23:06 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Fri, 9 Apr 2021 18:16:48 +0000 (11:16 -0700)
These punctuation symbols can be used to delimit macro arguments, even
though they aren't allowed anywhere else in the language.

src/data/identifier.c
src/data/identifier.h
src/language/command.def
src/language/lexer/scan.c
src/language/lexer/segment.c
src/language/lexer/token.c
tests/language/lexer/lexer.at
tests/language/lexer/scan.at
tests/language/lexer/segment.at

index c613734c94dc647ec4d2c1ca93e9e0dfb8232542..e3b33a3382a75953b82d4a28be7793c0fa36fc9e 100644 (file)
@@ -62,10 +62,11 @@ token_type_to_string (enum token_type token)
   switch (token)
     {
     case T_ID:
-    case T_MACRO_ID:
     case T_POS_NUM:
     case T_NEG_NUM:
     case T_STRING:
+    case T_MACRO_ID:
+    case T_MACRO_PUNCT:
     case T_STOP:
       return NULL;
 
index 85299979c034c5c2be05633d79704a7bf497f188..1fc63b4808f44e7e07ba3a92696f340f338d6922 100644 (file)
@@ -24,7 +24,6 @@
 
 #define TOKEN_TYPES                                                     \
     TOKEN_TYPE(ID)                  /* Identifier. */                   \
-    TOKEN_TYPE(MACRO_ID)            /* Identifier starting with '!'. */ \
     TOKEN_TYPE(POS_NUM)             /* Positive number. */              \
     TOKEN_TYPE(NEG_NUM)             /* Negative number. */              \
     TOKEN_TYPE(STRING)              /* Quoted string. */                \
@@ -37,7 +36,7 @@
     TOKEN_TYPE(SLASH)               /* / */                             \
     TOKEN_TYPE(EQUALS)              /* = */                             \
     TOKEN_TYPE(LPAREN)              /* (*/                              \
-    TOKEN_TYPE(RPAREN)              /*) */                              \
+    TOKEN_TYPE(RPAREN)              /* ) */                             \
     TOKEN_TYPE(LBRACK)              /* [ */                             \
     TOKEN_TYPE(RBRACK)              /* ] */                             \
     TOKEN_TYPE(COMMA)               /* , */                             \
     TOKEN_TYPE(TO)                  /* TO */                            \
     TOKEN_TYPE(WITH)                /* WITH */                          \
                                                                         \
-    TOKEN_TYPE(EXP)                 /* ** */
-
+    TOKEN_TYPE(EXP)                 /* ** */                            \
+                                                                        \
+    TOKEN_TYPE(MACRO_ID)            /* Identifier starting with '!'. */ \
+    TOKEN_TYPE(MACRO_PUNCT)         /* Miscellaneous punctuator. */
 /* Token types. */
 enum token_type
   {
index a97f9b83e70fd1c7e021188eb6a84107a6c04627..ff7cd8e84cf0629d6b6668c053315e58dcc53d58 100644 (file)
@@ -18,6 +18,7 @@
 DEF_CMD (S_ANY, F_ENHANCED, "CLOSE FILE HANDLE", cmd_close_file_handle)
 DEF_CMD (S_ANY, 0, "CACHE", cmd_cache)
 DEF_CMD (S_ANY, 0, "CD", cmd_cd)
+//DEF_CMD (S_ANY, 0, "DEFINE", cmd_define)
 DEF_CMD (S_ANY, 0, "DO REPEAT", cmd_do_repeat)
 DEF_CMD (S_ANY, 0, "END REPEAT", cmd_end_repeat)
 DEF_CMD (S_ANY, 0, "ECHO", cmd_echo)
@@ -188,7 +189,6 @@ UNIMPL_CMD ("CSTABULATE", "Tabulate complex samples")
 UNIMPL_CMD ("CTABLES", "Display complex samples")
 UNIMPL_CMD ("CURVEFIT", "Fit curve to line plot")
 UNIMPL_CMD ("DATE", "Create time series data")
-UNIMPL_CMD ("DEFINE", "Syntax macros")
 UNIMPL_CMD ("DETECTANOMALY", "Find unusual cases")
 UNIMPL_CMD ("DISCRIMINANT", "Linear discriminant analysis")
 UNIMPL_CMD ("EDIT", "obsolete")
index 94437d9dd8af5b6967fa5235d4765d5af3467c18..cae523cb37efbe83fc71ba8bbba1aa28b735309f 100644 (file)
@@ -324,6 +324,7 @@ scan_punct1__ (char c0)
     case '<': return T_LT;
     case '>': return T_GT;
     case '~': return T_NOT;
+    default: return T_MACRO_PUNCT;
     }
 
   NOT_REACHED ();
@@ -467,6 +468,8 @@ scan_start__ (struct scanner *scanner, enum segment_type type,
       else
         {
           token->type = scan_punct__ (s);
+          if (token->type == T_MACRO_PUNCT)
+            ss_alloc_substring (&token->string, s);
           return SCAN_DONE;
         }
 
index cfe3de522fca5c912d8d5da8ec37d75d1c3fcbf6..5f7fc01310d4241288b39d038ceb1294c25c46f6 100644 (file)
@@ -1015,6 +1015,12 @@ segmenter_parse_mid_command__ (struct segmenter *s,
         }
       else if (lex_uc_is_id1 (uc))
         return segmenter_parse_id__ (s, input, n, eof, type);
+      else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
+        {
+          *type = SEG_PUNCT;
+          s->substate = 0;
+          return 1;
+        }
       else
         {
           *type = SEG_UNEXPECTED_CHAR;
index f6bc6a1d22166a8f7dfc118f400bcdbd5f8ac88d..98fb72f14e84633f480f87b761913f73d3d7d876 100644 (file)
@@ -143,6 +143,7 @@ token_to_string (const struct token *token)
 
     case T_ID:
     case T_MACRO_ID:
+    case T_MACRO_PUNCT:
       return ss_xstrdup (token->string);
 
     case T_STRING:
index d499f0922f7cd2e20cd35fc2fb9b579d7d2d98c5..8438bfb26db62c026842980d0f93bd4dac847ecb 100644 (file)
@@ -46,7 +46,7 @@ u'110000'
 'foo
 'very long unterminated string that be ellipsized in its error message
 1e .x
-`
+^
 �
 ])
 AT_CHECK([pspp -O format=csv lexer.sps], [1], [dnl
@@ -72,7 +72,7 @@ lexer.sps:9.4: error: Syntax error at `.': Unexpected `.' in middle of command.
 
 lexer.sps:9: error: Unknown command `x'.
 
-lexer.sps:10.1: error: Syntax error at ``': Bad character ``' in input.
+lexer.sps:10.1: error: Syntax error at `^': Bad character `^' in input.
 
 lexer.sps:11.1: error: Syntax error at `�': Bad character U+FFFD in input.
 ])
index 30ee16ad9b92c1c16a900971dd31ee0a8d5a400a..b442958d5e35fe5111e737029a5660c9a9472843 100644 (file)
@@ -52,7 +52,7 @@ SKIP
 UNEXPECTED_DOT
 ID "x"
 SKIP
-UNEXPECTED_CHAR 95
+MACRO_PUNCT "_"
 ID "z"
 ENDCMD
 SKIP
@@ -187,6 +187,7 @@ AT_KEYWORDS([scan])
 AT_DATA([input], [dnl
 ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
 ~&|=>=><=<~=<>(),-+*/[[]]**
+% : ; ? _ ` { } ~
 ])
 AT_DATA([expout-base], [dnl
 NOT
@@ -249,6 +250,24 @@ SLASH
 LBRACK
 RBRACK
 EXP
+SKIP
+MACRO_PUNCT "%"
+SKIP
+MACRO_PUNCT ":"
+SKIP
+MACRO_PUNCT ";"
+SKIP
+MACRO_PUNCT "?"
+SKIP
+MACRO_PUNCT "_"
+SKIP
+MACRO_PUNCT "`"
+SKIP
+MACRO_PUNCT "{"
+SKIP
+MACRO_PUNCT "}"
+SKIP
+NOT
 -SKIP
 STOP
 ])
index 414318cb5a42120fe9be21ae13705b30bbc217e9..b55216cfefaf54b05a1e913c7d788b0f2a7bb580 100644 (file)
@@ -107,7 +107,7 @@ start_command   .
 identifier      x    space
 number          1
 identifier      y    space
-unexpected_char \_
+punct           \_
 identifier      z
 -newline         \n (later)
 -
@@ -291,6 +291,7 @@ AT_KEYWORDS([segment])
 AT_DATA([input], [dnl
 ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
 ~&|=>=><=<~=<>(),-+*/[[]]**
+% : ; ? _ ` { } ~
 ])
 AT_DATA([expout-base], [dnl
 punct           ~    space
@@ -335,6 +336,17 @@ punct           /
 punct           [[
 punct           ]]
 punct           **
+newline         \n (later)
+
+punct           %    space
+punct           :    space
+punct           ;    space
+punct           ?    space
+punct           \_    space
+punct           `    space
+punct           {    space
+punct           }    space
+punct           ~
 -newline         \n (later)
 -
 end