lexer: Add support for macro punctuation.

author Ben Pfaff <blp@cs.stanford.edu>

Mon, 22 Mar 2021 06:06:14 +0000 (23:06 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Fri, 9 Apr 2021 18:16:48 +0000 (11:16 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Mon, 22 Mar 2021 06:06:14 +0000 (23:06 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Fri, 9 Apr 2021 18:16:48 +0000 (11:16 -0700)
diff --git a/src/data/identifier.c b/src/data/identifier.c

index c613734c94dc647ec4d2c1ca93e9e0dfb8232542..e3b33a3382a75953b82d4a28be7793c0fa36fc9e 100644 (file)
--- a/src/data/identifier.c
+++ b/src/data/identifier.c
@@ -62,10 +62,11 @@ token_type_to_string (enum token_type token)
    switch (token)
      {
      case T_ID:
-    case T_MACRO_ID:
      case T_POS_NUM:
      case T_NEG_NUM:
      case T_STRING:
+    case T_MACRO_ID:
+    case T_MACRO_PUNCT:
      case T_STOP:
        return NULL;
  
diff --git a/src/data/identifier.h b/src/data/identifier.h

index 85299979c034c5c2be05633d79704a7bf497f188..1fc63b4808f44e7e07ba3a92696f340f338d6922 100644 (file)
--- a/src/data/identifier.h
+++ b/src/data/identifier.h
@@ -24,7 +24,6 @@
  
  #define TOKEN_TYPES                                                     \
      TOKEN_TYPE(ID)                  /* Identifier. */                   \
-    TOKEN_TYPE(MACRO_ID)            /* Identifier starting with '!'. */ \
      TOKEN_TYPE(POS_NUM)             /* Positive number. */              \
      TOKEN_TYPE(NEG_NUM)             /* Negative number. */              \
      TOKEN_TYPE(STRING)              /* Quoted string. */                \
@@ -37,7 +36,7 @@
      TOKEN_TYPE(SLASH)               /* / */                             \
      TOKEN_TYPE(EQUALS)              /* = */                             \
      TOKEN_TYPE(LPAREN)              /* (*/                              \
-    TOKEN_TYPE(RPAREN)              /*) */                              \
+    TOKEN_TYPE(RPAREN)              /* ) */                             \
      TOKEN_TYPE(LBRACK)              /* [ */                             \
      TOKEN_TYPE(RBRACK)              /* ] */                             \
      TOKEN_TYPE(COMMA)               /* , */                             \
@@ -58,8 +57,10 @@
      TOKEN_TYPE(TO)                  /* TO */                            \
      TOKEN_TYPE(WITH)                /* WITH */                          \
                                                                          \
-    TOKEN_TYPE(EXP)                 /* ** */
-
+    TOKEN_TYPE(EXP)                 /* ** */                            \
+                                                                        \
+    TOKEN_TYPE(MACRO_ID)            /* Identifier starting with '!'. */ \
+    TOKEN_TYPE(MACRO_PUNCT)         /* Miscellaneous punctuator. */
  /* Token types. */
  enum token_type
    {
diff --git a/src/language/command.def b/src/language/command.def

index a97f9b83e70fd1c7e021188eb6a84107a6c04627..ff7cd8e84cf0629d6b6668c053315e58dcc53d58 100644 (file)
--- a/src/language/command.def
+++ b/src/language/command.def
@@ -18,6 +18,7 @@
  DEF_CMD (S_ANY, F_ENHANCED, "CLOSE FILE HANDLE", cmd_close_file_handle)
  DEF_CMD (S_ANY, 0, "CACHE", cmd_cache)
  DEF_CMD (S_ANY, 0, "CD", cmd_cd)
+//DEF_CMD (S_ANY, 0, "DEFINE", cmd_define)
  DEF_CMD (S_ANY, 0, "DO REPEAT", cmd_do_repeat)
  DEF_CMD (S_ANY, 0, "END REPEAT", cmd_end_repeat)
  DEF_CMD (S_ANY, 0, "ECHO", cmd_echo)
@@ -188,7 +189,6 @@ UNIMPL_CMD ("CSTABULATE", "Tabulate complex samples")
  UNIMPL_CMD ("CTABLES", "Display complex samples")
  UNIMPL_CMD ("CURVEFIT", "Fit curve to line plot")
  UNIMPL_CMD ("DATE", "Create time series data")
-UNIMPL_CMD ("DEFINE", "Syntax macros")
  UNIMPL_CMD ("DETECTANOMALY", "Find unusual cases")
  UNIMPL_CMD ("DISCRIMINANT", "Linear discriminant analysis")
  UNIMPL_CMD ("EDIT", "obsolete")
diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c

index 94437d9dd8af5b6967fa5235d4765d5af3467c18..cae523cb37efbe83fc71ba8bbba1aa28b735309f 100644 (file)
--- a/src/language/lexer/scan.c
+++ b/src/language/lexer/scan.c
@@ -324,6 +324,7 @@ scan_punct1__ (char c0)
      case '<': return T_LT;
      case '>': return T_GT;
      case '~': return T_NOT;
+    default: return T_MACRO_PUNCT;
      }
  
    NOT_REACHED ();
@@ -467,6 +468,8 @@ scan_start__ (struct scanner *scanner, enum segment_type type,
        else
          {
            token->type = scan_punct__ (s);
+          if (token->type == T_MACRO_PUNCT)
+            ss_alloc_substring (&token->string, s);
            return SCAN_DONE;
          }
  
diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c

index cfe3de522fca5c912d8d5da8ec37d75d1c3fcbf6..5f7fc01310d4241288b39d038ceb1294c25c46f6 100644 (file)
--- a/src/language/lexer/segment.c
+++ b/src/language/lexer/segment.c
@@ -1015,6 +1015,12 @@ segmenter_parse_mid_command__ (struct segmenter *s,
          }
        else if (lex_uc_is_id1 (uc))
          return segmenter_parse_id__ (s, input, n, eof, type);
+      else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
+        {
+          *type = SEG_PUNCT;
+          s->substate = 0;
+          return 1;
+        }
        else
          {
            *type = SEG_UNEXPECTED_CHAR;
diff --git a/src/language/lexer/token.c b/src/language/lexer/token.c

index f6bc6a1d22166a8f7dfc118f400bcdbd5f8ac88d..98fb72f14e84633f480f87b761913f73d3d7d876 100644 (file)
--- a/src/language/lexer/token.c
+++ b/src/language/lexer/token.c
@@ -143,6 +143,7 @@ token_to_string (const struct token *token)
  
      case T_ID:
      case T_MACRO_ID:
+    case T_MACRO_PUNCT:
        return ss_xstrdup (token->string);
  
      case T_STRING:
diff --git a/tests/language/lexer/lexer.at b/tests/language/lexer/lexer.at

index d499f0922f7cd2e20cd35fc2fb9b579d7d2d98c5..8438bfb26db62c026842980d0f93bd4dac847ecb 100644 (file)
--- a/tests/language/lexer/lexer.at
+++ b/tests/language/lexer/lexer.at
@@ -46,7 +46,7 @@ u'110000'
  'foo
  'very long unterminated string that be ellipsized in its error message
  1e .x
-`
+^
  �
  ])
  AT_CHECK([pspp -O format=csv lexer.sps], [1], [dnl
@@ -72,7 +72,7 @@ lexer.sps:9.4: error: Syntax error at `.': Unexpected `.' in middle of command.
  
  lexer.sps:9: error: Unknown command `x'.
  
-lexer.sps:10.1: error: Syntax error at ``': Bad character ``' in input.
+lexer.sps:10.1: error: Syntax error at `^': Bad character `^' in input.
  
  lexer.sps:11.1: error: Syntax error at `�': Bad character U+FFFD in input.
  ])
diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at

index 30ee16ad9b92c1c16a900971dd31ee0a8d5a400a..b442958d5e35fe5111e737029a5660c9a9472843 100644 (file)
--- a/tests/language/lexer/scan.at
+++ b/tests/language/lexer/scan.at
@@ -52,7 +52,7 @@ SKIP
  UNEXPECTED_DOT
  ID "x"
  SKIP
-UNEXPECTED_CHAR 95
+MACRO_PUNCT "_"
  ID "z"
  ENDCMD
  SKIP
@@ -187,6 +187,7 @@ AT_KEYWORDS([scan])
  AT_DATA([input], [dnl
  ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
  ~&|=>=><=<~=<>(),-+*/[[]]**
+% : ; ? _ ` { } ~
  ])
  AT_DATA([expout-base], [dnl
  NOT
@@ -249,6 +250,24 @@ SLASH
  LBRACK
  RBRACK
  EXP
+SKIP
+MACRO_PUNCT "%"
+SKIP
+MACRO_PUNCT ":"
+SKIP
+MACRO_PUNCT ";"
+SKIP
+MACRO_PUNCT "?"
+SKIP
+MACRO_PUNCT "_"
+SKIP
+MACRO_PUNCT "`"
+SKIP
+MACRO_PUNCT "{"
+SKIP
+MACRO_PUNCT "}"
+SKIP
+NOT
  -SKIP
  STOP
  ])
diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at

index 414318cb5a42120fe9be21ae13705b30bbc217e9..b55216cfefaf54b05a1e913c7d788b0f2a7bb580 100644 (file)
--- a/tests/language/lexer/segment.at
+++ b/tests/language/lexer/segment.at
@@ -107,7 +107,7 @@ start_command   .
  identifier      x    space
  number          1
  identifier      y    space
-unexpected_char \_
+punct           \_
  identifier      z
  -newline         \n (later)
  -
@@ -291,6 +291,7 @@ AT_KEYWORDS([segment])
  AT_DATA([input], [dnl
  ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
  ~&|=>=><=<~=<>(),-+*/[[]]**
+% : ; ? _ ` { } ~
  ])
  AT_DATA([expout-base], [dnl
  punct           ~    space
@@ -335,6 +336,17 @@ punct           /
  punct           [[
  punct           ]]
  punct           **
+newline         \n (later)
+
+punct           %    space
+punct           :    space
+punct           ;    space
+punct           ?    space
+punct           \_    space
+punct           `    space
+punct           {    space
+punct           }    space
+punct           ~
  -newline         \n (later)
  -
  end
author	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 22 Mar 2021 06:06:14 +0000 (23:06 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Fri, 9 Apr 2021 18:16:48 +0000 (11:16 -0700)
src/data/identifier.c		patch \| blob \| history
src/data/identifier.h		patch \| blob \| history
src/language/command.def		patch \| blob \| history
src/language/lexer/scan.c		patch \| blob \| history
src/language/lexer/segment.c		patch \| blob \| history
src/language/lexer/token.c		patch \| blob \| history
tests/language/lexer/lexer.at		patch \| blob \| history
tests/language/lexer/scan.at		patch \| blob \| history
tests/language/lexer/segment.at		patch \| blob \| history