scan: Get rid of scan token types in favor of new scan result state.

author Ben Pfaff <blp@cs.stanford.edu>

Mon, 5 Jul 2021 22:15:45 +0000 (15:15 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Mon, 5 Jul 2021 23:24:28 +0000 (16:24 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Mon, 5 Jul 2021 22:15:45 +0000 (15:15 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Mon, 5 Jul 2021 23:24:28 +0000 (16:24 -0700)
diff --git a/src/language/control/define.c b/src/language/control/define.c

index 3a7f535c86d46917ded79962fc7ec44896b6200f..23a58fe8f931e5e01e44f930f712055ae3e89112 100644 (file)
--- a/src/language/control/define.c
+++ b/src/language/control/define.c
@@ -65,8 +65,8 @@ parse_quoted_token (struct lexer *lexer, struct token *token)
    struct string_lexer slex;
    string_lexer_init (&slex, s.string, s.length, SEG_MODE_INTERACTIVE, true);
    struct token another_token = { .type = T_STOP };
-  if (!string_lexer_next (&slex, token)
-      || string_lexer_next (&slex, &another_token))
+  if (string_lexer_next (&slex, token) != SLR_TOKEN
+      || string_lexer_next (&slex, &another_token) != SLR_END)
      {
        token_uninit (token);
        token_uninit (&another_token);
diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c

index 9753024cc3ef6330f8388538ea4ad1dbc4f1a532..f8b5a840821d6046a6881c1af27e4727a6c0797f 100644 (file)
--- a/src/language/lexer/lexer.c
+++ b/src/language/lexer/lexer.c
@@ -1081,13 +1081,12 @@ lex_match_phrase (struct lexer *lexer, const char *s)
    i = 0;
    string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
    while (string_lexer_next (&slex, &token))
-    if (token.type != SCAN_SKIP)
-      {
-        bool match = lex_tokens_match (lex_next (lexer, i++), &token);
-        token_uninit (&token);
-        if (!match)
-          return false;
-      }
+    {
+      bool match = lex_tokens_match (lex_next (lexer, i++), &token);
+      token_uninit (&token);
+      if (!match)
+        return false;
+    }
  
    while (i-- > 0)
      lex_get (lexer);
@@ -1667,6 +1666,7 @@ lex_source_try_get__ (struct lex_source *src)
  
    /* Extract segments and pass them through the scanner until we obtain a
       token. */
+  enum scan_result result;
    for (;;)
      {
        /* Extract a segment. */
@@ -1693,9 +1693,8 @@ lex_source_try_get__ (struct lex_source *src)
          }
  
        /* Pass the segment into the scanner and try to get a token out. */
-      enum scan_result result = scanner_push (&scanner, type,
-                                              ss_buffer (segment, seg_len),
-                                              &token->token);
+      result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
+                             &token->token);
        if (result == SCAN_SAVE)
          saved = state;
        else if (result == SCAN_BACK)
@@ -1703,7 +1702,9 @@ lex_source_try_get__ (struct lex_source *src)
            state = saved;
            break;
          }
-      else if (result == SCAN_DONE)
+      else if (result == SCAN_DONE
+               || result == SCAN_EMPTY
+               || result == SCAN_ERROR)
          break;
      }
  
@@ -1757,37 +1758,24 @@ lex_source_try_get__ (struct lex_source *src)
    src->line_pos = state.line_pos;
    src->n_newlines += state.newlines;
  
-  switch (token->token.type)
+  if (result == SCAN_EMPTY)
+    {
+      lex_source_pop_front (src);
+      return false;
+    }
+  else if (result == SCAN_ERROR)
+    {
+      lex_get_error (src, token->token.string.string);
+      return false;
+    }
+  else if (token->token.type == T_STOP)
      {
-    default:
-      return true;
-
-    case T_STOP:
        token->token.type = T_ENDCMD;
        src->eof = true;
        return true;
-
-    case SCAN_BAD_HEX_LENGTH:
-    case SCAN_BAD_HEX_DIGIT:
-    case SCAN_BAD_UNICODE_DIGIT:
-    case SCAN_BAD_UNICODE_LENGTH:
-    case SCAN_BAD_UNICODE_CODE_POINT:
-    case SCAN_EXPECTED_QUOTE:
-    case SCAN_EXPECTED_EXPONENT:
-    case SCAN_UNEXPECTED_CHAR:
-     {
-      char *msg = scan_token_to_error (&token->token);
-      lex_get_error (src, msg);
-      free (msg);
-      return false;
-     }
-
-    case SCAN_SKIP:
-      lex_source_pop_front (src);
-      return false;
      }
-
-  NOT_REACHED ();
+  else
+    return true;
  }
  
  /* Attempts to add a new token at the front of SRC.  Returns true if
diff --git a/src/language/lexer/macro.c b/src/language/lexer/macro.c

index 6b5d62430195b745a7e29e19eca99b3ef4e206ef..b5cbf2dfc6cc0bb4142fa163a2ecbd522a560204 100644 (file)
--- a/src/language/lexer/macro.c
+++ b/src/language/lexer/macro.c
@@ -248,6 +248,7 @@ macro_tokens_from_string__ (struct macro_tokens *mts, const struct substring src
        struct scanner scanner;
        scanner_init (&scanner, token);
  
+      enum scan_result result;
        for (;;)
          {
            enum segment_type type;
@@ -258,37 +259,40 @@ macro_tokens_from_string__ (struct macro_tokens *mts, const struct substring src
            struct substring segment = ss_head (state.body, seg_len);
            ss_advance (&state.body, seg_len);
  
-          enum scan_result result = scanner_push (&scanner, type, segment, token);
+          result = scanner_push (&scanner, type, segment, token);
            if (result == SCAN_SAVE)
              saved = state;
-          else if (result == SCAN_BACK)
-            {
-              state = saved;
-              break;
-            }
-          else if (result == SCAN_DONE)
+          else if (result != SCAN_MORE)
              break;
          }
  
-      /* We have a token in 'token'. */
-      mt.syntax.length = state.body.string - mt.syntax.string;
-      if (is_scan_type (token->type))
+
+      switch (result)
          {
-          if (token->type != SCAN_SKIP)
-            {
-              char *s = scan_token_to_error (token);
-              if (stack)
-                {
-                  mt.token.type = T_STRING;
-                  macro_error (stack, &mt, "%s", s);
-                }
-              else
-                msg (SE, "%s", s);
-              free (s);
-            }
+        case SCAN_BACK:
+          state = saved;
+          /* Fall through. */
+        case SCAN_DONE:
+          mt.syntax.length = state.body.string - mt.syntax.string;
+          macro_tokens_add (mts, &mt);
+          break;
+
+        case SCAN_EMPTY:
+          break;
+
+        case SCAN_ERROR:
+          mt.syntax.length = state.body.string - mt.syntax.string;
+          if (stack)
+            macro_error (stack, &mt, "%s", token->string.string);
+          else
+            msg (SE, "%s", token->string.string);
+          break;
+
+        case SCAN_MORE:
+        case SCAN_SAVE:
+          NOT_REACHED ();
          }
-      else
-        macro_tokens_add (mts, &mt);
+
        token_uninit (token);
      }
  }
@@ -1016,17 +1020,15 @@ unquote_string (const char *s, enum segmenter_mode segmenter_mode,
    string_lexer_init (&slex, s, strlen (s), segmenter_mode, true);
  
    struct token token1;
-  if (!string_lexer_next (&slex, &token1))
-    return false;
-
-  if (token1.type != T_STRING)
+  if (string_lexer_next (&slex, &token1) != SLR_TOKEN
+      || token1.type != T_STRING)
      {
        token_uninit (&token1);
        return false;
      }
  
    struct token token2;
-  if (string_lexer_next (&slex, &token2))
+  if (string_lexer_next (&slex, &token2) != SLR_END)
      {
        token_uninit (&token1);
        token_uninit (&token2);
diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c

index 3b9e3c5a2287e2b85eba873ac131ffa9a310cc72..a2f821c05daaeb7957352775eb23712b1fa9d018 100644 (file)
--- a/src/language/lexer/scan.c
+++ b/src/language/lexer/scan.c
@@ -71,7 +71,7 @@ digit_value (int c)
      }
  }
  
-static bool
+static char *
  scan_quoted_string__ (struct substring s, struct token *token)
  {
    int quote;
@@ -97,118 +97,89 @@ scan_quoted_string__ (struct substring s, struct token *token)
    memcpy (ss_end (token->string), s.string, ss_length (s));
    token->string.length += ss_length (s);
  
-  return true;
+  return NULL;
  }
  
-static bool
+static char *
  scan_hex_string__ (struct substring s, struct token *token)
  {
-  uint8_t *dst;
-  size_t i;
-
    /* Trim X' from front and ' from back. */
    s.string += 2;
    s.length -= 3;
  
    if (s.length % 2 != 0)
-    {
-      token->type = SCAN_BAD_HEX_LENGTH;
-      token->number = s.length;
-      return false;
-    }
+    return xasprintf (_("String of hex digits has %zu characters, which "
+                        "is not a multiple of 2."), s.length);
  
    ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
-  dst = CHAR_CAST (uint8_t *, ss_end (token->string));
+  uint8_t *dst = CHAR_CAST (uint8_t *, ss_end (token->string));
    token->string.length += s.length / 2;
-  for (i = 0; i < s.length; i += 2)
+  for (size_t i = 0; i < s.length; i += 2)
      {
        int hi = digit_value (s.string[i]);
        int lo = digit_value (s.string[i + 1]);
  
        if (hi >= 16 || lo >= 16)
-        {
-          token->type = SCAN_BAD_HEX_DIGIT;
-          token->number = s.string[hi >= 16 ? i : i + 1];
-          return false;
-        }
+        return xasprintf (_("`%c' is not a valid hex digit."),
+                          s.string[hi >= 16 ? i : i + 1]);
  
        *dst++ = hi * 16 + lo;
      }
  
-  return true;
+  return NULL;
  }
  
-static bool
+static char *
  scan_unicode_string__ (struct substring s, struct token *token)
  {
-  uint8_t *dst;
-  ucs4_t uc;
-  size_t i;
-
    /* Trim U' from front and ' from back. */
    s.string += 2;
    s.length -= 3;
  
    if (s.length < 1 || s.length > 8)
-    {
-      token->type = SCAN_BAD_UNICODE_LENGTH;
-      token->number = s.length;
-      return 0;
-    }
+    return xasprintf (_("Unicode string contains %zu bytes, which is "
+                        "not in the valid range of 1 to 8 bytes."),
+                      s.length);
  
    ss_realloc (&token->string, token->string.length + 4 + 1);
  
-  uc = 0;
-  for (i = 0; i < s.length; i++)
+  ucs4_t uc = 0;
+  for (size_t i = 0; i < s.length; i++)
      {
        int digit = digit_value (s.string[i]);
        if (digit >= 16)
-        {
-          token->type = SCAN_BAD_UNICODE_DIGIT;
-          token->number = s.string[i];
-          return 0;
-        }
+        return xasprintf (_("`%c' is not a valid hex digit."),
+                          s.string[i]);
        uc = uc * 16 + digit;
      }
  
    if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
-    {
-      token->type = SCAN_BAD_UNICODE_CODE_POINT;
-      token->number = uc;
-      return 0;
-    }
+    return xasprintf (_("U+%04llX is not a valid Unicode code point."),
+                      (long long) uc);
  
-  dst = CHAR_CAST (uint8_t *, ss_end (token->string));
+  uint8_t *dst = CHAR_CAST (uint8_t *, ss_end (token->string));
    token->string.length += u8_uctomb (dst, uc, 4);
  
-  return true;
+  return NULL;
+}
+
+static enum scan_result
+scan_error__ (struct token *token, char *error)
+{
+  ss_dealloc (&token->string);
+  token->type = T_STRING;
+  token->string = ss_cstr (error);
+  return SCAN_ERROR;
  }
  
  static enum scan_result
  scan_string_segment__ (struct scanner *scanner, enum segment_type type,
                         struct substring s, struct token *token)
  {
-  bool ok;
-
-  switch (type)
-    {
-    case SEG_QUOTED_STRING:
-      ok = scan_quoted_string__ (s, token);
-      break;
-
-    case SEG_HEX_STRING:
-      ok = scan_hex_string__ (s, token);
-      break;
-
-    case SEG_UNICODE_STRING:
-      ok = scan_unicode_string__ (s, token);
-      break;
-
-    default:
-      NOT_REACHED ();
-    }
-
-  if (ok)
+  char *error = (type == SEG_QUOTED_STRING ? scan_quoted_string__ (s, token)
+                 : type == SEG_HEX_STRING ? scan_hex_string__ (s, token)
+                 : scan_unicode_string__ (s, token));
+  if (!error)
      {
        token->type = T_STRING;
        token->string.string[token->string.length] = '\0';
@@ -217,14 +188,7 @@ scan_string_segment__ (struct scanner *scanner, enum segment_type type,
        return SCAN_SAVE;
      }
    else
-    {
-      /* The function we called above should have filled in token->type and
-         token->number properly to describe the error. */
-      ss_dealloc (&token->string);
-      token->string = ss_empty ();
-      return SCAN_DONE;
-    }
-
+    return scan_error__ (token, error);
  }
  
  static enum scan_result
@@ -397,76 +361,11 @@ static enum scan_result
  scan_unexpected_char (const struct substring *s, struct token *token)
  {
    ucs4_t uc;
-
-  token->type = SCAN_UNEXPECTED_CHAR;
    u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
-  token->number = uc;
-
-  return SCAN_DONE;
-}
-
-const char *
-scan_type_to_string (enum scan_type type)
-{
-  switch (type)
-    {
-#define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
-      SCAN_TYPES
-#undef SCAN_TYPE
-
-    default:
-      return token_type_to_name ((enum token_type) type);
-    }
-}
-
-bool
-is_scan_type (enum scan_type type)
-{
-  return type > SCAN_FIRST && type < SCAN_LAST;
-}
-
-/* If TOKEN has the type of a scan error (a subset of those identified by
-   is_scan_type()), returns an appropriate error message.  Otherwise, returns
-   NULL. */
-char *
-scan_token_to_error (const struct token *token)
-{
-  switch (token->type)
-    {
-    case SCAN_BAD_HEX_LENGTH:
-      return xasprintf (_("String of hex digits has %d characters, which "
-                          "is not a multiple of 2."), (int) token->number);
-
-    case SCAN_BAD_HEX_DIGIT:
-    case SCAN_BAD_UNICODE_DIGIT:
-      return xasprintf (_("`%c' is not a valid hex digit."),
-                        (int) token->number);
-
-    case SCAN_BAD_UNICODE_LENGTH:
-      return xasprintf (_("Unicode string contains %d bytes, which is "
-                          "not in the valid range of 1 to 8 bytes."),
-                        (int) token->number);
-
-    case SCAN_BAD_UNICODE_CODE_POINT:
-      return xasprintf (_("U+%04X is not a valid Unicode code point."),
-                        (int) token->number);
-
-    case SCAN_EXPECTED_QUOTE:
-      return xasprintf (_("Unterminated string constant."));
-
-    case SCAN_EXPECTED_EXPONENT:
-      return xasprintf (_("Missing exponent following `%s'."),
-                        token->string.string);
-
-    case SCAN_UNEXPECTED_CHAR:
-     {
-      char c_name[16];
-      return xasprintf (_("Bad character %s in input."),
-                        uc_name (token->number, c_name));
-     }
-    }
  
-  return NULL;
+  char c_name[16];
+  return scan_error__ (token, xasprintf (_("Bad character %s in input."),
+                                         uc_name (uc, c_name)));
  }
  
  static enum scan_result
@@ -527,8 +426,7 @@ scan_start__ (struct scanner *scanner, enum segment_type type,
      case SEG_COMMENT:
      case SEG_NEWLINE:
      case SEG_COMMENT_COMMAND:
-      token->type = SCAN_SKIP;
-      return SCAN_DONE;
+      return SCAN_EMPTY;
  
      case SEG_START_DOCUMENT:
        token->type = T_ID;
@@ -546,13 +444,13 @@ scan_start__ (struct scanner *scanner, enum segment_type type,
        return SCAN_DONE;
  
      case SEG_EXPECTED_QUOTE:
-      token->type = SCAN_EXPECTED_QUOTE;
-      return SCAN_DONE;
+      return scan_error__ (token,
+                           xasprintf (_("Unterminated string constant.")));
  
      case SEG_EXPECTED_EXPONENT:
-      token->type = SCAN_EXPECTED_EXPONENT;
-      ss_alloc_substring (&token->string, s);
-      return SCAN_DONE;
+      return scan_error__ (token,
+                           xasprintf (_("Missing exponent following `%.*s'."),
+                                      (int) s.length, s.string));
  
      case SEG_UNEXPECTED_CHAR:
        return scan_unexpected_char (&s, token);
@@ -626,6 +524,14 @@ scanner_init (struct scanner *scanner, struct token *token)
         the segments up to and including the segment for which SCAN_SAVE was
         most recently returned.  Segments following that one should be passed to
         the next scanner to be initialized.
+
+     - SCAN_EMPTY: This is similar to SCAN_DONE, but there's no token because
+       the scanner consumed white space or comments or other syntax that
+       doesn't produce a token.
+
+     - SCAN_ERROR: This is simila to SCAN_DONE, but the token is a T_STRING
+       that describes some lexical error.  The caller should report the error
+       and discard the token.
  */
  enum scan_result
  scanner_push (struct scanner *scanner, enum segment_type type,
@@ -664,14 +570,14 @@ string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
  }
  
  /*  */
-bool
+enum string_lexer_result
  string_lexer_next (struct string_lexer *slex, struct token *token)
  {
    struct segmenter saved_segmenter;
    size_t saved_offset = 0;
  
    struct scanner scanner;
-
+next:
    scanner_init (&scanner, token);
    for (;;)
      {
@@ -691,7 +597,7 @@ string_lexer_next (struct string_lexer *slex, struct token *token)
            slex->offset = saved_offset;
            /* Fall through. */
          case SCAN_DONE:
-          return token->type != T_STOP;
+          return token->type == T_STOP ? SLR_END : SLR_TOKEN;
  
          case SCAN_MORE:
            break;
@@ -700,6 +606,12 @@ string_lexer_next (struct string_lexer *slex, struct token *token)
            saved_segmenter = slex->segmenter;
            saved_offset = slex->offset;
            break;
+
+        case SCAN_ERROR:
+          return SLR_ERROR;
+
+        case SCAN_EMPTY:
+          goto next;
          }
      }
  }
diff --git a/src/language/lexer/scan.h b/src/language/lexer/scan.h

index 0dde2738049d6d8fdbe2f4bcdb6f5a2bf7b8a3e2..bd76e3ffda0d0ca5aa12bad80aa01bb4867bf62a 100644 (file)
--- a/src/language/lexer/scan.h
+++ b/src/language/lexer/scan.h
@@ -35,39 +35,6 @@ struct token;
     types.
  */
  
-#define SCAN_TYPES                              \
-    SCAN_TYPE(BAD_HEX_LENGTH)                   \
-    SCAN_TYPE(BAD_HEX_DIGIT)                    \
-                                                \
-    SCAN_TYPE(BAD_UNICODE_LENGTH)               \
-    SCAN_TYPE(BAD_UNICODE_DIGIT)                \
-    SCAN_TYPE(BAD_UNICODE_CODE_POINT)           \
-                                                \
-    SCAN_TYPE(EXPECTED_QUOTE)                   \
-    SCAN_TYPE(EXPECTED_EXPONENT)                \
-    SCAN_TYPE(UNEXPECTED_CHAR)                  \
-                                                \
-    SCAN_TYPE(SKIP)
-
-/* Types of scan tokens.
-
-   Scan token types are a superset of enum token_type.  Only the additional
-   scan token types are defined here, so see the definition of enum token_type
-   for the others. */
-enum scan_type
-  {
-#define SCAN_TYPE(TYPE) SCAN_##TYPE,
-    SCAN_FIRST = 255,
-    SCAN_TYPES
-    SCAN_LAST
-#undef SCAN_TYPE
-  };
-
-const char *scan_type_to_string (enum scan_type);
-bool is_scan_type (enum scan_type);
-
-char *scan_token_to_error (const struct token *);
-
  /* A scanner.  Opaque. */
  struct scanner
    {
@@ -80,6 +47,8 @@ enum scan_result
    {
      /* Complete token. */
      SCAN_DONE,                  /* Token successfully scanned. */
+    SCAN_EMPTY,                 /* This segment does not produce any token. */
+    SCAN_ERROR,                 /* This segment yields an error message. */
      SCAN_MORE,                  /* More segments needed to scan token. */
  
      /* Incomplete token. */
@@ -101,8 +70,16 @@ struct string_lexer
      struct segmenter segmenter;
    };
  
+enum string_lexer_result
+  {
+    SLR_END,
+    SLR_TOKEN,
+    SLR_ERROR
+  };
+
  void string_lexer_init (struct string_lexer *, const char *input,
                          size_t length, enum segmenter_mode, bool is_snippet);
-bool string_lexer_next (struct string_lexer *, struct token *);
+enum string_lexer_result string_lexer_next (struct string_lexer *,
+                                            struct token *);
  
  #endif /* scan.h */
diff --git a/src/language/lexer/token.h b/src/language/lexer/token.h

index dca1452c6a32aed957ce98e8e9c129fc254fff32..8ec28f3714ba54046f10c910e6276b2088df6eef 100644 (file)
--- a/src/language/lexer/token.h
+++ b/src/language/lexer/token.h
@@ -23,13 +23,10 @@
  #include "libpspp/str.h"
  #include "data/identifier.h"
  
-/* A PSPP syntax token.
-
-   The 'type' member is used by the scanner (see scan.h) for SCAN_* values as
-   well, which is why it is not declared as type "enum token_type". */
+/* A PSPP syntax token. */
  struct token
    {
-    int type;                   /* Usually a "enum token_type" value. */
+    enum token_type type;
      double number;
      struct substring string;
    };
diff --git a/tests/language/lexer/scan-test.c b/tests/language/lexer/scan-test.c

index 2a77e127ace3405ce6770b31aaf0cd7544d3db9d..53163bcad6383f8c51624f9c42e06881c7d2ac0f 100644 (file)
--- a/tests/language/lexer/scan-test.c
+++ b/tests/language/lexer/scan-test.c
@@ -54,7 +54,6 @@ main (int argc, char *argv[])
    char *input;
  
    struct string_lexer slex;
-  bool more;
  
    set_program_name (argv[0]);
    file_name = parse_options (argc, argv);
@@ -74,13 +73,13 @@ main (int argc, char *argv[])
      }
  
    string_lexer_init (&slex, input, length, mode, false);
+  enum string_lexer_result result;
    do
      {
        struct token token;
+      result = string_lexer_next (&slex, &token);
  
-      more = string_lexer_next (&slex, &token);
-
-      printf ("%s", scan_type_to_string (token.type));
+      printf ("%s", result == SLR_ERROR ? "error" : token_type_to_name (token.type));
        if (token.number != 0.0)
          {
            double x = token.number;
@@ -96,7 +95,7 @@ main (int argc, char *argv[])
  
        token_uninit (&token);
      }
-  while (more);
+  while (result != SLR_END);
  
    free (input);
  
diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at

index 146b891e1c728150e7b1d188e93a5d238fff710e..56711cbac0e418f83ad6eaa535c9a257b4d32517 100644 (file)
--- a/tests/language/lexer/scan.at
+++ b/tests/language/lexer/scan.at
@@ -34,52 +34,29 @@ WXYZ. /* unterminated end of line comment
  ])
  AT_DATA([expout-base], [dnl
  ID "a"
-SKIP
  ID "aB"
-SKIP
  ID "i5"
-SKIP
  ID "$x"
-SKIP
  ID "@efg"
-SKIP
  ID "@@."
-SKIP
  MACRO_ID "!abcd"
-SKIP
  ID "#.#"
-SKIP
  MACRO_PUNCT "."
  ID "x"
-SKIP
  MACRO_PUNCT "_"
  ID "z"
  ENDCMD
-SKIP
  ID "abcd."
-SKIP
  ID "abcd"
  ENDCMD
-SKIP
  ID "QRSTUV"
  ENDCMD
-SKIP
-SKIP
  ID "QrStUv"
  ENDCMD
-SKIP
-SKIP
-SKIP
  ID "WXYZ"
  ENDCMD
-SKIP
-SKIP
-SKIP
-UNEXPECTED_CHAR 65533
+error "Bad character U+FFFD in input."
  ENDCMD
-SKIP
-SKIP
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -95,88 +72,47 @@ and. with.
  ])
  AT_DATA([expout-base], [dnl
  AND
-SKIP
  OR
-SKIP
  NOT
-SKIP
  EQ
-SKIP
  GE
-SKIP
  GT
-SKIP
  LE
-SKIP
  LT
-SKIP
  NE
-SKIP
  ALL
-SKIP
  BY
-SKIP
  TO
-SKIP
  WITH
-SKIP
  AND
-SKIP
  OR
-SKIP
  NOT
-SKIP
  EQ
-SKIP
  GE
-SKIP
  GT
-SKIP
  LE
-SKIP
  LT
-SKIP
  NE
-SKIP
  ALL
-SKIP
  BY
-SKIP
  TO
-SKIP
  WITH
-SKIP
  ID "andx"
-SKIP
  ID "orx"
-SKIP
  ID "notx"
-SKIP
  ID "eqx"
-SKIP
  ID "gex"
-SKIP
  ID "gtx"
-SKIP
  ID "lex"
-SKIP
  ID "ltx"
-SKIP
  ID "nex"
-SKIP
  ID "allx"
-SKIP
  ID "byx"
-SKIP
  ID "tox"
-SKIP
  ID "withx"
-SKIP
  ID "and."
-SKIP
  WITH
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -191,45 +127,25 @@ AT_DATA([input], [dnl
  ])
  AT_DATA([expout-base], [dnl
  NOT
-SKIP
  AND
-SKIP
  OR
-SKIP
  EQUALS
-SKIP
  GE
-SKIP
  GT
-SKIP
  LE
-SKIP
  LT
-SKIP
  NE
-SKIP
  NE
-SKIP
  LPAREN
-SKIP
  RPAREN
-SKIP
  COMMA
-SKIP
  DASH
-SKIP
  PLUS
-SKIP
  ASTERISK
-SKIP
  SLASH
-SKIP
  LBRACK
-SKIP
  RBRACK
-SKIP
  EXP
-SKIP
  NOT
  AND
  OR
@@ -250,25 +166,15 @@ SLASH
  LBRACK
  RBRACK
  EXP
-SKIP
  MACRO_PUNCT "%"
-SKIP
  MACRO_PUNCT ":"
-SKIP
  MACRO_PUNCT ";"
-SKIP
  MACRO_PUNCT "?"
-SKIP
  MACRO_PUNCT "_"
-SKIP
  MACRO_PUNCT "`"
-SKIP
  MACRO_PUNCT "{"
-SKIP
  MACRO_PUNCT "}"
-SKIP
  NOT
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -287,73 +193,39 @@ AT_DATA([input], [dnl
  ])
  AT_DATA([expout-base], [dnl
  POS_NUM
-SKIP
  POS_NUM 1
-SKIP
  POS_NUM 1
-SKIP
  POS_NUM 1
-SKIP
  POS_NUM 1
  ENDCMD
-SKIP
  POS_NUM 123
  ENDCMD
-SKIP
-SKIP
-SKIP
-SKIP
-SKIP
  ENDCMD
  POS_NUM 1
-SKIP
  POS_NUM 0.1
-SKIP
  POS_NUM 0.1
-SKIP
  POS_NUM 0.1
-SKIP
  POS_NUM 50
-SKIP
  POS_NUM 0.6
-SKIP
  POS_NUM 70
-SKIP
  POS_NUM 60
-SKIP
  POS_NUM 0.006
-SKIP
  ENDCMD
  POS_NUM 30
-SKIP
  POS_NUM 0.04
-SKIP
  POS_NUM 5
-SKIP
  POS_NUM 6
-SKIP
  POS_NUM 0.0007
-SKIP
  POS_NUM 12.3
-SKIP
  POS_NUM 4.56
-SKIP
  POS_NUM 789
-SKIP
  POS_NUM 999
-SKIP
  POS_NUM 0.0112
-SKIP
  ENDCMD
-SKIP
-EXPECTED_EXPONENT "1e"
-SKIP
+error "Missing exponent following `1e'."
  ID "e1"
-SKIP
-EXPECTED_EXPONENT "1e+"
-SKIP
-EXPECTED_EXPONENT "1e-"
--SKIP
+error "Missing exponent following `1e+'."
+error "Missing exponent following `1e-'."
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -394,61 +266,33 @@ x"4142"
  ])
  AT_DATA([expout-base], [dnl
  STRING "x"
-SKIP
  STRING "y"
-SKIP
  STRING "abc"
-SKIP
  STRING "Don't"
-SKIP
  STRING "Can't"
-SKIP
  STRING "Won't"
-SKIP
  STRING ""quoted""
-SKIP
  STRING ""quoted""
-SKIP
  STRING ""
-SKIP
  STRING ""
-SKIP
  STRING "'"
-SKIP
  STRING """
-SKIP
-EXPECTED_QUOTE
-SKIP
-EXPECTED_QUOTE
-SKIP
+error "Unterminated string constant."
+error "Unterminated string constant."
  STRING "xyzabcde"
-SKIP
  STRING "foobar"
-SKIP
  STRING "foobar"
-SKIP
  STRING "foo"
-SKIP
  PLUS
-SKIP
  ENDCMD
-SKIP
  STRING "bar"
-SKIP
  ENDCMD
-SKIP
  PLUS
-SKIP
  STRING "AB5152"
-SKIP
  STRING "4142QR"
-SKIP
  STRING "ABお"
-SKIP
  STRING "�あいうえお"
-SKIP
  STRING "abc�えxyz"
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -461,18 +305,14 @@ AT_DATA([input], [dnl
  #! /usr/bin/pspp
  ])
  AT_DATA([expout-base], [dnl
-SKIP
-SKIP
  ID "#"
  MACRO_ID "!"
-SKIP
  SLASH
  ID "usr"
  SLASH
  ID "bin"
  SLASH
  ID "pspp"
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -499,57 +339,27 @@ next command.
  
  ])
  AT_DATA([expout-base], [dnl
-SKIP
-SKIP
-SKIP
  ENDCMD
-SKIP
  ENDCMD
-SKIP
-SKIP
  ENDCMD
-SKIP
-SKIP
  ENDCMD
-SKIP
  ENDCMD
-SKIP
-SKIP
  ENDCMD
-SKIP
-SKIP
  ENDCMD
-SKIP
  ID "com"
-SKIP
  ID "is"
-SKIP
  ID "ambiguous"
-SKIP
  WITH
-SKIP
  ID "COMPUTE"
  ENDCMD
-SKIP
  ENDCMD
-SKIP
-SKIP
-SKIP
  ENDCMD
-SKIP
  ENDCMD
-SKIP
-SKIP
-SKIP
  ENDCMD
-SKIP
  ID "next"
-SKIP
  ID "command"
  ENDCMD
-SKIP
  -ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -574,31 +384,21 @@ ID "DOCUMENT"
  STRING "DOCUMENT one line."
  ENDCMD
  ENDCMD
-SKIP
  ID "DOCUMENT"
  STRING "DOC more"
-SKIP
  STRING "    than"
-SKIP
  STRING "        one"
-SKIP
  STRING "            line."
  ENDCMD
  ENDCMD
-SKIP
  ID "DOCUMENT"
  STRING "docu"
-SKIP
  STRING "first.paragraph"
-SKIP
  STRING "isn't parsed as tokens"
-SKIP
  STRING ""
-SKIP
  STRING "second paragraph."
  -ENDCMD
  -ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -616,32 +416,17 @@ FILE /*
  ])
  AT_DATA([expout-base], [dnl
  ID "FIL"
-SKIP
  ID "label"
-SKIP
  STRING "isn't quoted"
  ENDCMD
-SKIP
  ID "FILE"
-SKIP
-SKIP
  ID "lab"
-SKIP
  STRING "is quoted"
  ENDCMD
-SKIP
  ID "FILE"
-SKIP
-SKIP
-SKIP
-SKIP
-SKIP
  ID "lab"
-SKIP
  STRING "not quoted here either"
-SKIP
  -ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -664,41 +449,22 @@ end data
  ])
  AT_DATA([expout-base], [dnl
  ID "begin"
-SKIP
  ID "data"
  ENDCMD
-SKIP
  STRING "123"
-SKIP
  STRING "xxx"
-SKIP
  ID "end"
-SKIP
  ID "data"
  ENDCMD
-SKIP
  ENDCMD
-SKIP
  ID "BEG"
-SKIP
-SKIP
-SKIP
  ID "DAT"
-SKIP
-SKIP
-SKIP
  STRING "5 6 7 /* x"
-SKIP
  STRING ""
-SKIP
  STRING "end  data"
-SKIP
  ID "end"
-SKIP
  ID "data"
-SKIP
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -719,43 +485,26 @@ end
  ])
  AT_DATA([expout-base], [dnl
  ID "do"
-SKIP
  ID "repeat"
-SKIP
  ID "x"
  EQUALS
  ID "a"
-SKIP
  ID "b"
-SKIP
  ID "c"
-SKIP
-SKIP
  ID "y"
  EQUALS
  ID "d"
-SKIP
  ID "e"
-SKIP
  ID "f"
  ENDCMD
-SKIP
  STRING "  do repeat a=1 thru 5."
-SKIP
  STRING "another command."
-SKIP
  STRING "second command"
-SKIP
  STRING "+ third command."
-SKIP
  STRING "end /* x */ /* y */ repeat print."
-SKIP
  ID "end"
-SKIP
-SKIP
  ID "repeat"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -781,60 +530,35 @@ end repeat
  ])
  AT_DATA([expout-base], [dnl
  ID "do"
-SKIP
  ID "repeat"
-SKIP
  ID "x"
  EQUALS
  ID "a"
-SKIP
  ID "b"
-SKIP
  ID "c"
-SKIP
-SKIP
  ID "y"
  EQUALS
  ID "d"
-SKIP
  ID "e"
-SKIP
  ID "f"
-SKIP
  ENDCMD
  STRING "do repeat a=1 thru 5"
-SKIP
  STRING "another command"
-SKIP
  STRING "second command"
-SKIP
  STRING "+ third command"
-SKIP
  STRING "end /* x */ /* y */ repeat print"
-SKIP
  ID "end"
-SKIP
-SKIP
  ID "repeat"
-SKIP
  ENDCMD
  ID "do"
-SKIP
-SKIP
  ID "repeat"
-SKIP
  ID "#a"
  EQUALS
  POS_NUM 1
-SKIP
  ENDCMD
-SKIP
  STRING "  inner command"
-SKIP
  ID "end"
-SKIP
  ID "repeat"
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-b])
@@ -849,17 +573,12 @@ var1 var2 var3
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
  RPAREN
-SKIP
-SKIP
  STRING "var1 var2 var3"
-SKIP
  MACRO_ID "!enddefine"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -873,15 +592,12 @@ define !macro1() var1 var2 var3
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
  RPAREN
  STRING " var1 var2 var3"
-SKIP
  MACRO_ID "!enddefine"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -895,16 +611,12 @@ var1 var2 var3!enddefine.
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
  RPAREN
-SKIP
-SKIP
  STRING "var1 var2 var3"
  MACRO_ID "!enddefine"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -917,14 +629,12 @@ define !macro1()var1 var2 var3!enddefine.
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
  RPAREN
  STRING "var1 var2 var3"
  MACRO_ID "!enddefine"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -938,15 +648,11 @@ define !macro1()
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
  RPAREN
-SKIP
-SKIP
  MACRO_ID "!enddefine"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -962,19 +668,13 @@ define !macro1()
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
  RPAREN
-SKIP
-SKIP
  STRING ""
-SKIP
  STRING ""
-SKIP
  MACRO_ID "!enddefine"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -988,28 +688,22 @@ define !macro1(a(), b(), c())
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
  ID "a"
  LPAREN
  RPAREN
  COMMA
-SKIP
  ID "b"
  LPAREN
  RPAREN
  COMMA
-SKIP
  ID "c"
  LPAREN
  RPAREN
  RPAREN
-SKIP
-SKIP
  MACRO_ID "!enddefine"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -1027,34 +721,22 @@ define !macro1(
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
-SKIP
-SKIP
  ID "a"
  LPAREN
  RPAREN
  COMMA
-SKIP
  ID "b"
  LPAREN
-SKIP
-SKIP
  RPAREN
  COMMA
-SKIP
-SKIP
  ID "c"
  LPAREN
  RPAREN
-SKIP
  RPAREN
-SKIP
-SKIP
  MACRO_ID "!enddefine"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -1072,26 +754,18 @@ content 2
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
-SKIP
  LPAREN
  ID "x"
  COMMA
  ID "y"
  COMMA
  ID "z"
-SKIP
  RPAREN
-SKIP
-SKIP
  STRING "content 1"
-SKIP
  STRING "content 2"
-SKIP
  MACRO_ID "!enddefine"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -1105,20 +779,14 @@ data list /x 1.
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  ENDCMD
-SKIP
  ID "data"
-SKIP
  ID "list"
-SKIP
  SLASH
  ID "x"
-SKIP
  POS_NUM 1
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -1133,22 +801,15 @@ data list /x 1.
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
-SKIP
  ID "x"
  ENDCMD
-SKIP
  ID "data"
-SKIP
  ID "list"
-SKIP
  SLASH
  ID "x"
-SKIP
  POS_NUM 1
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -1163,24 +824,17 @@ data list /x 1.
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
  ENDCMD
-SKIP
  ID "x"
  ENDCMD
-SKIP
  ID "data"
-SKIP
  ID "list"
-SKIP
  SLASH
  ID "x"
-SKIP
  POS_NUM 1
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -1196,20 +850,14 @@ data list /x 1.
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  ENDCMD
-SKIP
  ID "data"
-SKIP
  ID "list"
-SKIP
  SLASH
  ID "x"
-SKIP
  POS_NUM 1
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -1224,16 +872,11 @@ content line 2
  ])
  AT_DATA([expout-base], [dnl
  ID "define"
-SKIP
  MACRO_ID "!macro1"
  LPAREN
  RPAREN
-SKIP
-SKIP
  STRING "content line 1"
-SKIP
  STRING "content line 2"
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -1252,44 +895,25 @@ fourth command.
  ])
  AT_DATA([expout-base], [dnl
  ID "first"
-SKIP
  ID "command"
-SKIP
-SKIP
  ID "another"
-SKIP
  ID "line"
-SKIP
  ID "of"
-SKIP
  ID "first"
-SKIP
  ID "command"
-SKIP
  ENDCMD
-SKIP
  ID "second"
-SKIP
  ID "command"
-SKIP
  ENDCMD
  ID "third"
-SKIP
  ID "command"
-SKIP
  ENDCMD
-SKIP
  ID "fourth"
-SKIP
  ID "command"
  ENDCMD
-SKIP
-SKIP
  ID "fifth"
-SKIP
  ID "command"
  ENDCMD
--SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-b])
author	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 5 Jul 2021 22:15:45 +0000 (15:15 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 5 Jul 2021 23:24:28 +0000 (16:24 -0700)
src/language/control/define.c		patch \| blob \| history
src/language/lexer/lexer.c		patch \| blob \| history
src/language/lexer/macro.c		patch \| blob \| history
src/language/lexer/scan.c		patch \| blob \| history
src/language/lexer/scan.h		patch \| blob \| history
src/language/lexer/token.h		patch \| blob \| history
tests/language/lexer/scan-test.c		patch \| blob \| history
tests/language/lexer/scan.at		patch \| blob \| history