lexer: Add support for embedded \0 bytes and missing trailing new-line.

author Ben Pfaff <blp@cs.stanford.edu>

Mon, 24 Sep 2018 03:42:07 +0000 (20:42 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Mon, 24 Sep 2018 05:51:31 +0000 (22:51 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Mon, 24 Sep 2018 03:42:07 +0000 (20:42 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Mon, 24 Sep 2018 05:51:31 +0000 (22:51 -0700)
diff --git a/src/language/control/repeat.c b/src/language/control/repeat.c

index 316ac8fc06db6f5bd85080b2402c7b13da11e4e5..c73d02827149290d9af9dd05dbf5992000e2600b 100644 (file)
--- a/src/language/control/repeat.c
+++ b/src/language/control/repeat.c
@@ -210,7 +210,7 @@ do_parse_commands (struct substring s, enum segmenter_mode mode,
        enum segment_type type;
        int n;
  
-      n = segmenter_push (&segmenter, s.string, s.length, &type);
+      n = segmenter_push (&segmenter, s.string, s.length, true, &type);
        assert (n >= 0);
  
        if (type == SEG_DO_REPEAT_COMMAND)
@@ -220,7 +220,7 @@ do_parse_commands (struct substring s, enum segmenter_mode mode,
                int k;
  
                k = segmenter_push (&segmenter, s.string + n, s.length - n,
-                                  &type);
+                                  true, &type);
                if (type != SEG_NEWLINE && type != SEG_DO_REPEAT_COMMAND)
                  break;
  
@@ -275,9 +275,6 @@ parse_commands (struct lexer *lexer, struct hmap *dummies)
        ds_put_byte (&input, '\n');
        lex_get (lexer);
      }
-  if (ds_is_empty (&input))
-    ds_put_byte (&input, '\n');
-  ds_put_byte (&input, '\0');
  
    n_values = count_values (dummies);
    outputs = xmalloc (n_values * sizeof *outputs);
diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c

index a3642f8a6c6f7e7f5fc5b9376ee31db8308e9efa..fb45465f1943aa1d2e811aecd8ef41fce5e2282e 100644 (file)
--- a/src/language/lexer/lexer.c
+++ b/src/language/lexer/lexer.c
@@ -132,6 +132,7 @@ lex_reader_init (struct lex_reader *reader,
    reader->file_name = NULL;
    reader->encoding = NULL;
    reader->line_number = 0;
+  reader->eof = false;
  }
  
  /* Frees any file name already in READER and replaces it by a copy of
@@ -876,7 +877,7 @@ lex_match_phrase (struct lexer *lexer, const char *s)
    int i;
  
    i = 0;
-  string_lexer_init (&slex, s, SEG_MODE_INTERACTIVE);
+  string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
    while (string_lexer_next (&slex, &token))
      if (token.type != SCAN_SKIP)
        {
@@ -1190,38 +1191,11 @@ lex_source_read__ (struct lex_source *src)
                                             space, prompt);
        assert (n <= space);
  
-      for (char *p = &src->buffer[head_ofs]; p < &src->buffer[head_ofs + n];
-           p++)
-        if (*p == '\0')
-          {
-            struct msg m;
-            m.category = MSG_C_SYNTAX;
-            m.severity = MSG_S_ERROR;
-            m.file_name = src->reader->file_name;
-            m.first_line = 0;
-            m.last_line = 0;
-            m.first_column = 0;
-            m.last_column = 0;
-            m.text = xstrdup ("Bad character U+0000 in input.");
-            msg_emit (&m);
-
-            *p = ' ';
-          }
-
        if (n == 0)
          {
-          /* End of input.
-
-             Ensure that the input always ends in a new-line followed by a null
-             byte, as required by the segmenter library. */
-
-          if (src->head == src->tail
-              || src->buffer[src->head - src->tail - 1] != '\n')
-            src->buffer[src->head++ - src->tail] = '\n';
-
+          /* End of input. */
+          src->reader->eof = true;
            lex_source_expand__ (src);
-          src->buffer[src->head++ - src->tail] = '\0';
-
            return;
          }
  
@@ -1261,6 +1235,7 @@ lex_ellipsize__ (struct substring in, char *out, size_t out_size)
    for (out_len = 0; out_len < in.length; out_len += mblen)
      {
        if (in.string[out_len] == '\n'
+          || in.string[out_len] == '\0'
            || (in.string[out_len] == '\r'
                && out_len + 1 < in.length
                && in.string[out_len + 1] == '\n'))
@@ -1391,10 +1366,11 @@ lex_source_get__ (const struct lex_source *src_)
        size_t seg_maxlen = src->head - state.seg_pos;
        enum segment_type type;
        int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
-                                    &type);
+                                    src->reader->eof, &type);
        if (seg_len < 0)
          {
            /* The segmenter needs more input to produce a segment. */
+          assert (!src->reader->eof);
            lex_source_read__ (src);
            continue;
          }
diff --git a/src/language/lexer/lexer.h b/src/language/lexer/lexer.h

index 7383927eef9cedb8b25e1b640fb3a8a6dc82b4a4..463747f97454659825b860e9fef79477fe5184c5 100644 (file)
--- a/src/language/lexer/lexer.h
+++ b/src/language/lexer/lexer.h
@@ -56,6 +56,7 @@ struct lex_reader
      char *encoding;
      char *file_name;            /* NULL if not associated with a file. */
      int line_number;            /* 1-based initial line number, 0 if none. */
+    bool eof;
    };
  
  /* An implementation of a lex_reader. */
diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c

index 2cd66f07962324ffc5909fe5329f2fb64d3f36a5..573a00df9d327f42445cda0c7ccb6492bc093f95 100644 (file)
--- a/src/language/lexer/scan.c
+++ b/src/language/lexer/scan.c
@@ -593,16 +593,17 @@ scanner_push (struct scanner *scanner, enum segment_type type,
    NOT_REACHED ();
  }
  \f
-/* Initializes SLEX for parsing INPUT in the specified MODE.
+/* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
+   specified MODE.
  
     SLEX has no internal state to free, but it retains a reference to INPUT, so
     INPUT must not be modified or freed while SLEX is still in use. */
  void
-string_lexer_init (struct string_lexer *slex, const char *input,
+string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
                     enum segmenter_mode mode)
  {
    slex->input = input;
-  slex->length = strlen (input) + 1;
+  slex->length = length;
    slex->offset = 0;
    segmenter_init (&slex->segmenter, mode);
  }
@@ -624,7 +625,7 @@ string_lexer_next (struct string_lexer *slex, struct token *token)
        enum segment_type type;
        int n;
  
-      n = segmenter_push (&slex->segmenter, s, left, &type);
+      n = segmenter_push (&slex->segmenter, s, left, true, &type);
        assert (n >= 0);
  
        slex->offset += n;
diff --git a/src/language/lexer/scan.h b/src/language/lexer/scan.h

index 73f208033b1383001cd5f814f564f2f211523663..4327e9bb0bf02f38d79b8e970d07e2ded25187a1 100644 (file)
--- a/src/language/lexer/scan.h
+++ b/src/language/lexer/scan.h
@@ -101,7 +101,7 @@ struct string_lexer
    };
  
  void string_lexer_init (struct string_lexer *, const char *input,
-                        enum segmenter_mode);
+                        size_t length, enum segmenter_mode);
  bool string_lexer_next (struct string_lexer *, struct token *);
  
  #endif /* scan.h */
diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c

index 52ff37a457bc13597a1a2f861911f3e49ef94bd4..c0a09973ce8059535ea7c1e487c5fa2016ff8134 100644 (file)
--- a/src/language/lexer/segment.c
+++ b/src/language/lexer/segment.c
@@ -28,7 +28,6 @@
  
  #include "gl/c-ctype.h"
  #include "gl/c-strcase.h"
-#include "gl/memchr2.h"
  
  enum segmenter_state
    {
@@ -55,108 +54,122 @@ enum segmenter_state
  #define SS_START_OF_COMMAND (1u << 1)
  
  static int segmenter_detect_command_name__ (const char *input,
-                                            size_t n, int ofs);
+                                            size_t n, bool eof, int ofs);
  
  static int
-segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n)
+segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
+                      size_t ofs)
  {
    const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
    int mblen;
  
-  assert (n > 0);
+  assert (n > ofs);
+
+  input += ofs;
+  n -= ofs;
  
    mblen = u8_mbtoucr (puc, input, n);
-  return (mblen >= 0 ? mblen
-          : mblen == -2 ? -1
-          : u8_mbtouc (puc, input, n));
+  if (mblen >= 0)
+    return mblen;
+  else if (mblen != -2)
+    return u8_mbtouc (puc, input, n);
+  else if (eof)
+    {
+      *puc = 0xfffd;
+      return n;
+    }
+  else
+    return -1;
  }
  
  static int
  segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
-                          enum segment_type *type)
+                          bool eof, enum segment_type *type)
  {
    if (input[0] == '#')
      {
-      if (n < 2)
-        return -1;
-      else if (input[1] == '!')
+      if (n >= 2)
          {
-          int ofs;
+          if (input[1] == '!')
+            {
+              int ofs;
  
-          for (ofs = 2; ofs < n; ofs++)
-            if (input[ofs] == '\n' || input[ofs] == '\0')
-              {
-                if (input[ofs] == '\n' && input[ofs - 1] == '\r')
-                  ofs--;
+              for (ofs = 2; ofs < n; ofs++)
+                if (input[ofs] == '\n')
+                  {
+                    if (input[ofs] == '\n' && input[ofs - 1] == '\r')
+                      ofs--;
  
-                s->state = S_GENERAL;
-                s->substate = SS_START_OF_COMMAND;
-                *type = SEG_SHBANG;
-                return ofs;
-              }
+                    s->state = S_GENERAL;
+                    s->substate = SS_START_OF_COMMAND;
+                    *type = SEG_SHBANG;
+                    return ofs;
+                  }
  
-          return -1;
+              return eof ? ofs : -1;
+            }
          }
+      else if (!eof)
+        return -1;
      }
  
    s->state = S_GENERAL;
    s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
-  return segmenter_push (s, input, n, type);
+  return segmenter_push (s, input, n, eof, type);
  }
  
  static int
  segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
-                           const char *input, size_t n,
+                           const char *input, size_t n, bool eof,
                             enum segment_type *type)
  {
    assert (s->state == S_GENERAL);
  
-  if (n < 2)
-    return -1;
-
    *type = SEG_PUNCT;
    s->substate = 0;
-  return input[1] != '\0' && strchr (seconds, input[1]) != NULL ? 2 : 1;
+  return (n < 2
+          ? (eof ? 1 : -1)
+          : (strchr (seconds, input[1]) != NULL ? 2 : 1));
  }
  
  static int
-skip_comment (const char *input, size_t n, size_t ofs)
+skip_comment (const char *input, size_t n, bool eof, size_t ofs)
  {
    for (; ofs < n; ofs++)
      {
-      if (input[ofs] == '\n' || input[ofs] == '\0')
+      if (input[ofs] == '\n')
          return ofs;
        else if (input[ofs] == '*')
          {
            if (ofs + 1 >= n)
-            return -1;
+            return eof ? ofs + 1 : -1;
            else if (input[ofs + 1] == '/')
              return ofs + 2;
          }
      }
-  return -1;
+  return eof ? ofs : -1;
  }
  
  static int
-skip_spaces_and_comments (const char *input, size_t n, int ofs)
+skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
  {
    while (ofs < n)
      {
        ucs4_t uc;
        int mblen;
  
-      mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+      mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
        if (mblen < 0)
          return -1;
  
        if (uc == '/')
          {
            if (ofs + 1 >= n)
-            return -1;
+            return eof ? ofs : -1;
            else if (input[ofs + 1] != '*')
              return ofs;
  
-          ofs = skip_comment (input, n, ofs + 2);
+          ofs = skip_comment (input, n, eof, ofs + 2);
            if (ofs < 0)
              return -1;
          }
@@ -166,18 +179,20 @@ skip_spaces_and_comments (const char *input, size_t n, int ofs)
          return ofs;
      }
  
-  return -1;
+  return eof ? ofs : -1;
  }
  
  static int
-is_end_of_line (const char *input, size_t n, int ofs)
+is_end_of_line (const char *input, size_t n, bool eof, int ofs)
  {
-  if (input[ofs] == '\n' || input[ofs] == '\0')
+  if (ofs >= n)
+    return eof ? 1 : -1;
+  else if (input[ofs] == '\n')
      return 1;
    else if (input[ofs] == '\r')
      {
        if (ofs + 1 >= n)
-        return -1;
+        return eof ? 1 : -1;
        return input[ofs + 1] == '\n';
      }
    else
@@ -185,17 +200,17 @@ is_end_of_line (const char *input, size_t n, int ofs)
  }
  
  static int
-at_end_of_line (const char *input, size_t n, int ofs)
+at_end_of_line (const char *input, size_t n, bool eof, int ofs)
  {
-  ofs = skip_spaces_and_comments (input, n, ofs);
+  ofs = skip_spaces_and_comments (input, n, eof, ofs);
    if (ofs < 0)
      return -1;
  
-  return is_end_of_line (input, n, ofs);
+  return is_end_of_line (input, n, eof, ofs);
  }
  
  static int
-segmenter_parse_newline__ (const char *input, size_t n,
+segmenter_parse_newline__ (const char *input, size_t n, bool eof,
                             enum segment_type *type)
  {
    int ofs;
@@ -205,7 +220,10 @@ segmenter_parse_newline__ (const char *input, size_t n,
    else
      {
        if (n < 2)
-        return -1;
+        {
+          assert (!eof);
+          return -1;
+        }
  
        assert (input[0] == '\r');
        assert (input[1] == '\n');
@@ -217,93 +235,113 @@ segmenter_parse_newline__ (const char *input, size_t n,
  }
  
  static int
-skip_spaces (const char *input, size_t n, size_t ofs)
+skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
  {
    while (ofs < n)
      {
        ucs4_t uc;
        int mblen;
  
-      mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+      mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
        if (mblen < 0)
          return -1;
  
-      if (!lex_uc_is_space (uc) || uc == '\n' || uc == '\0')
+      if (!lex_uc_is_space (uc) || uc == '\n')
          return ofs;
  
        ofs += mblen;
      }
  
-  return -1;
+  return eof ? ofs : -1;
  }
  
  static int
-skip_digits (const char *input, size_t n, int ofs)
+skip_digits (const char *input, size_t n, bool eof, int ofs)
  {
    for (; ofs < n; ofs++)
      if (!c_isdigit (input[ofs]))
        return ofs;
-  return -1;
+  return eof ? ofs : -1;
  }
  
  static int
  segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
-                          enum segment_type *type)
+                          bool eof, enum segment_type *type)
  {
    int ofs;
  
    assert (s->state == S_GENERAL);
  
-  ofs = skip_digits (input, n, 0);
+  ofs = skip_digits (input, n, eof, 0);
    if (ofs < 0)
      return -1;
  
+  if (ofs >= n)
+    {
+      if (!eof)
+        return -1;
+      goto number;
+    };
    if (input[ofs] == '.')
      {
-      ofs = skip_digits (input, n, ofs + 1);
+      ofs = skip_digits (input, n, eof, ofs + 1);
        if (ofs < 0)
          return -1;
      }
  
    if (ofs >= n)
-    return -1;
+    {
+      if (!eof)
+        return -1;
+      goto number;
+    }
    if (input[ofs] == 'e' || input[ofs] == 'E')
      {
        ofs++;
        if (ofs >= n)
-        return -1;
+        {
+          if (!eof)
+            return -1;
+          goto expected_exponent;
+        }
  
        if (input[ofs] == '+' || input[ofs] == '-')
          {
            ofs++;
            if (ofs >= n)
-            return -1;
+            {
+              if (!eof)
+                return -1;
+              goto expected_exponent;
+            }
          }
  
        if (!c_isdigit (input[ofs]))
-        {
-          *type = SEG_EXPECTED_EXPONENT;
-          s->substate = 0;
-          return ofs;
-        }
+        goto expected_exponent;
  
-      ofs = skip_digits (input, n, ofs);
+      ofs = skip_digits (input, n, eof, ofs);
        if (ofs < 0)
          return -1;
      }
  
    if (input[ofs - 1] == '.')
      {
-      int eol = at_end_of_line (input, n, ofs);
+      int eol = at_end_of_line (input, n, eof, ofs);
        if (eol < 0)
          return -1;
        else if (eol)
          ofs--;
      }
  
+number:
    *type = SEG_NUMBER;
    s->substate = 0;
    return ofs;
+
+expected_exponent:
+  *type = SEG_EXPECTED_EXPONENT;
+  s->substate = 0;
+  return ofs;
  }
  
  static bool
@@ -344,7 +382,7 @@ is_reserved_word (const char *s, int n)
  
  static int
  segmenter_parse_comment_1__ (struct segmenter *s,
-                             const char *input, size_t n,
+                             const char *input, size_t n, bool eof,
                               enum segment_type *type)
  {
    int endcmd;
@@ -357,7 +395,7 @@ segmenter_parse_comment_1__ (struct segmenter *s,
        ucs4_t uc;
        int mblen;
  
-      mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+      mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
        if (mblen < 0)
          return -1;
  
@@ -370,9 +408,7 @@ segmenter_parse_comment_1__ (struct segmenter *s,
          case '\n':
            if (ofs > 1 && input[ofs - 1] == '\r')
              ofs--;
-          /* Fall through. */
-        case '\0':
-          if (endcmd == -2 || uc == '\0')
+          if (endcmd == -2)
              {
                /* Blank line ends comment command. */
                s->state = S_GENERAL;
@@ -405,50 +441,66 @@ segmenter_parse_comment_1__ (struct segmenter *s,
  
        ofs += mblen;
      }
+
+  if (eof)
+    {
+      /* End of file. */
+      s->state = S_GENERAL;
+      s->substate = SS_START_OF_COMMAND;
+      *type = SEG_SEPARATE_COMMANDS;
+      return ofs;
+    }
+
    return -1;
  }
  
  static int
-segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n,
-                             enum segment_type *type)
+segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
+                             size_t n, bool eof, enum segment_type *type)
  {
-  int new_cmd;
-  ucs4_t uc;
-  int mblen;
-  int ofs;
-
-  ofs = segmenter_parse_newline__ (input, n, type);
-  if (ofs < 0 || ofs >= n)
-    return -1;
-
-  mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
-  if (mblen < 0)
+  int ofs = segmenter_parse_newline__ (input, n, eof, type);
+  if (ofs < 0)
      return -1;
  
-  if (uc == '+' || uc == '-' || uc == '.')
-    new_cmd = true;
-  else if (!lex_uc_is_space (uc))
-    switch (s->mode)
-      {
-      case SEG_MODE_INTERACTIVE:
-        new_cmd = false;
-        break;
+  int new_cmd;
+  if (ofs >= n)
+    {
+      if (!eof)
+        return -1;
+      new_cmd = false;
+    }
+  else
+    {
+      ucs4_t uc;
+      int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
+      if (mblen < 0)
+        return -1;
  
-      case SEG_MODE_BATCH:
+      if (uc == '+' || uc == '-' || uc == '.')
          new_cmd = true;
-        break;
+      else if (!lex_uc_is_space (uc))
+        switch (s->mode)
+          {
+          case SEG_MODE_INTERACTIVE:
+            new_cmd = false;
+            break;
  
-      case SEG_MODE_AUTO:
-        new_cmd = segmenter_detect_command_name__ (input, n, ofs);
-        if (new_cmd < 0)
-          return -1;
-        break;
+          case SEG_MODE_BATCH:
+            new_cmd = true;
+            break;
  
-      default:
-        NOT_REACHED ();
-      }
-  else
-    new_cmd = false;
+          case SEG_MODE_AUTO:
+            new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
+            if (new_cmd < 0)
+              return -1;
+            break;
+
+          default:
+            NOT_REACHED ();
+          }
+      else
+        new_cmd = false;
+    }
  
    if (new_cmd)
      {
@@ -462,7 +514,7 @@ segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n,
  
  static int
  segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
-                              enum segment_type *type)
+                              bool eof, enum segment_type *type)
  {
    bool end_cmd;
    int ofs;
@@ -474,7 +526,7 @@ segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
        ucs4_t uc;
        int mblen;
  
-      mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+      mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
        if (mblen < 0)
          return -1;
  
@@ -492,11 +544,6 @@ segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
            s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
            return ofs;
  
-        case '\0':
-          *type = SEG_DOCUMENT;
-          s->state = S_DOCUMENT_3;
-          return ofs;
-
          default:
            if (!lex_uc_is_space (uc))
              end_cmd = false;
@@ -505,16 +552,22 @@ segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
  
        ofs += mblen;
      }
+  if (eof)
+    {
+      *type = SEG_DOCUMENT;
+      s->state = S_DOCUMENT_3;
+      return ofs;
+    }
    return -1;
  }
  
  static int
  segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
-                              enum segment_type *type)
+                              bool eof, enum segment_type *type)
  {
    int ofs;
  
-  ofs = segmenter_parse_newline__ (input, n, type);
+  ofs = segmenter_parse_newline__ (input, n, eof, type);
    if (ofs < 0)
      return -1;
  
@@ -532,22 +585,27 @@ segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
  }
  
  static int
-segmenter_unquoted (const char *input, size_t n, int ofs)
+segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
  
  {
-  char c;
-
-  ofs = skip_spaces_and_comments (input, n, ofs);
+  ofs = skip_spaces_and_comments (input, n, eof, ofs);
    if (ofs < 0)
      return -1;
-
-  c = input[ofs];
-  return c != '\'' && c != '"' && c != '\n' && c != '\0';
+  else if (ofs < n)
+    {
+      char c = input[ofs];
+      return c != '\'' && c != '"' && c != '\n';
+    }
+  else
+    {
+      assert (eof);
+      return 0;
+    }
  }
  
  static int
  next_id_in_command (const struct segmenter *s, const char *input, size_t n,
-                    int ofs, char id[], size_t id_size)
+                    bool eof, int ofs, char id[], size_t id_size)
  {
    struct segmenter sub;
  
@@ -561,7 +619,7 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
        enum segment_type type;
        int retval;
  
-      retval = segmenter_push (&sub, input + ofs, n - ofs, &type);
+      retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
        if (retval < 0)
          {
            id[0] = '\0';
@@ -612,13 +670,15 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
      }
  }
  
+/* Called when INPUT begins with a character that can start off an ID token. */
  static int
  segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
-                      enum segment_type *type)
+                      bool eof, enum segment_type *type)
  {
    ucs4_t uc;
    int ofs;
  
+  assert (n > 0);
    assert (s->state == S_GENERAL);
  
    ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
@@ -627,9 +687,13 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
        int mblen;
  
        if (ofs >= n)
-        return -1;
+        {
+          if (eof)
+            break;
+          return -1;
+        }
  
-      mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+      mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
        if (mblen < 0)
          return -1;
        else if (!lex_uc_is_idn (uc))
@@ -640,7 +704,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
  
    if (input[ofs - 1] == '.')
      {
-      int eol = at_end_of_line (input, n, ofs);
+      int eol = at_end_of_line (input, n, eof, ofs);
        if (eol < 0)
          return -1;
        else if (eol)
@@ -659,7 +723,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
        if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
          {
            s->state = S_COMMENT_1;
-          return segmenter_parse_comment_1__ (s, input, n, type);
+          return segmenter_parse_comment_1__ (s, input, n, eof, type);
          }
        else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
          {
@@ -670,7 +734,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
        else if (lex_id_match (ss_cstr ("TITLE"), word)
                 || lex_id_match (ss_cstr ("SUBTITLE"), word))
          {
-          int result = segmenter_unquoted (input, n, ofs);
+          int result = segmenter_unquoted (input, n, eof, ofs);
            if (result < 0)
              return -1;
            else if (result)
@@ -683,7 +747,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
          {
            char id[16];
  
-          if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
+          if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
              return -1;
            else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
              {
@@ -696,7 +760,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
          {
            char id[16];
  
-          if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
+          if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
              return -1;
            else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
              {
@@ -710,25 +774,27 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
            char id[16];
            int ofs2;
  
-          ofs2 = next_id_in_command (s, input, n, ofs, id, sizeof id);
+          ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
            if (ofs2 < 0)
              return -1;
            else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
              {
                int eol;
  
-              ofs2 = skip_spaces_and_comments (input, n, ofs2);
+              ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
                if (ofs2 < 0)
                  return -1;
  
-              if (input[ofs2] == '.')
+              if (ofs2 >= n)
+                assert (eof);
+              else if (input[ofs2] == '.')
                  {
-                  ofs2 = skip_spaces_and_comments (input, n, ofs2 + 1);
+                  ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
                    if (ofs2 < 0)
                      return -1;
                  }
  
-              eol = is_end_of_line (input, n, ofs2);
+              eol = is_end_of_line (input, n, eof, ofs2);
                if (eol < 0)
                  return -1;
                else if (eol)
@@ -751,7 +817,8 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
  static int
  segmenter_parse_string__ (enum segment_type string_type,
                            int ofs, struct segmenter *s,
-                          const char *input, size_t n, enum segment_type *type)
+                          const char *input, size_t n, bool eof,
+                          enum segment_type *type)
  {
    int quote = input[ofs];
  
@@ -760,46 +827,57 @@ segmenter_parse_string__ (enum segment_type string_type,
      if (input[ofs] == quote)
        {
          ofs++;
-        if (ofs >= n)
-          return -1;
-        else if (input[ofs] == quote)
-          ofs++;
-        else
+        if (ofs < n)
            {
-            *type = string_type;
-            s->substate = 0;
-            return ofs;
+            if (input[ofs] == quote)
+              {
+                ofs++;
+                continue;
+              }
            }
-      }
-    else if (input[ofs] == '\n' || input[ofs] == '\0')
-      {
-        *type = SEG_EXPECTED_QUOTE;
+        else if (!eof)
+          return -1;
+
+        *type = string_type;
          s->substate = 0;
          return ofs;
        }
+    else if (input[ofs] == '\n')
+      goto expected_quote;
      else
        ofs++;
  
+  if (eof)
+    goto expected_quote;
+
    return -1;
+
+expected_quote:
+  *type = SEG_EXPECTED_QUOTE;
+  s->substate = 0;
+  return ofs;
  }
  
  static int
  segmenter_maybe_parse_string__ (enum segment_type string_type,
                                  struct segmenter *s,
-                                const char *input, size_t n,
+                                const char *input, size_t n, bool eof,
                                  enum segment_type *type)
  {
    if (n < 2)
-    return -1;
+    {
+      if (!eof)
+        return -1;
+    }
    else if (input[1] == '\'' || input[1] == '"')
-    return segmenter_parse_string__ (string_type, 1, s, input, n, type);
-  else
-    return segmenter_parse_id__ (s, input, n, type);
+    return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
+
+  return segmenter_parse_id__ (s, input, n, eof, type);
  }
  
  static int
  segmenter_parse_mid_command__ (struct segmenter *s,
-                               const char *input, size_t n,
+                               const char *input, size_t n, bool eof,
                                 enum segment_type *type)
  {
    ucs4_t uc;
@@ -809,7 +887,7 @@ segmenter_parse_mid_command__ (struct segmenter *s,
    assert (s->state == S_GENERAL);
    assert (!(s->substate & SS_START_OF_LINE));
  
-  mblen = segmenter_u8_to_uc__ (&uc, input, n);
+  mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
    if (mblen < 0)
      return -1;
  
@@ -821,23 +899,24 @@ segmenter_parse_mid_command__ (struct segmenter *s,
        return 1;
  
      case '/':
-      if (n == 1)
-        return -1;
+      if (n < 2)
+        {
+          if (!eof)
+            return -1;
+        }
        else if (input[1] == '*')
          {
-          ofs = skip_comment (input, n, 2);
+          ofs = skip_comment (input, n, eof, 2);
            if (ofs < 0)
              return -1;
  
            *type = SEG_COMMENT;
            return ofs;
          }
-      else
-        {
-          s->substate = 0;
-          *type = SEG_PUNCT;
-          return 1;
-        }
+
+      s->substate = 0;
+      *type = SEG_PUNCT;
+      return 1;
  
      case '(': case ')': case ',': case '=': case '-':
      case '[': case ']': case '&': case '|': case '+':
@@ -850,62 +929,62 @@ segmenter_parse_mid_command__ (struct segmenter *s,
          {
            /* '*' at the beginning of a command begins a comment. */
            s->state = S_COMMENT_1;
-          return segmenter_parse_comment_1__ (s, input, n, type);
+          return segmenter_parse_comment_1__ (s, input, n, eof, type);
          }
        else
-        return segmenter_parse_digraph__ ("*", s, input, n, type);
+        return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
  
      case '<':
-      return segmenter_parse_digraph__ ("=>", s, input, n, type);
+      return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
  
      case '>':
-      return segmenter_parse_digraph__ ("=", s, input, n, type);
+      return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
  
      case '~':
-      return segmenter_parse_digraph__ ("=", s, input, n, type);
+      return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
  
      case '.':
        if (n < 2)
-        return -1;
-      else if (c_isdigit (input[1]))
-        return segmenter_parse_number__ (s, input, n, type);
-      else
          {
-          int eol = at_end_of_line (input, n, 1);
-          if (eol < 0)
+          if (!eof)
              return -1;
+        }
+      else if (c_isdigit (input[1]))
+        return segmenter_parse_number__ (s, input, n, eof, type);
  
-          if (eol)
-            {
-              *type = SEG_END_COMMAND;
-              s->substate = SS_START_OF_COMMAND;
-            }
-          else
-            *type = SEG_UNEXPECTED_DOT;
-          return 1;
+      int eol = at_end_of_line (input, n, eof, 1);
+      if (eol < 0)
+        return -1;
+
+      if (eol)
+        {
+          *type = SEG_END_COMMAND;
+          s->substate = SS_START_OF_COMMAND;
          }
-      NOT_REACHED ();
+      else
+        *type = SEG_UNEXPECTED_DOT;
+      return 1;
  
      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9':
-      return segmenter_parse_number__ (s, input, n, type);
+      return segmenter_parse_number__ (s, input, n, eof, type);
  
      case 'u': case 'U':
        return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
-                                           s, input, n, type);
+                                             s, input, n, eof, type);
  
      case 'x': case 'X':
        return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
-                                             s, input, n, type);
+                                             s, input, n, eof, type);
  
      case '\'': case '"':
        return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
-                                       s, input, n, type);
+                                       s, input, n, eof, type);
  
      default:
        if (lex_uc_is_space (uc))
          {
-          ofs = skip_spaces (input, n, mblen);
+          ofs = skip_spaces (input, n, eof, mblen);
            if (ofs < 0)
              return -1;
  
@@ -924,7 +1003,7 @@ segmenter_parse_mid_command__ (struct segmenter *s,
            return ofs;
          }
        else if (lex_uc_is_id1 (uc))
-        return segmenter_parse_id__ (s, input, n, type);
+        return segmenter_parse_id__ (s, input, n, eof, type);
        else
          {
            *type = SEG_UNEXPECTED_CHAR;
@@ -985,7 +1064,8 @@ segmenter_get_command_name_candidates (unsigned char first)
  }
  
  static int
-segmenter_detect_command_name__ (const char *input, size_t n, int ofs)
+segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
+                                 int ofs)
  {
    const char **commands;
  
@@ -998,13 +1078,17 @@ segmenter_detect_command_name__ (const char *input, size_t n, int ofs)
        int mblen;
  
        if (ofs >= n)
-        return -1;
+        {
+          if (eof)
+            break;
+          return -1;
+        }
  
-      mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+      mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
        if (mblen < 0)
          return -1;
  
-      if (uc == '\n' || uc == '\0'
+      if (uc == '\n'
            || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
          break;
  
@@ -1033,15 +1117,16 @@ segmenter_detect_command_name__ (const char *input, size_t n, int ofs)
  }
  
  static int
-is_start_of_string__ (const char *input, size_t n, int ofs)
+is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
  {
-  int c;
+  if (ofs >= n)
+    return eof ? 0 : -1;
  
-  c = input[ofs];
+  int c = input[ofs];
    if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
      {
        if (ofs + 1 >= n)
-        return -1;
+        return eof ? 0 : -1;
  
        return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
      }
@@ -1051,7 +1136,7 @@ is_start_of_string__ (const char *input, size_t n, int ofs)
  
  static int
  segmenter_parse_start_of_line__ (struct segmenter *s,
-                                 const char *input, size_t n,
+                                 const char *input, size_t n, bool eof,
                                   enum segment_type *type)
  {
    ucs4_t uc;
@@ -1061,19 +1146,19 @@ segmenter_parse_start_of_line__ (struct segmenter *s,
    assert (s->state == S_GENERAL);
    assert (s->substate & SS_START_OF_LINE);
  
-  mblen = segmenter_u8_to_uc__ (&uc, input, n);
+  mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
    if (mblen < 0)
      return -1;
  
    switch (uc)
      {
      case '+':
-      ofs = skip_spaces_and_comments (input, n, 1);
+      ofs = skip_spaces_and_comments (input, n, eof, 1);
        if (ofs < 0)
          return -1;
        else
          {
-          int is_string = is_start_of_string__ (input, n, ofs);
+          int is_string = is_start_of_string__ (input, n, eof, ofs);
            if (is_string < 0)
              return -1;
            else if (is_string)
@@ -1095,7 +1180,7 @@ segmenter_parse_start_of_line__ (struct segmenter *s,
      default:
        if (lex_uc_is_space (uc))
          {
-          int eol = at_end_of_line (input, n, 0);
+          int eol = at_end_of_line (input, n, eof, 0);
            if (eol < 0)
              return -1;
            else if (eol)
@@ -1111,7 +1196,7 @@ segmenter_parse_start_of_line__ (struct segmenter *s,
          break;
        else if (s->mode == SEG_MODE_AUTO)
          {
-          int cmd = segmenter_detect_command_name__ (input, n, 0);
+          int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
            if (cmd < 0)
              return -1;
            else if (cmd == 0)
@@ -1126,12 +1211,12 @@ segmenter_parse_start_of_line__ (struct segmenter *s,
      }
  
    s->substate = SS_START_OF_COMMAND;
-  return segmenter_parse_mid_command__ (s, input, n, type);
+  return segmenter_parse_mid_command__ (s, input, n, eof, type);
  }
  
  static int
  segmenter_parse_file_label__ (struct segmenter *s,
-                              const char *input, size_t n,
+                              const char *input, size_t n, bool eof,
                                enum segment_type *type)
  {
    struct segmenter sub;
@@ -1139,7 +1224,7 @@ segmenter_parse_file_label__ (struct segmenter *s,
  
    sub = *s;
    sub.state = S_GENERAL;
-  ofs = segmenter_push (&sub, input, n, type);
+  ofs = segmenter_push (&sub, input, n, eof, type);
  
    if (ofs < 0)
      return -1;
@@ -1149,7 +1234,7 @@ segmenter_parse_file_label__ (struct segmenter *s,
  
        assert (lex_id_match (ss_cstr ("LABEL"),
                              ss_buffer ((char *) input, ofs)));
-      result = segmenter_unquoted (input, n, ofs);
+      result = segmenter_unquoted (input, n, eof, ofs);
        if (result < 0)
          return -1;
        else
@@ -1170,7 +1255,8 @@ segmenter_parse_file_label__ (struct segmenter *s,
  
  static int
  segmenter_subparse (struct segmenter *s,
-                    const char *input, size_t n, enum segment_type *type)
+                    const char *input, size_t n, bool eof,
+                    enum segment_type *type)
  {
    struct segmenter sub;
    int ofs;
@@ -1178,17 +1264,17 @@ segmenter_subparse (struct segmenter *s,
    sub.mode = s->mode;
    sub.state = S_GENERAL;
    sub.substate = s->substate;
-  ofs = segmenter_push (&sub, input, n, type);
+  ofs = segmenter_push (&sub, input, n, eof, type);
    s->substate = sub.substate;
    return ofs;
  }
  
  static int
  segmenter_parse_do_repeat_1__ (struct segmenter *s,
-                               const char *input, size_t n,
+                               const char *input, size_t n, bool eof,
                                 enum segment_type *type)
  {
-  int ofs = segmenter_subparse (s, input, n, type);
+  int ofs = segmenter_subparse (s, input, n, eof, type);
    if (ofs < 0)
      return -1;
  
@@ -1205,10 +1291,10 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s,
  
  static int
  segmenter_parse_do_repeat_2__ (struct segmenter *s,
-                               const char *input, size_t n,
+                               const char *input, size_t n, bool eof,
                                 enum segment_type *type)
  {
-  int ofs = segmenter_subparse (s, input, n, type);
+  int ofs = segmenter_subparse (s, input, n, eof, type);
    if (ofs < 0)
      return -1;
  
@@ -1223,7 +1309,7 @@ segmenter_parse_do_repeat_2__ (struct segmenter *s,
  
  static bool
  check_repeat_command (struct segmenter *s,
-                      const char *input, size_t n)
+                      const char *input, size_t n, bool eof)
  {
    int direction;
    char id[16];
@@ -1233,7 +1319,7 @@ check_repeat_command (struct segmenter *s,
    if (input[ofs] == '+' || input[ofs] == '-')
      ofs++;
  
-  ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
+  ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
    if (ofs < 0)
      return false;
    else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
@@ -1243,7 +1329,7 @@ check_repeat_command (struct segmenter *s,
    else
      return true;
  
-  ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
+  ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
    if (ofs < 0)
      return false;
  
@@ -1253,48 +1339,40 @@ check_repeat_command (struct segmenter *s,
  }
  
  static int
-segmenter_parse_full_line__ (const char *input, size_t n,
+segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
                               enum segment_type *type)
  {
-  const char *newline = memchr2 (input, '\n', '\0', n);
+  const char *newline = memchr (input, '\n', n);
+  if (!newline)
+    return eof ? n : -1;
  
-  if (newline == NULL)
-    return -1;
-  else
+  ptrdiff_t ofs = newline - input;
+  if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
      {
-      int ofs = newline - input;
-      if (*newline == '\0')
-        {
-          assert (ofs > 0);
-          return ofs;
-        }
-      else if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
-        {
-          *type = SEG_NEWLINE;
-          return ofs + 1;
-        }
-      else
-        return ofs - (input[ofs - 1] == '\r');
+      *type = SEG_NEWLINE;
+      return ofs + 1;
      }
+  else
+    return ofs - (input[ofs - 1] == '\r');
  }
  
  static int
  segmenter_parse_do_repeat_3__ (struct segmenter *s,
-                               const char *input, size_t n,
+                               const char *input, size_t n, bool eof,
                                 enum segment_type *type)
  {
    int ofs;
  
-  ofs = segmenter_parse_full_line__ (input, n, type);
-  if (ofs < 0 || input[ofs - 1] == '\n')
+  ofs = segmenter_parse_full_line__ (input, n, eof, type);
+  if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
      return ofs;
-  else if (!check_repeat_command (s, input, n))
+  else if (!check_repeat_command (s, input, n, eof) && !eof)
      return -1;
    else if (s->substate == 0)
      {
        s->state = S_GENERAL;
        s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
-      return segmenter_push (s, input, n, type);
+      return segmenter_push (s, input, n, eof, type);
      }
    else
      {
@@ -1305,10 +1383,10 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s,
  
  static int
  segmenter_parse_begin_data_1__ (struct segmenter *s,
-                                const char *input, size_t n,
+                                const char *input, size_t n, bool eof,
                                  enum segment_type *type)
  {
-  int ofs = segmenter_subparse (s, input, n, type);
+  int ofs = segmenter_subparse (s, input, n, eof, type);
    if (ofs < 0)
      return -1;
  
@@ -1320,10 +1398,10 @@ segmenter_parse_begin_data_1__ (struct segmenter *s,
  
  static int
  segmenter_parse_begin_data_2__ (struct segmenter *s,
-                                const char *input, size_t n,
+                                const char *input, size_t n, bool eof,
                                  enum segment_type *type)
  {
-  int ofs = segmenter_subparse (s, input, n, type);
+  int ofs = segmenter_subparse (s, input, n, eof, type);
    if (ofs < 0)
      return -1;
  
@@ -1342,7 +1420,7 @@ is_end_data (const char *input, size_t n)
    int mblen;
    int ofs;
  
-  if (n < 3 || c_strncasecmp (input, "END", 3))
+  if (n < 4 || c_strncasecmp (input, "END", 3))
      return false;
  
    ofs = 3;
@@ -1375,19 +1453,19 @@ is_end_data (const char *input, size_t n)
  
  static int
  segmenter_parse_begin_data_3__ (struct segmenter *s,
-                                const char *input, size_t n,
+                                const char *input, size_t n, bool eof,
                                  enum segment_type *type)
  {
    int ofs;
  
-  ofs = segmenter_parse_full_line__ (input, n, type);
+  ofs = segmenter_parse_full_line__ (input, n, eof, type);
    if (ofs < 0)
      return -1;
    else if (is_end_data (input, ofs))
      {
        s->state = S_GENERAL;
        s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
-      return segmenter_push (s, input, n, type);
+      return segmenter_push (s, input, n, eof, type);
      }
    else
      {
@@ -1399,12 +1477,12 @@ segmenter_parse_begin_data_3__ (struct segmenter *s,
  
  static int
  segmenter_parse_begin_data_4__ (struct segmenter *s,
-                                const char *input, size_t n,
+                                const char *input, size_t n, bool eof,
                                  enum segment_type *type)
  {
    int ofs;
  
-  ofs = segmenter_parse_newline__ (input, n, type);
+  ofs = segmenter_parse_newline__ (input, n, eof, type);
    if (ofs < 0)
      return -1;
  
@@ -1414,12 +1492,12 @@ segmenter_parse_begin_data_4__ (struct segmenter *s,
  
  static int
  segmenter_parse_title_1__ (struct segmenter *s,
-                           const char *input, size_t n,
+                           const char *input, size_t n, bool eof,
                             enum segment_type *type)
  {
    int ofs;
  
-  ofs = skip_spaces (input, n, 0);
+  ofs = skip_spaces (input, n, eof, 0);
    if (ofs < 0)
      return -1;
    s->state = S_TITLE_2;
@@ -1429,7 +1507,7 @@ segmenter_parse_title_1__ (struct segmenter *s,
  
  static int
  segmenter_parse_title_2__ (struct segmenter *s,
-                           const char *input, size_t n,
+                           const char *input, size_t n, bool eof,
                             enum segment_type *type)
  {
    int endcmd;
@@ -1442,18 +1520,14 @@ segmenter_parse_title_2__ (struct segmenter *s,
        ucs4_t uc;
        int mblen;
  
-      mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+      mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
        if (mblen < 0)
          return -1;
  
        switch (uc)
          {
          case '\n':
-        case '\0':
-          s->state = S_GENERAL;
-          s->substate = 0;
-          *type = SEG_UNQUOTED_STRING;
-          return endcmd >= 0 ? endcmd : ofs;
+          goto end_of_line;
  
          case '.':
            endcmd = ofs;
@@ -1468,6 +1542,15 @@ segmenter_parse_title_2__ (struct segmenter *s,
        ofs += mblen;
      }
  
+  if (eof)
+    {
+    end_of_line:
+      s->state = S_GENERAL;
+      s->substate = 0;
+      *type = SEG_UNQUOTED_STRING;
+      return endcmd >= 0 ? endcmd : ofs;
+    }
+
    return -1;
  }
  
@@ -1510,9 +1593,9 @@ segmenter_get_mode (const struct segmenter *s)
  
  /* Attempts to label a prefix of S's remaining input with a segment type.  The
     caller supplies the first N bytes of the remaining input as INPUT, which
-   must be a UTF-8 encoded string.  The end of the input stream must be
-   indicated by a null byte at the beginning of a line, that is, immediately
-   following a new-line (or as the first byte of the input stream).
+   must be a UTF-8 encoded string.  If EOF is true, then the N bytes supplied
+   are the entire (remainder) of the input; if EOF is false, then further input
+   is potentially available.
  
     The input may contain '\n' or '\r\n' line ends in any combination.
  
@@ -1523,11 +1606,11 @@ segmenter_get_mode (const struct segmenter *s)
     the segmenter.
  
     Failure occurs only if the segment type of the N bytes in INPUT cannot yet
-   be determined.  In this case segmenter_push() returns -1.  The caller should
-   obtain more input and then call segmenter_push() again with a larger N and
-   repeat until the input is exhausted (which must be indicated as described
-   above) or until a valid segment is returned.  segmenter_push() will never
-   return -1 when the end of input is visible within INPUT.
+   be determined.  In this case segmenter_push() returns -1.  If more input is
+   available, the caller should obtain some more, then call again with a larger
+   N.  If this is not enough, the process might need to repeat again and agin.
+   If input is exhausted, then the caller may call again setting EOF to true.
+   segmenter_push() will never return -1 when EOF is true.
  
     The caller must not, in a sequence of calls, supply contradictory input.
     That is, bytes provided as part of INPUT in one call, but not consumed, must
@@ -1535,63 +1618,65 @@ segmenter_get_mode (const struct segmenter *s)
     because segmenter_push() must often make decisions based on looking ahead
     beyond the bytes that it consumes. */
  int
-segmenter_push (struct segmenter *s, const char *input, size_t n,
+segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
                  enum segment_type *type)
  {
-  if (n == 0)
-    return -1;
-
-  if (input[0] == '\0')
+  if (!n)
      {
-      *type = SEG_END;
-      return 1;
+      if (eof)
+        {
+          *type = SEG_END;
+          return 0;
+        }
+      else
+        return -1;
      }
  
    switch (s->state)
      {
      case S_SHBANG:
-      return segmenter_parse_shbang__ (s, input, n, type);
+      return segmenter_parse_shbang__ (s, input, n, eof, type);
  
      case S_GENERAL:
        return (s->substate & SS_START_OF_LINE
-              ? segmenter_parse_start_of_line__ (s, input, n, type)
-              : segmenter_parse_mid_command__ (s, input, n, type));
+              ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
+              : segmenter_parse_mid_command__ (s, input, n, eof, type));
  
      case S_COMMENT_1:
-      return segmenter_parse_comment_1__ (s, input, n, type);
+      return segmenter_parse_comment_1__ (s, input, n, eof, type);
      case S_COMMENT_2:
-      return segmenter_parse_comment_2__ (s, input, n, type);
+      return segmenter_parse_comment_2__ (s, input, n, eof, type);
  
      case S_DOCUMENT_1:
-      return segmenter_parse_document_1__ (s, input, n, type);
+      return segmenter_parse_document_1__ (s, input, n, eof, type);
      case S_DOCUMENT_2:
-      return segmenter_parse_document_2__ (s, input, n, type);
+      return segmenter_parse_document_2__ (s, input, n, eof, type);
      case S_DOCUMENT_3:
        return segmenter_parse_document_3__ (s, type);
  
      case S_FILE_LABEL:
-      return segmenter_parse_file_label__ (s, input, n, type);
+      return segmenter_parse_file_label__ (s, input, n, eof, type);
  
      case S_DO_REPEAT_1:
-      return segmenter_parse_do_repeat_1__ (s, input, n, type);
+      return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
      case S_DO_REPEAT_2:
-      return segmenter_parse_do_repeat_2__ (s, input, n, type);
+      return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
      case S_DO_REPEAT_3:
-      return segmenter_parse_do_repeat_3__ (s, input, n, type);
+      return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
  
      case S_BEGIN_DATA_1:
-      return segmenter_parse_begin_data_1__ (s, input, n, type);
+      return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
      case S_BEGIN_DATA_2:
-      return segmenter_parse_begin_data_2__ (s, input, n, type);
+      return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
      case S_BEGIN_DATA_3:
-      return segmenter_parse_begin_data_3__ (s, input, n, type);
+      return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
      case S_BEGIN_DATA_4:
-      return segmenter_parse_begin_data_4__ (s, input, n, type);
+      return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
  
      case S_TITLE_1:
-      return segmenter_parse_title_1__ (s, input, n, type);
+      return segmenter_parse_title_1__ (s, input, n, eof, type);
      case S_TITLE_2:
-      return segmenter_parse_title_2__ (s, input, n, type);
+      return segmenter_parse_title_2__ (s, input, n, eof, type);
      }
  
    NOT_REACHED ();
diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h

index 1c209c5acb395dfe630035a4e34f4650215491dc..c647c8691dadd81f1e634ed91a63f0b2b11a4b9d 100644 (file)
--- a/src/language/lexer/segment.h
+++ b/src/language/lexer/segment.h
@@ -118,7 +118,7 @@ void segmenter_init (struct segmenter *, enum segmenter_mode);
  
  enum segmenter_mode segmenter_get_mode (const struct segmenter *);
  
-int segmenter_push (struct segmenter *, const char *input, size_t n,
+int segmenter_push (struct segmenter *, const char *input, size_t n, bool eof,
                      enum segment_type *);
  
  enum prompt_style segmenter_get_prompt (const struct segmenter *);
diff --git a/tests/language/lexer/lexer.at b/tests/language/lexer/lexer.at

index f13940bd6767d43585a4ce2887ef945b83c01bd8..87ce344eacd4193785fbad5b8ad6bf96307427aa 100644 (file)
--- a/tests/language/lexer/lexer.at
+++ b/tests/language/lexer/lexer.at
@@ -84,16 +84,13 @@ AT_SETUP([lexer crash due to null byte])
  printf "datA dist list notable file='input.txt'/a b c.
  lis|.\0" > lexer.sps
  
-# We sort the output into a predictable order because the lexer finds
-# and reports null bytes as soon as it reads them into its input
-# buffer, as opposed to when it encounters them during tokenization.
-# This also means that null bytes might be reported as part of one
-# command or another or none, hence removing the LIST: prefix.
-AT_CHECK([pspp -O format=csv lexer.sps > lexer.csv], [1])
-AT_CHECK([sed '/^$/d
-s/LIST: //' lexer.csv | sort], [0], [dnl
-lexer.sps: error: Bad character U+0000 in input.
+AT_CHECK([pspp -O format=csv lexer.sps], [1], [dnl
  lexer.sps:1: error: Unknown command `datA dist'.
-lexer.sps:2: error: LIST is allowed only after the active dataset has been defined.
+
+lexer.sps:2: error: LIST: LIST is allowed only after the active dataset has been defined.
+
+lexer.sps:2.5: error: LIST: Syntax error at `.': Unexpected `.' in middle of command.
+
+lexer.sps:2.6: error: LIST: Syntax error at `...': Bad character U+0000 in input.
  ])
  AT_CLEANUP
diff --git a/tests/language/lexer/scan-test.c b/tests/language/lexer/scan-test.c

index cfa8a79938f859b832e53b2cd6c01af703671170..abbf0f9455196331dc617dc47abd0ddee6926bef 100644 (file)
--- a/tests/language/lexer/scan-test.c
+++ b/tests/language/lexer/scan-test.c
@@ -39,6 +39,10 @@
  /* -a/--auto, -b/--batch, -i/--interactive: syntax mode. */
  static enum segmenter_mode mode = SEG_MODE_AUTO;
  
+/* -s, --strip-trailing-newline: Strip trailing newline from last line of
+    input. */
+static bool strip_trailing_newline;
+
  static const char *parse_options (int argc, char **argv);
  static void usage (void) NO_RETURN;
  
@@ -55,19 +59,21 @@ main (int argc, char *argv[])
    set_program_name (argv[0]);
    file_name = parse_options (argc, argv);
  
-  /* Read from stdin into 'input'.  Ensure that 'input' ends in a new-line
-     followed by a null byte. */
+  /* Read from stdin into 'input'. */
    input = (!strcmp (file_name, "-")
             ? fread_file (stdin, &length)
             : read_file (file_name, &length));
    if (input == NULL)
      error (EXIT_FAILURE, errno, "reading %s failed", file_name);
-  input = xrealloc (input, length + 3);
-  if (length == 0 || input[length - 1] != '\n')
-    input[length++] = '\n';
-  input[length++] = '\0';
  
-  string_lexer_init (&slex, input, mode);
+  if (strip_trailing_newline && length && input[length - 1] == '\n')
+    {
+      length--;
+      if (length && input[length - 1] == '\r')
+        length--;
+    }
+
+  string_lexer_init (&slex, input, length, mode);
    do
      {
        struct token token;
@@ -107,11 +113,12 @@ parse_options (int argc, char **argv)
            {"auto", no_argument, NULL, 'a'},
            {"batch", no_argument, NULL, 'b'},
            {"interactive", no_argument, NULL, 'i'},
+          {"strip-trailing-newline", no_argument, NULL, 's'},
            {"help", no_argument, NULL, 'h'},
            {NULL, 0, NULL, 0},
          };
  
-      int c = getopt_long (argc, argv, "abih", options, NULL);
+      int c = getopt_long (argc, argv, "sabih", options, NULL);
        if (c == -1)
          break;
  
@@ -129,6 +136,10 @@ parse_options (int argc, char **argv)
            mode = SEG_MODE_INTERACTIVE;
            break;
  
+        case 's':
+          strip_trailing_newline = true;
+          break;
+
          case 'h':
            usage ();
  
@@ -159,10 +170,10 @@ usage (void)
  usage: %s [OPTIONS] INPUT\n\
  \n\
  Options:\n\
-  -1, --one-segment   feed one segment at a time\n\
    -a, --auto          use \"auto\" syntax mode\n\
    -b, --batch         use \"batch\" syntax mode\n\
    -i, --interactive   use \"interactive\" syntax mode (default)\n\
+  -s, --strip-trailing-newline  remove newline from end of input\n\
    -v, --verbose       include rows and column numbers in output\n\
    -h, --help          print this help message\n",
            program_name, program_name);
diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at

index a6b0e62881663c00ba87075203e08d067d3ef2c0..8eb48059e9aff32d215ff063408ff5f665df5989 100644 (file)
--- a/tests/language/lexer/scan.at
+++ b/tests/language/lexer/scan.at
@@ -16,7 +16,11 @@ dnl along with this program.  If not, see <http://www.gnu.org/licenses/>.
  dnl
  AT_BANNER([syntax scanning])
  m4_define([PSPP_CHECK_SCAN],
-  [AT_CHECK([scan-test $1 input], [0], [expout])])
+  [sed 's/^-//' < expout-base > expout
+   AT_CHECK([scan-test $1 input], [0], [expout])
+
+   sed '/^-/d' < expout-base > expout
+   AT_CHECK([scan-test -s $1 input], [0], [expout])])
  \f
  AT_SETUP([identifiers])
  AT_KEYWORDS([scan])
@@ -28,7 +32,7 @@ QrStUv./* end of line comment */ @&t@
  WXYZ. /* unterminated end of line comment
  �. /* U+FFFD is not valid in an identifier
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  ID "a"
  SKIP
  ID "aB"
@@ -73,7 +77,7 @@ UNEXPECTED_CHAR 65533
  ENDCMD
  SKIP
  SKIP
-SKIP
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -87,7 +91,7 @@ AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
  andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
  and. with.
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  AND
  SKIP
  OR
@@ -170,7 +174,7 @@ ID "and."
  SKIP
  WITH
  ENDCMD
-SKIP
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -182,7 +186,7 @@ AT_DATA([input], [dnl
  ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
  ~&|=>=><=<~=<>(),-+*/[[]]**
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  NOT
  SKIP
  AND
@@ -243,7 +247,7 @@ SLASH
  LBRACK
  RBRACK
  EXP
-SKIP
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -260,7 +264,7 @@ AT_DATA([input], [dnl
  1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
  . 1e e1 1e+ 1e-
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  POS_NUM
  SKIP
  POS_NUM 1
@@ -328,7 +332,7 @@ SKIP
  EXPECTED_EXPONENT "1e+"
  SKIP
  EXPECTED_EXPONENT "1e-"
-SKIP
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -367,7 +371,7 @@ x"4142"
  "�あいうえお"
  "abc"+U"FFFD"+u'3048'+"xyz"
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  STRING "x"
  SKIP
  STRING "y"
@@ -423,7 +427,7 @@ SKIP
  STRING "�あいうえお"
  SKIP
  STRING "abc�えxyz"
-SKIP
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -435,7 +439,7 @@ AT_DATA([input], [dnl
  #! /usr/bin/pspp
  #! /usr/bin/pspp
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  SKIP
  SKIP
  ID "#"
@@ -447,7 +451,7 @@ SLASH
  ID "bin"
  SLASH
  ID "pspp"
-SKIP
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -473,7 +477,7 @@ com is ambiguous with COMPUTE.
  next command.
  
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  SKIP
  SKIP
  SKIP
@@ -523,8 +527,8 @@ SKIP
  ID "command"
  ENDCMD
  SKIP
-ENDCMD
-SKIP
+-ENDCMD
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -544,7 +548,7 @@ isn't parsed as tokens
  
  second paragraph.
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  ID "DOCUMENT"
  STRING "DOCUMENT one line."
  ENDCMD
@@ -571,9 +575,9 @@ SKIP
  STRING ""
  SKIP
  STRING "second paragraph."
-ENDCMD
-ENDCMD
-SKIP
+-ENDCMD
+-ENDCMD
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -598,7 +602,7 @@ FILE /*
  /**/  lab not quoted here either
  
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  ID "title"
  SKIP
  STRING "Quoted string title"
@@ -656,8 +660,8 @@ ID "lab"
  SKIP
  STRING "not quoted here either"
  SKIP
-ENDCMD
-SKIP
+-ENDCMD
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -678,7 +682,7 @@ end  data
  end data
  .
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  ID "begin"
  SKIP
  ID "data"
@@ -714,7 +718,7 @@ SKIP
  ID "data"
  SKIP
  ENDCMD
-SKIP
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -733,7 +737,7 @@ end /* x */ /* y */ repeat print.
  end
   repeat.
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  ID "do"
  SKIP
  ID "repeat"
@@ -771,7 +775,7 @@ SKIP
  SKIP
  ID "repeat"
  ENDCMD
-SKIP
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-i])
@@ -788,7 +792,7 @@ third command
  fourth command.
     fifth command.
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  ID "first"
  SKIP
  ID "command"
@@ -827,7 +831,7 @@ ID "fifth"
  SKIP
  ID "command"
  ENDCMD
-SKIP
+-SKIP
  STOP
  ])
  PSPP_CHECK_SCAN([-b])
diff --git a/tests/language/lexer/segment-test.c b/tests/language/lexer/segment-test.c

index ef5ff613feba9af8ba3588c4a492efa301a54344..2cd141cfe01598972fc83f97a26cf871d4912088 100644 (file)
--- a/tests/language/lexer/segment-test.c
+++ b/tests/language/lexer/segment-test.c
@@ -50,6 +50,10 @@ static bool one_byte;
  /* -0, --truncations: Check that every truncation of input yields a result. */
  static bool check_truncations;
  
+/* -s, --strip-trailing-newline: Strip trailing newline from last line of
+    input. */
+static bool strip_trailing_newline;
+
  static const char *parse_options (int argc, char **argv);
  static void usage (void) NO_RETURN;
  
@@ -74,23 +78,23 @@ main (int argc, char *argv[])
    if (input == NULL)
      error (EXIT_FAILURE, errno, "reading %s failed", file_name);
  
-  if (!check_truncations)
+  if (strip_trailing_newline && length && input[length - 1] == '\n')
      {
-      input = xrealloc (input, length + 3);
-      if (length == 0 || input[length - 1] != '\n')
-        input[length++] = '\n';
-      input[length++] = '\0';
-
-      check_segmentation (input, length, true);
+      length--;
+      if (length && input[length - 1] == '\r')
+        length--;
      }
+
+  if (!check_truncations)
+    check_segmentation (input, length, true);
    else
      {
        size_t test_len;
  
        for (test_len = 0; test_len <= length; test_len++)
          {
-          char *copy = xmemdup0 (input, test_len);
-          check_segmentation (copy, test_len + 1, false);
+          char *copy = xmemdup (input, test_len);
+          check_segmentation (copy, test_len, false);
            free (copy);
          }
      }
@@ -102,18 +106,16 @@ main (int argc, char *argv[])
  static void
  check_segmentation (const char *input, size_t length, bool print_segments)
  {
-  size_t offset, line_number, line_offset;
    struct segmenter s;
-  int prev_type;
-
    segmenter_init (&s, mode);
  
-  line_number = 1;
-  line_offset = 0;
-  prev_type = -1;
-  for (offset = 0; offset < length; )
+  size_t line_number = 1;
+  size_t line_offset = 0;
+  int prev_type = -1;
+  size_t offset = 0;
+  enum segment_type type;
+  do
      {
-      enum segment_type type;
        const char *type_name, *p;
        int n;
  
@@ -132,7 +134,7 @@ check_segmentation (const char *input, size_t length, bool print_segments)
                  n_newlines++;
  
                copy = xmemdup (input + offset, i);
-              n = segmenter_push (&s, copy, i, &type);
+              n = segmenter_push (&s, copy, i, i + offset >= length, &type);
                free (copy);
  
                if (n >= 0)
@@ -141,17 +143,24 @@ check_segmentation (const char *input, size_t length, bool print_segments)
            assert (n_newlines <= 2);
          }
        else
-        n = segmenter_push (&s, input + offset, length - offset, &type);
+        n = segmenter_push (&s, input + offset, length - offset, true, &type);
  
        if (n < 0)
-        error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu",
-               offset);
+        {
+          if (!print_segments)
+            check_segmentation (input, length, true);
+          else
+            error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu",
+                   offset);
+        }
        assert (offset + n <= length);
  
        if (type == SEG_NEWLINE)
-        assert ((n == 1 && input[offset] == '\n')
-                || (n == 2
-                    && input[offset] == '\r' && input[offset + 1] == '\n'));
+        {
+          assert ((n == 1 && input[offset] == '\n')
+                  || (n == 2
+                      && input[offset] == '\r' && input[offset + 1] == '\n'));
+        }
        else
          assert (memchr (&input[offset], '\n', n) == NULL);
  
@@ -266,6 +275,7 @@ check_segmentation (const char *input, size_t length, bool print_segments)
            printf (" (%s)\n", prompt_style_to_string (prompt));
          }
      }
+  while (type != SEG_END);
  
    if (print_segments)
      putchar ('\n');
@@ -280,6 +290,7 @@ parse_options (int argc, char **argv)
          {
            {"one-byte", no_argument, NULL, '1'},
            {"truncations", no_argument, NULL, '0'},
+          {"strip-trailing-newline", no_argument, NULL, 's'},
            {"auto", no_argument, NULL, 'a'},
            {"batch", no_argument, NULL, 'b'},
            {"interactive", no_argument, NULL, 'i'},
@@ -288,7 +299,7 @@ parse_options (int argc, char **argv)
            {NULL, 0, NULL, 0},
          };
  
-      int c = getopt_long (argc, argv, "01abivh", options, NULL);
+      int c = getopt_long (argc, argv, "01abivhs", options, NULL);
        if (c == -1)
          break;
  
@@ -302,6 +313,10 @@ parse_options (int argc, char **argv)
            check_truncations = true;
            break;
  
+        case 's':
+          strip_trailing_newline = true;
+          break;
+
          case 'a':
            mode = SEG_MODE_AUTO;
            break;
@@ -350,6 +365,7 @@ usage: %s [OPTIONS] INPUT\n\
  Options:\n\
    -1, --one-byte      feed one byte at a time\n\
    -0, --truncations   check null truncation of each prefix of input\n\
+  -s, --strip-trailing-newline  remove newline from end of input\n\
    -a, --auto          use \"auto\" syntax mode\n\
    -b, --batch         use \"batch\" syntax mode\n\
    -i, --interactive   use \"interactive\" syntax mode (default)\n\
diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at

index bd3bc38281ef9f15718092fb6bc013e64da0cf66..3660c924f7c4d8e595e6106f9ada492bd2d7eac4 100644 (file)
--- a/tests/language/lexer/segment.at
+++ b/tests/language/lexer/segment.at
@@ -16,10 +16,16 @@ dnl along with this program.  If not, see <http://www.gnu.org/licenses/>.
  dnl
  AT_BANNER([syntax segmentation])
  m4_define([PSPP_CHECK_SEGMENT],
-  [AT_CHECK([segment-test $1 input], [0], [expout])
-   AT_CHECK([segment-test -1 $1 input], [0], [expout])
-   AT_CHECK([segment-test -0 $1 input])
-   AT_CHECK([segment-test -01 $1 input])])
+  [for strip in "" "-s"; do
+     case $strip in # (
+        '') sed 's/^-//' < expout-base > expout ;; # (
+       -s) sed '/^-/d' < expout-base > expout ;;
+     esac
+     AT_CHECK([segment-test $1 $strip input], [0], [expout])
+     AT_CHECK([segment-test -1 $strip $1 input], [0], [expout])
+     AT_CHECK([segment-test -0 $strip $1 input])
+     AT_CHECK([segment-test -01 $strip $1 input])
+   done])
  \f
  AT_SETUP([identifiers])
  AT_KEYWORDS([segment])
@@ -36,7 +42,7 @@ f@#_.#6
  GhIjK
  .x 1y _z
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  identifier      a    space
  identifier      ab    space
  identifier      abc    space
@@ -97,9 +103,9 @@ number          1
  identifier      y    space
  unexpected_char \_
  identifier      z
-newline         \n (later)
-
-end             <U+0000>
+-newline         \n (later)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -125,7 +131,7 @@ wxyz./* unterminated end of line comment
  WXYZ. /* unterminated end of line comment
  WxYz./* unterminated end of line comment @&t@
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  identifier      abcd.    space
  identifier      abcd
  end_command     .
@@ -203,9 +209,9 @@ newline         \n (first)
  identifier      WxYz
  end_command     .
  comment         /*_unterminated_end_of_line_comment_
-newline         \n (first)
-
-end             <U+0000>
+-newline         \n (first)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -218,7 +224,7 @@ AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
  andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
  and. with.
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  reserved_word   and    space
  reserved_word   or    space
  reserved_word   not    space
@@ -267,9 +273,9 @@ newline         \n (later)
  identifier      and.    space
  reserved_word   with
  end_command     .
-newline         \n (first)
-
-end             <U+0000>
+-newline         \n (first)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -280,7 +286,7 @@ AT_DATA([input], [dnl
  ~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
  ~&|=>=><=<~=<>(),-+*/[[]]**
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  punct           ~    space
  punct           &    space
  punct           |    space
@@ -323,9 +329,9 @@ punct           /
  punct           [[
  punct           ]]
  punct           **
-newline         \n (later)
-
-end             <U+0000>
+-newline         \n (later)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -341,7 +347,7 @@ AT_DATA([input], [dnl
  1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
  . 1e e1 1e+ 1e-
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  number          0    space
  number          1    space
  number          01    space
@@ -390,9 +396,9 @@ expected_exponent 1e    space
  identifier      e1    space
  expected_exponent 1e+    space
  expected_exponent 1e-
-newline         \n (later)
-
-end             <U+0000>
+-newline         \n (later)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -413,7 +419,7 @@ u'fffd' U"041"
  + /* also a punctuator on blank line
  - 'new command'
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  quoted_string   'x'    space
  quoted_string   "y"    space
  quoted_string   'abc'
@@ -462,9 +468,9 @@ newline         \n (later)
  
  start_command   -    space
  quoted_string   'new_command'
-newline         \n (later)
-
-end             <U+0000>
+-newline         \n (later)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -476,7 +482,7 @@ AT_DATA([input], [dnl
  title my title.
  #! /usr/bin/pspp
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  shbang          #!_/usr/bin/pspp
  newline         \n (first)
  
@@ -493,9 +499,9 @@ punct           /
  identifier      bin
  punct           /
  identifier      pspp
-newline         \n (later)
-
-end             <U+0000>
+-newline         \n (later)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -520,7 +526,7 @@ com is ambiguous with COMPUTE.
  next command.
  
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  comment_command *_Comment_commands_"don't
  newline         \n (COMMENT)
  
@@ -580,10 +586,10 @@ identifier      command
  end_command     .
  newline         \n (first)
  
-separate_commands
-newline         \n (first)
-
-end             <U+0000>
+-separate_commands
+-newline         \n (first)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -602,7 +608,7 @@ isn't parsed as tokens
  
  second paragraph.
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  start_document
  document        DOCUMENT_one_line.
  end_command     
@@ -638,11 +644,11 @@ document
  newline         \n (DOCUMENT)
  
  document        second_paragraph.
-end_command     
-separate_commands
-newline         \n (first)
- 
-end             <U+0000>
+-end_command
+-separate_commands
+-newline         \n (first)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -666,7 +672,7 @@ FILE /*
  /**/  lab not quoted here either
  
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  identifier      title
  comment         /**/
  quoted_string   'Quoted_string_title'
@@ -728,10 +734,10 @@ identifier      lab    space
  unquoted_string not_quoted_here_either
  newline         \n (later)
  
-separate_commands
-newline         \n (first)
-
-end             <U+0000>
+-separate_commands
+-newline         \n (first)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -763,7 +769,7 @@ begin data "xxx".
  begin data 123.
  not data
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  identifier      begin    space
  identifier      data
  end_command     .
@@ -854,9 +860,9 @@ newline         \n (first)
  
  reserved_word   not    space
  identifier      data
-newline         \n (later)
-
-end             <U+0000>
+-newline         \n (later)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -878,7 +884,7 @@ do
    inner command.
  end repeat.
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  identifier      do    space
  identifier      repeat    space
  identifier      x
@@ -936,9 +942,9 @@ newline         \n (DO REPEAT)
  identifier      end    space
  identifier      repeat
  end_command     .
-newline         \n (first)
-
-end             <U+0000>
+-newline         \n (first)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-i])
  AT_CLEANUP
@@ -954,7 +960,7 @@ third command
  fourth command.
     fifth command.
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  identifier      first    space
  identifier      command
  newline         \n (later)
@@ -990,9 +996,9 @@ spaces          ___
  identifier      fifth    space
  identifier      command
  end_command     .
-newline         \n (first)
-
-end             <U+0000>
+-newline         \n (first)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-b])
  AT_CLEANUP
@@ -1014,7 +1020,7 @@ twostep cluster
  fourth command.
     fifth command.
  ])
-AT_DATA([expout], [dnl
+AT_DATA([expout-base], [dnl
  identifier      command
  newline         \n (later)
  
@@ -1080,9 +1086,9 @@ spaces          ___
  identifier      fifth    space
  identifier      command
  end_command     .
-newline         \n (first)
-
-end             <U+0000>
+-newline         \n (first)
+-
+end
  ])
  PSPP_CHECK_SEGMENT([-a])
  AT_CLEANUP
author	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 24 Sep 2018 03:42:07 +0000 (20:42 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 24 Sep 2018 05:51:31 +0000 (22:51 -0700)
src/language/control/repeat.c		patch \| blob \| history
src/language/lexer/lexer.c		patch \| blob \| history
src/language/lexer/lexer.h		patch \| blob \| history
src/language/lexer/scan.c		patch \| blob \| history
src/language/lexer/scan.h		patch \| blob \| history
src/language/lexer/segment.c		patch \| blob \| history
src/language/lexer/segment.h		patch \| blob \| history
tests/language/lexer/lexer.at		patch \| blob \| history
tests/language/lexer/scan-test.c		patch \| blob \| history
tests/language/lexer/scan.at		patch \| blob \| history
tests/language/lexer/segment-test.c		patch \| blob \| history
tests/language/lexer/segment.at		patch \| blob \| history