From d4b8d953acd00e9a51b79cb2e345342649c5ff0c Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Mon, 29 May 2023 13:09:09 -0700
Subject: [PATCH] command-segmenter: New library for dividing syntax into
 individual commands.

This is a building block for allowing the GUI to run whole commands instead
of just lines.
---
 src/language/lexer/automake.mk         |   2 +
 src/language/lexer/command-segmenter.c | 205 +++++++++++++++++++++++++
 src/language/lexer/command-segmenter.h |  44 ++++++
 src/language/lexer/scan.c              |   3 +
 src/language/lexer/segment.c           |  48 +++++-
 src/language/lexer/segment.h           |   3 +
 tests/automake.mk                      |   1 +
 tests/language/lexer/segment-test.c    |  67 +++++++-
 tests/language/lexer/segment.at        | 200 ++++++++++++++++++++++--
 9 files changed, 549 insertions(+), 24 deletions(-)
 create mode 100644 src/language/lexer/command-segmenter.c
 create mode 100644 src/language/lexer/command-segmenter.h

diff --git a/src/language/lexer/automake.mk b/src/language/lexer/automake.mk
index 01b3df49c6..29f0a637f7 100644
--- a/src/language/lexer/automake.mk
+++ b/src/language/lexer/automake.mk
@@ -20,6 +20,8 @@
 language_lexer_sources = \
 	src/language/lexer/command-name.c \
 	src/language/lexer/command-name.h \
+	src/language/lexer/command-segmenter.c \
+	src/language/lexer/command-segmenter.h \
 	src/language/lexer/include-path.c \
 	src/language/lexer/include-path.h \
 	src/language/lexer/lexer.c \
diff --git a/src/language/lexer/command-segmenter.c b/src/language/lexer/command-segmenter.c
new file mode 100644
index 0000000000..8d1036d23c
--- /dev/null
+++ b/src/language/lexer/command-segmenter.c
@@ -0,0 +1,205 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "language/lexer/command-segmenter.h"
+
+#include "language/lexer/segment.h"
+#include "libpspp/deque.h"
+#include "libpspp/str.h"
+
+struct lines
+  {
+    int first;
+    int last;
+  };
+
+struct command_segmenter
+  {
+    struct segmenter segmenter;
+    struct string input;
+
+    int command_first_line;
+    int line;
+    enum segment_type prev_segment;
+
+    struct deque deque;
+    struct lines *lines;
+  };
+
+/* Creates and returns a new command segmenter for the given syntax MODE. */
+struct command_segmenter *
+command_segmenter_create (enum segmenter_mode mode)
+{
+  struct command_segmenter *cs = xmalloc (sizeof *cs);
+  *cs = (struct command_segmenter) {
+    .segmenter = segmenter_init (mode, false),
+    .input = DS_EMPTY_INITIALIZER,
+    .prev_segment = SEG_NEWLINE,
+    .deque = DEQUE_EMPTY_INITIALIZER,
+  };
+  return cs;
+}
+
+/* Destroys CS. */
+void
+command_segmenter_destroy (struct command_segmenter *cs)
+{
+  if (cs)
+    {
+      ds_destroy (&cs->input);
+      free (cs->lines);
+      free (cs);
+    }
+}
+
+static void
+emit (struct command_segmenter *cs, int first, int last)
+{
+  if (first < last)
+    {
+      if (deque_is_full (&cs->deque))
+        cs->lines = deque_expand (&cs->deque, cs->lines, sizeof *cs->lines);
+      cs->lines[deque_push_back (&cs->deque)] = (struct lines) {
+        .first = first,
+        .last = last,
+      };
+    }
+}
+
+static void
+command_segmenter_push__ (struct command_segmenter *cs,
+                          const char *input, size_t n, bool eof)
+{
+  if (!ds_is_empty (&cs->input))
+    {
+      ds_put_substring (&cs->input, ss_buffer (input, n));
+      input = ds_cstr (&cs->input);
+      n = ds_length (&cs->input);
+    }
+
+  for (;;)
+    {
+      enum segment_type type;
+      int retval = segmenter_push (&cs->segmenter, input, n, eof, &type);
+      if (retval < 0)
+        break;
+
+      switch (type)
+        {
+        case SEG_NUMBER:
+        case SEG_QUOTED_STRING:
+        case SEG_HEX_STRING:
+        case SEG_UNICODE_STRING:
+        case SEG_UNQUOTED_STRING:
+        case SEG_RESERVED_WORD:
+        case SEG_IDENTIFIER:
+        case SEG_PUNCT:
+        case SEG_SHBANG:
+        case SEG_SPACES:
+        case SEG_COMMENT:
+        case SEG_COMMENT_COMMAND:
+        case SEG_DO_REPEAT_COMMAND:
+        case SEG_INLINE_DATA:
+        case SEG_INNER_START_COMMAND:
+        case SEG_INNER_SEPARATE_COMMANDS:
+        case SEG_INNER_END_COMMAND:
+        case SEG_MACRO_ID:
+        case SEG_MACRO_NAME:
+        case SEG_MACRO_BODY:
+        case SEG_START_DOCUMENT:
+        case SEG_DOCUMENT:
+        case SEG_EXPECTED_QUOTE:
+        case SEG_EXPECTED_EXPONENT:
+        case SEG_UNEXPECTED_CHAR:
+          break;
+
+        case SEG_NEWLINE:
+          cs->line++;
+          break;
+
+        case SEG_START_COMMAND:
+          if (cs->line > cs->command_first_line)
+            emit (cs, cs->command_first_line, cs->line);
+          cs->command_first_line = cs->line;
+          break;
+
+        case SEG_SEPARATE_COMMANDS:
+          if (cs->line > cs->command_first_line)
+            emit (cs, cs->command_first_line, cs->line);
+          cs->command_first_line = cs->line + 1;
+          break;
+
+        case SEG_END_COMMAND:
+          emit (cs, cs->command_first_line, cs->line + 1);
+          cs->command_first_line = cs->line + 1;
+          break;
+
+        case SEG_END:
+          emit (cs, cs->command_first_line, cs->line + (cs->prev_segment != SEG_NEWLINE));
+          break;
+        }
+
+      cs->prev_segment = type;
+      input += retval;
+      n -= retval;
+      if (type == SEG_END)
+        break;
+    }
+
+  ds_assign_substring (&cs->input, ss_buffer (input, n));
+}
+
+/* Adds the N bytes of UTF-8 encoded syntax INPUT to CS. */
+void
+command_segmenter_push (struct command_segmenter *cs,
+                        const char *input, size_t n)
+{
+  command_segmenter_push__ (cs, input, n, false);
+}
+
+/* Tells CS that no more input is coming.  The caller shouldn't call
+   command_segmenter_push() again. */
+void
+command_segmenter_eof (struct command_segmenter *cs)
+{
+  command_segmenter_push__ (cs, "", 0, true);
+}
+
+/* Attempts to get a pair of line numbers bounding a command in the input from
+   CS.  If successful, returns true and stores the first line in LINES[0] and
+   one past the last line in LINES[1].  On failure, returns false.
+
+   Command bounds can start becoming available as soon as after the first call
+   to command_segmenter_push().  Often the output lags behind the input a
+   little because some lookahead is needed.  After calling
+   command_segmenter_eof(), all the output is available.
+
+   Command bounds are always in order and commands never overlap.  Some lines,
+   such as blank lines, might not be part of any command.  An empty input or
+   input consisting of just blank lines contains no commands. */
+bool
+command_segmenter_get (struct command_segmenter *cs, int lines[2])
+{
+  if (deque_is_empty (&cs->deque))
+    return false;
+
+  struct lines *r = &cs->lines[deque_pop_front (&cs->deque)];
+  lines[0] = r->first;
+  lines[1] = r->last;
+  return true;
+}
diff --git a/src/language/lexer/command-segmenter.h b/src/language/lexer/command-segmenter.h
new file mode 100644
index 0000000000..52c2b025a7
--- /dev/null
+++ b/src/language/lexer/command-segmenter.h
@@ -0,0 +1,44 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef COMMAND_SEGMENTER_H
+#define COMMAND_SEGMENTER_H 1
+
+#include "language/lexer/segment.h"
+
+/* Divides syntax lines into individual commands.
+
+   This is for use by the GUI, which has a feature to run an individual command
+   in a syntax window.
+
+   This groups together some kinds of commands that the PSPP tokenizer would
+   put T_ENDCMD inside.  For example, it always considers BEGIN DATA...END DATA
+   as a single command, even though the tokenizer will emit T_ENDCMD after
+   BEGIN DATA if it has a command terminator.  That's because it's the behavior
+   most useful for the GUI feature.
+*/
+
+struct command_segmenter;
+
+struct command_segmenter *command_segmenter_create (enum segmenter_mode);
+void command_segmenter_destroy (struct command_segmenter *);
+
+void command_segmenter_push (struct command_segmenter *,
+                             const char *input, size_t n);
+void command_segmenter_eof (struct command_segmenter *);
+bool command_segmenter_get (struct command_segmenter *, int lines[2]);
+
+#endif /* command-segmenter.h */
diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c
index e4fe405d47..43f79df4ee 100644
--- a/src/language/lexer/scan.c
+++ b/src/language/lexer/scan.c
@@ -372,6 +372,9 @@ token_from_segment (enum segment_type type, struct substring s,
     case SEG_START_COMMAND:
     case SEG_SEPARATE_COMMANDS:
     case SEG_END_COMMAND:
+    case SEG_INNER_START_COMMAND:
+    case SEG_INNER_SEPARATE_COMMANDS:
+    case SEG_INNER_END_COMMAND:
       *token = (struct token) { .type = T_ENDCMD };
       return TOKENIZE_TOKEN;
 
diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c
index a7bce8b6e7..58d9af5031 100644
--- a/src/language/lexer/segment.c
+++ b/src/language/lexer/segment.c
@@ -688,6 +688,9 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
         case SEG_START_COMMAND:
         case SEG_SEPARATE_COMMANDS:
         case SEG_END_COMMAND:
+        case SEG_INNER_START_COMMAND:
+        case SEG_INNER_SEPARATE_COMMANDS:
+        case SEG_INNER_END_COMMAND:
         case SEG_END:
         case SEG_EXPECTED_QUOTE:
         case SEG_EXPECTED_EXPONENT:
@@ -800,8 +803,9 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
             return -1;
           else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
             {
-              int eol;
-
+              /* We've found BEGIN DATA.  Check whether that's the entire
+                 command (either followed by a new-line or by '.' then a
+                 new-line). */
               ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
               if (ofs2 < 0)
                 return -1;
@@ -815,11 +819,14 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
                     return -1;
                 }
 
-              eol = is_end_of_line (input, n, eof, ofs2);
+              int eol = is_end_of_line (input, n, eof, ofs2);
               if (eol < 0)
                 return -1;
               else if (eol)
                 {
+                  /* BEGIN DATA is indeed the entire command.  We choose a next
+                     state depending on whether it's one line long or two lines
+                     long. */
                   if (memchr (input, '\n', ofs2))
                     s->state = S_BEGIN_DATA_1;
                   else
@@ -1229,14 +1236,30 @@ segmenter_parse_start_of_line__ (struct segmenter *s,
               return 1;
             }
         }
-      /* Fall through. */
+      *type = SEG_START_COMMAND;
+      s->substate = SS_START_OF_COMMAND;
+      return 1;
 
     case '-':
-    case '.':
       *type = SEG_START_COMMAND;
       s->substate = SS_START_OF_COMMAND;
       return 1;
 
+    case '.':
+      /* We've found '.' at the beginning of a line.  If there's more text on
+         the line, then it starts a new command, because '+' or '-' or '.' in
+         the leftmost column does that.  If the command is otherwise blank,
+         then it ends the previous command.  The difference only matters for
+         deciding whether the line is part of the previous command in
+         command_segmenter. */
+      int eol = at_end_of_line (input, n, eof, 1);
+      if (eol < 0)
+        return -1;
+
+      *type = eol ? SEG_END_COMMAND : SEG_START_COMMAND;
+      s->substate = SS_START_OF_COMMAND;
+      return 1;
+
     default:
       if (lex_uc_is_space (uc))
         {
@@ -1409,12 +1432,16 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s,
     {
       /* We reached a blank line that separates the head from the body. */
       s->state = S_DO_REPEAT_2;
+      *type = SEG_INNER_SEPARATE_COMMANDS;
     }
   else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
     {
       /* We reached the body. */
       s->state = S_DO_REPEAT_3;
       s->substate = 1;
+      *type = (*type == SEG_END_COMMAND
+               ? SEG_INNER_END_COMMAND
+               : SEG_INNER_START_COMMAND);
     }
 
   return ofs;
@@ -1722,6 +1749,13 @@ segmenter_parse_define_5__ (struct segmenter *s,
   return ofs;
 }
 
+/* We're segmenting the first line of a two-line BEGIN DATA command.  Segment
+   up to the first new-line.
+
+   This BEGIN DATA is expressed something like this (weird, but legal):
+
+      BEGIN
+       DATA. */
 static int
 segmenter_parse_begin_data_1__ (struct segmenter *s,
                                 const char *input, size_t n, bool eof,
@@ -1737,6 +1771,8 @@ segmenter_parse_begin_data_1__ (struct segmenter *s,
   return ofs;
 }
 
+/* We're segmenting a one-line BEGIN DATA command, or the second line of a
+   two-line BEGIN DATA command.  Segment up to the new-line. */
 static int
 segmenter_parse_begin_data_2__ (struct segmenter *s,
                                 const char *input, size_t n, bool eof,
@@ -1748,6 +1784,8 @@ segmenter_parse_begin_data_2__ (struct segmenter *s,
 
   if (*type == SEG_NEWLINE)
     s->state = S_BEGIN_DATA_3;
+  else if (*type == SEG_END_COMMAND)
+    *type = SEG_INNER_END_COMMAND;
 
   return ofs;
 }
diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h
index d5f846a900..29bac670c2 100644
--- a/src/language/lexer/segment.h
+++ b/src/language/lexer/segment.h
@@ -88,6 +88,9 @@ enum segmenter_mode
     SEG_TYPE(START_COMMAND)                     \
     SEG_TYPE(SEPARATE_COMMANDS)                 \
     SEG_TYPE(END_COMMAND)                       \
+    SEG_TYPE(INNER_START_COMMAND)               \
+    SEG_TYPE(INNER_SEPARATE_COMMANDS)           \
+    SEG_TYPE(INNER_END_COMMAND)                 \
     SEG_TYPE(END)                               \
                                                 \
     SEG_TYPE(EXPECTED_QUOTE)                    \
diff --git a/tests/automake.mk b/tests/automake.mk
index aa79c2a482..43c2f89078 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -206,6 +206,7 @@ check_PROGRAMS += tests/language/lexer/segment-test
 tests_language_lexer_segment_test_SOURCES = \
 	src/data/identifier.c \
 	src/language/lexer/command-name.c \
+	src/language/lexer/command-segmenter.c \
 	src/language/lexer/segment.c \
 	tests/language/lexer/segment-test.c
 tests_language_lexer_segment_test_CFLAGS = $(AM_CFLAGS)
diff --git a/tests/language/lexer/segment-test.c b/tests/language/lexer/segment-test.c
index 5977e8fce6..3f5a579d5a 100644
--- a/tests/language/lexer/segment-test.c
+++ b/tests/language/lexer/segment-test.c
@@ -29,6 +29,7 @@
 #include "libpspp/cast.h"
 #include "libpspp/compiler.h"
 #include "libpspp/misc.h"
+#include "language/lexer/command-segmenter.h"
 #include "language/lexer/segment.h"
 
 #include "gl/error.h"
@@ -50,6 +51,9 @@ static bool one_byte;
 /* -0, --truncations: Check that every truncation of input yields a result. */
 static bool check_truncations;
 
+/* -c, --commands: Print segmentation of input into commands. */
+static bool commands;
+
 /* -s, --strip-trailing-newline: Strip trailing newline from last line of
     input. */
 static bool strip_trailing_newline;
@@ -59,6 +63,7 @@ static void usage (void) NO_RETURN;
 
 static void check_segmentation (const char *input, size_t length,
                                 bool print_segments);
+static void check_commands (const char *input, size_t length);
 
 int
 main (int argc, char *argv[])
@@ -72,8 +77,7 @@ main (int argc, char *argv[])
 
   setvbuf (stdout, NULL, _IONBF, 0);
 
-  /* Read from stdin into 'input'.  Ensure that 'input' ends in a new-line
-     followed by a null byte. */
+  /* Read syntax into 'input'. */
   input = (!strcmp (file_name, "-")
            ? fread_file (stdin, 0, &length)
            : read_file (file_name, 0, &length));
@@ -87,9 +91,7 @@ main (int argc, char *argv[])
         length--;
     }
 
-  if (!check_truncations)
-    check_segmentation (input, length, true);
-  else
+  if (check_truncations)
     {
       size_t test_len;
 
@@ -100,11 +102,54 @@ main (int argc, char *argv[])
           free (copy);
         }
     }
+  else if (commands)
+    check_commands (input, length);
+  else
+    check_segmentation (input, length, true);
+
   free (input);
 
   return 0;
 }
 
+static void
+print_line (const char *input, size_t length, int line)
+{
+  for (int i = 0; i < line; i++)
+    {
+      const char *newline = memchr (input, '\n', length);
+      size_t line_len = newline ? newline - input + 1 : strlen (input);
+      input += line_len;
+      length -= line_len;
+    }
+
+  int line_len = strcspn (input, "\n");
+  printf ("%.*s\n", line_len, input);
+}
+
+static void
+check_commands (const char *input, size_t length)
+{
+  struct command_segmenter *cs = command_segmenter_create (mode);
+  command_segmenter_push (cs, input, length);
+  command_segmenter_eof (cs);
+
+  int last_line = -1;
+  int lines[2];
+  while (command_segmenter_get (cs, lines))
+    {
+      assert (last_line == -1 || lines[0] >= last_line);
+      assert (lines[0] < lines[1]);
+      if (last_line != -1)
+        printf ("-----\n");
+      for (int line = lines[0]; line < lines[1]; line++)
+        print_line (input, length, line);
+      last_line = lines[1];
+    }
+
+  command_segmenter_destroy (cs);
+}
+
 static void
 check_segmentation (const char *input, size_t length, bool print_segments)
 {
@@ -300,12 +345,13 @@ parse_options (int argc, char **argv)
           {"auto", no_argument, NULL, 'a'},
           {"batch", no_argument, NULL, 'b'},
           {"interactive", no_argument, NULL, 'i'},
+          {"commands", no_argument, NULL, 'c'},
           {"verbose", no_argument, NULL, 'v'},
           {"help", no_argument, NULL, 'h'},
           {NULL, 0, NULL, 0},
         };
 
-      int c = getopt_long (argc, argv, "01abivhs", options, NULL);
+      int c = getopt_long (argc, argv, "01abivhsc", options, NULL);
       if (c == -1)
         break;
 
@@ -335,6 +381,10 @@ parse_options (int argc, char **argv)
           mode = SEG_MODE_INTERACTIVE;
           break;
 
+        case 'c':
+          commands = true;
+          break;
+
         case 'v':
           verbose = true;
           break;
@@ -368,9 +418,12 @@ usage (void)
 %s, to test breaking PSPP syntax into lexical segments\n\
 usage: %s [OPTIONS] INPUT\n\
 \n\
+By default, print segmentation of input into PSPP syntax units. Other modes:\n\
+  -0, --truncations   check null truncation of each prefix of input\n\
+  -c, --commands      print segmentation into PSPP commands\n\
+\n\
 Options:\n\
   -1, --one-byte      feed one byte at a time\n\
-  -0, --truncations   check null truncation of each prefix of input\n\
   -s, --strip-trailing-newline  remove newline from end of input\n\
   -a, --auto          use \"auto\" syntax mode\n\
   -b, --batch         use \"batch\" syntax mode\n\
diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at
index abbc08c8cd..0d49147ada 100644
--- a/tests/language/lexer/segment.at
+++ b/tests/language/lexer/segment.at
@@ -178,10 +178,10 @@ identifier      #abcd
 end_command     .
 newline         \n (first)
 
-start_command   .
+end_command     .
 newline         \n (first)
 
-start_command   .    space
+end_command     .    space
 newline         \n (first)
 
 identifier      LMNOP
@@ -610,7 +610,8 @@ AT_CLEANUP
 AT_SETUP([* and COMMENT commands])
 AT_KEYWORDS([segment])
 AT_DATA([input], [dnl
-* Comment commands "don't
+* Comment commands "don't dnl "
+
 have to contain valid tokens.
 
 ** Check ambiguity with ** token.
@@ -626,9 +627,31 @@ com is ambiguous with COMPUTE.
 
 next command.
 
+])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+* Comment commands "don't dnl "
+
+have to contain valid tokens.
+-----
+** Check ambiguity with ** token.
+-----
+****************.
+-----
+comment keyword works too.
+-----
+COMM also.
+-----
+com is ambiguous with COMPUTE.
+-----
+   * Comment need not start at left margin.
+-----
+* Comment ends with blank line
+-----
+next command.
 ])
 AT_DATA([expout-base], [dnl
-comment_command *_Comment_commands_"don't
+comment_command *_Comment_commands_"don't dnl "
+
 newline         \n (COMMENT)
 
 comment_command have_to_contain_valid_tokens
@@ -707,6 +730,20 @@ docu
 first.paragraph
 isn't parsed as tokens
 
+second paragraph.
+])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+DOCUMENT one line.
+-----
+DOC more
+    than
+        one
+            line.
+-----
+docu
+first.paragraph
+isn't parsed as tokens
+
 second paragraph.
 ])
 AT_DATA([expout-base], [dnl
@@ -763,6 +800,15 @@ FILE
 FILE /*
 /**/  lab not quoted here either
 
+])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+FIL label isn't quoted.
+-----
+FILE
+  lab 'is quoted'.
+-----
+FILE /*
+/**/  lab not quoted here either
 ])
 AT_DATA([expout-base], [dnl
 identifier      FIL    space
@@ -825,10 +871,37 @@ begin data "xxx".
 begin data 123.
 not data
 ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+begin data.
+end data.
+-----
+begin data. /*
+123
+xxx
+end data.
+-----
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end  data
+end data
+.
+-----
+begin
+ data.
+data
+end data.
+-----
+begin data "xxx".
+-----
+begin data 123.
+-----
+not data
+])
 AT_DATA([expout-base], [dnl
 identifier      begin    space
 identifier      data
-end_command     .
+inner_end_command .
 newline         \n (data)
 
 identifier      end    space
@@ -841,7 +914,7 @@ newline         \n (first)
 
 identifier      begin    space
 identifier      data
-end_command     .    space
+inner_end_command .    space
 comment         /*
 newline         \n (data)
 
@@ -878,7 +951,7 @@ identifier      end    space
 identifier      data
 newline         \n (later)
 
-start_command   .
+end_command     .
 newline         \n (first)
 
 separate_commands
@@ -888,7 +961,7 @@ identifier      begin
 newline         \n (later)
     space
 identifier      data
-end_command     .
+inner_end_command .
 newline         \n (data)
 
 inline_data     data
@@ -940,6 +1013,22 @@ do
   inner command.
 end repeat.
 ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+do repeat x=a b c
+          y=d e f.
+  do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+-----
+do
+  repeat #a=1.
+  inner command.
+end repeat.
+])
 AT_DATA([expout-base], [dnl
 identifier      do    space
 identifier      repeat    space
@@ -956,7 +1045,7 @@ punct           =
 identifier      d    space
 identifier      e    space
 identifier      f
-end_command     .
+inner_end_command .
 newline         \n (DO REPEAT)
 
 do_repeat_command __do_repeat_a=1_thru_5.
@@ -989,7 +1078,7 @@ identifier      repeat    space
 identifier      #a
 punct           =
 number          1
-end_command     .
+inner_end_command .
 newline         \n (DO REPEAT)
 
 do_repeat_command __inner_command.
@@ -1023,6 +1112,23 @@ do
   inner command
 end repeat
 ])
+AT_CHECK([segment-test -c -b input], [0], [dnl
+do repeat x=a b c
+          y=d e f
+do repeat a=1 thru 5
+another command
+second command
++ third command
+end /* x */ /* y */ repeat print
+end
+ repeat
+-----
+do
+  repeat #a=1
+
+  inner command
+end repeat
+])
 AT_DATA([expout-base], [dnl
 identifier      do    space
 identifier      repeat    space
@@ -1041,7 +1147,7 @@ identifier      e    space
 identifier      f
 newline         \n (later)
 
-start_command
+inner_start_command
 do_repeat_command do_repeat_a=1_thru_5
 newline         \n (DO REPEAT)
 
@@ -1074,7 +1180,7 @@ punct           =
 number          1
 newline         \n (later)
 
-separate_commands
+inner_separate_commands
 newline         \n (DO REPEAT)
 
 do_repeat_command __inner_command
@@ -1096,6 +1202,7 @@ define !macro1()
 var1 var2 var3 "!enddefine"
 !enddefine.
 ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1122,6 +1229,7 @@ AT_DATA([input], [dnl
 define !macro1() var1 var2 var3 /* !enddefine
 !enddefine.
 ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1145,6 +1253,7 @@ AT_DATA([input], [dnl
 define !macro1()
 var1 var2 var3!enddefine.
 ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1168,6 +1277,7 @@ AT_KEYWORDS([segment])
 AT_DATA([input], [dnl
 define !macro1()var1 var2 var3!enddefine.
 ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1189,6 +1299,7 @@ AT_DATA([input], [dnl
 define !macro1()
 !enddefine.
 ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1214,6 +1325,7 @@ define !macro1()
 
 !enddefine.
 ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1243,6 +1355,7 @@ AT_DATA([input], [dnl
 define !macro1(a(), b(), c())
 !enddefine.
 ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1281,6 +1394,7 @@ define !macro1(
 )
 !enddefine.
 ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1330,6 +1444,7 @@ content 1
 content 2
 !enddefine.
 ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1368,6 +1483,11 @@ AT_DATA([input], [dnl
 define !macro1.
 data list /x 1.
 ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+define !macro1.
+-----
+data list /x 1.
+])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1394,6 +1514,12 @@ define !macro1
 x.
 data list /x 1.
 ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+define !macro1
+x.
+-----
+data list /x 1.
+])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1423,6 +1549,13 @@ define !macro1(.
 x.
 data list /x 1.
 ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+define !macro1@{:@.
+-----
+x.
+-----
+data list /x 1.
+])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1455,6 +1588,11 @@ dnl which should not be there and ends it early.
 define !macro1.
 data list /x 1.
 ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+define !macro1.
+-----
+data list /x 1.
+])
 AT_DATA([expout-base], [dnl
 identifier      define    space
 macro_name      !macro1
@@ -1643,11 +1781,49 @@ end
 ])
 PSPP_CHECK_SEGMENT([-a])
 AT_CLEANUP
+
+AT_SETUP([empty input])
+AT_KEYWORDS([segment])
+: > input
+AT_DATA([expout-base], [dnl
+end
+])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
+PSPP_CHECK_SEGMENT
+AT_CLEANUP
+
+AT_SETUP([blank lines input])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+
+
+
+
+])
+AT_DATA([expout-base], [dnl
+separate_commands
+newline         \n (first)
+
+separate_commands
+newline         \n (first)
+
+separate_commands
+newline         \n (first)
+
+-separate_commands
+-newline         \n (first)
+-
+end
+])
+AT_CHECK([segment-test -c -i input])
+PSPP_CHECK_SEGMENT
+AT_CLEANUP
 
 # This checks for regression against bug #61253.  To see the read of
 # uninitialized data, run with valgrind.  The test will pass either
 # way.  (The bug report has a more complicated crashing case.)
 AT_SETUP([input ends in carriage return])
+AT_KEYWORDS([segment])
 printf '\r' > input
 AT_DATA([expout-base], [dnl
 separate_commands
-- 
2.30.2