command-segmenter: New library for dividing syntax into individual commands.

author Ben Pfaff <blp@cs.stanford.edu>

Mon, 29 May 2023 20:09:09 +0000 (13:09 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Mon, 29 May 2023 20:23:18 +0000 (13:23 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Mon, 29 May 2023 20:09:09 +0000 (13:09 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Mon, 29 May 2023 20:23:18 +0000 (13:23 -0700)
diff --git a/src/language/lexer/automake.mk b/src/language/lexer/automake.mk

index 01b3df49c6cb62745a2df2902110a9f652766777..29f0a637f70c91bd4f8aa668dd72afb8d0c4c777 100644 (file)
--- a/src/language/lexer/automake.mk
+++ b/src/language/lexer/automake.mk
@@ -20,6 +20,8 @@
  language_lexer_sources = \
         src/language/lexer/command-name.c \
         src/language/lexer/command-name.h \
+       src/language/lexer/command-segmenter.c \
+       src/language/lexer/command-segmenter.h \
         src/language/lexer/include-path.c \
         src/language/lexer/include-path.h \
         src/language/lexer/lexer.c \
diff --git a/src/language/lexer/command-segmenter.c b/src/language/lexer/command-segmenter.c

new file mode 100644 (file)

index 0000000..8d1036d
--- /dev/null
+++ b/src/language/lexer/command-segmenter.c
@@ -0,0 +1,205 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "language/lexer/command-segmenter.h"
+
+#include "language/lexer/segment.h"
+#include "libpspp/deque.h"
+#include "libpspp/str.h"
+
+struct lines
+  {
+    int first;
+    int last;
+  };
+
+struct command_segmenter
+  {
+    struct segmenter segmenter;
+    struct string input;
+
+    int command_first_line;
+    int line;
+    enum segment_type prev_segment;
+
+    struct deque deque;
+    struct lines *lines;
+  };
+
+/* Creates and returns a new command segmenter for the given syntax MODE. */
+struct command_segmenter *
+command_segmenter_create (enum segmenter_mode mode)
+{
+  struct command_segmenter *cs = xmalloc (sizeof *cs);
+  *cs = (struct command_segmenter) {
+    .segmenter = segmenter_init (mode, false),
+    .input = DS_EMPTY_INITIALIZER,
+    .prev_segment = SEG_NEWLINE,
+    .deque = DEQUE_EMPTY_INITIALIZER,
+  };
+  return cs;
+}
+
+/* Destroys CS. */
+void
+command_segmenter_destroy (struct command_segmenter *cs)
+{
+  if (cs)
+    {
+      ds_destroy (&cs->input);
+      free (cs->lines);
+      free (cs);
+    }
+}
+
+static void
+emit (struct command_segmenter *cs, int first, int last)
+{
+  if (first < last)
+    {
+      if (deque_is_full (&cs->deque))
+        cs->lines = deque_expand (&cs->deque, cs->lines, sizeof *cs->lines);
+      cs->lines[deque_push_back (&cs->deque)] = (struct lines) {
+        .first = first,
+        .last = last,
+      };
+    }
+}
+
+static void
+command_segmenter_push__ (struct command_segmenter *cs,
+                          const char *input, size_t n, bool eof)
+{
+  if (!ds_is_empty (&cs->input))
+    {
+      ds_put_substring (&cs->input, ss_buffer (input, n));
+      input = ds_cstr (&cs->input);
+      n = ds_length (&cs->input);
+    }
+
+  for (;;)
+    {
+      enum segment_type type;
+      int retval = segmenter_push (&cs->segmenter, input, n, eof, &type);
+      if (retval < 0)
+        break;
+
+      switch (type)
+        {
+        case SEG_NUMBER:
+        case SEG_QUOTED_STRING:
+        case SEG_HEX_STRING:
+        case SEG_UNICODE_STRING:
+        case SEG_UNQUOTED_STRING:
+        case SEG_RESERVED_WORD:
+        case SEG_IDENTIFIER:
+        case SEG_PUNCT:
+        case SEG_SHBANG:
+        case SEG_SPACES:
+        case SEG_COMMENT:
+        case SEG_COMMENT_COMMAND:
+        case SEG_DO_REPEAT_COMMAND:
+        case SEG_INLINE_DATA:
+        case SEG_INNER_START_COMMAND:
+        case SEG_INNER_SEPARATE_COMMANDS:
+        case SEG_INNER_END_COMMAND:
+        case SEG_MACRO_ID:
+        case SEG_MACRO_NAME:
+        case SEG_MACRO_BODY:
+        case SEG_START_DOCUMENT:
+        case SEG_DOCUMENT:
+        case SEG_EXPECTED_QUOTE:
+        case SEG_EXPECTED_EXPONENT:
+        case SEG_UNEXPECTED_CHAR:
+          break;
+
+        case SEG_NEWLINE:
+          cs->line++;
+          break;
+
+        case SEG_START_COMMAND:
+          if (cs->line > cs->command_first_line)
+            emit (cs, cs->command_first_line, cs->line);
+          cs->command_first_line = cs->line;
+          break;
+
+        case SEG_SEPARATE_COMMANDS:
+          if (cs->line > cs->command_first_line)
+            emit (cs, cs->command_first_line, cs->line);
+          cs->command_first_line = cs->line + 1;
+          break;
+
+        case SEG_END_COMMAND:
+          emit (cs, cs->command_first_line, cs->line + 1);
+          cs->command_first_line = cs->line + 1;
+          break;
+
+        case SEG_END:
+          emit (cs, cs->command_first_line, cs->line + (cs->prev_segment != SEG_NEWLINE));
+          break;
+        }
+
+      cs->prev_segment = type;
+      input += retval;
+      n -= retval;
+      if (type == SEG_END)
+        break;
+    }
+
+  ds_assign_substring (&cs->input, ss_buffer (input, n));
+}
+
+/* Adds the N bytes of UTF-8 encoded syntax INPUT to CS. */
+void
+command_segmenter_push (struct command_segmenter *cs,
+                        const char *input, size_t n)
+{
+  command_segmenter_push__ (cs, input, n, false);
+}
+
+/* Tells CS that no more input is coming.  The caller shouldn't call
+   command_segmenter_push() again. */
+void
+command_segmenter_eof (struct command_segmenter *cs)
+{
+  command_segmenter_push__ (cs, "", 0, true);
+}
+
+/* Attempts to get a pair of line numbers bounding a command in the input from
+   CS.  If successful, returns true and stores the first line in LINES[0] and
+   one past the last line in LINES[1].  On failure, returns false.
+
+   Command bounds can start becoming available as soon as after the first call
+   to command_segmenter_push().  Often the output lags behind the input a
+   little because some lookahead is needed.  After calling
+   command_segmenter_eof(), all the output is available.
+
+   Command bounds are always in order and commands never overlap.  Some lines,
+   such as blank lines, might not be part of any command.  An empty input or
+   input consisting of just blank lines contains no commands. */
+bool
+command_segmenter_get (struct command_segmenter *cs, int lines[2])
+{
+  if (deque_is_empty (&cs->deque))
+    return false;
+
+  struct lines *r = &cs->lines[deque_pop_front (&cs->deque)];
+  lines[0] = r->first;
+  lines[1] = r->last;
+  return true;
+}
diff --git a/src/language/lexer/command-segmenter.h b/src/language/lexer/command-segmenter.h

new file mode 100644 (file)

index 0000000..52c2b02
--- /dev/null
+++ b/src/language/lexer/command-segmenter.h
@@ -0,0 +1,44 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef COMMAND_SEGMENTER_H
+#define COMMAND_SEGMENTER_H 1
+
+#include "language/lexer/segment.h"
+
+/* Divides syntax lines into individual commands.
+
+   This is for use by the GUI, which has a feature to run an individual command
+   in a syntax window.
+
+   This groups together some kinds of commands that the PSPP tokenizer would
+   put T_ENDCMD inside.  For example, it always considers BEGIN DATA...END DATA
+   as a single command, even though the tokenizer will emit T_ENDCMD after
+   BEGIN DATA if it has a command terminator.  That's because it's the behavior
+   most useful for the GUI feature.
+*/
+
+struct command_segmenter;
+
+struct command_segmenter *command_segmenter_create (enum segmenter_mode);
+void command_segmenter_destroy (struct command_segmenter *);
+
+void command_segmenter_push (struct command_segmenter *,
+                             const char *input, size_t n);
+void command_segmenter_eof (struct command_segmenter *);
+bool command_segmenter_get (struct command_segmenter *, int lines[2]);
+
+#endif /* command-segmenter.h */
diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c

index e4fe405d47c340963666604f3c43a4059fd2a456..43f79df4eeee73e4318b1d7c7412189f4562de2b 100644 (file)
--- a/src/language/lexer/scan.c
+++ b/src/language/lexer/scan.c
@@ -372,6 +372,9 @@ token_from_segment (enum segment_type type, struct substring s,
      case SEG_START_COMMAND:
      case SEG_SEPARATE_COMMANDS:
      case SEG_END_COMMAND:
+    case SEG_INNER_START_COMMAND:
+    case SEG_INNER_SEPARATE_COMMANDS:
+    case SEG_INNER_END_COMMAND:
        *token = (struct token) { .type = T_ENDCMD };
        return TOKENIZE_TOKEN;
  
diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c

index a7bce8b6e7b9ef96440e07e0cf89cdbedbc38eb1..58d9af503157797268e77e24398f91e702c76fdc 100644 (file)
--- a/src/language/lexer/segment.c
+++ b/src/language/lexer/segment.c
@@ -688,6 +688,9 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
          case SEG_START_COMMAND:
          case SEG_SEPARATE_COMMANDS:
          case SEG_END_COMMAND:
+        case SEG_INNER_START_COMMAND:
+        case SEG_INNER_SEPARATE_COMMANDS:
+        case SEG_INNER_END_COMMAND:
          case SEG_END:
          case SEG_EXPECTED_QUOTE:
          case SEG_EXPECTED_EXPONENT:
@@ -800,8 +803,9 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
              return -1;
            else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
              {
-              int eol;
-
+              /* We've found BEGIN DATA.  Check whether that's the entire
+                 command (either followed by a new-line or by '.' then a
+                 new-line). */
                ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
                if (ofs2 < 0)
                  return -1;
@@ -815,11 +819,14 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
                      return -1;
                  }
  
-              eol = is_end_of_line (input, n, eof, ofs2);
+              int eol = is_end_of_line (input, n, eof, ofs2);
                if (eol < 0)
                  return -1;
                else if (eol)
                  {
+                  /* BEGIN DATA is indeed the entire command.  We choose a next
+                     state depending on whether it's one line long or two lines
+                     long. */
                    if (memchr (input, '\n', ofs2))
                      s->state = S_BEGIN_DATA_1;
                    else
@@ -1229,14 +1236,30 @@ segmenter_parse_start_of_line__ (struct segmenter *s,
                return 1;
              }
          }
-      /* Fall through. */
+      *type = SEG_START_COMMAND;
+      s->substate = SS_START_OF_COMMAND;
+      return 1;
  
      case '-':
-    case '.':
        *type = SEG_START_COMMAND;
        s->substate = SS_START_OF_COMMAND;
        return 1;
  
+    case '.':
+      /* We've found '.' at the beginning of a line.  If there's more text on
+         the line, then it starts a new command, because '+' or '-' or '.' in
+         the leftmost column does that.  If the command is otherwise blank,
+         then it ends the previous command.  The difference only matters for
+         deciding whether the line is part of the previous command in
+         command_segmenter. */
+      int eol = at_end_of_line (input, n, eof, 1);
+      if (eol < 0)
+        return -1;
+
+      *type = eol ? SEG_END_COMMAND : SEG_START_COMMAND;
+      s->substate = SS_START_OF_COMMAND;
+      return 1;
+
      default:
        if (lex_uc_is_space (uc))
          {
@@ -1409,12 +1432,16 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s,
      {
        /* We reached a blank line that separates the head from the body. */
        s->state = S_DO_REPEAT_2;
+      *type = SEG_INNER_SEPARATE_COMMANDS;
      }
    else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
      {
        /* We reached the body. */
        s->state = S_DO_REPEAT_3;
        s->substate = 1;
+      *type = (*type == SEG_END_COMMAND
+               ? SEG_INNER_END_COMMAND
+               : SEG_INNER_START_COMMAND);
      }
  
    return ofs;
@@ -1722,6 +1749,13 @@ segmenter_parse_define_5__ (struct segmenter *s,
    return ofs;
  }
  
+/* We're segmenting the first line of a two-line BEGIN DATA command.  Segment
+   up to the first new-line.
+
+   This BEGIN DATA is expressed something like this (weird, but legal):
+
+      BEGIN
+       DATA. */
  static int
  segmenter_parse_begin_data_1__ (struct segmenter *s,
                                  const char *input, size_t n, bool eof,
@@ -1737,6 +1771,8 @@ segmenter_parse_begin_data_1__ (struct segmenter *s,
    return ofs;
  }
  
+/* We're segmenting a one-line BEGIN DATA command, or the second line of a
+   two-line BEGIN DATA command.  Segment up to the new-line. */
  static int
  segmenter_parse_begin_data_2__ (struct segmenter *s,
                                  const char *input, size_t n, bool eof,
@@ -1748,6 +1784,8 @@ segmenter_parse_begin_data_2__ (struct segmenter *s,
  
    if (*type == SEG_NEWLINE)
      s->state = S_BEGIN_DATA_3;
+  else if (*type == SEG_END_COMMAND)
+    *type = SEG_INNER_END_COMMAND;
  
    return ofs;
  }
diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h

index d5f846a900db8b55d463e07996232a89e6401b39..29bac670c29f14f3064e9fbe5c5d2c2757de640a 100644 (file)
--- a/src/language/lexer/segment.h
+++ b/src/language/lexer/segment.h
@@ -88,6 +88,9 @@ enum segmenter_mode
      SEG_TYPE(START_COMMAND)                     \
      SEG_TYPE(SEPARATE_COMMANDS)                 \
      SEG_TYPE(END_COMMAND)                       \
+    SEG_TYPE(INNER_START_COMMAND)               \
+    SEG_TYPE(INNER_SEPARATE_COMMANDS)           \
+    SEG_TYPE(INNER_END_COMMAND)                 \
      SEG_TYPE(END)                               \
                                                  \
      SEG_TYPE(EXPECTED_QUOTE)                    \
diff --git a/tests/automake.mk b/tests/automake.mk

index aa79c2a482be381f87dc96af1c09fe1faf8464bc..43c2f890786968aec4887e0cadd73f4f03342596 100644 (file)
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -206,6 +206,7 @@ check_PROGRAMS += tests/language/lexer/segment-test
  tests_language_lexer_segment_test_SOURCES = \
         src/data/identifier.c \
         src/language/lexer/command-name.c \
+       src/language/lexer/command-segmenter.c \
         src/language/lexer/segment.c \
         tests/language/lexer/segment-test.c
  tests_language_lexer_segment_test_CFLAGS = $(AM_CFLAGS)
diff --git a/tests/language/lexer/segment-test.c b/tests/language/lexer/segment-test.c

index 5977e8fce69236c868d4035f14515350d385afaf..3f5a579d5a12995209abee494279df5c8bd97e80 100644 (file)
--- a/tests/language/lexer/segment-test.c
+++ b/tests/language/lexer/segment-test.c
@@ -29,6 +29,7 @@
  #include "libpspp/cast.h"
  #include "libpspp/compiler.h"
  #include "libpspp/misc.h"
+#include "language/lexer/command-segmenter.h"
  #include "language/lexer/segment.h"
  
  #include "gl/error.h"
@@ -50,6 +51,9 @@ static bool one_byte;
  /* -0, --truncations: Check that every truncation of input yields a result. */
  static bool check_truncations;
  
+/* -c, --commands: Print segmentation of input into commands. */
+static bool commands;
+
  /* -s, --strip-trailing-newline: Strip trailing newline from last line of
      input. */
  static bool strip_trailing_newline;
@@ -59,6 +63,7 @@ static void usage (void) NO_RETURN;
  
  static void check_segmentation (const char *input, size_t length,
                                  bool print_segments);
+static void check_commands (const char *input, size_t length);
  
  int
  main (int argc, char *argv[])
@@ -72,8 +77,7 @@ main (int argc, char *argv[])
  
    setvbuf (stdout, NULL, _IONBF, 0);
  
-  /* Read from stdin into 'input'.  Ensure that 'input' ends in a new-line
-     followed by a null byte. */
+  /* Read syntax into 'input'. */
    input = (!strcmp (file_name, "-")
             ? fread_file (stdin, 0, &length)
             : read_file (file_name, 0, &length));
@@ -87,9 +91,7 @@ main (int argc, char *argv[])
          length--;
      }
  
-  if (!check_truncations)
-    check_segmentation (input, length, true);
-  else
+  if (check_truncations)
      {
        size_t test_len;
  
@@ -100,11 +102,54 @@ main (int argc, char *argv[])
            free (copy);
          }
      }
+  else if (commands)
+    check_commands (input, length);
+  else
+    check_segmentation (input, length, true);
+
    free (input);
  
    return 0;
  }
  
+static void
+print_line (const char *input, size_t length, int line)
+{
+  for (int i = 0; i < line; i++)
+    {
+      const char *newline = memchr (input, '\n', length);
+      size_t line_len = newline ? newline - input + 1 : strlen (input);
+      input += line_len;
+      length -= line_len;
+    }
+
+  int line_len = strcspn (input, "\n");
+  printf ("%.*s\n", line_len, input);
+}
+
+static void
+check_commands (const char *input, size_t length)
+{
+  struct command_segmenter *cs = command_segmenter_create (mode);
+  command_segmenter_push (cs, input, length);
+  command_segmenter_eof (cs);
+
+  int last_line = -1;
+  int lines[2];
+  while (command_segmenter_get (cs, lines))
+    {
+      assert (last_line == -1 || lines[0] >= last_line);
+      assert (lines[0] < lines[1]);
+      if (last_line != -1)
+        printf ("-----\n");
+      for (int line = lines[0]; line < lines[1]; line++)
+        print_line (input, length, line);
+      last_line = lines[1];
+    }
+
+  command_segmenter_destroy (cs);
+}
+
  static void
  check_segmentation (const char *input, size_t length, bool print_segments)
  {
@@ -300,12 +345,13 @@ parse_options (int argc, char **argv)
            {"auto", no_argument, NULL, 'a'},
            {"batch", no_argument, NULL, 'b'},
            {"interactive", no_argument, NULL, 'i'},
+          {"commands", no_argument, NULL, 'c'},
            {"verbose", no_argument, NULL, 'v'},
            {"help", no_argument, NULL, 'h'},
            {NULL, 0, NULL, 0},
          };
  
-      int c = getopt_long (argc, argv, "01abivhs", options, NULL);
+      int c = getopt_long (argc, argv, "01abivhsc", options, NULL);
        if (c == -1)
          break;
  
@@ -335,6 +381,10 @@ parse_options (int argc, char **argv)
            mode = SEG_MODE_INTERACTIVE;
            break;
  
+        case 'c':
+          commands = true;
+          break;
+
          case 'v':
            verbose = true;
            break;
@@ -368,9 +418,12 @@ usage (void)
  %s, to test breaking PSPP syntax into lexical segments\n\
  usage: %s [OPTIONS] INPUT\n\
  \n\
+By default, print segmentation of input into PSPP syntax units. Other modes:\n\
+  -0, --truncations   check null truncation of each prefix of input\n\
+  -c, --commands      print segmentation into PSPP commands\n\
+\n\
  Options:\n\
    -1, --one-byte      feed one byte at a time\n\
-  -0, --truncations   check null truncation of each prefix of input\n\
    -s, --strip-trailing-newline  remove newline from end of input\n\
    -a, --auto          use \"auto\" syntax mode\n\
    -b, --batch         use \"batch\" syntax mode\n\
diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at

index abbc08c8cd4b4a037b7f7155ccc799b0cdde6593..0d49147ada5e27f459d35d7f9527e32b11d503e6 100644 (file)
--- a/tests/language/lexer/segment.at
+++ b/tests/language/lexer/segment.at
@@ -178,10 +178,10 @@ identifier      #abcd
  end_command     .
  newline         \n (first)
  
-start_command   .
+end_command     .
  newline         \n (first)
  
-start_command   .    space
+end_command     .    space
  newline         \n (first)
  
  identifier      LMNOP
@@ -610,7 +610,8 @@ AT_CLEANUP
  AT_SETUP([* and COMMENT commands])
  AT_KEYWORDS([segment])
  AT_DATA([input], [dnl
-* Comment commands "don't
+* Comment commands "don't dnl "
+
  have to contain valid tokens.
  
  ** Check ambiguity with ** token.
@@ -626,9 +627,31 @@ com is ambiguous with COMPUTE.
  
  next command.
  
+])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+* Comment commands "don't dnl "
+
+have to contain valid tokens.
+-----
+** Check ambiguity with ** token.
+-----
+****************.
+-----
+comment keyword works too.
+-----
+COMM also.
+-----
+com is ambiguous with COMPUTE.
+-----
+   * Comment need not start at left margin.
+-----
+* Comment ends with blank line
+-----
+next command.
  ])
  AT_DATA([expout-base], [dnl
-comment_command *_Comment_commands_"don't
+comment_command *_Comment_commands_"don't dnl "
+
  newline         \n (COMMENT)
  
  comment_command have_to_contain_valid_tokens
@@ -707,6 +730,20 @@ docu
  first.paragraph
  isn't parsed as tokens
  
+second paragraph.
+])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+DOCUMENT one line.
+-----
+DOC more
+    than
+        one
+            line.
+-----
+docu
+first.paragraph
+isn't parsed as tokens
+
  second paragraph.
  ])
  AT_DATA([expout-base], [dnl
@@ -763,6 +800,15 @@ FILE
  FILE /*
  /**/  lab not quoted here either
  
+])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+FIL label isn't quoted.
+-----
+FILE
+  lab 'is quoted'.
+-----
+FILE /*
+/**/  lab not quoted here either
  ])
  AT_DATA([expout-base], [dnl
  identifier      FIL    space
@@ -825,10 +871,37 @@ begin data "xxx".
  begin data 123.
  not data
  ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+begin data.
+end data.
+-----
+begin data. /*
+123
+xxx
+end data.
+-----
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end  data
+end data
+.
+-----
+begin
+ data.
+data
+end data.
+-----
+begin data "xxx".
+-----
+begin data 123.
+-----
+not data
+])
  AT_DATA([expout-base], [dnl
  identifier      begin    space
  identifier      data
-end_command     .
+inner_end_command .
  newline         \n (data)
  
  identifier      end    space
@@ -841,7 +914,7 @@ newline         \n (first)
  
  identifier      begin    space
  identifier      data
-end_command     .    space
+inner_end_command .    space
  comment         /*
  newline         \n (data)
  
@@ -878,7 +951,7 @@ identifier      end    space
  identifier      data
  newline         \n (later)
  
-start_command   .
+end_command     .
  newline         \n (first)
  
  separate_commands
@@ -888,7 +961,7 @@ identifier      begin
  newline         \n (later)
      space
  identifier      data
-end_command     .
+inner_end_command .
  newline         \n (data)
  
  inline_data     data
@@ -940,6 +1013,22 @@ do
    inner command.
  end repeat.
  ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+do repeat x=a b c
+          y=d e f.
+  do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+-----
+do
+  repeat #a=1.
+  inner command.
+end repeat.
+])
  AT_DATA([expout-base], [dnl
  identifier      do    space
  identifier      repeat    space
@@ -956,7 +1045,7 @@ punct           =
  identifier      d    space
  identifier      e    space
  identifier      f
-end_command     .
+inner_end_command .
  newline         \n (DO REPEAT)
  
  do_repeat_command __do_repeat_a=1_thru_5.
@@ -989,7 +1078,7 @@ identifier      repeat    space
  identifier      #a
  punct           =
  number          1
-end_command     .
+inner_end_command .
  newline         \n (DO REPEAT)
  
  do_repeat_command __inner_command.
@@ -1023,6 +1112,23 @@ do
    inner command
  end repeat
  ])
+AT_CHECK([segment-test -c -b input], [0], [dnl
+do repeat x=a b c
+          y=d e f
+do repeat a=1 thru 5
+another command
+second command
++ third command
+end /* x */ /* y */ repeat print
+end
+ repeat
+-----
+do
+  repeat #a=1
+
+  inner command
+end repeat
+])
  AT_DATA([expout-base], [dnl
  identifier      do    space
  identifier      repeat    space
@@ -1041,7 +1147,7 @@ identifier      e    space
  identifier      f
  newline         \n (later)
  
-start_command
+inner_start_command
  do_repeat_command do_repeat_a=1_thru_5
  newline         \n (DO REPEAT)
  
@@ -1074,7 +1180,7 @@ punct           =
  number          1
  newline         \n (later)
  
-separate_commands
+inner_separate_commands
  newline         \n (DO REPEAT)
  
  do_repeat_command __inner_command
@@ -1096,6 +1202,7 @@ define !macro1()
  var1 var2 var3 "!enddefine"
  !enddefine.
  ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1122,6 +1229,7 @@ AT_DATA([input], [dnl
  define !macro1() var1 var2 var3 /* !enddefine
  !enddefine.
  ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1145,6 +1253,7 @@ AT_DATA([input], [dnl
  define !macro1()
  var1 var2 var3!enddefine.
  ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1168,6 +1277,7 @@ AT_KEYWORDS([segment])
  AT_DATA([input], [dnl
  define !macro1()var1 var2 var3!enddefine.
  ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1189,6 +1299,7 @@ AT_DATA([input], [dnl
  define !macro1()
  !enddefine.
  ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1214,6 +1325,7 @@ define !macro1()
  
  !enddefine.
  ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1243,6 +1355,7 @@ AT_DATA([input], [dnl
  define !macro1(a(), b(), c())
  !enddefine.
  ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1281,6 +1394,7 @@ define !macro1(
  )
  !enddefine.
  ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1330,6 +1444,7 @@ content 1
  content 2
  !enddefine.
  ])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1368,6 +1483,11 @@ AT_DATA([input], [dnl
  define !macro1.
  data list /x 1.
  ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+define !macro1.
+-----
+data list /x 1.
+])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1394,6 +1514,12 @@ define !macro1
  x.
  data list /x 1.
  ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+define !macro1
+x.
+-----
+data list /x 1.
+])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1423,6 +1549,13 @@ define !macro1(.
  x.
  data list /x 1.
  ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+define !macro1@{:@.
+-----
+x.
+-----
+data list /x 1.
+])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1455,6 +1588,11 @@ dnl which should not be there and ends it early.
  define !macro1.
  data list /x 1.
  ])
+AT_CHECK([segment-test -c -i input], [0], [dnl
+define !macro1.
+-----
+data list /x 1.
+])
  AT_DATA([expout-base], [dnl
  identifier      define    space
  macro_name      !macro1
@@ -1643,11 +1781,49 @@ end
  ])
  PSPP_CHECK_SEGMENT([-a])
  AT_CLEANUP
+
+AT_SETUP([empty input])
+AT_KEYWORDS([segment])
+: > input
+AT_DATA([expout-base], [dnl
+end
+])
+AT_CHECK([cp input expout && segment-test -c -i input], [0], [expout])
+PSPP_CHECK_SEGMENT
+AT_CLEANUP
+
+AT_SETUP([blank lines input])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+
+
+
+
+])
+AT_DATA([expout-base], [dnl
+separate_commands
+newline         \n (first)
+
+separate_commands
+newline         \n (first)
+
+separate_commands
+newline         \n (first)
+
+-separate_commands
+-newline         \n (first)
+-
+end
+])
+AT_CHECK([segment-test -c -i input])
+PSPP_CHECK_SEGMENT
+AT_CLEANUP
  \f
  # This checks for regression against bug #61253.  To see the read of
  # uninitialized data, run with valgrind.  The test will pass either
  # way.  (The bug report has a more complicated crashing case.)
  AT_SETUP([input ends in carriage return])
+AT_KEYWORDS([segment])
  printf '\r' > input
  AT_DATA([expout-base], [dnl
  separate_commands
author	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 29 May 2023 20:09:09 +0000 (13:09 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 29 May 2023 20:23:18 +0000 (13:23 -0700)
src/language/lexer/automake.mk		patch \| blob \| history
src/language/lexer/command-segmenter.c	[new file with mode: 0644]	patch \| blob
src/language/lexer/command-segmenter.h	[new file with mode: 0644]	patch \| blob
src/language/lexer/scan.c		patch \| blob \| history
src/language/lexer/segment.c		patch \| blob \| history
src/language/lexer/segment.h		patch \| blob \| history
tests/automake.mk		patch \| blob \| history
tests/language/lexer/segment-test.c		patch \| blob \| history
tests/language/lexer/segment.at		patch \| blob \| history