segment: Refine treatment of start of macro body.

[pspp] / src / language / lexer / segment.c
diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c

index c0a09973ce8059535ea7c1e487c5fa2016ff8134..a4fea0b213118559d474b94a2bc4efa4008ff0d0 100644 (file)
--- a/src/language/lexer/segment.c
+++ b/src/language/lexer/segment.c
@@ -42,6 +42,10 @@ enum segmenter_state
      S_DO_REPEAT_1,
      S_DO_REPEAT_2,
      S_DO_REPEAT_3,
+    S_DEFINE_1,
+    S_DEFINE_2,
+    S_DEFINE_3,
+    S_DEFINE_4,
      S_BEGIN_DATA_1,
      S_BEGIN_DATA_2,
      S_BEGIN_DATA_3,
@@ -92,21 +96,26 @@ segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
          {
            if (input[1] == '!')
              {
-              int ofs;
-
-              for (ofs = 2; ofs < n; ofs++)
-                if (input[ofs] == '\n')
-                  {
-                    if (input[ofs] == '\n' && input[ofs - 1] == '\r')
-                      ofs--;
-
-                    s->state = S_GENERAL;
-                    s->substate = SS_START_OF_COMMAND;
-                    *type = SEG_SHBANG;
-                    return ofs;
-                  }
+              for (int ofs = 2; ; ofs++)
+                {
+                  if (ofs >= n)
+                    {
+                      if (!eof)
+                        return -1;
+                    }
+                  else if (input[ofs] == '\n')
+                    {
+                      if (input[ofs - 1] == '\r')
+                        ofs--;
+                    }
+                  else
+                    continue;
  
-              return eof ? ofs : -1;
+                  s->state = S_GENERAL;
+                  s->substate = SS_START_OF_COMMAND;
+                  *type = SEG_SHBANG;
+                  return ofs;
+                }
              }
          }
        else if (!eof)
@@ -209,6 +218,22 @@ at_end_of_line (const char *input, size_t n, bool eof, int ofs)
    return is_end_of_line (input, n, eof, ofs);
  }
  
+static bool
+is_all_spaces (const char *input_, size_t n)
+{
+  const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
+
+  int mblen;
+  for (int ofs = 0; ofs < n; ofs += mblen)
+    {
+      ucs4_t uc;
+      mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
+      if (!lex_uc_is_space (uc))
+        return false;
+    }
+  return true;
+}
+
  static int
  segmenter_parse_newline__ (const char *input, size_t n, bool eof,
                             enum segment_type *type)
@@ -281,20 +306,23 @@ segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
        if (!eof)
          return -1;
        goto number;
-    };
+    }
    if (input[ofs] == '.')
      {
+      if (ofs + 1 >= n)
+        {
+          if (!eof)
+            return -1;
+          goto number;
+        }
+
        ofs = skip_digits (input, n, eof, ofs + 1);
        if (ofs < 0)
          return -1;
+      else if (ofs >= n)
+        goto number;
      }
  
-  if (ofs >= n)
-    {
-      if (!eof)
-        return -1;
-      goto number;
-    }
    if (input[ofs] == 'e' || input[ofs] == 'E')
      {
        ofs++;
@@ -653,6 +681,8 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
          case SEG_COMMENT_COMMAND:
          case SEG_DO_REPEAT_COMMAND:
          case SEG_INLINE_DATA:
+        case SEG_MACRO_ID:
+        case SEG_MACRO_BODY:
          case SEG_START_DOCUMENT:
          case SEG_DOCUMENT:
          case SEG_START_COMMAND:
@@ -661,7 +691,6 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
          case SEG_END:
          case SEG_EXPECTED_QUOTE:
          case SEG_EXPECTED_EXPONENT:
-        case SEG_UNEXPECTED_DOT:
          case SEG_UNEXPECTED_CHAR:
            id[0] = '\0';
            return ofs + retval;
@@ -711,10 +740,9 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
          ofs--;
      }
  
-  if (is_reserved_word (input, ofs))
-    *type = SEG_RESERVED_WORD;
-  else
-    *type = SEG_IDENTIFIER;
+  *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
+           : input[0] == '!' ? SEG_MACRO_ID
+           : SEG_IDENTIFIER);
  
    if (s->substate & SS_START_OF_COMMAND)
      {
@@ -743,6 +771,11 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
                return ofs;
              }
          }
+      else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
+        {
+          s->state = S_DEFINE_1;
+          return ofs;
+        }
        else if (lex_id_match (ss_cstr ("FILE"), word))
          {
            char id[16];
@@ -962,7 +995,7 @@ segmenter_parse_mid_command__ (struct segmenter *s,
            s->substate = SS_START_OF_COMMAND;
          }
        else
-        *type = SEG_UNEXPECTED_DOT;
+        *type = SEG_PUNCT;
        return 1;
  
      case '0': case '1': case '2': case '3': case '4':
@@ -981,6 +1014,9 @@ segmenter_parse_mid_command__ (struct segmenter *s,
        return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
                                         s, input, n, eof, type);
  
+    case '!':
+      return segmenter_parse_id__ (s, input, n, eof, type);
+
      default:
        if (lex_uc_is_space (uc))
          {
@@ -1004,6 +1040,12 @@ segmenter_parse_mid_command__ (struct segmenter *s,
          }
        else if (lex_uc_is_id1 (uc))
          return segmenter_parse_id__ (s, input, n, eof, type);
+      else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
+        {
+          *type = SEG_PUNCT;
+          s->substate = 0;
+          return 1;
+        }
        else
          {
            *type = SEG_UNEXPECTED_CHAR;
@@ -1269,6 +1311,9 @@ segmenter_subparse (struct segmenter *s,
    return ofs;
  }
  
+/* We are segmenting a DO REPEAT command, currently reading the syntax that
+   defines the stand-in variables (the head) before the lines of syntax to be
+   repeated (the body). */
  static int
  segmenter_parse_do_repeat_1__ (struct segmenter *s,
                                 const char *input, size_t n, bool eof,
@@ -1278,10 +1323,14 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s,
    if (ofs < 0)
      return -1;
  
-  if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
-    s->state = S_DO_REPEAT_2;
-  else if (*type == SEG_END_COMMAND)
+  if (*type == SEG_SEPARATE_COMMANDS)
+    {
+      /* We reached a blank line that separates the head from the body. */
+      s->state = S_DO_REPEAT_2;
+    }
+  else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
      {
+      /* We reached the body. */
        s->state = S_DO_REPEAT_3;
        s->substate = 1;
      }
@@ -1289,6 +1338,8 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s,
    return ofs;
  }
  
+/* We are segmenting a DO REPEAT command, currently reading a blank line that
+   separates the head from the body. */
  static int
  segmenter_parse_do_repeat_2__ (struct segmenter *s,
                                 const char *input, size_t n, bool eof,
@@ -1300,6 +1351,7 @@ segmenter_parse_do_repeat_2__ (struct segmenter *s,
  
    if (*type == SEG_NEWLINE)
      {
+      /* We reached the body. */
        s->state = S_DO_REPEAT_3;
        s->substate = 1;
      }
@@ -1356,6 +1408,12 @@ segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
      return ofs - (input[ofs - 1] == '\r');
  }
  
+/* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
+   be repeated.  Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
+
+   DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
+   the lines we're segmenting.  s->substate counts the nesting level, starting
+   at 1. */
  static int
  segmenter_parse_do_repeat_3__ (struct segmenter *s,
                                 const char *input, size_t n, bool eof,
@@ -1370,6 +1428,8 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s,
      return -1;
    else if (s->substate == 0)
      {
+      /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
+         body. */
        s->state = S_GENERAL;
        s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
        return segmenter_push (s, input, n, eof, type);
@@ -1381,6 +1441,173 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s,
      }
  }
  
+/* We are segmenting a DEFINE command, which consists of:
+
+  - The DEFINE keyword.
+
+  - Anything but "(".
+
+  - "(" followed by a sequence of tokens possibly including balanced parentheses
+    up to a final ")".
+
+  - A sequence of any number of lines, one string per line, ending with
+    "!ENDDEFINE".  The first line is usually blank (that is, a newline follows
+    the "(").  The last line usually just has "!ENDDEFINE." on it, but it can
+    start with other tokens.  The whole DEFINE...!ENDDEFINE can be on a single
+    line, even.
+   */
+static int
+segmenter_parse_define_1__ (struct segmenter *s,
+                            const char *input, size_t n, bool eof,
+                            enum segment_type *type)
+{
+  int ofs = segmenter_subparse (s, input, n, eof, type);
+  if (ofs < 0)
+    return -1;
+
+  if (*type == SEG_SEPARATE_COMMANDS
+      || *type == SEG_END_COMMAND
+      || *type == SEG_START_COMMAND)
+    {
+      /* The DEFINE command is malformed because we reached its end without
+         ever hitting a "(" token.  Transition back to general parsing. */
+      s->state = S_GENERAL;
+      return ofs;
+    }
+  else if (*type == SEG_PUNCT && input[0] == '(')
+    {
+      s->state = S_DEFINE_2;
+      s->nest = 1;
+      return ofs;
+    }
+
+  return ofs;
+}
+
+static int
+segmenter_parse_define_2__ (struct segmenter *s,
+                            const char *input, size_t n, bool eof,
+                            enum segment_type *type)
+{
+  int ofs = segmenter_subparse (s, input, n, eof, type);
+  if (ofs < 0)
+    return -1;
+
+  if (*type == SEG_SEPARATE_COMMANDS
+      || *type == SEG_END_COMMAND
+      || *type == SEG_START_COMMAND)
+    {
+      /* The DEFINE command is malformed because we reached its end before
+         closing the set of parentheses.  Transition back to general
+         parsing. */
+      s->state = S_GENERAL;
+      return ofs;
+    }
+  else if (*type == SEG_PUNCT && input[0] == '(')
+    {
+      s->nest++;
+      return ofs;
+    }
+  else if (*type == SEG_PUNCT && input[0] == ')')
+    {
+      s->nest--;
+      if (!s->nest)
+        {
+          s->state = S_DEFINE_3;
+          s->substate = 0;
+        }
+      return ofs;
+    }
+
+  return ofs;
+}
+
+static size_t
+find_enddefine (struct substring input)
+{
+  size_t n = input.length;
+  const struct substring enddefine = ss_cstr ("!ENDDEFINE");
+  for (size_t i = 0; i + enddefine.length <= n; i++)
+    if (input.string[i] == '!'
+        && ss_equals_case (ss_substr (input, i, enddefine.length), enddefine))
+      return i;
+  return SIZE_MAX;
+}
+
+/* We are in the body of a macro definition, looking for additional lines of
+   the body or !ENDDEFINE. */
+static int
+segmenter_parse_define_3__ (struct segmenter *s,
+                            const char *input, size_t n, bool eof,
+                            enum segment_type *type)
+{
+  /* Gather a whole line. */
+  const char *newline = memchr (input, '\n', n);
+  int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
+             : eof ? n
+             : -1);
+  if (ofs < 0)
+    return -1;
+
+  /* Does the line contain !ENDDEFINE? */
+  size_t end = find_enddefine (ss_buffer (input, ofs));
+  if (end == SIZE_MAX)
+    {
+      /* No !ENDDEFINE.  We have a full line of macro body.
+
+         The line might be blank, whether completely empty or just spaces and
+         comments.  That's OK: we need to report blank lines because they can
+         have significance.
+
+         However, if the first line of the macro body (the same line as the
+         closing parenthesis in the argument definition) is blank, we just
+         report it as spaces because it's not significant. */
+      *type = (s->substate == 0 && is_all_spaces (input, ofs)
+               ? SEG_SPACES : SEG_MACRO_BODY);
+      s->state = S_DEFINE_4;
+      s->substate = 1;
+      return ofs;
+    }
+  else
+    {
+      /* Macro ends at the !ENDDEFINE on this line. */
+      s->state = S_GENERAL;
+      s->substate = 0;
+      if (!end)
+        {
+          /* Line starts with !ENDDEFINE. */
+          return segmenter_push (s, input, n, eof, type);
+        }
+      else
+        {
+          if (is_all_spaces (input, end))
+            {
+              /* Line starts with spaces followed by !ENDDEFINE. */
+              *type = SEG_SPACES;
+            }
+          else
+            {
+              /* Line starts with some content followed by !ENDDEFINE. */
+              *type = SEG_MACRO_BODY;
+            }
+          return end;
+        }
+    }
+}
+
+static int
+segmenter_parse_define_4__ (struct segmenter *s,
+                            const char *input, size_t n, bool eof,
+                            enum segment_type *type)
+{
+  int ofs = segmenter_parse_newline__ (input, n, eof, type);
+  if (ofs < 0)
+    return -1;
+
+  s->state = S_DEFINE_3;
+  return ofs;
+}
+
  static int
  segmenter_parse_begin_data_1__ (struct segmenter *s,
                                  const char *input, size_t n, bool eof,
@@ -1664,6 +1891,15 @@ segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
      case S_DO_REPEAT_3:
        return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
  
+    case S_DEFINE_1:
+      return segmenter_parse_define_1__ (s, input, n, eof, type);
+    case S_DEFINE_2:
+      return segmenter_parse_define_2__ (s, input, n, eof, type);
+    case S_DEFINE_3:
+      return segmenter_parse_define_3__ (s, input, n, eof, type);
+    case S_DEFINE_4:
+      return segmenter_parse_define_4__ (s, input, n, eof, type);
+
      case S_BEGIN_DATA_1:
        return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
      case S_BEGIN_DATA_2:
@@ -1716,6 +1952,13 @@ segmenter_get_prompt (const struct segmenter *s)
      case S_DO_REPEAT_3:
        return PROMPT_DO_REPEAT;
  
+    case S_DEFINE_1:
+    case S_DEFINE_2:
+      return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
+    case S_DEFINE_3:
+    case S_DEFINE_4:
+      return PROMPT_DEFINE;
+
      case S_BEGIN_DATA_1:
        return PROMPT_FIRST;
      case S_BEGIN_DATA_2: