X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=src%2Flanguage%2Flexer%2Fsegment.c;h=2689811593f4b515682e21c8c315d4a44562d9b4;hb=dc25c013a7ce573b87fd27cbefddc732733c837f;hp=c607c4bd1ffc52061ab7cf7d6d6632d3e4a8b249;hpb=fe94912b9c8682c4666873b84c83cda88f4c135d;p=pspp

diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c
index c607c4bd1f..2689811593 100644
--- a/src/language/lexer/segment.c
+++ b/src/language/lexer/segment.c
@@ -38,16 +38,20 @@ enum segmenter_state
     S_DOCUMENT_1,
     S_DOCUMENT_2,
     S_DOCUMENT_3,
-    S_FILE_LABEL,
+    S_FILE_LABEL_1,
+    S_FILE_LABEL_2,
+    S_FILE_LABEL_3,
     S_DO_REPEAT_1,
     S_DO_REPEAT_2,
     S_DO_REPEAT_3,
+    S_DEFINE_1,
+    S_DEFINE_2,
+    S_DEFINE_3,
+    S_DEFINE_4,
     S_BEGIN_DATA_1,
     S_BEGIN_DATA_2,
     S_BEGIN_DATA_3,
     S_BEGIN_DATA_4,
-    S_TITLE_1,
-    S_TITLE_2
   };
 
 #define SS_START_OF_LINE (1u << 0)
@@ -214,6 +218,22 @@ at_end_of_line (const char *input, size_t n, bool eof, int ofs)
   return is_end_of_line (input, n, eof, ofs);
 }
 
+static bool
+is_all_spaces (const char *input_, size_t n)
+{
+  const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
+
+  int mblen;
+  for (int ofs = 0; ofs < n; ofs += mblen)
+    {
+      ucs4_t uc;
+      mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
+      if (!lex_uc_is_space (uc))
+        return false;
+    }
+  return true;
+}
+
 static int
 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
                            enum segment_type *type)
@@ -286,20 +306,23 @@ segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
       if (!eof)
         return -1;
       goto number;
-    };
+    }
   if (input[ofs] == '.')
     {
+      if (ofs + 1 >= n)
+        {
+          if (!eof)
+            return -1;
+          goto number;
+        }
+
       ofs = skip_digits (input, n, eof, ofs + 1);
       if (ofs < 0)
         return -1;
+      else if (ofs >= n)
+        goto number;
     }
 
-  if (ofs >= n)
-    {
-      if (!eof)
-        return -1;
-      goto number;
-    }
   if (input[ofs] == 'e' || input[ofs] == 'E')
     {
       ofs++;
@@ -658,6 +681,8 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
         case SEG_COMMENT_COMMAND:
         case SEG_DO_REPEAT_COMMAND:
         case SEG_INLINE_DATA:
+        case SEG_MACRO_ID:
+        case SEG_MACRO_BODY:
         case SEG_START_DOCUMENT:
         case SEG_DOCUMENT:
         case SEG_START_COMMAND:
@@ -666,7 +691,6 @@ next_id_in_command (const struct segmenter *s, const char *input, size_t n,
         case SEG_END:
         case SEG_EXPECTED_QUOTE:
         case SEG_EXPECTED_EXPONENT:
-        case SEG_UNEXPECTED_DOT:
         case SEG_UNEXPECTED_CHAR:
           id[0] = '\0';
           return ofs + retval;
@@ -716,10 +740,9 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
         ofs--;
     }
 
-  if (is_reserved_word (input, ofs))
-    *type = SEG_RESERVED_WORD;
-  else
-    *type = SEG_IDENTIFIER;
+  *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
+           : input[0] == '!' ? SEG_MACRO_ID
+           : SEG_IDENTIFIER);
 
   if (s->substate & SS_START_OF_COMMAND)
     {
@@ -736,17 +759,10 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
           *type = SEG_START_DOCUMENT;
           return 0;
         }
-      else if (lex_id_match (ss_cstr ("TITLE"), word)
-               || lex_id_match (ss_cstr ("SUBTITLE"), word))
+      else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
         {
-          int result = segmenter_unquoted (input, n, eof, ofs);
-          if (result < 0)
-            return -1;
-          else if (result)
-            {
-              s->state = S_TITLE_1;
-              return ofs;
-            }
+          s->state = S_DEFINE_1;
+          return ofs;
         }
       else if (lex_id_match (ss_cstr ("FILE"), word))
         {
@@ -756,7 +772,7 @@ segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
             return -1;
           else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
             {
-              s->state = S_FILE_LABEL;
+              s->state = S_FILE_LABEL_1;
               s->substate = 0;
               return ofs;
             }
@@ -967,7 +983,7 @@ segmenter_parse_mid_command__ (struct segmenter *s,
           s->substate = SS_START_OF_COMMAND;
         }
       else
-        *type = SEG_UNEXPECTED_DOT;
+        *type = SEG_PUNCT;
       return 1;
 
     case '0': case '1': case '2': case '3': case '4':
@@ -986,6 +1002,9 @@ segmenter_parse_mid_command__ (struct segmenter *s,
       return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
                                        s, input, n, eof, type);
 
+    case '!':
+      return segmenter_parse_id__ (s, input, n, eof, type);
+
     default:
       if (lex_uc_is_space (uc))
         {
@@ -1009,6 +1028,12 @@ segmenter_parse_mid_command__ (struct segmenter *s,
         }
       else if (lex_uc_is_id1 (uc))
         return segmenter_parse_id__ (s, input, n, eof, type);
+      else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
+        {
+          *type = SEG_PUNCT;
+          s->substate = 0;
+          return 1;
+        }
       else
         {
           *type = SEG_UNEXPECTED_CHAR;
@@ -1220,9 +1245,9 @@ segmenter_parse_start_of_line__ (struct segmenter *s,
 }
 
 static int
-segmenter_parse_file_label__ (struct segmenter *s,
-                              const char *input, size_t n, bool eof,
-                              enum segment_type *type)
+segmenter_parse_file_label_1__ (struct segmenter *s,
+                                const char *input, size_t n, bool eof,
+                                enum segment_type *type)
 {
   struct segmenter sub;
   int ofs;
@@ -1245,7 +1270,7 @@ segmenter_parse_file_label__ (struct segmenter *s,
       else
         {
           if (result)
-            s->state = S_TITLE_1;
+            s->state = S_FILE_LABEL_2;
           else
             *s = sub;
           return ofs;
@@ -1258,6 +1283,70 @@ segmenter_parse_file_label__ (struct segmenter *s,
     }
 }
 
+static int
+segmenter_parse_file_label_2__ (struct segmenter *s,
+                                const char *input, size_t n, bool eof,
+                                enum segment_type *type)
+{
+  int ofs;
+
+  ofs = skip_spaces (input, n, eof, 0);
+  if (ofs < 0)
+    return -1;
+  s->state = S_FILE_LABEL_3;
+  *type = SEG_SPACES;
+  return ofs;
+}
+
+static int
+segmenter_parse_file_label_3__ (struct segmenter *s,
+                                const char *input, size_t n, bool eof,
+                                enum segment_type *type)
+{
+  int endcmd;
+  int ofs;
+
+  endcmd = -1;
+  ofs = 0;
+  while (ofs < n)
+    {
+      ucs4_t uc;
+      int mblen;
+
+      mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
+      if (mblen < 0)
+        return -1;
+
+      switch (uc)
+        {
+        case '\n':
+          goto end_of_line;
+
+        case '.':
+          endcmd = ofs;
+          break;
+
+        default:
+          if (!lex_uc_is_space (uc))
+            endcmd = -1;
+          break;
+        }
+
+      ofs += mblen;
+    }
+
+  if (eof)
+    {
+    end_of_line:
+      s->state = S_GENERAL;
+      s->substate = 0;
+      *type = SEG_UNQUOTED_STRING;
+      return endcmd >= 0 ? endcmd : ofs;
+    }
+
+  return -1;
+}
+
 static int
 segmenter_subparse (struct segmenter *s,
                     const char *input, size_t n, bool eof,
@@ -1274,6 +1363,9 @@ segmenter_subparse (struct segmenter *s,
   return ofs;
 }
 
+/* We are segmenting a DO REPEAT command, currently reading the syntax that
+   defines the stand-in variables (the head) before the lines of syntax to be
+   repeated (the body). */
 static int
 segmenter_parse_do_repeat_1__ (struct segmenter *s,
                                const char *input, size_t n, bool eof,
@@ -1283,10 +1375,14 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s,
   if (ofs < 0)
     return -1;
 
-  if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
-    s->state = S_DO_REPEAT_2;
-  else if (*type == SEG_END_COMMAND)
+  if (*type == SEG_SEPARATE_COMMANDS)
+    {
+      /* We reached a blank line that separates the head from the body. */
+      s->state = S_DO_REPEAT_2;
+    }
+  else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
     {
+      /* We reached the body. */
       s->state = S_DO_REPEAT_3;
       s->substate = 1;
     }
@@ -1294,6 +1390,8 @@ segmenter_parse_do_repeat_1__ (struct segmenter *s,
   return ofs;
 }
 
+/* We are segmenting a DO REPEAT command, currently reading a blank line that
+   separates the head from the body. */
 static int
 segmenter_parse_do_repeat_2__ (struct segmenter *s,
                                const char *input, size_t n, bool eof,
@@ -1305,6 +1403,7 @@ segmenter_parse_do_repeat_2__ (struct segmenter *s,
 
   if (*type == SEG_NEWLINE)
     {
+      /* We reached the body. */
       s->state = S_DO_REPEAT_3;
       s->substate = 1;
     }
@@ -1361,6 +1460,12 @@ segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
     return ofs - (input[ofs - 1] == '\r');
 }
 
+/* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
+   be repeated.  Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
+
+   DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
+   the lines we're segmenting.  s->substate counts the nesting level, starting
+   at 1. */
 static int
 segmenter_parse_do_repeat_3__ (struct segmenter *s,
                                const char *input, size_t n, bool eof,
@@ -1375,6 +1480,8 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s,
     return -1;
   else if (s->substate == 0)
     {
+      /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
+         body. */
       s->state = S_GENERAL;
       s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
       return segmenter_push (s, input, n, eof, type);
@@ -1386,6 +1493,173 @@ segmenter_parse_do_repeat_3__ (struct segmenter *s,
     }
 }
 
+/* We are segmenting a DEFINE command, which consists of:
+
+  - The DEFINE keyword.
+
+  - Anything but "(".
+
+  - "(" followed by a sequence of tokens possibly including balanced parentheses
+    up to a final ")".
+
+  - A sequence of any number of lines, one string per line, ending with
+    "!ENDDEFINE".  The first line is usually blank (that is, a newline follows
+    the "(").  The last line usually just has "!ENDDEFINE." on it, but it can
+    start with other tokens.  The whole DEFINE...!ENDDEFINE can be on a single
+    line, even.
+   */
+static int
+segmenter_parse_define_1__ (struct segmenter *s,
+                            const char *input, size_t n, bool eof,
+                            enum segment_type *type)
+{
+  int ofs = segmenter_subparse (s, input, n, eof, type);
+  if (ofs < 0)
+    return -1;
+
+  if (*type == SEG_SEPARATE_COMMANDS
+      || *type == SEG_END_COMMAND
+      || *type == SEG_START_COMMAND)
+    {
+      /* The DEFINE command is malformed because we reached its end without
+         ever hitting a "(" token.  Transition back to general parsing. */
+      s->state = S_GENERAL;
+      return ofs;
+    }
+  else if (*type == SEG_PUNCT && input[0] == '(')
+    {
+      s->state = S_DEFINE_2;
+      s->nest = 1;
+      return ofs;
+    }
+
+  return ofs;
+}
+
+static int
+segmenter_parse_define_2__ (struct segmenter *s,
+                            const char *input, size_t n, bool eof,
+                            enum segment_type *type)
+{
+  int ofs = segmenter_subparse (s, input, n, eof, type);
+  if (ofs < 0)
+    return -1;
+
+  if (*type == SEG_SEPARATE_COMMANDS
+      || *type == SEG_END_COMMAND
+      || *type == SEG_START_COMMAND)
+    {
+      /* The DEFINE command is malformed because we reached its end before
+         closing the set of parentheses.  Transition back to general
+         parsing. */
+      s->state = S_GENERAL;
+      return ofs;
+    }
+  else if (*type == SEG_PUNCT && input[0] == '(')
+    {
+      s->nest++;
+      return ofs;
+    }
+  else if (*type == SEG_PUNCT && input[0] == ')')
+    {
+      s->nest--;
+      if (!s->nest)
+        {
+          s->state = S_DEFINE_3;
+          s->substate = 0;
+        }
+      return ofs;
+    }
+
+  return ofs;
+}
+
+static size_t
+find_enddefine (struct substring input)
+{
+  size_t n = input.length;
+  const struct substring enddefine = ss_cstr ("!ENDDEFINE");
+  for (size_t i = 0; i + enddefine.length <= n; i++)
+    if (input.string[i] == '!'
+        && ss_equals_case (ss_substr (input, i, enddefine.length), enddefine))
+      return i;
+  return SIZE_MAX;
+}
+
+/* We are in the body of a macro definition, looking for additional lines of
+   the body or !ENDDEFINE. */
+static int
+segmenter_parse_define_3__ (struct segmenter *s,
+                            const char *input, size_t n, bool eof,
+                            enum segment_type *type)
+{
+  /* Gather a whole line. */
+  const char *newline = memchr (input, '\n', n);
+  int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
+             : eof ? n
+             : -1);
+  if (ofs < 0)
+    return -1;
+
+  /* Does the line contain !ENDDEFINE? */
+  size_t end = find_enddefine (ss_buffer (input, ofs));
+  if (end == SIZE_MAX)
+    {
+      /* No !ENDDEFINE.  We have a full line of macro body.
+
+         The line might be blank, whether completely empty or just spaces and
+         comments.  That's OK: we need to report blank lines because they can
+         have significance.
+
+         However, if the first line of the macro body (the same line as the
+         closing parenthesis in the argument definition) is blank, we just
+         report it as spaces because it's not significant. */
+      *type = (s->substate == 0 && is_all_spaces (input, ofs)
+               ? SEG_SPACES : SEG_MACRO_BODY);
+      s->state = S_DEFINE_4;
+      s->substate = 1;
+      return ofs;
+    }
+  else
+    {
+      /* Macro ends at the !ENDDEFINE on this line. */
+      s->state = S_GENERAL;
+      s->substate = 0;
+      if (!end)
+        {
+          /* Line starts with !ENDDEFINE. */
+          return segmenter_push (s, input, n, eof, type);
+        }
+      else
+        {
+          if (is_all_spaces (input, end))
+            {
+              /* Line starts with spaces followed by !ENDDEFINE. */
+              *type = SEG_SPACES;
+            }
+          else
+            {
+              /* Line starts with some content followed by !ENDDEFINE. */
+              *type = SEG_MACRO_BODY;
+            }
+          return end;
+        }
+    }
+}
+
+static int
+segmenter_parse_define_4__ (struct segmenter *s,
+                            const char *input, size_t n, bool eof,
+                            enum segment_type *type)
+{
+  int ofs = segmenter_parse_newline__ (input, n, eof, type);
+  if (ofs < 0)
+    return -1;
+
+  s->state = S_DEFINE_3;
+  return ofs;
+}
+
 static int
 segmenter_parse_begin_data_1__ (struct segmenter *s,
                                 const char *input, size_t n, bool eof,
@@ -1495,70 +1769,6 @@ segmenter_parse_begin_data_4__ (struct segmenter *s,
   return ofs;
 }
 
-static int
-segmenter_parse_title_1__ (struct segmenter *s,
-                           const char *input, size_t n, bool eof,
-                           enum segment_type *type)
-{
-  int ofs;
-
-  ofs = skip_spaces (input, n, eof, 0);
-  if (ofs < 0)
-    return -1;
-  s->state = S_TITLE_2;
-  *type = SEG_SPACES;
-  return ofs;
-}
-
-static int
-segmenter_parse_title_2__ (struct segmenter *s,
-                           const char *input, size_t n, bool eof,
-                           enum segment_type *type)
-{
-  int endcmd;
-  int ofs;
-
-  endcmd = -1;
-  ofs = 0;
-  while (ofs < n)
-    {
-      ucs4_t uc;
-      int mblen;
-
-      mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
-      if (mblen < 0)
-        return -1;
-
-      switch (uc)
-        {
-        case '\n':
-          goto end_of_line;
-
-        case '.':
-          endcmd = ofs;
-          break;
-
-        default:
-          if (!lex_uc_is_space (uc))
-            endcmd = -1;
-          break;
-        }
-
-      ofs += mblen;
-    }
-
-  if (eof)
-    {
-    end_of_line:
-      s->state = S_GENERAL;
-      s->substate = 0;
-      *type = SEG_UNQUOTED_STRING;
-      return endcmd >= 0 ? endcmd : ofs;
-    }
-
-  return -1;
-}
-
 /* Returns the name of segment TYPE as a string.  The caller must not modify
    or free the returned string.
 
@@ -1576,17 +1786,28 @@ segment_type_to_string (enum segment_type type)
     }
 }
 
-/* Initializes S as a segmenter with the given syntax MODE.
+/* Returns a segmenter with the given syntax MODE.
+
+   If IS_SNIPPET is false, then the segmenter will parse as if it's being given
+   a whole file.  This means, for example, that it will interpret - or + at the
+   beginning of the syntax as a separator between commands (since - or + at the
+   beginning of a line has this meaning).
+
+   If IS_SNIPPET is true, then the segmenter will parse as if it's being given
+   an isolated piece of syntax.  This means that, for example, that it will
+   interpret - or + at the beginning of the syntax as an operator token or (if
+   followed by a digit) as part of a number.
 
    A segmenter does not contain any external references, so nothing needs to be
    done to destroy one.  For the same reason, segmenters may be copied with
    plain struct assignment (or memcpy). */
-void
-segmenter_init (struct segmenter *s, enum segmenter_mode mode)
+struct segmenter
+segmenter_init (enum segmenter_mode mode, bool is_snippet)
 {
-  s->state = S_SHBANG;
-  s->substate = 0;
-  s->mode = mode;
+  return (struct segmenter) {
+    .state = is_snippet ? S_GENERAL : S_SHBANG,
+    .mode = mode,
+  };
 }
 
 /* Returns the mode passed to segmenter_init() for S. */
@@ -1659,8 +1880,12 @@ segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
     case S_DOCUMENT_3:
       return segmenter_parse_document_3__ (s, type);
 
-    case S_FILE_LABEL:
-      return segmenter_parse_file_label__ (s, input, n, eof, type);
+    case S_FILE_LABEL_1:
+      return segmenter_parse_file_label_1__ (s, input, n, eof, type);
+    case S_FILE_LABEL_2:
+      return segmenter_parse_file_label_2__ (s, input, n, eof, type);
+    case S_FILE_LABEL_3:
+      return segmenter_parse_file_label_3__ (s, input, n, eof, type);
 
     case S_DO_REPEAT_1:
       return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
@@ -1669,6 +1894,15 @@ segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
     case S_DO_REPEAT_3:
       return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
 
+    case S_DEFINE_1:
+      return segmenter_parse_define_1__ (s, input, n, eof, type);
+    case S_DEFINE_2:
+      return segmenter_parse_define_2__ (s, input, n, eof, type);
+    case S_DEFINE_3:
+      return segmenter_parse_define_3__ (s, input, n, eof, type);
+    case S_DEFINE_4:
+      return segmenter_parse_define_4__ (s, input, n, eof, type);
+
     case S_BEGIN_DATA_1:
       return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
     case S_BEGIN_DATA_2:
@@ -1677,11 +1911,6 @@ segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
       return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
     case S_BEGIN_DATA_4:
       return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
-
-    case S_TITLE_1:
-      return segmenter_parse_title_1__ (s, input, n, eof, type);
-    case S_TITLE_2:
-      return segmenter_parse_title_2__ (s, input, n, eof, type);
     }
 
   NOT_REACHED ();
@@ -1712,8 +1941,11 @@ segmenter_get_prompt (const struct segmenter *s)
     case S_DOCUMENT_3:
       return PROMPT_FIRST;
 
-    case S_FILE_LABEL:
+    case S_FILE_LABEL_1:
       return PROMPT_LATER;
+    case S_FILE_LABEL_2:
+    case S_FILE_LABEL_3:
+      return PROMPT_FIRST;
 
     case S_DO_REPEAT_1:
     case S_DO_REPEAT_2:
@@ -1721,6 +1953,13 @@ segmenter_get_prompt (const struct segmenter *s)
     case S_DO_REPEAT_3:
       return PROMPT_DO_REPEAT;
 
+    case S_DEFINE_1:
+    case S_DEFINE_2:
+      return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
+    case S_DEFINE_3:
+    case S_DEFINE_4:
+      return PROMPT_DEFINE;
+
     case S_BEGIN_DATA_1:
       return PROMPT_FIRST;
     case S_BEGIN_DATA_2:
@@ -1729,9 +1968,6 @@ segmenter_get_prompt (const struct segmenter *s)
     case S_BEGIN_DATA_4:
       return PROMPT_DATA;
 
-    case S_TITLE_1:
-    case S_TITLE_2:
-      return PROMPT_FIRST;
     }
 
   NOT_REACHED ();