From 106e3340befcc44a43dffcdd08d92ef70678712f Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Sun, 13 Feb 2022 11:07:22 -0800
Subject: [PATCH] lexer: New function lex_ofs_representation().

This interface is more convenient for situations where it's easier to
consume tokens before getting their representations.
---
 src/language/lexer/lexer.c     | 68 +++++++++++++++++++++++-----------
 src/language/lexer/lexer.h     |  1 +
 src/language/stats/matrix.c    | 19 ++--------
 src/language/utilities/title.c | 20 +++-------
 4 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c
index 908556a7ee..1fadf53b30 100644
--- a/src/language/lexer/lexer.c
+++ b/src/language/lexer/lexer.c
@@ -282,8 +282,8 @@ struct lexer
   };
 
 static struct lex_source *lex_source__ (const struct lexer *);
-static char *lex_source_get_syntax__ (const struct lex_source *,
-                                      int n0, int n1);
+static char *lex_source_syntax__ (const struct lex_source *,
+                                  int ofs0, int ofs1);
 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 static void lex_source_push_endcmd__ (struct lex_source *);
 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
@@ -1451,15 +1451,37 @@ lex_ofs_end_point (const struct lexer *lexer, int ofs)
 
 /* Returns the text of the syntax in tokens N0 ahead of the current one,
    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
-   are both zero, this requests the syntax for the current token.)  The caller
-   must eventually free the returned string (with free()).  The syntax is
-   encoded in UTF-8 and in the original form supplied to the lexer so that, for
-   example, it may include comments, spaces, and new-lines if it spans multiple
-   tokens.  Macro expansion, however, has already been performed. */
+   are both zero, this requests the syntax for the current token.)
+
+   The caller must eventually free the returned string (with free()).  The
+   syntax is encoded in UTF-8 and in the original form supplied to the lexer so
+   that, for example, it may include comments, spaces, and new-lines if it
+   spans multiple tokens.  Macro expansion, however, has already been
+   performed. */
 char *
 lex_next_representation (const struct lexer *lexer, int n0, int n1)
 {
-  return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
+  const struct lex_source *src = lex_source__ (lexer);
+  return (src
+          ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
+          : xstrdup (""));
+}
+
+
+/* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
+   inclusive.  (For example, if OFS0 and OFS1 are both zero, this requests the
+   syntax for the first token in the current command.)
+
+   The caller must eventually free the returned string (with free()).  The
+   syntax is encoded in UTF-8 and in the original form supplied to the lexer so
+   that, for example, it may include comments, spaces, and new-lines if it
+   spans multiple tokens.  Macro expansion, however, has already been
+   performed. */
+char *
+lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
+{
+  const struct lex_source *src = lex_source__ (lexer);
+  return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
 }
 
 /* Returns true if the token N ahead of the current one was produced by macro
@@ -1787,32 +1809,33 @@ lex_source__ (const struct lexer *lexer)
           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
 }
 
-/* Returns the text of the syntax in SRC for tokens N0 ahead of the current
-   one, through N1 ahead of the current one, inclusive.  (For example, if N0
-   and N1 are both zero, this requests the syntax for the current token.)  The
-   caller must eventually free the returned string (with free()).  The syntax
-   is encoded in UTF-8 and in the original form supplied to the lexer so that,
-   for example, it may include comments, spaces, and new-lines if it spans
-   multiple tokens.  Macro expansion, however, has already been performed. */
+/* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
+   OFS1 in the current command, inclusive.  (For example, if OFS0 and OFS1 are
+   both zero, this requests the syntax for the first token in the current
+   command.)  The caller must eventually free the returned string (with
+   free()).  The syntax is encoded in UTF-8 and in the original form supplied
+   to the lexer so that, for example, it may include comments, spaces, and
+   new-lines if it spans multiple tokens.  Macro expansion, however, has
+   already been performed. */
 static char *
-lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
+lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
 {
   struct string s = DS_EMPTY_INITIALIZER;
-  for (size_t i = n0; i <= n1; )
+  for (size_t i = ofs0; i <= ofs1; )
     {
       /* Find [I,J) as the longest sequence of tokens not produced by macro
          expansion, or otherwise the longest sequence expanded from a single
          macro call. */
-      const struct lex_token *first = lex_source_next__ (src, i);
+      const struct lex_token *first = lex_source_ofs__ (src, i);
       size_t j;
-      for (j = i + 1; j <= n1; j++)
+      for (j = i + 1; j <= ofs1; j++)
         {
-          const struct lex_token *cur = lex_source_next__ (src, j);
+          const struct lex_token *cur = lex_source_ofs__ (src, j);
           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
               || first->macro_rep != cur->macro_rep)
             break;
         }
-      const struct lex_token *last = lex_source_next__ (src, j - 1);
+      const struct lex_token *last = lex_source_ofs__ (src, j - 1);
 
       /* Now add the syntax for this sequence of tokens to SRC. */
       if (!ds_is_empty (&s))
@@ -1883,7 +1906,8 @@ lex_source_error_valist (struct lex_source *src, int n0, int n1,
   else
     {
       /* Get the syntax that caused the error. */
-      char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
+      char *raw_syntax = lex_source_syntax__ (src, n0 + src->parse_ofs,
+                                              n1 + src->parse_ofs);
       char syntax[64];
       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
       free (raw_syntax);
diff --git a/src/language/lexer/lexer.h b/src/language/lexer/lexer.h
index 764da74b19..bb9d6a954d 100644
--- a/src/language/lexer/lexer.h
+++ b/src/language/lexer/lexer.h
@@ -166,6 +166,7 @@ struct msg_point lex_ofs_end_point (const struct lexer *, int ofs);
 
 /* Token representation. */
 char *lex_next_representation (const struct lexer *, int n0, int n1);
+char *lex_ofs_representation (const struct lexer *, int ofs0, int ofs1);
 bool lex_next_is_from_macro (const struct lexer *, int n);
 
 /* Current position. */
diff --git a/src/language/stats/matrix.c b/src/language/stats/matrix.c
index 9bbc37f9a2..b1537cd92e 100644
--- a/src/language/stats/matrix.c
+++ b/src/language/stats/matrix.c
@@ -5367,25 +5367,12 @@ matrix_print_parse (struct matrix_state *s)
 
   if (lex_token (s->lexer) != T_SLASH && lex_token (s->lexer) != T_ENDCMD)
     {
-      size_t depth = 0;
-      for (size_t i = 0; ; i++)
-        {
-          enum token_type t = lex_next_token (s->lexer, i);
-          if (t == T_LPAREN || t == T_LBRACK || t == T_LCURLY)
-            depth++;
-          else if ((t == T_RPAREN || t == T_RBRACK || t == T_RCURLY) && depth)
-            depth--;
-          else if ((t == T_SLASH && !depth) || t == T_ENDCMD || t == T_STOP)
-            {
-              if (i > 0)
-                cmd->print.title = lex_next_representation (s->lexer, 0, i - 1);
-              break;
-            }
-        }
-
+      int start_ofs = lex_ofs (s->lexer);
       cmd->print.expression = matrix_parse_exp (s);
       if (!cmd->print.expression)
         goto error;
+      cmd->print.title = lex_ofs_representation (s->lexer, start_ofs,
+                                                 lex_ofs (s->lexer) - 1);
     }
 
   while (lex_match (s->lexer, T_SLASH))
diff --git a/src/language/utilities/title.c b/src/language/utilities/title.c
index 323bdf3fa5..8ad6a4acb3 100644
--- a/src/language/utilities/title.c
+++ b/src/language/utilities/title.c
@@ -44,28 +44,18 @@ parse_title (struct lexer *lexer, void (*set_title) (const char *))
       set_title (lex_tokcstr (lexer));
       lex_get (lexer);
     }
-  else if (lex_token (lexer) == T_ENDCMD)
-    {
-      /* This would be a bad special case below because n-1 would be
-         SIZE_MAX. */
-      set_title ("");
-    }
   else
     {
-      /* Count the tokens in the title. */
-      size_t n = 0;
-      while (lex_next (lexer, n)->type != T_ENDCMD)
-        n++;
+      int start_ofs = lex_ofs (lexer);
+      while (lex_token (lexer) != T_ENDCMD)
+        lex_get (lexer);
 
       /* Get the raw representation of all the tokens, including any space
          between them, and use it as the title. */
-      char *title = lex_next_representation (lexer, 0, n - 1);
+      char *title = lex_ofs_representation (lexer, start_ofs,
+                                            lex_ofs (lexer) - 1);
       set_title (title);
       free (title);
-
-      /* Skip past the tokens. */
-      for (size_t i = 0; i < n; i++)
-        lex_get (lexer);
     }
   return CMD_SUCCESS;
 }
-- 
2.30.2