Work on DEFINE command.

author Ben Pfaff <blp@cs.stanford.edu>

Tue, 23 Mar 2021 14:14:48 +0000 (07:14 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 30 May 2021 22:50:57 +0000 (15:50 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Tue, 23 Mar 2021 14:14:48 +0000 (07:14 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 30 May 2021 22:50:57 +0000 (15:50 -0700)
diff --git a/src/language/command.def b/src/language/command.def

index a97f9b83e70fd1c7e021188eb6a84107a6c04627..12f30c7c037283dd5b76f56b49299208bd2b11c3 100644 (file)
--- a/src/language/command.def
+++ b/src/language/command.def
@@ -18,6 +18,7 @@
  DEF_CMD (S_ANY, F_ENHANCED, "CLOSE FILE HANDLE", cmd_close_file_handle)
  DEF_CMD (S_ANY, 0, "CACHE", cmd_cache)
  DEF_CMD (S_ANY, 0, "CD", cmd_cd)
+DEF_CMD (S_ANY, 0, "DEFINE", cmd_define)
  DEF_CMD (S_ANY, 0, "DO REPEAT", cmd_do_repeat)
  DEF_CMD (S_ANY, 0, "END REPEAT", cmd_end_repeat)
  DEF_CMD (S_ANY, 0, "ECHO", cmd_echo)
@@ -188,7 +189,6 @@ UNIMPL_CMD ("CSTABULATE", "Tabulate complex samples")
  UNIMPL_CMD ("CTABLES", "Display complex samples")
  UNIMPL_CMD ("CURVEFIT", "Fit curve to line plot")
  UNIMPL_CMD ("DATE", "Create time series data")
-UNIMPL_CMD ("DEFINE", "Syntax macros")
  UNIMPL_CMD ("DETECTANOMALY", "Find unusual cases")
  UNIMPL_CMD ("DISCRIMINANT", "Linear discriminant analysis")
  UNIMPL_CMD ("EDIT", "obsolete")
diff --git a/src/language/control/automake.mk b/src/language/control/automake.mk

index 909acd13db4106bfd0872a265bbb02397e11d3bc..9d09687c81e38330552f5f23c5d6f3b01385edf4 100644 (file)
--- a/src/language/control/automake.mk
+++ b/src/language/control/automake.mk
@@ -20,6 +20,7 @@
  language_control_sources = \
         src/language/control/control-stack.c \
         src/language/control/control-stack.h \
+       src/language/control/define.c \
         src/language/control/do-if.c \
         src/language/control/loop.c \
         src/language/control/repeat.c \
diff --git a/src/language/control/define.c b/src/language/control/define.c

new file mode 100644 (file)

index 0000000..686685f
--- /dev/null
+++ b/src/language/control/define.c
@@ -0,0 +1,223 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <limits.h>
+
+#include "language/command.h"
+#include "language/lexer/lexer.h"
+#include "language/lexer/macro.h"
+#include "language/lexer/scan.h"
+#include "language/lexer/token.h"
+
+#include "gl/xalloc.h"
+
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+
+static bool
+force_macro_id (struct lexer *lexer)
+{
+  return lex_token (lexer) == T_MACRO_ID || lex_force_id (lexer);
+}
+
+static bool
+match_macro_id (struct lexer *lexer, const char *id)
+{
+  if (id[0] != '!')
+    return lex_match_id (lexer, id);
+  else if (lex_token (lexer) == T_MACRO_ID
+           && ss_equals_case (lex_tokss (lexer), ss_cstr (id)))
+    {
+      lex_get (lexer);
+      return true;
+    }
+  else
+    return false;
+}
+
+static bool
+parse_quoted_token (struct lexer *lexer, struct token *token)
+{
+  if (!lex_force_string (lexer))
+    return false;
+
+  struct substring s = lex_tokss (lexer);
+  struct string_lexer slex;
+  string_lexer_init (&slex, s.string, s.length, SEG_MODE_INTERACTIVE);
+  struct token another_token;
+  if (!string_lexer_next (&slex, token)
+      || string_lexer_next (&slex, &another_token))
+    {
+      token_uninit (token);
+      token_uninit (&another_token);
+      lex_error (lexer, _("String must contain exactly one token."));
+      return false;
+    }
+  lex_get (lexer);
+  return true;
+}
+
+int
+cmd_define (struct lexer *lexer, struct dataset *ds UNUSED)
+{
+  if (!force_macro_id (lexer))
+    return CMD_FAILURE;
+
+  /* Parse macro name. */
+  struct macro *m = xmalloc (sizeof *m);
+  *m = (struct macro) { .name = ss_xstrdup (lex_tokss (lexer)) };
+  lex_get (lexer);
+
+  if (!lex_force_match (lexer, T_LPAREN))
+    goto error;
+
+  size_t allocated_params = 0;
+  while (!lex_match (lexer, T_RPAREN))
+    {
+      if (m->n_params >= allocated_params)
+        m->params = x2nrealloc (m->params, &allocated_params,
+                                sizeof *m->params);
+
+      size_t param_index = m->n_params++;
+      struct macro_param *p = &m->params[param_index];
+      *p = (struct macro_param) { .expand_arg = true };
+
+      /* Parse parameter name. */
+      if (match_macro_id (lexer, "!POSITIONAL"))
+        {
+          if (param_index > 0 && !m->params[param_index - 1].positional)
+            {
+              lex_error (lexer, _("Positional parameters must precede "
+                                  "keyword parameters."));
+              goto error;
+            }
+
+          p->positional = true;
+          p->name = xasprintf ("!%zu", param_index + 1);
+        }
+      else
+        {
+          if (!lex_force_id (lexer))
+            goto error;
+
+          p->positional = false;
+          p->name = xasprintf ("!%s", lex_tokcstr (lexer));
+          lex_get (lexer);
+
+          if (!lex_force_match (lexer, T_EQUALS))
+            goto error;
+        }
+
+      /* Parse default value. */
+      if (match_macro_id (lexer, "!DEFAULT"))
+        {
+          if (!lex_force_match (lexer, T_LPAREN))
+            goto error;
+
+          /* XXX Should this handle balanced inner parentheses? */
+          while (!lex_match (lexer, T_RPAREN))
+            {
+              if (lex_token (lexer) == T_ENDCMD)
+                {
+                  lex_error_expecting (lexer, ")");
+                  goto error;
+                }
+              const struct macro_token mt = {
+                .token = *lex_next (lexer, 0),
+                .representation = lex_next_representation (lexer, 0, 0),
+              };
+              macro_tokens_add (&p->def, &mt);
+              lex_get (lexer);
+            }
+        }
+
+      if (match_macro_id (lexer, "!NOEXPAND"))
+        p->expand_arg = false;
+
+      if (match_macro_id (lexer, "!TOKENS"))
+        {
+          if (!lex_force_match (lexer, T_LPAREN)
+              || !lex_force_int_range (lexer, "!TOKENS", 1, INT_MAX))
+            goto error;
+          p->arg_type = ARG_N_TOKENS;
+          p->n_tokens = lex_integer (lexer);
+          lex_get (lexer);
+          if (!lex_force_match (lexer, T_RPAREN))
+            goto error;
+        }
+      else if (match_macro_id (lexer, "!CHAREND"))
+        {
+          p->arg_type = ARG_CHAREND;
+          p->charend = (struct token) { .type = T_STOP };
+
+          if (!lex_force_match (lexer, T_LPAREN)
+              || !parse_quoted_token (lexer, &p->charend)
+              || !lex_force_match (lexer, T_RPAREN))
+            goto error;
+        }
+      else if (match_macro_id (lexer, "!ENCLOSE"))
+        {
+          p->arg_type = ARG_ENCLOSE;
+          p->enclose[0] = p->enclose[1] = (struct token) { .type = T_STOP };
+
+          if (!lex_force_match (lexer, T_LPAREN)
+              || !parse_quoted_token (lexer, &p->enclose[0])
+              || !lex_force_match (lexer, T_COMMA)
+              || !parse_quoted_token (lexer, &p->enclose[1])
+              || !lex_force_match (lexer, T_RPAREN))
+            goto error;
+        }
+      else if (match_macro_id (lexer, "!CMDEND"))
+        p->arg_type = ARG_CMDEND;
+      else
+        {
+          lex_error_expecting (lexer, "!TOKENS", "!CHAREND",
+                               "!ENCLOSE", "!CMDEND");
+          goto error;
+        }
+
+      if (lex_token (lexer) != T_RPAREN && !lex_force_match (lexer, T_SLASH))
+        goto error;
+    }
+
+  struct string body = DS_EMPTY_INITIALIZER;
+  while (!match_macro_id (lexer, "!ENDDEFINE"))
+    {
+      if (lex_token (lexer) != T_STRING)
+        {
+          lex_error (lexer, _("Expecting macro body or !ENDDEFINE"));
+          ds_destroy (&body);
+          goto error;
+        }
+
+      ds_put_substring (&body, lex_tokss (lexer));
+      ds_put_byte (&body, '\n');
+      lex_get (lexer);
+    }
+
+  macro_tokens_from_string (&m->body, body.ss, lex_get_syntax_mode (lexer));
+  ds_destroy (&body);
+
+  lex_define_macro (lexer, m);
+
+  return CMD_SUCCESS;
+
+error:
+  macro_destroy (m);
+  return CMD_FAILURE;
+}
diff --git a/src/language/control/repeat.c b/src/language/control/repeat.c

index 118e8d3ccd4fd8c56c075c9512d735945f4e6cd9..0438fa1bd8bbaf8a27337e89501eb13fc73f0244 100644 (file)
--- a/src/language/control/repeat.c
+++ b/src/language/control/repeat.c
@@ -201,10 +201,7 @@ do_parse_commands (struct substring s, enum segmenter_mode mode,
                     struct hmap *dummies,
                     struct string *outputs, size_t n_outputs)
  {
-  struct segmenter segmenter;
-
-  segmenter_init (&segmenter, mode);
-
+  struct segmenter segmenter = SEGMENTER_INIT (mode);
    while (!ss_is_empty (s))
      {
        enum segment_type type;
diff --git a/src/language/lexer/automake.mk b/src/language/lexer/automake.mk

index 4387c3dd223b77e879a57b99bfc3541100ee7475..01b3df49c6cb62745a2df2902110a9f652766777 100644 (file)
--- a/src/language/lexer/automake.mk
+++ b/src/language/lexer/automake.mk
@@ -24,6 +24,8 @@ language_lexer_sources = \
         src/language/lexer/include-path.h \
         src/language/lexer/lexer.c \
         src/language/lexer/lexer.h \
+       src/language/lexer/macro.c \
+       src/language/lexer/macro.h \
         src/language/lexer/format-parser.c \
         src/language/lexer/format-parser.h \
         src/language/lexer/scan.c \
diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c

index 7f2d0290a64e0c4f9e1d34ad249491ec35e9a967..5ff5099652ac0405eebb02c45bd94622e216239b 100644 (file)
--- a/src/language/lexer/lexer.c
+++ b/src/language/lexer/lexer.c
@@ -31,6 +31,7 @@
  #include <uniwidth.h>
  
  #include "language/command.h"
+#include "language/lexer/macro.h"
  #include "language/lexer/scan.h"
  #include "language/lexer/segment.h"
  #include "language/lexer/token.h"
@@ -67,6 +68,7 @@ struct lex_token
      size_t token_len;           /* Length of source for token in bytes. */
      size_t line_pos;            /* Start of line containing token_pos. */
      int first_line;             /* Line number at token_pos. */
+    bool from_macro;
    };
  
  /* A source of tokens, corresponding to a syntax file.
@@ -77,6 +79,7 @@ struct lex_source
    {
      struct ll ll;               /* In lexer's list of sources. */
      struct lex_reader *reader;
+    struct lexer *lexer;
      struct segmenter segmenter;
      bool eof;                   /* True if T_STOP was read from 'reader'. */
  
@@ -99,21 +102,25 @@ struct lex_source
      struct lex_token *tokens;   /* Lookahead tokens for parser. */
    };
  
-static struct lex_source *lex_source_create (struct lex_reader *);
+static struct lex_source *lex_source_create (struct lexer *,
+                                             struct lex_reader *);
  static void lex_source_destroy (struct lex_source *);
  
  /* Lexer. */
  struct lexer
    {
      struct ll_list sources;     /* Contains "struct lex_source"s. */
+    struct macro_set *macros;
    };
  
  static struct lex_source *lex_source__ (const struct lexer *);
+static struct substring lex_source_get_syntax__ (const struct lex_source *,
+                                                 int n0, int n1);
  static const struct lex_token *lex_next__ (const struct lexer *, int n);
  static void lex_source_push_endcmd__ (struct lex_source *);
  
  static void lex_source_pop__ (struct lex_source *);
-static bool lex_source_get__ (const struct lex_source *);
+static bool lex_source_get (const struct lex_source *);
  static void lex_source_error_valist (struct lex_source *, int n0, int n1,
                                       const char *format, va_list)
     PRINTF_FORMAT (4, 0);
@@ -148,8 +155,11 @@ lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
  struct lexer *
  lex_create (void)
  {
-  struct lexer *lexer = xzalloc (sizeof *lexer);
-  ll_init (&lexer->sources);
+  struct lexer *lexer = xmalloc (sizeof *lexer);
+  *lexer = (struct lexer) {
+    .sources = LL_INITIALIZER (lexer->sources),
+    .macros = macro_set_create (),
+  };
    return lexer;
  }
  
@@ -163,10 +173,19 @@ lex_destroy (struct lexer *lexer)
  
        ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
          lex_source_destroy (source);
+      macro_set_destroy (lexer->macros);
        free (lexer);
      }
  }
  
+/* Adds M to LEXER's set of macros.  M replaces any existing macro with the
+   same name.  Takes ownership of M. */
+void
+lex_define_macro (struct lexer *lexer, struct macro *m)
+{
+  macro_set_add (lexer->macros, m);
+}
+
  /* Inserts READER into LEXER so that the next token read by LEXER comes from
     READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
     token. */
@@ -174,7 +193,7 @@ void
  lex_include (struct lexer *lexer, struct lex_reader *reader)
  {
    assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
-  ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
+  ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
  }
  
  /* Appends READER to LEXER, so that it will be read after all other current
@@ -182,7 +201,7 @@ lex_include (struct lexer *lexer, struct lex_reader *reader)
  void
  lex_append (struct lexer *lexer, struct lex_reader *reader)
  {
-  ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
+  ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
  }
  \f
  /* Advancing. */
@@ -196,7 +215,7 @@ lex_push_token__ (struct lex_source *src)
      src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
  
    token = &src->tokens[deque_push_front (&src->deque)];
-  token_init (&token->token);
+  token->token = (struct token) { .type = T_STOP };
    return token;
  }
  
@@ -226,7 +245,7 @@ lex_get (struct lexer *lexer)
      lex_source_pop__ (src);
  
    while (deque_is_empty (&src->deque))
-    if (!lex_source_get__ (src))
+    if (!lex_source_get (src))
        {
          lex_source_destroy (src);
          src = lex_source__ (lexer);
@@ -859,13 +878,17 @@ lex_next__ (const struct lexer *lexer_, int n)
      return lex_source_next__ (src, n);
    else
      {
-      static const struct lex_token stop_token =
-        { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
-
+      static const struct lex_token stop_token = { .token = { .type = T_STOP } };
        return &stop_token;
      }
  }
  
+static const struct lex_token *
+lex_source_front (const struct lex_source *src)
+{
+  return &src->tokens[deque_front (&src->deque, 0)];
+}
+
  static const struct lex_token *
  lex_source_next__ (const struct lex_source *src, int n)
  {
@@ -873,14 +896,12 @@ lex_source_next__ (const struct lex_source *src, int n)
      {
        if (!deque_is_empty (&src->deque))
          {
-          struct lex_token *front;
-
-          front = &src->tokens[deque_front (&src->deque, 0)];
+          const struct lex_token *front = lex_source_front (src);
            if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
              return front;
          }
  
-      lex_source_get__ (src);
+      lex_source_get (src);
      }
  
    return &src->tokens[deque_back (&src->deque, n)];
@@ -945,6 +966,12 @@ lex_next_tokss (const struct lexer *lexer, int n)
    return lex_next (lexer, n)->string;
  }
  
+struct substring
+lex_next_representation (const struct lexer *lexer, int n0, int n1)
+{
+  return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
+}
+
  static bool
  lex_tokens_match (const struct token *actual, const struct token *expected)
  {
@@ -1160,7 +1187,6 @@ lex_get_encoding (const struct lexer *lexer)
    return src == NULL ? NULL : src->reader->encoding;
  }
  
-
  /* Returns the syntax mode for the syntax file from which the current drawn is
     drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
     does not have line numbers.
@@ -1320,16 +1346,24 @@ lex_source__ (const struct lexer *lexer)
  }
  
  static struct substring
-lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
+lex_tokens_get_syntax__ (const struct lex_source *src,
+                         const struct lex_token *token0,
+                         const struct lex_token *token1)
  {
-  const struct lex_token *token0 = lex_source_next__ (src, n0);
-  const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
    size_t start = token0->token_pos;
    size_t end = token1->token_pos + token1->token_len;
  
    return ss_buffer (&src->buffer[start - src->tail], end - start);
  }
  
+static struct substring
+lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
+{
+  return lex_tokens_get_syntax__ (src,
+                                  lex_source_next__ (src, n0),
+                                  lex_source_next__ (src, MAX (n0, n1)));
+}
+
  static void
  lex_ellipsize__ (struct substring in, char *out, size_t out_size)
  {
@@ -1377,6 +1411,16 @@ lex_source_error_valist (struct lex_source *src, int n0, int n1,
    token = lex_source_next__ (src, n0);
    if (token->token.type == T_ENDCMD)
      ds_put_cstr (&s, _("Syntax error at end of command"));
+  else if (token->from_macro)
+    {
+      /* XXX this isn't ideal, we should get the actual syntax */
+      char *syntax = token_to_string (&token->token);
+      if (syntax)
+        ds_put_format (&s, _("Syntax error at `%s'"), syntax);
+      else
+        ds_put_cstr (&s, _("Syntax error"));
+      free (syntax);
+    }
    else
      {
        struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
@@ -1428,16 +1472,11 @@ lex_get_error (struct lex_source *src, const char *format, ...)
  }
  
  /* Attempts to append an additional token into SRC's deque, reading more from
-   the underlying lex_reader if necessary.  Returns true if successful, false
-   if the deque already represents (a suffix of) the whole lex_reader's
-   contents, */
+   the underlying lex_reader if necessary.  Returns true if a new token was
+   added to SRC's deque, false otherwise. */
  static bool
-lex_source_get__ (const struct lex_source *src_)
+lex_source_try_get (struct lex_source *src)
  {
-  struct lex_source *src = CONST_CAST (struct lex_source *, src_);
-  if (src->eof)
-    return false;
-
    /* State maintained while scanning tokens.  Usually we only need a single
       state, but scanner_push() can return SCAN_SAVE to indicate that the state
       needs to be saved and possibly restored later with SCAN_BACK. */
@@ -1568,57 +1607,133 @@ lex_source_get__ (const struct lex_source *src_)
    switch (token->token.type)
      {
      default:
-      break;
+      return true;
  
      case T_STOP:
        token->token.type = T_ENDCMD;
        src->eof = true;
-      break;
+      return true;
  
      case SCAN_BAD_HEX_LENGTH:
        lex_get_error (src, _("String of hex digits has %d characters, which "
                              "is not a multiple of 2"),
                       (int) token->token.number);
-      break;
+      return false;
  
      case SCAN_BAD_HEX_DIGIT:
      case SCAN_BAD_UNICODE_DIGIT:
        lex_get_error (src, _("`%c' is not a valid hex digit"),
                       (int) token->token.number);
-      break;
+      return false;
  
      case SCAN_BAD_UNICODE_LENGTH:
        lex_get_error (src, _("Unicode string contains %d bytes, which is "
                              "not in the valid range of 1 to 8 bytes"),
                       (int) token->token.number);
-      break;
+      return false;
  
      case SCAN_BAD_UNICODE_CODE_POINT:
        lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
                       (int) token->token.number);
-      break;
+      return false;
  
      case SCAN_EXPECTED_QUOTE:
        lex_get_error (src, _("Unterminated string constant"));
-      break;
+      return false;
  
      case SCAN_EXPECTED_EXPONENT:
        lex_get_error (src, _("Missing exponent following `%s'"),
                       token->token.string.string);
-      break;
+      return false;
  
      case SCAN_UNEXPECTED_CHAR:
        {
          char c_name[16];
          lex_get_error (src, _("Bad character %s in input"),
                         uc_name (token->token.number, c_name));
+        return false;
        }
-      break;
  
      case SCAN_SKIP:
        lex_source_pop_front (src);
-      break;
+      return false;
+    }
+
+  NOT_REACHED ();
+}
+
+static bool
+lex_source_get__ (struct lex_source *src)
+{
+  for (;;)
+    {
+      if (src->eof)
+        return false;
+      else if (lex_source_try_get (src))
+        return true;
+    }
+}
+
+static bool
+lex_source_get (const struct lex_source *src_)
+{
+  struct lex_source *src = CONST_CAST (struct lex_source *, src_);
+
+  size_t old_count = deque_count (&src->deque);
+  if (!lex_source_get__ (src))
+    return false;
+
+  if (!settings_get_mexpand ())
+    return true;
+
+  struct macro_expander *me;
+  int retval = macro_expander_create (src->lexer->macros,
+                                      &lex_source_front (src)->token,
+                                      &me);
+  while (!retval)
+    {
+      if (!lex_source_get__ (src))
+        {
+          /* This should not be reachable because we always get a T_STOP at the
+             end of input and the macro_expander should always terminate
+             expansion on T_STOP. */
+          NOT_REACHED ();
+        }
+
+      const struct lex_token *front = lex_source_front (src);
+      const struct macro_token mt = {
+        .token = front->token,
+        .representation = lex_tokens_get_syntax__ (src, front, front)
+      };
+      retval = macro_expander_add (me, &mt);
+    }
+  if (retval < 0)
+    {
+      /* XXX handle case where there's a macro invocation starting from some
+         later token we've already obtained */
+      macro_expander_destroy (me);
+      return true;
+    }
+
+  /* XXX handle case where the macro invocation doesn't use all the tokens */
+  while (deque_count (&src->deque) > old_count)
+    lex_source_pop_front (src);
+
+  struct macro_tokens expansion = { .n = 0 };
+  macro_expander_get_expansion (me, &expansion);
+  macro_expander_destroy (me);
+
+  for (size_t i = 0; i < expansion.n; i++)
+    {
+      *lex_push_token__ (src) = (struct lex_token) {
+        .token = expansion.mts[i].token,
+        .from_macro = true,
+        /* XXX the rest */
+      };
+
+      ss_dealloc (&expansion.mts[i].representation); /* XXX should feed into lexer */
      }
+  free (expansion.mts);
  
    return true;
  }
@@ -1635,13 +1750,14 @@ lex_source_push_endcmd__ (struct lex_source *src)
  }
  
  static struct lex_source *
-lex_source_create (struct lex_reader *reader)
+lex_source_create (struct lexer *lexer, struct lex_reader *reader)
  {
    struct lex_source *src;
  
    src = xzalloc (sizeof *src);
    src->reader = reader;
    segmenter_init (&src->segmenter, reader->syntax);
+  src->lexer = lexer;
    src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
  
    lex_source_push_endcmd__ (src);
diff --git a/src/language/lexer/lexer.h b/src/language/lexer/lexer.h

index caf57503317973788122afd42367d0a4eeaf1b1f..86bb4f2b5dadd2089d0c6ec4d175b5fe67fb648d 100644 (file)
--- a/src/language/lexer/lexer.h
+++ b/src/language/lexer/lexer.h
@@ -29,6 +29,7 @@
  #include "libpspp/prompt.h"
  
  struct lexer;
+struct macro;
  
  /* Handling of errors. */
  enum lex_error_mode
@@ -90,6 +91,9 @@ struct lex_reader *lex_reader_for_substring_nocopy (struct substring, const char
  struct lexer *lex_create (void);
  void lex_destroy (struct lexer *);
  
+/* Macros. */
+void lex_define_macro (struct lexer *, struct macro *);
+
  /* Files. */
  void lex_include (struct lexer *, struct lex_reader *);
  void lex_append (struct lexer *, struct lex_reader *);
@@ -142,6 +146,10 @@ const char *lex_next_tokcstr (const struct lexer *, int n);
  double lex_next_tokval (const struct lexer *, int n);
  struct substring lex_next_tokss (const struct lexer *, int n);
  
+/* Token representation. */
+struct substring lex_next_representation (const struct lexer *,
+                                          int n0, int n1);
+
  /* Current position. */
  int lex_get_first_line_number (const struct lexer *, int n);
  int lex_get_last_line_number (const struct lexer *, int n);
diff --git a/src/language/lexer/macro.c b/src/language/lexer/macro.c

new file mode 100644 (file)

index 0000000..a15b706
--- /dev/null
+++ b/src/language/lexer/macro.c
@@ -0,0 +1,931 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "language/lexer/macro.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "data/settings.h"
+#include "language/lexer/segment.h"
+#include "language/lexer/scan.h"
+#include "libpspp/assertion.h"
+#include "libpspp/i18n.h"
+#include "libpspp/message.h"
+#include "libpspp/str.h"
+
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+
+void
+macro_token_copy (struct macro_token *dst, const struct macro_token *src)
+{
+  token_copy (&dst->token, &src->token);
+  ss_alloc_substring (&dst->representation, src->representation);
+}
+
+void
+macro_token_uninit (struct macro_token *mt)
+{
+  token_uninit (&mt->token);
+  ss_dealloc (&mt->representation);
+}
+
+void
+macro_tokens_copy (struct macro_tokens *dst, const struct macro_tokens *src)
+{
+  *dst = (struct macro_tokens) {
+    .mts = xmalloc (src->n * sizeof *dst->mts),
+    .n = src->n,
+    .allocated = src->n,
+  };
+  for (size_t i = 0; i < src->n; i++)
+    macro_token_copy (&dst->mts[i], &src->mts[i]);
+}
+
+void
+macro_tokens_uninit (struct macro_tokens *mts)
+{
+  for (size_t i = 0; i < mts->n; i++)
+    macro_token_uninit (&mts->mts[i]);
+  free (mts->mts);
+}
+
+struct macro_token *
+macro_tokens_add_uninit (struct macro_tokens *mts)
+{
+  if (mts->n >= mts->allocated)
+    mts->mts = x2nrealloc (mts->mts, &mts->allocated, sizeof *mts->mts);
+  return &mts->mts[mts->n++];
+}
+
+void
+macro_tokens_add (struct macro_tokens *mts, const struct macro_token *mt)
+{
+  macro_token_copy (macro_tokens_add_uninit (mts), mt);
+}
+
+void
+macro_tokens_from_string (struct macro_tokens *mts, const struct substring src,
+                          enum segmenter_mode mode)
+{
+  struct state
+    {
+      struct segmenter segmenter;
+      struct substring body;
+    };
+
+  struct state state = {
+    .segmenter = SEGMENTER_INIT (mode),
+    .body = src,
+  };
+  struct state saved = state;
+
+  while (state.body.length > 0)
+    {
+      struct macro_token mt = {
+        .token = { .type = T_STOP },
+        .representation = { .string = state.body.string },
+      };
+      struct token *token = &mt.token;
+
+      struct scanner scanner;
+      scanner_init (&scanner, token);
+
+      for (;;)
+        {
+          enum segment_type type;
+          int seg_len = segmenter_push (&state.segmenter, state.body.string,
+                                        state.body.length, true, &type);
+          assert (seg_len >= 0);
+
+          struct substring segment = ss_head (state.body, seg_len);
+          ss_advance (&state.body, seg_len);
+
+          enum scan_result result = scanner_push (&scanner, type, segment, token);
+          if (result == SCAN_SAVE)
+            saved = state;
+          else if (result == SCAN_BACK)
+            {
+              state = saved;
+              break;
+            }
+          else if (result == SCAN_DONE)
+            break;
+        }
+
+      /* We have a token in 'token'. */
+      if (is_scan_type (token->type))
+        {
+          if (token->type != SCAN_SKIP)
+            {
+              /* XXX report error */
+            }
+        }
+      else
+        {
+          mt.representation.length = state.body.string - mt.representation.string;
+          macro_tokens_add (mts, &mt);
+        }
+      token_uninit (token);
+    }
+}
+
+void
+macro_tokens_print (const struct macro_tokens *mts, FILE *stream)
+{
+  for (size_t i = 0; i < mts->n; i++)
+    token_print (&mts->mts[i].token, stream);
+}
+
+void
+macro_destroy (struct macro *m)
+{
+  if (!m)
+    return;
+
+  free (m->name);
+  for (size_t i = 0; i < m->n_params; i++)
+    {
+      struct macro_param *p = &m->params[i];
+      free (p->name);
+
+      macro_tokens_uninit (&p->def);
+
+      switch (p->arg_type)
+        {
+        case ARG_N_TOKENS:
+          break;
+
+        case ARG_CHAREND:
+          token_uninit (&p->charend);
+          break;
+
+        case ARG_ENCLOSE:
+          token_uninit (&p->enclose[0]);
+          token_uninit (&p->enclose[1]);
+          break;
+
+        case ARG_CMDEND:
+          break;
+        }
+    }
+  free (m->params);
+  macro_tokens_uninit (&m->body);
+  free (m);
+}
+\f
+struct macro_set *
+macro_set_create (void)
+{
+  struct macro_set *set = xmalloc (sizeof *set);
+  *set = (struct macro_set) {
+    .macros = HMAP_INITIALIZER (set->macros),
+  };
+  return set;
+}
+
+void
+macro_set_destroy (struct macro_set *set)
+{
+  if (!set)
+    return;
+
+  struct macro *macro, *next;
+  HMAP_FOR_EACH_SAFE (macro, next, struct macro, hmap_node, &set->macros)
+    {
+      hmap_delete (&set->macros, &macro->hmap_node);
+      macro_destroy (macro);
+    }
+  hmap_destroy (&set->macros);
+  free (set);
+}
+
+static unsigned int
+hash_macro_name (const char *name)
+{
+  return utf8_hash_case_string (name, 0);
+}
+
+static struct macro *
+macro_set_find__ (struct macro_set *set, const char *name)
+{
+  struct macro *macro;
+  HMAP_FOR_EACH_WITH_HASH (macro, struct macro, hmap_node,
+                           hash_macro_name (name), &set->macros)
+    if (!utf8_strcasecmp (macro->name, name))
+      return macro;
+
+  return NULL;
+}
+
+const struct macro *
+macro_set_find (const struct macro_set *set, const char *name)
+{
+  return macro_set_find__ (CONST_CAST (struct macro_set *, set), name);
+}
+
+/* Adds M to SET.  M replaces any existing macro with the same name.  Takes
+   ownership of M. */
+void
+macro_set_add (struct macro_set *set, struct macro *m)
+{
+  struct macro *victim = macro_set_find__ (set, m->name);
+  if (victim)
+    {
+      hmap_delete (&set->macros, &victim->hmap_node);
+      macro_destroy (victim);
+    }
+
+  hmap_insert (&set->macros, &m->hmap_node, hash_macro_name (m->name));
+}
+\f
+enum me_state
+  {
+    /* Error state. */
+    ME_ERROR,
+
+    /* Accumulating tokens in me->params toward the end of any type of
+       argument. */
+    ME_ARG,
+
+    /* Expecting the opening delimiter of an ARG_ENCLOSE argument. */
+    ME_ENCLOSE,
+
+    /* Expecting a keyword for a keyword argument. */
+    ME_KEYWORD,
+
+    /* Expecting an equal sign for a keyword argument. */
+    ME_EQUALS,
+  };
+
+
+struct macro_expander
+  {
+    const struct macro_set *macros;
+
+    enum me_state state;
+    size_t n_tokens;
+
+    const struct macro *macro;
+    struct macro_tokens **args;
+    const struct macro_param *param;
+  };
+
+static int
+me_finished (struct macro_expander *me)
+{
+  for (size_t i = 0; i < me->macro->n_params; i++)
+    if (!me->args[i])
+      {
+        me->args[i] = xmalloc (sizeof *me->args[i]);
+        macro_tokens_copy (me->args[i], &me->macro->params[i].def);
+      }
+  return me->n_tokens;
+}
+
+static int
+me_next_arg (struct macro_expander *me)
+{
+  if (!me->param)
+    {
+      assert (!me->macro->n_params);
+      return me_finished (me);
+    }
+  else if (me->param->positional)
+    {
+      me->param++;
+      if (me->param >= &me->macro->params[me->macro->n_params])
+        return me_finished (me);
+      else
+        {
+          me->state = me->param->positional ? ME_ARG : ME_KEYWORD;
+          return 0;
+        }
+    }
+  else
+    {
+      for (size_t i = 0; i < me->macro->n_params; i++)
+        if (!me->args[i])
+          {
+            me->state = ME_KEYWORD;
+            return 0;
+          }
+      return me_finished (me);
+    }
+}
+
+static int
+me_error (struct macro_expander *me)
+{
+  me->state = ME_ERROR;
+  return -1;
+}
+
+static int
+me_add_arg (struct macro_expander *me, const struct macro_token *mt)
+{
+  const struct token *token = &mt->token;
+  if (token->type == T_STOP)
+    {
+      msg (SE, _("Unexpected end of file reading argument %s "
+                 "to macro %s."), me->param->name, me->macro->name);
+
+      return me_error (me);
+    }
+
+  me->n_tokens++;
+
+  const struct macro_param *p = me->param;
+  struct macro_tokens **argp = &me->args[p - me->macro->params];
+  if (!*argp)
+    *argp = xzalloc (sizeof **argp);
+  struct macro_tokens *arg = *argp;
+  if (p->arg_type == ARG_N_TOKENS)
+    {
+      macro_tokens_add (arg, mt);
+      if (arg->n >= p->n_tokens)
+        return me_next_arg (me);
+      return 0;
+    }
+  else if (p->arg_type == ARG_CMDEND)
+    {
+      if (token->type == T_ENDCMD || token->type == T_STOP)
+        return me_next_arg (me);
+      macro_tokens_add (arg, mt);
+      return 0;
+    }
+  else
+    {
+      const struct token *end
+        = p->arg_type == ARG_CMDEND ? &p->charend : &p->enclose[1];
+      if (token_equal (token, end))
+        return me_next_arg (me);
+      macro_tokens_add (arg, mt);
+      return 0;
+    }
+}
+
+static int
+me_expected (struct macro_expander *me, const struct macro_token *actual,
+             const struct token *expected)
+{
+  const struct substring actual_s
+    = (actual->representation.length ? actual->representation
+       : ss_cstr (_("<end of input>")));
+  char *expected_s = token_to_string (expected);
+  msg (SE, _("Found `%.*s' while expecting `%s' reading argument %s "
+             "to macro %s."),
+       (int) actual_s.length, actual_s.string, expected_s,
+       me->param->name, me->macro->name);
+  free (expected_s);
+
+  return me_error (me);
+}
+
+static int
+me_enclose (struct macro_expander *me, const struct macro_token *mt)
+{
+  const struct token *token = &mt->token;
+  me->n_tokens++;
+
+  if (token_equal (&me->param->enclose[0], token))
+    {
+      me->state = ME_ARG;
+      return 0;
+    }
+
+  return me_expected (me, mt, &me->param->enclose[0]);
+}
+
+static const struct macro_param *
+macro_find_parameter_by_name (const struct macro *m, struct substring name)
+{
+  for (size_t i = 0; i < m->n_params; i++)
+    {
+      const struct macro_param *p = &m->params[i];
+      struct substring p_name = ss_cstr (p->name);
+      if (!utf8_strncasecmp (p_name.string, p_name.length,
+                             name.string, name.length))
+        return p;
+    }
+  return NULL;
+}
+
+static int
+me_keyword (struct macro_expander *me, const struct macro_token *mt)
+{
+  const struct token *token = &mt->token;
+  if (token->type != T_ID)
+    return me_finished (me);
+
+  const struct macro_param *p = macro_find_parameter_by_name (me->macro,
+                                                              token->string);
+  if (p)
+    {
+      size_t arg_index = p - me->macro->params;
+      me->param = p;
+      if (me->args[arg_index])
+        {
+          msg (SE,
+               _("Argument %s multiply specified in call to macro %s."),
+               p->name, me->macro->name);
+          return me_error (me);
+        }
+
+      me->n_tokens++;
+      me->state = ME_EQUALS;
+      return 0;
+    }
+
+  return me_finished (me);
+}
+
+static int
+me_equals (struct macro_expander *me, const struct macro_token *mt)
+{
+  const struct token *token = &mt->token;
+  me->n_tokens++;
+
+  if (token->type == T_EQUALS)
+    {
+      me->state = ME_ARG;
+      return 0;
+    }
+
+  return me_expected (me, mt, &(struct token) { .type = T_EQUALS });
+}
+
+int
+macro_expander_create (const struct macro_set *macros,
+                       const struct token *token,
+                       struct macro_expander **mep)
+{
+  *mep = NULL;
+  if (macro_set_is_empty (macros))
+    return -1;
+  if (token->type != T_ID && token->type != T_MACRO_ID)
+    return -1;
+
+  const struct macro *macro = macro_set_find (macros, token->string.string);
+  if (!macro)
+    return -1;
+
+  struct macro_expander *me = xmalloc (sizeof *me);
+  *me = (struct macro_expander) {
+    .macros = macros,
+    .n_tokens = 1,
+    .macro = macro,
+  };
+  *mep = me;
+
+  if (!macro->n_params)
+    return 1;
+  else
+    {
+      me->state = macro->params[0].positional ? ME_ARG : ME_KEYWORD;
+      me->args = xcalloc (macro->n_params, sizeof *me->args);
+      me->param = macro->params;
+      return 0;
+    }
+}
+
+void
+macro_expander_destroy (struct macro_expander *me)
+{
+  if (!me)
+    return;
+
+  for (size_t i = 0; i < me->macro->n_params; i++)
+    if (me->args[i])
+      {
+        macro_tokens_uninit (me->args[i]);
+        free (me->args[i]);
+      }
+  free (me->args);
+  free (me);
+}
+
+/* Adds TOKEN to the collection of tokens in ME that potentially need to be
+   macro expanded.
+
+   Returns -1 if the tokens added do not actually invoke a macro.  The caller
+   should consume the first token without expanding it.
+
+   Returns 0 if the macro expander needs more tokens, for macro arguments or to
+   decide whether this is actually a macro invocation.  The caller should call
+   macro_expander_add() again with the next token.
+
+   Returns a positive number to indicate that the returned number of tokens
+   invoke a macro.  The number returned might be less than the number of tokens
+   added because it can take a few tokens of lookahead to determine whether the
+   macro invocation is finished.  The caller should call
+   macro_expander_get_expansion() to obtain the expansion. */
+int
+macro_expander_add (struct macro_expander *me, const struct macro_token *mt)
+{
+  switch (me->state)
+    {
+    case ME_ERROR:
+      return -1;
+
+    case ME_ARG:
+      return me_add_arg (me, mt);
+
+    case ME_ENCLOSE:
+      return me_enclose (me, mt);
+
+    case ME_KEYWORD:
+      return me_keyword (me, mt);
+
+    case ME_EQUALS:
+      return me_equals (me, mt);
+
+    default:
+      NOT_REACHED ();
+    }
+}
+
+/* Each argument to a macro function is one of:
+
+       - A quoted string or other single literal token.
+
+       - An argument to the macro being expanded, e.g. !1 or a named argument.
+
+       - !*.
+
+       - A function invocation.
+
+   Each function invocation yields a character sequence to be turned into a
+   sequence of tokens.  The case where that character sequence is a single
+   quoted string is an important special case.
+*/
+struct parse_macro_function_ctx
+  {
+    struct macro_token *input;
+    size_t n_input;
+    int nesting_countdown;
+    const struct macro_set *macros;
+    const struct macro_expander *me;
+    bool *expand;
+  };
+
+static void
+macro_expand (const struct macro_tokens *,
+              int nesting_countdown, const struct macro_set *,
+              const struct macro_expander *, bool *expand, struct macro_tokens *exp);
+
+static bool
+expand_macro_function (struct parse_macro_function_ctx *ctx,
+                       struct macro_token *output,
+                       size_t *input_consumed);
+
+static size_t
+parse_function_arg (struct parse_macro_function_ctx *ctx,
+                    size_t i, struct macro_token *farg)
+{
+  struct macro_token *tokens = ctx->input;
+  const struct token *token = &tokens[i].token;
+  if (token->type == T_MACRO_ID)
+    {
+      const struct macro_param *param = macro_find_parameter_by_name (
+        ctx->me->macro, token->string);
+      if (param)
+        {
+          size_t param_idx = param - ctx->me->macro->params;
+          const struct macro_tokens *marg = ctx->me->args[param_idx];
+          if (marg->n == 1)
+            macro_token_copy (farg, &marg->mts[0]);
+          else
+            {
+              struct string s = DS_EMPTY_INITIALIZER;
+              for (size_t i = 0; i < marg->n; i++)
+                {
+                  if (i)
+                    ds_put_byte (&s, ' ');
+                  ds_put_substring (&s, marg->mts[i].representation);
+                }
+
+              struct substring s_copy;
+              ss_alloc_substring (&s_copy, s.ss);
+
+              *farg = (struct macro_token) {
+                .token = { .type = T_MACRO_ID, .string = s.ss },
+                .representation = s_copy,
+              };
+            }
+          return 1;
+        }
+
+      struct parse_macro_function_ctx subctx = {
+        .input = &ctx->input[i],
+        .n_input = ctx->n_input - i,
+        .nesting_countdown = ctx->nesting_countdown,
+        .macros = ctx->macros,
+        .me = ctx->me,
+        .expand = ctx->expand,
+      };
+      size_t subinput_consumed;
+      if (expand_macro_function (&subctx, farg, &subinput_consumed))
+        return subinput_consumed;
+    }
+
+  macro_token_copy (farg, &tokens[i]);
+  return 1;
+}
+
+static bool
+parse_macro_function (struct parse_macro_function_ctx *ctx,
+                      struct macro_tokens *args,
+                      struct substring function,
+                      int min_args, int max_args,
+                      size_t *input_consumed)
+{
+  struct macro_token *tokens = ctx->input;
+  size_t n_tokens = ctx->n_input;
+
+  if (!n_tokens
+      || tokens[0].token.type != T_MACRO_ID
+      || !ss_equals_case (tokens[0].token.string, function))
+    return false;
+
+  if (n_tokens < 2 || tokens[1].token.type != T_LPAREN)
+    {
+      printf ("`(' expected following %s'\n", function.string);
+      return false;
+    }
+
+  *args = (struct macro_tokens) { .n = 0 };
+
+  for (size_t i = 2;; )
+    {
+      if (i >= n_tokens)
+        goto unexpected_end;
+      if (tokens[i].token.type == T_RPAREN)
+        {
+          *input_consumed = i + 1;
+          if (args->n < min_args || args->n > max_args)
+            {
+              printf ("Wrong number of arguments to %s.\n", function.string);
+              goto error;
+            }
+          return true;
+        }
+
+      i += parse_function_arg (ctx, i, macro_tokens_add_uninit (args));
+      if (i >= n_tokens)
+        goto unexpected_end;
+
+      if (tokens[i].token.type == T_COMMA)
+        i++;
+      else if (tokens[i].token.type != T_RPAREN)
+        {
+          printf ("Expecting `,' or `)' in %s invocation.", function.string);
+          goto error;
+        }
+    }
+
+unexpected_end:
+  printf ("Missing closing parenthesis in arguments to %s.\n",
+          function.string);
+  /* Fall through. */
+error:
+  macro_tokens_uninit (args);
+  return false;
+}
+
+static bool
+expand_macro_function (struct parse_macro_function_ctx *ctx,
+                       struct macro_token *output,
+                       size_t *input_consumed)
+{
+  struct macro_tokens args;
+
+  if (parse_macro_function (ctx, &args, ss_cstr ("!length"), 1, 1,
+                            input_consumed))
+    {
+      size_t length = args.mts[0].representation.length;
+      *output = (struct macro_token) {
+        .token = { .type = T_POS_NUM, .number = length },
+        .representation = ss_cstr (xasprintf ("%zu", length)),
+      };
+    }
+  else if (parse_macro_function (ctx, &args, ss_cstr ("!blanks"), 1, 1,
+                                 input_consumed))
+    {
+      /* XXX this isn't right, it might be a character string containing a
+         positive integer, e.g. via !CONCAT. */
+      if (args.mts[0].token.type != T_POS_NUM)
+        {
+          printf ("argument to !BLANKS must be positive integer\n");
+          macro_tokens_uninit (&args);
+          return false;
+        }
+
+      struct string s = DS_EMPTY_INITIALIZER;
+      ds_put_byte_multiple (&s, ' ', args.mts[0].token.number);
+
+      struct substring s_copy;
+      ss_alloc_substring (&s_copy, s.ss);
+
+      *output = (struct macro_token) {
+        .token = { .type = T_ID, .string = s.ss },
+        .representation = s_copy,
+      };
+    }
+  else if (parse_macro_function (ctx, &args, ss_cstr ("!concat"), 1, INT_MAX,
+                                 input_consumed))
+    {
+      struct string s;
+      bool all_strings = true;
+      for (size_t i = 0; i < args.n; i++)
+        {
+          if (args.mts[i].token.type == T_STRING)
+            ds_put_substring (&s, args.mts[i].token.string);
+          else
+            {
+              all_strings = false;
+              ds_put_substring (&s, args.mts[i].representation);
+            }
+        }
+
+      if (all_strings)
+        {
+          *output = (struct macro_token) {
+            .token = { .type = T_STRING, .string = s.ss },
+          };
+          output->representation = ss_cstr (token_to_string (&output->token));
+        }
+      else
+        {
+          *output = (struct macro_token) {
+            .token = { .type = T_MACRO_ID /*XXX*/, .string = s.ss },
+          };
+          ss_alloc_substring (&output->representation, s.ss);
+        }
+    }
+  else if (parse_macro_function (ctx, &args, ss_cstr ("!quote"), 1, 1,
+                                 input_consumed))
+    {
+      if (args.mts[0].token.type == T_STRING)
+        macro_token_copy (output, &args.mts[0]);
+      else
+        {
+          *output = (struct macro_token) { .token = { .type = T_STRING } };
+          ss_alloc_substring (&output->token.string, args.mts[0].representation);
+          output->representation = ss_cstr (token_to_string (&output->token));
+        }
+    }
+  else if (parse_macro_function (ctx, &args, ss_cstr ("!unquote"), 1, 1,
+                                 input_consumed))
+    {
+      if (args.mts[0].token.type == T_STRING)
+        {
+          *output = (struct macro_token) { .token = { .type = T_MACRO_ID } };
+          ss_alloc_substring (&output->token.string, args.mts[0].token.string);
+          output->representation = ss_cstr (token_to_string (&output->token));
+        }
+      else
+        macro_token_copy (output, &args.mts[0]);
+    }
+  else
+    return false;
+
+  macro_tokens_uninit (&args);
+  return true;
+}
+
+static void
+macro_expand (const struct macro_tokens *mts,
+              int nesting_countdown, const struct macro_set *macros,
+              const struct macro_expander *me, bool *expand,
+              struct macro_tokens *exp)
+{
+  if (nesting_countdown <= 0)
+    {
+      printf ("maximum nesting level exceeded\n");
+      for (size_t i = 0; i < mts->n; i++)
+        macro_tokens_add (exp, &mts->mts[i]);
+      return;
+    }
+
+  for (size_t i = 0; i < mts->n; i++)
+    {
+      const struct macro_token *mt = &mts->mts[i];
+      const struct token *token = &mt->token;
+      if (token->type == T_MACRO_ID && me)
+        {
+          const struct macro_param *param = macro_find_parameter_by_name (
+            me->macro, token->string);
+          if (param)
+            {
+              const struct macro_tokens *arg = me->args[param - me->macro->params];
+              //macro_tokens_print (arg, stdout);
+              if (*expand && param->expand_arg)
+                macro_expand (arg, nesting_countdown, macros, NULL, expand, exp);
+              else
+                for (size_t i = 0; i < arg->n; i++)
+                  macro_tokens_add (exp, &arg->mts[i]);
+              continue;
+            }
+        }
+
+      if (*expand)
+        {
+          struct macro_expander *subme;
+          int retval = macro_expander_create (macros, token, &subme);
+          for (size_t j = 1; !retval; j++)
+            {
+              const struct macro_token stop = { .token = { .type = T_STOP } };
+              retval = macro_expander_add (
+                subme, i + j < mts->n ? &mts->mts[i + j] : &stop);
+            }
+          if (retval > 0)
+            {
+              i += retval - 1;
+              macro_expand (&subme->macro->body, nesting_countdown - 1, macros,
+                            subme, expand, exp);
+              macro_expander_destroy (subme);
+              continue;
+            }
+
+          macro_expander_destroy (subme);
+        }
+
+      if (token->type != T_MACRO_ID)
+        {
+          macro_tokens_add (exp, mt);
+          continue;
+        }
+
+      /* Maybe each arg should just be a string, either a quoted string or a
+         non-quoted string containing tokens. */
+      struct parse_macro_function_ctx ctx = {
+        .input = &mts->mts[i],
+        .n_input = mts->n - i,
+        .nesting_countdown = nesting_countdown,
+        .macros = macros,
+        .me = me,
+        .expand = expand,
+      };
+      struct macro_token function_output;
+      size_t function_consumed;
+      if (expand_macro_function (&ctx, &function_output, &function_consumed))
+        {
+          i += function_consumed - 1;
+
+          if (function_output.token.type == T_MACRO_ID)
+            macro_tokens_from_string (exp, function_output.token.string,
+                                      SEG_MODE_INTERACTIVE /* XXX */);
+          else
+            macro_tokens_add (exp, &function_output);
+          macro_token_uninit (&function_output);
+
+          continue;
+        }
+
+      if (ss_equals_case (token->string, ss_cstr ("!onexpand")))
+        *expand = true;
+      else if (ss_equals_case (token->string, ss_cstr ("!offexpand")))
+        *expand = false;
+      else
+        macro_tokens_add (exp, mt);
+    }
+}
+
+void
+macro_expander_get_expansion (struct macro_expander *me, struct macro_tokens *exp)
+{
+#if 0
+  for (size_t i = 0; i < me->macro->n_params; i++)
+    {
+      printf ("%s:\n", me->macro->params[i].name);
+      macro_tokens_print (me->args[i], stdout);
+    }
+#endif
+
+  bool expand = true;
+  macro_expand (&me->macro->body, settings_get_mnest (),
+                me->macros, me, &expand, exp);
+
+#if 0
+  printf ("expansion:\n");
+  macro_tokens_print (exp, stdout);
+#endif
+}
+
diff --git a/src/language/lexer/macro.h b/src/language/lexer/macro.h

new file mode 100644 (file)

index 0000000..23ae1d9
--- /dev/null
+++ b/src/language/lexer/macro.h
@@ -0,0 +1,120 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef MACRO_H
+#define MACRO_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "libpspp/hmap.h"
+#include "libpspp/str.h"
+#include "language/lexer/segment.h"
+#include "language/lexer/token.h"
+
+struct macro_expander;
+
+struct macro_token
+  {
+    struct token token;
+    struct substring representation;
+  };
+
+void macro_token_copy (struct macro_token *, const struct macro_token *);
+void macro_token_uninit (struct macro_token *);
+
+struct macro_tokens
+  {
+    struct macro_token *mts;
+    size_t n;
+    size_t allocated;
+  };
+
+void macro_tokens_copy (struct macro_tokens *, const struct macro_tokens *);
+void macro_tokens_uninit (struct macro_tokens *);
+struct macro_token *macro_tokens_add_uninit (struct macro_tokens *);
+void macro_tokens_add (struct macro_tokens *, const struct macro_token *);
+
+void macro_tokens_from_string (struct macro_tokens *, const struct substring,
+                               enum segmenter_mode);
+
+void macro_tokens_print (const struct macro_tokens *, FILE *);
+
+struct macro_param
+  {
+    bool positional;            /* Is this a positional parameter? */
+    char *name;                 /* "!1" or "!name". */
+    struct macro_tokens def;    /* Default expansion. */
+    bool expand_arg;            /* Macro-expand the argument? */
+
+    enum
+      {
+        ARG_N_TOKENS,
+        ARG_CHAREND,
+        ARG_ENCLOSE,
+        ARG_CMDEND
+      }
+    arg_type;
+    union
+      {
+        int n_tokens;
+        struct token charend;
+        struct token enclose[2];
+      };
+  };
+
+struct macro
+  {
+    struct hmap_node hmap_node; /* Indexed by 'name'. */
+    char *name;
+
+    struct macro_param *params;
+    size_t n_params;
+
+    struct macro_tokens body;
+  };
+
+void macro_destroy (struct macro *);
+
+struct macro_set
+  {
+    struct hmap macros;
+  };
+
+struct macro_set *macro_set_create (void);
+void macro_set_destroy (struct macro_set *);
+const struct macro *macro_set_find (const struct macro_set *,
+                                    const char *);
+void macro_set_add (struct macro_set *, struct macro *);
+
+static inline bool
+macro_set_is_empty (const struct macro_set *set)
+{
+  return hmap_is_empty (&set->macros);
+}
+\f
+/* Macro expansion. */
+
+int macro_expander_create (const struct macro_set *,
+                           const struct token *,
+                           struct macro_expander **);
+void macro_expander_destroy (struct macro_expander *);
+
+int macro_expander_add (struct macro_expander *, const struct macro_token *);
+
+void macro_expander_get_expansion (struct macro_expander *, struct macro_tokens *);
+
+#endif /* macro.h */
diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c

index 86ebb7d00675cd6c89d223d49b1924cb278473e9..0e29dc9e71be9702c3213ab1ef32f85fba9836c5 100644 (file)
--- a/src/language/lexer/scan.c
+++ b/src/language/lexer/scan.c
@@ -548,7 +548,7 @@ void
  scanner_init (struct scanner *scanner, struct token *token)
  {
    scanner->state = S_START;
-  token_init (token);
+  *token = (struct token) { .type = T_STOP };
  }
  
  /* Adds the segment with type TYPE and UTF-8 text S to SCANNER.  TOKEN must be
diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c

index a4fea0b213118559d474b94a2bc4efa4008ff0d0..ac88117ff5270e1a8b43cdf38172af5aaf431f16 100644 (file)
--- a/src/language/lexer/segment.c
+++ b/src/language/lexer/segment.c
@@ -28,6 +28,7 @@
  
  #include "gl/c-ctype.h"
  #include "gl/c-strcase.h"
+#include "gl/verify.h"
  
  enum segmenter_state
    {
@@ -54,6 +55,9 @@ enum segmenter_state
      S_TITLE_2
    };
  
+/* S_SHBANG is the start state that SEGMENTER_INIT refers to as just 0. */
+verify (S_SHBANG == 0);
+
  #define SS_START_OF_LINE (1u << 0)
  #define SS_START_OF_COMMAND (1u << 1)
  
@@ -1806,9 +1810,7 @@ segment_type_to_string (enum segment_type type)
  void
  segmenter_init (struct segmenter *s, enum segmenter_mode mode)
  {
-  s->state = S_SHBANG;
-  s->substate = 0;
-  s->mode = mode;
+  *s = (struct segmenter) SEGMENTER_INIT (mode);
  }
  
  /* Returns the mode passed to segmenter_init() for S. */
diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h

index 02a269bdd2779b53a0f0bddd00e2641ddaf184b9..10551066b0ec31230194077c91d6f2149047eef9 100644 (file)
--- a/src/language/lexer/segment.h
+++ b/src/language/lexer/segment.h
@@ -117,6 +117,8 @@ struct segmenter
      unsigned char mode;
    };
  
+#define SEGMENTER_INIT(MODE) { .mode = MODE }
+
  void segmenter_init (struct segmenter *, enum segmenter_mode);
  
  enum segmenter_mode segmenter_get_mode (const struct segmenter *);
diff --git a/src/language/lexer/token.c b/src/language/lexer/token.c

index 718f3d07f3d480a1580d59714ee8348e63a38e5e..9fa5bbb6ba81804534c809f4a2b90bb552c607cf 100644 (file)
--- a/src/language/lexer/token.c
+++ b/src/language/lexer/token.c
@@ -27,17 +27,17 @@
  #include "libpspp/cast.h"
  #include "libpspp/misc.h"
  
-
  #include "gl/ftoastr.h"
  #include "gl/xalloc.h"
  
-/* Initializes TOKEN with an arbitrary type, number 0, and a null string. */
  void
-token_init (struct token *token)
+token_copy (struct token *dst, const struct token *src)
  {
-  token->type = 0;
-  token->number = 0.0;
-  token->string = ss_empty ();
+  *dst = (struct token) {
+    .type = src->type,
+    .number = src->number,
+  };
+  ss_alloc_substring (&dst->string, src->string);
  }
  
  /* Frees the string that TOKEN contains. */
@@ -45,7 +45,33 @@ void
  token_uninit (struct token *token)
  {
    if (token != NULL)
-    ss_dealloc (&token->string);
+    {
+      ss_dealloc (&token->string);
+      *token = (struct token) { .type = T_STOP };
+    }
+}
+
+bool
+token_equal (const struct token *a, const struct token *b)
+{
+  if (a->type != b->type)
+    return false;
+
+  switch (a->type)
+    {
+    case T_POS_NUM:
+    case T_NEG_NUM:
+      return a->number == b->number;
+
+    case T_ID:
+    case T_MACRO_ID:
+    case T_MACRO_PUNCT:
+    case T_STRING:
+      return ss_equals (a->string, b->string);
+
+    default:
+      return true;
+    }
  }
  
  static char *
@@ -150,7 +176,7 @@ token_to_string (const struct token *token)
        return string_representation (token->string);
  
      default:
-      return xstrdup_if_nonnull (token_type_to_name (token->type));
+      return xstrdup_if_nonnull (token_type_to_string (token->type));
      }
  }
  
@@ -172,3 +198,41 @@ token_print (const struct token *token, FILE *stream)
               (int) token->string.length, token->string.string);
    putc ('\n', stream);
  }
+\f
+void
+tokens_copy (struct tokens *dst, const struct tokens *src)
+{
+  *dst = (struct tokens) {
+    .tokens = xnmalloc (src->n, sizeof *dst->tokens),
+    .n = src->n,
+    .allocated = src->n,
+  };
+
+  for (size_t i = 0; i < src->n; i++)
+    token_copy (&dst->tokens[i], &src->tokens[i]);
+}
+
+void
+tokens_uninit (struct tokens *tokens)
+{
+  for (size_t i = 0; i < tokens->n; i++)
+    token_uninit (&tokens->tokens[i]);
+  free (tokens->tokens);
+}
+
+void
+tokens_add (struct tokens *tokens, const struct token *t)
+{
+  if (tokens->allocated >= tokens->n)
+    tokens->tokens = x2nrealloc (tokens->tokens, &tokens->allocated,
+                                 sizeof *tokens->tokens);
+
+  token_copy (&tokens->tokens[tokens->n++], t);
+}
+
+void
+tokens_print (const struct tokens *tokens, FILE *stream)
+{
+  for (size_t i = 0; i < tokens->n; i++)
+    token_print (&tokens->tokens[i], stream);
+}
diff --git a/src/language/lexer/token.h b/src/language/lexer/token.h

index cab1a8cf9c63d011b3788e8a0123ba7acf19dd34..b334edfef6c385cdc8a28482198dbb8ce8088a59 100644 (file)
--- a/src/language/lexer/token.h
+++ b/src/language/lexer/token.h
@@ -17,6 +17,7 @@
  #ifndef TOKEN_H
  #define TOKEN_H 1
  
+#include <stdbool.h>
  #include <stdio.h>
  #include "libpspp/str.h"
  #include "data/identifier.h"
@@ -32,14 +33,26 @@ struct token
      struct substring string;
    };
  
-#define TOKEN_INITIALIZER(TYPE, NUMBER, STRING) \
-        { TYPE, NUMBER, SS_LITERAL_INITIALIZER (STRING) }
-
-void token_init (struct token *);
+void token_copy (struct token *, const struct token *);
  void token_uninit (struct token *);
  
+bool token_equal (const struct token *, const struct token *);
+
  char *token_to_string (const struct token *);
  
  void token_print (const struct token *, FILE *);
+\f
+struct tokens
+  {
+    struct token *tokens;
+    size_t n;
+    size_t allocated;
+  };
+
+void tokens_copy (struct tokens *, const struct tokens *);
+void tokens_uninit (struct tokens *);
+void tokens_add (struct tokens *, const struct token *);
+
+void tokens_print (const struct tokens *, FILE *);
  
  #endif /* token.h */
diff --git a/tests/automake.mk b/tests/automake.mk

index ec81e5288140196c5dbd368faf8c00d80e8480d7..4de61417b2ef7cf95b8898bd50abded0f386abfd 100644 (file)
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -339,6 +339,7 @@ TESTSUITE_AT = \
         tests/data/sys-file.at \
         tests/data/encrypted-file.at \
         tests/language/command.at \
+       tests/language/control/define.at \
         tests/language/control/do-if.at \
         tests/language/control/do-repeat.at \
         tests/language/control/loop.at \
diff --git a/tests/language/control/define.at b/tests/language/control/define.at

new file mode 100644 (file)

index 0000000..d187b04
--- /dev/null
+++ b/tests/language/control/define.at
@@ -0,0 +1,26 @@
+dnl PSPP - a program for statistical analysis.
+dnl Copyright (C) 2017 Free Software Foundation, Inc.
+dnl
+dnl This program is free software: you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation, either version 3 of the License, or
+dnl (at your option) any later version.
+dnl
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program.  If not, see <http://www.gnu.org/licenses/>.
+dnl
+AT_BANNER([DEFINE])
+
+AT_SETUP([DEFINE])
+AT_DATA([define.sps], [dnl
+DEFINE !variables()
+  brand model license color
+!ENDDEFINE.
+])
+AT_CHECK([pspp define.sps])
+AT_CLEANUP
diff --git a/tests/language/lexer/segment-test.c b/tests/language/lexer/segment-test.c

index a3b67b89b24b2cd4eb59eb73617d1bd3682b7f4a..cb46401b34df2326a2858375cfd7826878b8a014 100644 (file)
--- a/tests/language/lexer/segment-test.c
+++ b/tests/language/lexer/segment-test.c
@@ -108,8 +108,7 @@ main (int argc, char *argv[])
  static void
  check_segmentation (const char *input, size_t length, bool print_segments)
  {
-  struct segmenter s;
-  segmenter_init (&s, mode);
+  struct segmenter s = SEGMENTER_INIT (mode);
  
    size_t line_number = 1;
    size_t line_offset = 0;
author	Ben Pfaff <blp@cs.stanford.edu>
	Tue, 23 Mar 2021 14:14:48 +0000 (07:14 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 30 May 2021 22:50:57 +0000 (15:50 -0700)
src/language/command.def		patch \| blob \| history
src/language/control/automake.mk		patch \| blob \| history
src/language/control/define.c	[new file with mode: 0644]	patch \| blob
src/language/control/repeat.c		patch \| blob \| history
src/language/lexer/automake.mk		patch \| blob \| history
src/language/lexer/lexer.c		patch \| blob \| history
src/language/lexer/lexer.h		patch \| blob \| history
src/language/lexer/macro.c	[new file with mode: 0644]	patch \| blob
src/language/lexer/macro.h	[new file with mode: 0644]	patch \| blob
src/language/lexer/scan.c		patch \| blob \| history
src/language/lexer/segment.c		patch \| blob \| history
src/language/lexer/segment.h		patch \| blob \| history
src/language/lexer/token.c		patch \| blob \| history
src/language/lexer/token.h		patch \| blob \| history
tests/automake.mk		patch \| blob \| history
tests/language/control/define.at	[new file with mode: 0644]	patch \| blob
tests/language/lexer/segment-test.c		patch \| blob \| history