From 5a6b751888278c8c849ab0f4adf99f1be610e610 Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Thu, 18 Nov 2010 22:19:58 -0800
Subject: [PATCH] command: Factor command name matching out of command.c.

Making command parsing into a library will allow other code to use
the same functionality, which will be useful later in the lexer.
It also simplifies parsing command names and allows us to easily
add tests for command name parsing.

The new command name parsing code supports UTF-8.  This is not useful yet,
because its only client does not feed it UTF-8 strings, but it will be
useful later when the rest of the lexer is rewritten, where it will gain
another client that does feed it UTF-8.
---
 src/language/command.c                   | 385 ++++-------------------
 src/language/expressions/operations.def  |   2 +-
 src/language/lexer/automake.mk           |   5 +-
 src/language/lexer/command-name.c        | 235 ++++++++++++++
 src/language/lexer/command-name.h        |  46 +++
 tests/automake.mk                        |  17 +-
 tests/language/command.at                |   2 +-
 tests/language/data-io/inpt-pgm.at       |   4 +-
 tests/language/lexer/command-name-test.c | 149 +++++++++
 tests/language/lexer/command-name.at     | 234 ++++++++++++++
 10 files changed, 753 insertions(+), 326 deletions(-)
 create mode 100644 src/language/lexer/command-name.c
 create mode 100644 src/language/lexer/command-name.h
 create mode 100644 tests/language/lexer/command-name-test.c
 create mode 100644 tests/language/lexer/command-name.at

diff --git a/src/language/command.c b/src/language/command.c
index a6805898..80df9143 100644
--- a/src/language/command.c
+++ b/src/language/command.c
@@ -28,6 +28,7 @@
 #include "data/procedure.h"
 #include "data/settings.h"
 #include "data/variable.h"
+#include "language/lexer/command-name.h"
 #include "language/lexer/lexer.h"
 #include "language/prompt.h"
 #include "libpspp/assertion.h"
@@ -115,7 +116,6 @@ static const size_t command_cnt = sizeof commands / sizeof *commands;
 
 static bool in_correct_state (const struct command *, enum cmd_state);
 static bool report_state_mismatch (const struct command *, enum cmd_state);
-static const struct command *find_command (const char *name);
 static void set_completion_state (enum cmd_state);
 
 /* Command parser. */
@@ -240,257 +240,23 @@ do_parse_command (struct lexer *lexer,
   return result;
 }
 
-static size_t
-match_strings (const char *a, size_t a_len,
-               const char *b, size_t b_len)
-{
-  size_t match_len = 0;
-
-  while (a_len > 0 && b_len > 0)
-    {
-      /* Mismatch always returns zero. */
-      if (toupper ((unsigned char) *a++) != toupper ((unsigned char) *b++))
-        return 0;
-
-      /* Advance. */
-      a_len--;
-      b_len--;
-      match_len++;
-    }
-
-  return match_len;
-}
-
-/* Returns the first character in the first word in STRING,
-   storing the word's length in *WORD_LEN.  If no words remain,
-   returns a null pointer and stores 0 in *WORD_LEN.  Words are
-   sequences of alphanumeric characters or single
-   non-alphanumeric characters.  Words are delimited by
-   spaces. */
-static const char *
-find_word (const char *string, size_t *word_len)
-{
-  /* Skip whitespace and asterisks. */
-  while (isspace ((unsigned char) *string))
-    string++;
-
-  /* End of string? */
-  if (*string == '\0')
-    {
-      *word_len = 0;
-      return NULL;
-    }
-
-  /* Special one-character word? */
-  if (!isalnum ((unsigned char) *string))
-    {
-      *word_len = 1;
-      return string;
-    }
-
-  /* Alphanumeric word. */
-  *word_len = 1;
-  while (isalnum ((unsigned char) string[*word_len]))
-    (*word_len)++;
-
-  return string;
-}
-
-/* Returns true if strings A and B can be confused based on
-   their first three letters. */
-static bool
-conflicting_3char_prefixes (const char *a, const char *b)
-{
-  size_t aw_len, bw_len;
-  const char *aw, *bw;
-
-  aw = find_word (a, &aw_len);
-  bw = find_word (b, &bw_len);
-  assert (aw != NULL && bw != NULL);
-
-  /* Words that are the same don't conflict. */
-  if (aw_len == bw_len && !buf_compare_case (aw, bw, aw_len))
-    return false;
-
-  /* Words that are otherwise the same in the first three letters
-     do conflict. */
-  return ((aw_len > 3 && bw_len > 3)
-          || (aw_len == 3 && bw_len > 3)
-          || (bw_len == 3 && aw_len > 3)) && !buf_compare_case (aw, bw, 3);
-}
-
-/* Returns true if CMD can be confused with another command
-   based on the first three letters of its first word. */
-static bool
-conflicting_3char_prefix_command (const struct command *cmd)
-{
-  assert (cmd >= commands && cmd < commands + command_cnt);
-
-  return ((cmd > commands
-           && conflicting_3char_prefixes (cmd[-1].name, cmd[0].name))
-          || (cmd < commands + command_cnt
-              && conflicting_3char_prefixes (cmd[0].name, cmd[1].name)));
-}
-
-/* Ways that a set of words can match a command name. */
-enum command_match
-  {
-    MISMATCH,           /* Not a match. */
-    PARTIAL_MATCH,      /* The words begin the command name. */
-    COMPLETE_MATCH      /* The words are the command name. */
-  };
-
-/* Figures out how well the WORD_CNT words in WORDS match CMD,
-   and returns the appropriate enum value.  If WORDS are a
-   partial match for CMD and the next word in CMD is a dash, then
-   *DASH_POSSIBLE is set to 1 if DASH_POSSIBLE is non-null;
-   otherwise, *DASH_POSSIBLE is unchanged. */
-static enum command_match
-cmd_match_words (const struct command *cmd,
-                 char *const words[], size_t word_cnt,
-                 int *dash_possible)
-{
-  const char *word;
-  size_t word_len;
-  size_t word_idx;
-
-  for (word = find_word (cmd->name, &word_len), word_idx = 0;
-       word != NULL && word_idx < word_cnt;
-       word = find_word (word + word_len, &word_len), word_idx++)
-    if (word_len != strlen (words[word_idx])
-        || buf_compare_case (word, words[word_idx], word_len))
-      {
-        size_t match_chars = match_strings (word, word_len,
-                                            words[word_idx],
-                                            strlen (words[word_idx]));
-        if (match_chars == 0)
-          {
-            /* Mismatch. */
-            return MISMATCH;
-          }
-        else if (match_chars == 1 || match_chars == 2)
-          {
-            /* One- and two-character abbreviations are not
-               acceptable. */
-            return MISMATCH;
-          }
-        else if (match_chars == 3)
-          {
-            /* Three-character abbreviations are acceptable
-               in the first word of a command if there are
-               no name conflicts.  They are always
-               acceptable after the first word. */
-            if (word_idx == 0 && conflicting_3char_prefix_command (cmd))
-              return MISMATCH;
-          }
-        else /* match_chars > 3 */
-          {
-            /* Four-character and longer abbreviations are
-               always acceptable.  */
-          }
-      }
-
-  if (word == NULL && word_idx == word_cnt)
-    {
-      /* cmd->name = "FOO BAR", words[] = {"FOO", "BAR"}. */
-      return COMPLETE_MATCH;
-    }
-  else if (word == NULL)
-    {
-      /* cmd->name = "FOO BAR", words[] = {"FOO", "BAR", "BAZ"}. */
-      return MISMATCH;
-    }
-  else
-    {
-      /* cmd->name = "FOO BAR BAZ", words[] = {"FOO", "BAR"}. */
-      if (word[0] == '-' && dash_possible != NULL)
-        *dash_possible = 1;
-      return PARTIAL_MATCH;
-    }
-}
-
-/* Returns the number of commands for which the WORD_CNT words in
-   WORDS are a partial or complete match.  If some partial match
-   has a dash as the next word, then *DASH_POSSIBLE is set to 1,
-   otherwise it is set to 0. */
 static int
-count_matching_commands (char *const words[], size_t word_cnt,
-                         int *dash_possible)
+find_best_match (struct substring s, const struct command **matchp)
 {
   const struct command *cmd;
-  int cmd_match_count;
+  struct command_matcher cm;
+  int missing_words;
 
-  cmd_match_count = 0;
-  *dash_possible = 0;
-  for (cmd = commands; cmd < commands + command_cnt; cmd++)
-    if (cmd_match_words (cmd, words, word_cnt, dash_possible) != MISMATCH)
-      cmd_match_count++;
+  command_matcher_init (&cm, s);
+  for (cmd = commands; cmd < &commands[command_cnt]; cmd++)
+    command_matcher_add (&cm, ss_cstr (cmd->name), CONST_CAST (void *, cmd));
 
-  return cmd_match_count;
-}
-
-/* Returns the command for which the WORD_CNT words in WORDS are
-   a complete match.  Returns a null pointer if no such command
-   exists. */
-static const struct command *
-get_complete_match (char *const words[], size_t word_cnt)
-{
-  const struct command *cmd;
+  *matchp = command_matcher_get_match (&cm);
+  missing_words = command_matcher_get_missing_words (&cm);
 
-  for (cmd = commands; cmd < commands + command_cnt; cmd++)
-    if (cmd_match_words (cmd, words, word_cnt, NULL) == COMPLETE_MATCH)
-      return cmd;
+  command_matcher_destroy (&cm);
 
-  return NULL;
-}
-
-/* Returns the command with the given exact NAME.
-   Aborts if no such command exists. */
-static const struct command *
-find_command (const char *name)
-{
-  const struct command *cmd;
-
-  for (cmd = commands; cmd < commands + command_cnt; cmd++)
-    if (!strcmp (cmd->name, name))
-      return cmd;
-  NOT_REACHED ();
-}
-
-/* Frees the WORD_CNT words in WORDS. */
-static void
-free_words (char *words[], size_t word_cnt)
-{
-  size_t idx;
-
-  for (idx = 0; idx < word_cnt; idx++)
-    free (words[idx]);
-}
-
-/* Flags an error that the command whose name is given by the
-   WORD_CNT words in WORDS is unknown. */
-static void
-unknown_command_error (struct lexer *lexer, char *const words[], size_t word_cnt)
-{
-  if (word_cnt == 0)
-    lex_error (lexer, _("expecting command name"));
-  else
-    {
-      struct string s;
-      size_t i;
-
-      ds_init_empty (&s);
-      for (i = 0; i < word_cnt; i++)
-        {
-          if (i != 0)
-            ds_put_byte (&s, ' ');
-          ds_put_cstr (&s, words[i]);
-        }
-
-      msg (SE, _("Unknown command %s."), ds_cstr (&s));
-
-      ds_destroy (&s);
-    }
+  return missing_words;
 }
 
 /* Parse the command name and return a pointer to the corresponding
@@ -499,93 +265,74 @@ unknown_command_error (struct lexer *lexer, char *const words[], size_t word_cnt
 static const struct command *
 parse_command_name (struct lexer *lexer)
 {
-  char *words[16];
-  int word_cnt;
-  int complete_word_cnt;
-  int dash_possible;
-
-  if (lex_token (lexer) == T_EXP ||
-		  lex_token (lexer) == '*' || lex_token (lexer) == '[')
-    return find_command ("COMMENT");
-
-  dash_possible = 0;
-  word_cnt = complete_word_cnt = 0;
-  while (lex_token (lexer) == T_ID || (dash_possible && lex_token (lexer) == '-'))
-    {
-      int cmd_match_cnt;
+  const struct command *command;
+  int missing_words;
+  struct string s;
 
-      assert (word_cnt < sizeof words / sizeof *words);
-      if (lex_token (lexer) == T_ID)
-        {
-          words[word_cnt] = ds_xstrdup (lex_tokstr (lexer));
-          str_uppercase (words[word_cnt]);
-        }
-      else if (lex_token (lexer) == '-')
-        words[word_cnt] = xstrdup ("-");
-      word_cnt++;
+  if (lex_token (lexer) == T_EXP
+      || lex_token (lexer) == '*'
+      || lex_token (lexer) == '[')
+    {
+      static const struct command c = { S_ANY, 0, "COMMENT", cmd_comment };
+      return &c;
+    }
 
-      cmd_match_cnt = count_matching_commands (words, word_cnt,
-                                               &dash_possible);
-      if (cmd_match_cnt == 0)
-        break;
-      else if (cmd_match_cnt == 1)
+  command = NULL;
+  missing_words = 0;
+  ds_init_empty (&s);
+  for (;;)
+    {
+      if (lex_token (lexer) == '-')
+        ds_put_byte (&s, '-');
+      else if (lex_token (lexer) == T_ID)
         {
-          const struct command *command = get_complete_match (words, word_cnt);
-          if (command != NULL)
-            {
-              if (!(command->flags & F_KEEP_FINAL_TOKEN))
-                lex_get (lexer);
-              free_words (words, word_cnt);
-              return command;
-            }
+          if (!ds_is_empty (&s) && ds_last (&s) != '-')
+            ds_put_byte (&s, ' ');
+          ds_put_cstr (&s, lex_tokid (lexer));
         }
-      else /* cmd_match_cnt > 1 */
+      else if (lex_is_integer (lexer) && lex_integer (lexer) >= 0)
         {
-          /* Do we have a complete command name so far? */
-          if (get_complete_match (words, word_cnt) != NULL)
-            complete_word_cnt = word_cnt;
+          if (!ds_is_empty (&s) && ds_last (&s) != '-')
+            ds_put_byte (&s, ' ');
+          ds_put_format (&s, "%ld", lex_integer (lexer));
         }
+      else
+        break;
+
+      missing_words = find_best_match (ds_ss (&s), &command);
+      if (missing_words <= 0)
+        break;
+
       lex_get (lexer);
     }
 
-  /* If we saw a complete command name earlier, drop back to
-     it. */
-  if (complete_word_cnt)
+  if (command == NULL && missing_words > 0)
     {
-      int pushback_word_cnt;
-      const struct command *command;
-
-      /* Get the command. */
-      command = get_complete_match (words, complete_word_cnt);
-      assert (command != NULL);
-
-      /* Figure out how many words we want to keep.
-         We normally want to swallow the entire command. */
-      pushback_word_cnt = complete_word_cnt + 1;
-      if (command->flags & F_KEEP_FINAL_TOKEN)
-        pushback_word_cnt--;
-
-      /* FIXME: We only support one-token pushback. */
-      assert (pushback_word_cnt + 1 >= word_cnt);
-
-      while (word_cnt > pushback_word_cnt)
-        {
-          word_cnt--;
-          if (strcmp (words[word_cnt], "-"))
-            lex_put_back_id (lexer, words[word_cnt]);
-          else
-            lex_put_back (lexer, '-');
-          free (words[word_cnt]);
-        }
+      ds_put_cstr (&s, " .");
+      missing_words = find_best_match (ds_ss (&s), &command);
+      ds_truncate (&s, ds_length (&s) - 2);
+    }
 
-      free_words (words, word_cnt);
-      return command;
+  if (command == NULL)
+    {
+      if (ds_is_empty (&s))
+        lex_error (lexer, _("expecting command name"));
+      else
+        msg (SE, _("Unknown command `%s'."), ds_cstr (&s));
+    }
+  else if (missing_words == 0)
+    {
+      if (!(command->flags & F_KEEP_FINAL_TOKEN))
+        lex_get (lexer);
+    }
+  else if (missing_words < 0)
+    {
+      assert (missing_words == -1);
+      assert (!(command->flags & F_KEEP_FINAL_TOKEN));
     }
 
-  /* We didn't get a valid command name. */
-  unknown_command_error (lexer, words, word_cnt);
-  free_words (words, word_cnt);
-  return NULL;
+  ds_destroy (&s);
+  return command;
 }
 
 /* Returns true if COMMAND is allowed in STATE,
diff --git a/src/language/expressions/operations.def b/src/language/expressions/operations.def
index 0286acf9..5d1ee46e 100644
--- a/src/language/expressions/operations.def
+++ b/src/language/expressions/operations.def
@@ -590,7 +590,7 @@ function NUMBER (string s, ni_format f)
   data_in_imply_decimals (s, LEGACY_NATIVE, f->type, f->d, &out);
   else
     {
-      msg (SE, "Cannot parse \"%.*s\" as format %s: %s",
+      msg (SE, "Cannot parse `%.*s' as format %s: %s",
            (int) s.length, s.string, fmt_name (f->type), error);
       free (error);
     }
diff --git a/src/language/lexer/automake.mk b/src/language/lexer/automake.mk
index aff3f2a2..71f6b413 100644
--- a/src/language/lexer/automake.mk
+++ b/src/language/lexer/automake.mk
@@ -2,7 +2,10 @@
 
 
 language_lexer_sources = \
-	src/language/lexer/lexer.c  src/language/lexer/lexer.h \
+	src/language/lexer/command-name.c \
+	src/language/lexer/command-name.h \
+	src/language/lexer/lexer.c \
+	src/language/lexer/lexer.h \
 	src/language/lexer/subcommand-list.c  \
 	src/language/lexer/subcommand-list.h \
 	src/language/lexer/format-parser.c \
diff --git a/src/language/lexer/command-name.c b/src/language/lexer/command-name.c
new file mode 100644
index 00000000..8ef64d9f
--- /dev/null
+++ b/src/language/lexer/command-name.c
@@ -0,0 +1,235 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "language/lexer/command-name.h"
+
+#include <assert.h>
+#include <limits.h>
+
+#include "data/identifier.h"
+
+#include "gl/c-ctype.h"
+
+/* Stores the first word in S into WORD and advances S past that word.  Returns
+   true if successful, false if no word remained in S to be extracted.
+
+   A word is a sequence of digits, a letter possibly followed by a sequence of
+   letters or digits, or one character of another type.  Words may be delimited
+   by spaces. */
+static bool
+find_word (struct substring *s, struct substring *word)
+{
+  size_t ofs;
+  ucs4_t c;
+
+  /* Skip whitespace. */
+  for (;;)
+    {
+      c = ss_first_mb (*s);
+      if (c == UINT32_MAX)
+        {
+          *word = ss_empty ();
+          return false;
+        }
+      else if (lex_uc_is_space (c))
+        ss_get_mb (s);
+      else
+        break;
+    }
+
+  ofs = ss_first_mblen (*s);
+  if (lex_uc_is_id1 (c))
+    {
+      while (lex_uc_is_idn (ss_at_mb (*s, ofs)))
+        ofs += ss_at_mblen (*s, ofs);
+    }
+  else if (c_isdigit (c))
+    {
+      while (c_isdigit (s->string[ofs]))
+        ofs++;
+    }
+  ss_get_bytes (s, ofs, word);
+  return true;
+}
+
+/* Returns the number of words in S, as extracted by find_word(). */
+static int
+count_words (struct substring s)
+{
+  struct substring word;
+  int n;
+
+  n = 0;
+  while (find_word (&s, &word))
+    n++;
+  return n;
+}
+
+/* Compares STRING obtained from the user against the full name of a COMMAND,
+   using this algorithm:
+
+   1. Divide COMMAND into words C[0] through C[n - 1].
+
+   2. Divide STRING into words S[0] through S[m - 1].
+
+   3. Compare word C[i] against S[i] for 0 <= i < min(n, m), using the keyword
+      matching algorithm implemented by lex_id_match().  If any of them fail to
+      match, then STRING does not match COMMAND and the function returns false.
+
+   4. Otherwise, STRING and COMMAND match.  Set *MISSING_WORDS to n - m.  Set
+      *EXACT to false if any of the S[i] were found to be abbreviated in the
+      comparisons done in step 3, or to true if they were all exactly equal
+      (modulo case).  Return true. */
+bool
+command_match (struct substring command, struct substring string,
+               bool *exact, int *missing_words)
+{
+  *exact = true;
+  for (;;)
+    {
+      struct substring cw, sw;
+      int match;
+
+      if (!find_word (&command, &cw))
+        {
+          *missing_words = -count_words (string);
+          return true;
+        }
+      else if (!find_word (&string, &sw))
+        {
+          *missing_words = 1 + count_words (command);
+          return true;
+        }
+
+      match = lex_id_match (cw, sw);
+      if (sw.length < cw.length)
+        *exact = false;
+      if (match == 0)
+        return false;
+    }
+}
+
+/* Initializes CM for matching STRING against a table of command names.
+
+   STRING may be ASCII or UTF-8.
+
+   For sample use, see command.c.  Here's a usage outline:
+
+      // Try each possible command.
+      command_matcher_init (&cm, string);
+      for (cmd = commands; cmd < &commands[command_cnt]; cmd++)
+        command_matcher_add (&cm, cmd->name, cmd);
+
+      // Get the result.
+      match = command_matcher_get_match (&cm);
+      missing_words = command_matcher_get_missing_words (&cm);
+
+      if (missing_words > 0)
+        {
+          // Incomplete command name.  Add another word to the string
+          // and start over.  Or if there are no more words to be added,
+          // add " ." to the string as a sentinel and start over.
+        }
+      else if (match == NULL)
+        {
+          // No valid command with this name.
+        }
+      else if (missing_words == 0)
+        {
+          // The full, correct command name is 'match'.
+        }
+      else if (missing_words < 0)
+        {
+          // The abs(missing_words) last words of 'string' are actually
+          // part of the command's body, not part of its name; they
+          // were only needed to resolve ambiguities.  'match' is the
+          // correct command but those extra words should be put back
+          // for later re-parsing.
+        }
+*/
+void
+command_matcher_init (struct command_matcher *cm, struct substring string)
+{
+  cm->string = string;
+  cm->extensible = false;
+  cm->exact_match = NULL;
+  cm->n_matches = 0;
+  cm->match = NULL;
+  cm->match_missing_words = 0;
+}
+
+/* Destroys CM's state. */
+void
+command_matcher_destroy (struct command_matcher *cm UNUSED)
+{
+  /* Nothing to do. */
+}
+
+/* Considers COMMAND as a candidate for the command name being parsed by CM.
+   If COMMAND is the correct command name, then command_matcher_get_match()
+   will return AUX later.
+
+   COMMAND must be an ASCII string. */
+void
+command_matcher_add (struct command_matcher *cm, struct substring command,
+                     void *aux)
+{
+  int missing_words;
+  bool exact;
+
+  assert (aux != NULL);
+  if (command_match (command, cm->string, &exact, &missing_words))
+    {
+      if (missing_words > 0)
+        cm->extensible = true;
+      else if (exact && missing_words == 0)
+        cm->exact_match = aux;
+      else
+        {
+          if (missing_words > cm->match_missing_words)
+            cm->n_matches = 0;
+
+          if (missing_words >= cm->match_missing_words || cm->n_matches == 0)
+            {
+              cm->n_matches++;
+              cm->match = aux;
+              cm->match_missing_words = missing_words;
+            }
+        }
+    }
+}
+
+/* Returns the command name matched by CM. */
+void *
+command_matcher_get_match (const struct command_matcher *cm)
+{
+  return (cm->extensible ? NULL
+          : cm->exact_match != NULL ? cm->exact_match
+          : cm->n_matches == 1 ? cm->match
+          : NULL);
+}
+
+/* Returns the difference between the number of words in the matched command
+   name and the string provided to command_matcher_init(). */
+int
+command_matcher_get_missing_words (const struct command_matcher *cm)
+{
+  return (cm->extensible ? 1
+          : cm->exact_match != NULL ? 0
+          : cm->match_missing_words);
+}
diff --git a/src/language/lexer/command-name.h b/src/language/lexer/command-name.h
new file mode 100644
index 00000000..51751aa7
--- /dev/null
+++ b/src/language/lexer/command-name.h
@@ -0,0 +1,46 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef COMMAND_NAME_H
+#define COMMAND_NAME_H 1
+
+#include <stdbool.h>
+#include "libpspp/str.h"
+
+bool command_match (struct substring command, struct substring string,
+                    bool *exact, int *missing_words);
+
+/* Allows matching a string against a table of command names. */
+struct command_matcher
+  {
+    struct substring string;
+    bool extensible;
+    void *exact_match;
+    int n_matches;
+    void *match;
+    int match_missing_words;
+  };
+
+void command_matcher_init (struct command_matcher *, struct substring string);
+void command_matcher_destroy (struct command_matcher *);
+
+void command_matcher_add (struct command_matcher *, struct substring command,
+                          void *aux);
+
+void *command_matcher_get_match (const struct command_matcher *);
+int command_matcher_get_missing_words (const struct command_matcher *);
+
+#endif /* command-name.h */
diff --git a/tests/automake.mk b/tests/automake.mk
index 8be58e18..e17efd15 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -3,6 +3,7 @@
 check_PROGRAMS += \
 	tests/data/datasheet-test \
 	tests/data/inexactify \
+	tests/language/lexer/command-name-test \
 	tests/libpspp/abt-test \
 	tests/libpspp/bt-test \
 	tests/libpspp/heap-test \
@@ -181,6 +182,17 @@ tests_dissect_sysfile_SOURCES = \
 tests_dissect_sysfile_LDADD = gl/libgl.la $(LIBINTL) 
 tests_dissect_sysfile_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=\"$(bindir)\"
 
+check_PROGRAMS += tests/language/lexer/command-name-test
+tests_language_lexer_command_name_test_SOURCES = \
+	src/data/identifier.c \
+	src/language/lexer/command-name.c \
+	tests/language/lexer/command-name-test.c
+tests_language_lexer_command_name_test_LDADD = \
+	src/libpspp/libpspp.la \
+	gl/libgl.la \
+	$(LIBINTL) 
+tests_language_lexer_command_name_test_CFLAGS = $(AM_CFLAGS)
+
 check_PROGRAMS += tests/output/render-test
 tests_output_render_test_SOURCES = tests/output/render-test.c
 tests_output_render_test_LDADD = \
@@ -257,6 +269,7 @@ TESTSUITE_AT = \
 	tests/language/dictionary/weight.at \
 	tests/language/expressions/evaluate.at \
 	tests/language/expressions/parse.at \
+	tests/language/lexer/command-name.at \
 	tests/language/lexer/lexer.at \
 	tests/language/lexer/q2c.at \
 	tests/language/lexer/variable-parser.at \
@@ -328,7 +341,7 @@ EXTRA_DIST += tests/testsuite.at
 
 CHECK_LOCAL += tests_check
 tests_check: tests/atconfig tests/atlocal $(TESTSUITE) $(check_PROGRAMS)
-	$(SHELL) '$(TESTSUITE)' -C tests AUTOTEST_PATH=tests/data:tests/libpspp:tests/output:src/ui/terminal $(TESTSUITEFLAGS)
+	$(SHELL) '$(TESTSUITE)' -C tests AUTOTEST_PATH=tests/data:tests/language/lexer:tests/libpspp:tests/output:src/ui/terminal $(TESTSUITEFLAGS)
 
 CLEAN_LOCAL += tests_clean
 tests_clean:
@@ -337,7 +350,7 @@ tests_clean:
 AUTOM4TE = $(SHELL) $(srcdir)/build-aux/missing --run autom4te
 AUTOTEST = $(AUTOM4TE) --language=autotest
 $(TESTSUITE): package.m4 $(srcdir)/tests/testsuite.at $(TESTSUITE_AT) 
-	$(AUTOTEST) -I '$(srcdir)' -o $@.tmp $@.at
+	$(AUTOTEST) -I '$(srcdir)' $@.at | sed 's/@<00A0>@/Â /g' > $@.tmp
 	mv $@.tmp $@
 
 # The `:;' works around a Bash 3.2 bug when the output is not writeable.
diff --git a/tests/language/command.at b/tests/language/command.at
index 1bd6ccf2..aa4eff53 100644
--- a/tests/language/command.at
+++ b/tests/language/command.at
@@ -7,7 +7,7 @@ DATA rubbish.
 EXECUTE.
 ])
 AT_CHECK([pspp -O format=csv command.sps], [1], [dnl
-command.sps:1: error: Unknown command DATA RUBBISH.
+command.sps:1: error: Unknown command `DATA rubbish'.
 
 command.sps:2: error: EXECUTE: EXECUTE is allowed only after the active file has been defined.
 ])
diff --git a/tests/language/data-io/inpt-pgm.at b/tests/language/data-io/inpt-pgm.at
index 7f2e1435..f0ce9395 100644
--- a/tests/language/data-io/inpt-pgm.at
+++ b/tests/language/data-io/inpt-pgm.at
@@ -14,9 +14,9 @@ END INPUT PROGRAM.
 AT_CHECK([pspp -O format=csv input-program.sps], [1], [dnl
 input-program.sps:3: error: BEGIN DATA: BEGIN DATA is not allowed inside INPUT PROGRAM.
 
-input-program.sps:4: error: Syntax error at `123456789': expecting command name.
+input-program.sps:4: error: Unknown command `123456789'.
 
-input-program.sps:5: error: Unknown command END DATA.
+input-program.sps:5: error: Unknown command `END DATA'.
 ])
 AT_CLEANUP
 
diff --git a/tests/language/lexer/command-name-test.c b/tests/language/lexer/command-name-test.c
new file mode 100644
index 00000000..d63b8a20
--- /dev/null
+++ b/tests/language/lexer/command-name-test.c
@@ -0,0 +1,149 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libpspp/assertion.h"
+#include "libpspp/compiler.h"
+#include "language/lexer/command-name.h"
+
+#include "gl/error.h"
+#include "gl/progname.h"
+
+static char **commands, **strings;
+static size_t n_commands, n_strings;
+
+static void parse_options (int argc, char **argv);
+static void usage (void) NO_RETURN;
+
+int
+main (int argc, char *argv[])
+{
+  size_t i;
+
+  set_program_name (argv[0]);
+  parse_options (argc, argv);
+
+  for (i = 0; i < n_strings; i++)
+    {
+      const char *string = strings[i];
+      struct command_matcher cm;
+      const char *best;
+      size_t j;
+
+      if (i > 0)
+        putchar ('\n');
+      printf ("string=\"%s\":\n", string);
+      for (j = 0; j < n_commands; j++)
+        {
+          const char *command = commands[j];
+          int missing_words;
+          bool match, exact;
+
+          match = command_match (ss_cstr (command), ss_cstr (string),
+                                 &exact, &missing_words);
+          printf ("\tcommand=\"%s\" match=%s",
+                  command, match ? "yes" : "no");
+          if (match)
+            printf (" exact=%s missing_words=%d",
+                    exact ? "yes" : "no", missing_words);
+          putchar ('\n');
+        }
+
+      command_matcher_init (&cm, ss_cstr (string));
+      for (j = 0; j < n_commands; j++)
+        command_matcher_add (&cm, ss_cstr (commands[j]), commands[j]);
+      best = command_matcher_get_match (&cm);
+      printf ("match: %s, missing_words=%d\n",
+              best ? best : "none", command_matcher_get_missing_words (&cm));
+      command_matcher_destroy (&cm);
+    }
+
+  return 0;
+}
+
+static void
+parse_options (int argc, char **argv)
+{
+  int breakpoint;
+
+  for (;;)
+    {
+      static const struct option options[] =
+        {
+          {"help", no_argument, NULL, 'h'},
+          {NULL, 0, NULL, 0},
+        };
+
+      int c = getopt_long (argc, argv, "h", options, NULL);
+      if (c == -1)
+        break;
+
+      switch (c)
+        {
+        case 'h':
+          usage ();
+
+        case 0:
+          break;
+
+        case '?':
+          exit (EXIT_FAILURE);
+          break;
+
+        default:
+          NOT_REACHED ();
+        }
+
+    }
+
+  for (breakpoint = optind; ; breakpoint++)
+    if (breakpoint >= argc)
+      error (1, 0, "missing ',' on command line; use --help for help");
+    else if (!strcmp (argv[breakpoint], ","))
+      break;
+
+  commands = &argv[optind];
+  n_commands = breakpoint - optind;
+
+  strings = &argv[breakpoint + 1];
+  n_strings = argc - (breakpoint + 1);
+
+  if (n_commands == 0 || n_strings == 0)
+    error (1, 0, "must specify at least one command and one string; "
+           "use --help for help");
+}
+
+static void
+usage (void)
+{
+  printf ("\
+%s, to match PSPP command names\n\
+usage: %s [OPTIONS] COMMAND... , STRING...\n\
+\n\
+Options:\n\
+  -h, --help          print this help message\n",
+          program_name, program_name);
+  exit (EXIT_SUCCESS);
+}
diff --git a/tests/language/lexer/command-name.at b/tests/language/lexer/command-name.at
new file mode 100644
index 00000000..e0ecd59a
--- /dev/null
+++ b/tests/language/lexer/command-name.at
@@ -0,0 +1,234 @@
+AT_BANNER([command name matching])
+
+AT_SETUP([single words])
+AT_KEYWORDS([command name matching])
+AT_CHECK([command-name-test DESCRIPTIVES , DESCRIPTIVESX DESCRIPTIVES descr Des DEX DE '' 'DESCRIPTIVES MORE' 'DESCRIPTIVES@<00A0>@MORE'],
+  [0], [dnl
+string="DESCRIPTIVESX":
+	command="DESCRIPTIVES" match=no
+match: none, missing_words=0
+
+string="DESCRIPTIVES":
+	command="DESCRIPTIVES" match=yes exact=yes missing_words=0
+match: DESCRIPTIVES, missing_words=0
+
+string="descr":
+	command="DESCRIPTIVES" match=yes exact=no missing_words=0
+match: DESCRIPTIVES, missing_words=0
+
+string="Des":
+	command="DESCRIPTIVES" match=yes exact=no missing_words=0
+match: DESCRIPTIVES, missing_words=0
+
+string="DEX":
+	command="DESCRIPTIVES" match=no
+match: none, missing_words=0
+
+string="DE":
+	command="DESCRIPTIVES" match=no
+match: none, missing_words=0
+
+string="":
+	command="DESCRIPTIVES" match=yes exact=yes missing_words=1
+match: none, missing_words=1
+
+string="DESCRIPTIVES MORE":
+	command="DESCRIPTIVES" match=yes exact=yes missing_words=-1
+match: DESCRIPTIVES, missing_words=-1
+
+string="DESCRIPTIVES@<00A0>@MORE":
+	command="DESCRIPTIVES" match=yes exact=yes missing_words=-1
+match: DESCRIPTIVES, missing_words=-1
+])
+AT_CLEANUP
+
+AT_SETUP([two words without prefix match])
+AT_KEYWORDS([command name matching])
+AT_CHECK([command-name-test 'DO IF' 'DO REPEAT' , 'DO@<00A0>@IF' 'DO REPEAT' 'DO REP' 'DO OTHER' 'D IF' 'DO I' DO],
+  [0], [dnl
+string="DO@<00A0>@IF":
+	command="DO IF" match=yes exact=yes missing_words=0
+	command="DO REPEAT" match=no
+match: DO IF, missing_words=0
+
+string="DO REPEAT":
+	command="DO IF" match=no
+	command="DO REPEAT" match=yes exact=yes missing_words=0
+match: DO REPEAT, missing_words=0
+
+string="DO REP":
+	command="DO IF" match=no
+	command="DO REPEAT" match=yes exact=no missing_words=0
+match: DO REPEAT, missing_words=0
+
+string="DO OTHER":
+	command="DO IF" match=no
+	command="DO REPEAT" match=no
+match: none, missing_words=0
+
+string="D IF":
+	command="DO IF" match=no
+	command="DO REPEAT" match=no
+match: none, missing_words=0
+
+string="DO I":
+	command="DO IF" match=no
+	command="DO REPEAT" match=no
+match: none, missing_words=0
+
+string="DO":
+	command="DO IF" match=yes exact=yes missing_words=1
+	command="DO REPEAT" match=yes exact=yes missing_words=1
+match: none, missing_words=1
+])
+AT_CLEANUP
+
+AT_SETUP([two words with prefix match])
+AT_KEYWORDS([command name matching])
+AT_CHECK([command-name-test GET 'GET DATA' , GET 'GET TYPE' 'GET DAT' 'GET DATA'],
+  [0], [dnl
+string="GET":
+	command="GET" match=yes exact=yes missing_words=0
+	command="GET DATA" match=yes exact=yes missing_words=1
+match: none, missing_words=1
+
+string="GET TYPE":
+	command="GET" match=yes exact=yes missing_words=-1
+	command="GET DATA" match=no
+match: GET, missing_words=-1
+
+string="GET DAT":
+	command="GET" match=yes exact=yes missing_words=-1
+	command="GET DATA" match=yes exact=no missing_words=0
+match: GET DATA, missing_words=0
+
+string="GET DATA":
+	command="GET" match=yes exact=yes missing_words=-1
+	command="GET DATA" match=yes exact=yes missing_words=0
+match: GET DATA, missing_words=0
+])
+AT_CLEANUP
+
+AT_SETUP([ambiguous single-word names])
+AT_KEYWORDS([command name matching])
+AT_CHECK([command-name-test CASEPLOT CASESTOVARS , CAS Case CaseP CaseS], [0],
+  [dnl
+string="CAS":
+	command="CASEPLOT" match=yes exact=no missing_words=0
+	command="CASESTOVARS" match=yes exact=no missing_words=0
+match: none, missing_words=0
+
+string="Case":
+	command="CASEPLOT" match=yes exact=no missing_words=0
+	command="CASESTOVARS" match=yes exact=no missing_words=0
+match: none, missing_words=0
+
+string="CaseP":
+	command="CASEPLOT" match=yes exact=no missing_words=0
+	command="CASESTOVARS" match=no
+match: CASEPLOT, missing_words=0
+
+string="CaseS":
+	command="CASEPLOT" match=no
+	command="CASESTOVARS" match=yes exact=no missing_words=0
+match: CASESTOVARS, missing_words=0
+])
+AT_CLEANUP
+
+AT_SETUP([ambiguous two-word names])
+AT_KEYWORDS([command name matching])
+AT_CHECK([command-name-test VARCOMP VARSTOCASES 'VARIABLE ATTRIBUTE' , VAR VARC VARS VARI 'VAR@<00A0>@ATT'],
+  [0], [dnl
+string="VAR":
+	command="VARCOMP" match=yes exact=no missing_words=0
+	command="VARSTOCASES" match=yes exact=no missing_words=0
+	command="VARIABLE ATTRIBUTE" match=yes exact=no missing_words=1
+match: none, missing_words=1
+
+string="VARC":
+	command="VARCOMP" match=yes exact=no missing_words=0
+	command="VARSTOCASES" match=no
+	command="VARIABLE ATTRIBUTE" match=no
+match: VARCOMP, missing_words=0
+
+string="VARS":
+	command="VARCOMP" match=no
+	command="VARSTOCASES" match=yes exact=no missing_words=0
+	command="VARIABLE ATTRIBUTE" match=no
+match: VARSTOCASES, missing_words=0
+
+string="VARI":
+	command="VARCOMP" match=no
+	command="VARSTOCASES" match=no
+	command="VARIABLE ATTRIBUTE" match=yes exact=no missing_words=1
+match: none, missing_words=1
+
+string="VAR@<00A0>@ATT":
+	command="VARCOMP" match=yes exact=no missing_words=-1
+	command="VARSTOCASES" match=yes exact=no missing_words=-1
+	command="VARIABLE ATTRIBUTE" match=yes exact=no missing_words=0
+match: VARIABLE ATTRIBUTE, missing_words=0
+])
+AT_CLEANUP
+
+AT_SETUP([numbers and punctuation])
+AT_KEYWORDS([command name matching])
+AT_CHECK([command-name-test T-TEST 2SLS LIST , T-TEST 'T - Test' 2SLS '2 SLS' List],
+  [0], [dnl
+string="T-TEST":
+	command="T-TEST" match=yes exact=yes missing_words=0
+	command="2SLS" match=no
+	command="LIST" match=no
+match: T-TEST, missing_words=0
+
+string="T - Test":
+	command="T-TEST" match=yes exact=yes missing_words=0
+	command="2SLS" match=no
+	command="LIST" match=no
+match: T-TEST, missing_words=0
+
+string="2SLS":
+	command="T-TEST" match=no
+	command="2SLS" match=yes exact=yes missing_words=0
+	command="LIST" match=no
+match: 2SLS, missing_words=0
+
+string="2 SLS":
+	command="T-TEST" match=no
+	command="2SLS" match=yes exact=yes missing_words=0
+	command="LIST" match=no
+match: 2SLS, missing_words=0
+
+string="List":
+	command="T-TEST" match=no
+	command="2SLS" match=no
+	command="LIST" match=yes exact=yes missing_words=0
+match: LIST, missing_words=0
+])
+AT_CLEANUP
+
+AT_SETUP([off by more than one word])
+AT_KEYWORDS([command name matching])
+AT_CHECK([command-name-test 'a@<00A0>@b c' , a 'a b' 'a b c' 'a@<00A0>@b c d' 'a b c@<00A0>@d e'],
+  [0], [dnl
+string="a":
+	command="a@<00A0>@b c" match=yes exact=yes missing_words=2
+match: none, missing_words=1
+
+string="a b":
+	command="a@<00A0>@b c" match=yes exact=yes missing_words=1
+match: none, missing_words=1
+
+string="a b c":
+	command="a@<00A0>@b c" match=yes exact=yes missing_words=0
+match: a@<00A0>@b c, missing_words=0
+
+string="a@<00A0>@b c d":
+	command="a@<00A0>@b c" match=yes exact=yes missing_words=-1
+match: a@<00A0>@b c, missing_words=-1
+
+string="a b c@<00A0>@d e":
+	command="a@<00A0>@b c" match=yes exact=yes missing_words=-2
+match: a@<00A0>@b c, missing_words=-2
+])
+AT_CLEANUP
-- 
2.30.2