Segmentation code is coming along.

author Ben Pfaff <blp@cs.stanford.edu>

Mon, 8 Jul 2024 00:48:38 +0000 (17:48 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Mon, 8 Jul 2024 00:48:38 +0000 (17:48 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Mon, 8 Jul 2024 00:48:38 +0000 (17:48 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Mon, 8 Jul 2024 00:48:38 +0000 (17:48 -0700)
diff --git a/Makefile.am b/Makefile.am

index fcc4fe4856db381e4ad9f39406cc807f090f3638..b82bb6bb6f5c417e2af2eb7f10d4fb73e52b4593 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -131,7 +131,7 @@ dist-hook-git: distfiles
              (git --version) >/dev/null 2>&1; then                           \
           (cd $(srcdir) && git ls-files)                                    \
             | grep -vE '\.gitignore|README.Git|Smake|Bug-administration'    \
-           | grep -vE '\.gitattributes'                                    \
+           | grep -vE '\.gitattributes|rust'                                    \
             | LC_ALL=C sort -u > gitfiles;                                  \
           LC_ALL=C comm -1 -3 distfiles gitfiles > missing-distfiles;       \
           if test -s missing-distfiles; then                                \
diff --git a/rust/Cargo.lock b/rust/Cargo.lock

index 8b89b6fb95bc7e1b659885472c3aae11cd20da64..4569faca60f24c14460a69d721866b8827f3cecb 100644 (file)
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -52,6 +52,12 @@ version = "1.3.2"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
  
+[[package]]
+name = "bitflags"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
+
  [[package]]
  name = "bumpalo"
  version = "3.13.0"
@@ -91,7 +97,7 @@ version = "4.1.7"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "2f3061d6db6d8fcbbd4b05e057f2acace52e64e96b498c08c2d7a4e65addd340"
  dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
   "clap_derive",
   "clap_lex",
   "is-terminal",
@@ -493,6 +499,7 @@ name = "pspp"
  version = "1.0.0"
  dependencies = [
   "anyhow",
+ "bitflags 2.5.0",
   "chrono",
   "clap",
   "encoding_rs",
@@ -509,6 +516,7 @@ dependencies = [
   "ordered-float",
   "thiserror",
   "unicase",
+ "utf8-decode",
   "windows-sys 0.48.0",
  ]
  
@@ -527,7 +535,7 @@ version = "0.36.8"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644"
  dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
   "errno 0.2.8",
   "io-lifetimes",
   "libc",
@@ -541,7 +549,7 @@ version = "0.37.3"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "62b24138615de35e32031d041a09032ef3487a616d901ca4db224e7d557efae2"
  dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
   "errno 0.3.1",
   "io-lifetimes",
   "libc",
@@ -651,6 +659,12 @@ version = "1.0.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
  
+[[package]]
+name = "utf8-decode"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
+
  [[package]]
  name = "version_check"
  version = "0.9.4"
diff --git a/rust/Cargo.toml b/rust/Cargo.toml

index 5131409ac01d631ca5144640c128c159585fddb8..371ac6dff22f7835d6da8644b8dd4b95d261b02b 100644 (file)
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -22,6 +22,8 @@ finl_unicode = "1.2.0"
  unicase = "2.6.0"
  libc = "0.2.147"
  indexmap = "2.1.0"
+utf8-decode = "1.0.1"
+bitflags = "2.5.0"
  
  [target.'cfg(windows)'.dependencies]
  windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs

index 1108a46a7e1f5463a32b1277672b9bf05351de89..8e37e64ecb3ffac0889621121228d9f3e08d10f0 100644 (file)
--- a/rust/src/identifier.rs
+++ b/rust/src/identifier.rs
@@ -12,29 +12,57 @@ use thiserror::Error as ThisError;
  use unicase::UniCase;
  
  pub trait IdentifierChar {
+    /// Returns true if `self` is an ASCII character that may be the first
+    /// character in an identifier.
+    fn ascii_may_start_id(self) -> bool;
+
      /// Returns true if `self` may be the first character in an identifier.
      fn may_start_id(self) -> bool;
  
+    /// Returns true if `self` is an ASCII character that may be a second or
+    /// subsequent character in an identifier.
+    fn ascii_may_continue_id(self) -> bool;
+
      /// Returns true if `self` may be a second or subsequent character in an
      /// identifier.
      fn may_continue_id(self) -> bool;
  }
  
  impl IdentifierChar for char {
+    fn ascii_may_start_id(self) -> bool {
+        matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$')
+    }
+
      fn may_start_id(self) -> bool {
-        use MajorCategory::*;
+        if self < '\u{0080}' {
+            self.ascii_may_start_id()
+        } else {
+            use MajorCategory::*;
  
-        ([L, M, S].contains(&self.get_major_category()) || "@#$".contains(self))
-            && self != char::REPLACEMENT_CHARACTER
+            ([L, M, S].contains(&self.get_major_category()) || "@#$".contains(self))
+                && self != char::REPLACEMENT_CHARACTER
+        }
+    }
+
+    fn ascii_may_continue_id(self) -> bool {
+        self.ascii_may_start_id() || matches!(self, '0'..='9' | '.' | '_')
      }
  
      fn may_continue_id(self) -> bool {
-        use MajorCategory::*;
+        if self < '\u{0080}' {
+            self.ascii_may_continue_id()
+        } else {
+            use MajorCategory::*;
  
-        ([L, M, S, N].contains(&self.get_major_category()) || "@#$._".contains(self))
-            && self != char::REPLACEMENT_CHARACTER
+            ([L, M, S, N].contains(&self.get_major_category()) || "@#$._".contains(self))
+                && self != char::REPLACEMENT_CHARACTER
+        }
      }
  }
+#[test]
+fn gc() {
+    println!("{:?}", '<'.get_major_category());
+}
  
  #[derive(Clone, Debug, ThisError)]
  pub enum Error {
@@ -66,7 +94,7 @@ pub enum Error {
      },
  }
  
-fn is_reserved_word(s: &str) -> bool {
+pub fn is_reserved_word(s: &str) -> bool {
      for word in [
          "and", "or", "not", "eq", "ge", "gt", "le", "ne", "all", "by", "to", "with",
      ] {
@@ -151,6 +179,47 @@ impl Identifier {
      }
  }
  
+
+
+/// Returns true if `token` is a case-insensitive match for `keyword`.
+///
+/// Keywords match `keyword` and `token` are identical, or `token` is at least 3
+/// characters long and those characters are identical to `keyword` or differ
+/// only in case.
+///
+/// `keyword` must be ASCII.  It's normally a constant string, so it's declared
+/// as `&'static str` to make it harder to reverse the argument order. But
+/// there's no reason that a non-static string won't work, so use
+/// [`id_match_n_nonstatic`] instead if you need it.
+pub fn id_match(keyword: &'static str, token: &str) -> bool {
+    id_match_n(keyword, token, 3)
+}
+
+/// Returns true if `token` is a case-insensitive match for at least the first
+/// `n` characters of `keyword`.
+///
+/// `keyword` must be ASCII.  It's normally a constant string, so it's declared
+/// as `&'static str` to make it harder to reverse the argument order. But
+/// there's no reason that a non-static string won't work, so use
+/// [`id_match_n_nonstatic`] instead if you need it.
+pub fn id_match_n(keyword: &'static str, token: &str, n: usize) -> bool {
+    id_match_n_nonstatic(keyword, token, n)
+}
+
+/// Returns true if `token` is a case-insensitive match for at least the first
+/// `n` characters of `keyword`.
+///
+/// `keyword` must be ASCII.
+pub fn id_match_n_nonstatic(keyword: &str, token: &str, n: usize) -> bool {
+    debug_assert!(keyword.is_ascii());
+    let keyword_prefix = if (n..keyword.len()).contains(&token.len()) {
+        &keyword[..token.len()]
+    } else {
+        keyword
+    };
+    keyword_prefix.eq_ignore_ascii_case(token)
+}
+
  impl Display for Identifier {
      fn fmt(&self, f: &mut Formatter) -> FmtResult {
          write!(f, "{}", self.0)
diff --git a/rust/src/lex/command_name.rs b/rust/src/lex/command_name.rs

new file mode 100644 (file)

index 0000000..208bd45
--- /dev/null
+++ b/rust/src/lex/command_name.rs
@@ -0,0 +1,298 @@
+use crate::identifier::id_match_n_nonstatic;
+
+pub struct Match {
+    pub exact: bool,
+    pub missing_words: isize,
+}
+
+fn count_words(s: &str) -> isize {
+    s.split_whitespace().count() as isize
+}
+
+/// Compares `string` obtained from the user against the full name of a `command`,
+/// using this algorithm:
+///
+///   1. Divide `command` into words `c[0]` through `c[n - 1]`.
+///
+///   2. Divide `string` into words `s[0]` through `s[m - 1]`.
+///
+///   3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the keyword
+///      matching algorithm implemented by lex_id_match().  If any of them fail to
+///      match, then `string` does not match `command` and the function returns false.
+///
+///   4. Otherwise, `string` and `command` match.  Set *MISSING_WORDS to n - m.  Set
+///      *EXACT to false if any of the S[i] were found to be abbreviated in the
+///      comparisons done in step 3, or to true if they were all exactly equal
+///      (modulo case).  Return true. */
+pub fn command_match(command: &str, string: &str) -> Option<Match> {
+    let mut command_words = command.split_whitespace();
+    let mut string_words = string.split_whitespace();
+    let mut exact = true;
+    loop {
+        let Some(cw) = command_words.next() else {
+            return Some(Match {
+                exact,
+                missing_words: -count_words(string),
+            });
+        };
+        let Some(sw) = string_words.next() else {
+            return Some(Match {
+                exact,
+                missing_words: 1 + count_words(command),
+            });
+        };
+        if !id_match_n_nonstatic(cw, sw, 3) {
+            return None;
+        }
+        if sw.len() < cw.len() {
+            exact = false;
+        }
+    }
+}
+
+pub const COMMAND_NAMES: &'static [&'static str] = &[
+    "2SLS",
+    "ACF",
+    "ADD DOCUMENT",
+    "ADD FILES",
+    "ADD VALUE LABELS",
+    "AGGREGATE",
+    "ALSCAL",
+    "ANACOR",
+    "ANOVA",
+    "APPLY DICTIONARY",
+    "AUTORECODE",
+    "BEGIN DATA",
+    "BREAK",
+    "CACHE",
+    "CASEPLOT",
+    "CASESTOVARS",
+    "CATPCA",
+    "CATREG",
+    "CCF",
+    "CD",
+    "CLEAR TRANSFORMATIONS",
+    "CLOSE FILE HANDLE",
+    "CLUSTER",
+    "COMPUTE",
+    "CONJOINT",
+    "CORRELATIONS",
+    "CORRESPONDENCE",
+    "COUNT",
+    "COXREG",
+    "CREATE",
+    "CROSSTABS",
+    "CSDESCRIPTIVES",
+    "CSGLM",
+    "CSLOGISTIC",
+    "CSPLAN",
+    "CSSELECT",
+    "CSTABULATE",
+    "CTABLES",
+    "CURVEFIT",
+    "DATA LIST",
+    "DATAFILE ATTRIBUTE",
+    "DATASET ACTIVATE",
+    "DATASET CLOSE",
+    "DATASET COPY",
+    "DATASET DECLARE",
+    "DATASET DISPLAY",
+    "DATASET NAME",
+    "DATE",
+    "DEBUG EVALUATE",
+    "DEBUG EXPAND",
+    "DEBUG FLOAT FORMAT",
+    "DEBUG FORMAT GUESSER",
+    "DEBUG MATRIX READ",
+    "DEBUG MOMENTS",
+    "DEBUG PAPER SIZE",
+    "DEBUG POOL",
+    "DEBUG XFORM FAIL",
+    "DEFINE",
+    "DELETE VARIABLES",
+    "DESCRIPTIVES",
+    "DETECTANOMALY",
+    "DISCRIMINANT",
+    "DISPLAY MACROS",
+    "DISPLAY VARIABLE SETS",
+    "DISPLAY",
+    "DO IF",
+    "DO REPEAT",
+    "DOCUMENT",
+    "DROP DOCUMENTS",
+    "ECHO",
+    "EDIT",
+    "ELSE IF",
+    "ELSE",
+    "END CASE",
+    "END FILE TYPE",
+    "END FILE",
+    "END IF",
+    "END LOOP",
+    "END REPEAT",
+    "ERASE",
+    "EXAMINE",
+    "EXECUTE",
+    "EXIT",
+    "EXPORT",
+    "FACTOR",
+    "FILE HANDLE",
+    "FILE LABEL",
+    "FILE TYPE",
+    "FILTER",
+    "FINISH",
+    "FIT",
+    "FLIP",
+    "FORMATS",
+    "FREQUENCIES",
+    "GENLOG",
+    "GET DATA",
+    "GET TRANSLATE",
+    "GET",
+    "GGRAPH",
+    "GLM",
+    "GRAPH",
+    "HILOGLINEAR",
+    "HOMALS",
+    "HOST",
+    "IF",
+    "IGRAPH",
+    "IMPORT",
+    "INCLUDE",
+    "INFO",
+    "INPUT PROGRAM",
+    "INSERT",
+    "KEYED DATA LIST",
+    "KM",
+    "LEAVE",
+    "LIST",
+    "LOGISTIC REGRESSION",
+    "LOGLINEAR",
+    "LOOP",
+    "MANOVA",
+    "MAPS",
+    "MATCH FILES",
+    "MATRIX DATA",
+    "MATRIX",
+    "MCONVERT",
+    "MEANS",
+    "MISSING VALUES",
+    "MIXED",
+    "MODEL CLOSE",
+    "MODEL HANDLE",
+    "MODEL LIST",
+    "MODEL NAME",
+    "MRSETS",
+    "MULT RESPONSE",
+    "MULTIPLE CORRESPONDENCE",
+    "MVA",
+    "N OF CASES",
+    "N",
+    "NAIVEBAYES",
+    "NEW FILE",
+    "NLR",
+    "NOMREG",
+    "NONPAR CORR",
+    "NPAR TESTS",
+    "NUMBERED",
+    "NUMERIC",
+    "OLAP CUBES",
+    "OMS",
+    "ONEWAY",
+    "ORTHOPLAN",
+    "OUTPUT MODIFY",
+    "OVERALS",
+    "PACF",
+    "PARTIAL CORR",
+    "PEARSON CORRELATIONS",
+    "PERMISSIONS",
+    "PLANCARDS",
+    "PLUM",
+    "POINT",
+    "PPLOT",
+    "PREDICT",
+    "PREFSCAL",
+    "PRESERVE",
+    "PRINCALS",
+    "PRINT EJECT",
+    "PRINT FORMATS",
+    "PRINT SPACE",
+    "PRINT",
+    "PROBIT",
+    "PROCEDURE OUTPUT",
+    "PROXIMITIES",
+    "PROXSCAL",
+    "Q",
+    "QUICK CLUSTER",
+    "QUIT",
+    "RANK",
+    "RATIO STATISTICS",
+    "READ MODEL",
+    "RECODE",
+    "RECORD TYPE",
+    "REFORMAT",
+    "REGRESSION",
+    "RELIABILITY",
+    "RENAME VARIABLES",
+    "REPEATING DATA",
+    "REPORT",
+    "REREAD",
+    "RESTORE",
+    "RMV",
+    "ROC",
+    "SAMPLE",
+    "SAVE DATA COLLECTION",
+    "SAVE TRANSLATE",
+    "SAVE",
+    "SCRIPT",
+    "SEASON",
+    "SELECT IF",
+    "SELECTPRED",
+    "SET",
+    "SHOW",
+    "SORT CASES",
+    "SORT VARIABLES",
+    "SPCHART",
+    "SPECTRA",
+    "SPLIT FILE",
+    "STEMLEAF",
+    "STRING",
+    "SUBTITLE",
+    "SUMMARIZE",
+    "SURVIVAL",
+    "SYSFILE INFO",
+    "T-TEST",
+    "TDISPLAY",
+    "TEMPORARY",
+    "TITLE",
+    "TREE",
+    "TSAPPLY",
+    "TSET",
+    "TSHOW",
+    "TSMODEL",
+    "TSPLOT",
+    "TWOSTEP CLUSTER",
+    "UNIANOVA",
+    "UNNUMBERED",
+    "UPDATE",
+    "USE",
+    "VALIDATEDATA",
+    "VALUE LABELS",
+    "VARCOMP",
+    "VARIABLE ALIGNMENT",
+    "VARIABLE ATTRIBUTE",
+    "VARIABLE LABELS",
+    "VARIABLE LEVEL",
+    "VARIABLE ROLE",
+    "VARIABLE WIDTH",
+    "VARSTOCASES",
+    "VECTOR",
+    "VERIFY",
+    "WEIGHT",
+    "WLS",
+    "WRITE FORMATS",
+    "WRITE",
+    "XEXPORT",
+    "XGRAPH",
+    "XSAVE",
+];
diff --git a/rust/src/lex/mod.rs b/rust/src/lex/mod.rs

index c5780e0db5baea9fa0452af22f8db7eda06939fc..2047837df405a9dc656362a524983e13fe10fc6d 100644 (file)
--- a/rust/src/lex/mod.rs
+++ b/rust/src/lex/mod.rs
@@ -1 +1,2 @@
  pub mod segment;
+pub mod command_name;
diff --git a/rust/src/lex/segment.rs b/rust/src/lex/segment.rs

index 597e5766c56f8372df288ff9860381d110bb133f..d52dd888fc48ad97f0e5ac40aea9ddeb69946e85 100644 (file)
--- a/rust/src/lex/segment.rs
+++ b/rust/src/lex/segment.rs
@@ -19,6 +19,14 @@
  //! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior
  //! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
  
+use crate::{
+    identifier::{id_match, id_match_n, is_reserved_word, IdentifierChar},
+    prompt::PromptStyle,
+};
+use bitflags::bitflags;
+
+use super::command_name::{command_match, COMMAND_NAMES};
+
  /// Segmentation mode.
  ///
  /// PSPP syntax is written in one of two modes which are broadly defined as
@@ -47,7 +55,7 @@ pub enum Mode {
  }
  
  /// The type of a segment.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
  pub enum Type {
      Number,
      QuotedString,
@@ -75,9 +83,1456 @@ pub enum Type {
      End,
      ExpectedQuote,
      ExpectedExponent,
-    UnexpectedChar
+    UnexpectedChar,
  }
  
+bitflags! {
+    #[derive(Copy, Clone)]
+    pub struct Substate: u8 {
+        const START_OF_LINE = 1;
+        const START_OF_COMMAND = 2;
+    }
+}
+
+#[derive(Copy, Clone)]
  pub struct Segmenter {
-    state: State
+    state: (State, Substate),
+    nest: u8,
+    mode: Mode,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct Incomplete;
+
+impl Segmenter {
+    /// Returns a segmenter with the given syntax `mode`.
+    ///
+    /// If `is_snippet` is false, then the segmenter will parse as if it's being
+    /// given a whole file.  This means, for example, that it will interpret `-`
+    /// or `+` at the beginning of the syntax as a separator between commands
+    /// (since `-` or `+` at the beginning of a line has this meaning).
+    ///
+    /// If `is_snippet` is true, then the segmenter will parse as if it's being
+    /// given an isolated piece of syntax.  This means that, for example, that
+    /// it will interpret `-` or `+` at the beginning of the syntax as an
+    /// operator token or (if followed by a digit) as part of a number.
+    pub fn new(mode: Mode, is_snippet: bool) -> Self {
+        Self {
+            state: if is_snippet {
+                (State::General, Substate::empty())
+            } else {
+                (State::Shbang, Substate::empty())
+            },
+            mode,
+            nest: 0,
+        }
+    }
+
+    pub fn mode(&self) -> Mode {
+        self.mode
+    }
+
+    fn start_of_line(&self) -> bool {
+        self.state.1.contains(Substate::START_OF_LINE)
+    }
+
+    fn start_of_command(&self) -> bool {
+        self.state.1.contains(Substate::START_OF_COMMAND)
+    }
+
+    /// Returns the style of command prompt to display to an interactive user
+    /// for input in the current state..  The return value is most accurate in
+    /// mode `Mode::Interactive` and at the beginning of a line (that is, if
+    /// [`Segmenter::push`] consumed as much as possible of the input up to a
+    /// new-line).
+    pub fn prompt(&self) -> PromptStyle {
+        match self.state.0 {
+            State::Shbang => PromptStyle::First,
+            State::General => {
+                if self.start_of_command() {
+                    PromptStyle::First
+                } else {
+                    PromptStyle::Later
+                }
+            }
+            State::Comment1 | State::Comment2 => PromptStyle::Comment,
+            State::Document1 | State::Document2 => PromptStyle::Document,
+            State::Document3 => PromptStyle::First,
+            State::FileLabel1 => PromptStyle::Later,
+            State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
+            State::DoRepeat1 | State::DoRepeat2 => {
+                if self.start_of_command() {
+                    PromptStyle::First
+                } else {
+                    PromptStyle::Later
+                }
+            }
+            State::DoRepeat3 => PromptStyle::DoRepeat,
+            State::Define1 | State::Define2 | State::Define3 => {
+                if self.start_of_command() {
+                    PromptStyle::First
+                } else {
+                    PromptStyle::Later
+                }
+            }
+            State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
+            State::BeginData1 => PromptStyle::First,
+            State::BeginData2 => PromptStyle::Later,
+            State::BeginData3 | State::BeginData4 => PromptStyle::Data,
+        }
+    }
+
+    /// Attempts to label a prefix of the remaining input with a segment type.
+    /// The caller supplies a prefix of the remaining input as `input`.  If
+    /// `eof` is true, then `input` is the entire (remainder) of the input; if
+    /// `eof` is false, then further input is potentially available.
+    ///
+    /// The input may contain '\n' or '\r\n' line ends in any combination.
+    ///
+    /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
+    /// in the segment at the beginning of `input` (a number in
+    /// `0..=input.len()`) and the type of that segment.  The next call should
+    /// not include those bytes in `input`, because they have (figuratively)
+    /// been consumed by the segmenter.
+    ///
+    /// Segments can have zero length, including segment types `Type::End`,
+    /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and
+    /// `Type::Spaces`.
+    ///
+    /// Failure occurs only if the segment type of the bytes in `input` cannot
+    /// yet be determined.  In this case, this function returns `Err(Incomplete)`.  If
+    /// more input is available, the caller should obtain some more, then call
+    /// again with a longer `input`.  If this is not enough, the process might
+    /// need to repeat again and again.  If input is exhausted, then the caller
+    /// may call again setting `eof` to true.  This function will never return
+    /// `Err(Incomplete)` when `eof` is true.
+    ///
+    /// The caller must not, in a sequence of calls, supply contradictory input.
+    /// That is, bytes provided as part of `input` in one call, but not
+    /// consumed, must not be provided with *different* values on subsequent
+    /// calls.  This is because the function must often make decisions based on
+    /// looking ahead beyond the bytes that it consumes.
+    pub fn push<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
+        if input.is_empty() {
+            if eof {
+                return Ok((input, Type::End));
+            } else {
+                return Err(Incomplete);
+            };
+        }
+
+        match self.state.0 {
+            State::Shbang => return self.parse_shbang(input, eof),
+            State::General => {
+                if self.start_of_line() {
+                    self.parse_start_of_line(input, eof)
+                } else {
+                    self.parse_mid_command(input, eof)
+                }
+            }
+            State::Comment1 => self.parse_comment_1(input, eof),
+            State::Comment2 => self.parse_comment_2(input, eof),
+            State::Document1 => self.parse_document_1(input, eof),
+            State::Document2 => self.parse_document_2(input, eof),
+            State::Document3 => self.parse_document_3(input, eof),
+            State::FileLabel1 => self.parse_file_label_1(input, eof),
+            State::FileLabel2 => self.parse_file_label_2(input, eof),
+            State::FileLabel3 => self.parse_file_label_3(input, eof),
+            State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
+            State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
+            State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
+            State::Define1 => self.parse_define_1_2(input, eof),
+            State::Define2 => self.parse_define_1_2(input, eof),
+            State::Define3 => self.parse_define_3(input, eof),
+            State::Define4 => self.parse_define_4_5(input, eof),
+            State::Define5 => self.parse_define_4_5(input, eof),
+            State::Define6 => self.parse_define_6(input, eof),
+            State::BeginData1 => self.parse_begin_data_1(input, eof),
+            State::BeginData2 => self.parse_begin_data_2(input, eof),
+            State::BeginData3 => self.parse_begin_data_3(input, eof),
+            State::BeginData4 => self.parse_begin_data_4(input, eof),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum State {
+    Shbang,
+    General,
+    Comment1,
+    Comment2,
+    Document1,
+    Document2,
+    Document3,
+    FileLabel1,
+    FileLabel2,
+    FileLabel3,
+    DoRepeat1,
+    DoRepeat2,
+    DoRepeat3,
+    Define1,
+    Define2,
+    Define3,
+    Define4,
+    Define5,
+    Define6,
+    BeginData1,
+    BeginData2,
+    BeginData3,
+    BeginData4,
+}
+
+fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
+    let mut iter = input.chars();
+    match iter.next() {
+        None if !eof => Err(Incomplete),
+        c => Ok((c, iter.as_str())),
+    }
+}
+
+fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+    loop {
+        let (Some(c), rest) = take(input, eof)? else {
+            return Ok(input);
+        };
+        match c {
+            '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
+            '*' => {
+                if let (Some('/'), rest) = take(rest, eof)? {
+                    return Ok(rest);
+                }
+            }
+            _ => (),
+        };
+        input = rest;
+    }
+}
+
+fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
+where
+    F: Fn(char) -> bool,
+{
+    let input = input.trim_start_matches(f);
+    if input.is_empty() && !eof {
+        Err(Incomplete)
+    } else {
+        Ok(input)
+    }
+}
+
+fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
+where
+    F: Fn(char) -> bool,
+{
+    if let (Some(c), rest) = take(input, eof)? {
+        if f(c) {
+            return Ok(Some(rest));
+        }
+    }
+    Ok(None)
+}
+
+fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+    loop {
+        let (Some(c), rest) = take(input, eof)? else {
+            return Ok(input);
+        };
+        match c {
+            '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+            c if c.is_whitespace() => (),
+            _ => return Ok(input),
+        }
+        input = rest;
+    }
+}
+
+fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
+    skip_matching(|c| c.is_ascii_digit(), input, eof)
+}
+
+fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+    loop {
+        let (Some(c), rest) = take(input, eof)? else {
+            return Ok(input);
+        };
+        match c {
+            '/' => {
+                let (c, rest2) = take(rest, eof)?;
+                match c {
+                    Some('*') => input = skip_comment(rest2, eof)?,
+                    Some(_) | None => return Ok(rest),
+                }
+            }
+            '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+            c if c.is_whitespace() => input = rest,
+            _ => return Ok(input),
+        };
+    }
+}
+
+fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
+    let (Some(c), _rest) = take(input, eof)? else {
+        return Ok(false);
+    };
+    match c {
+        'x' | 'X' | 'u' | 'U' => Ok({
+            let (c, _rest) = take(input, eof)?;
+            c == Some('\'') || c == Some('"')
+        }),
+        '\'' | '"' | '\n' => Ok(true),
+        _ => Ok(false),
+    }
+}
+
+fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+    let (Some(c), rest) = take(input, eof)? else {
+        return Ok(true);
+    };
+    Ok(match c {
+        '\n' => true,
+        '\r' => take(rest, eof)?.0 == Some('\n'),
+        _ => false,
+    })
+}
+
+fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+    let input = skip_spaces_and_comments(input, eof)?;
+    is_end_of_line(input, eof)
+}
+
+fn first(s: &str) -> char {
+    s.chars().next().unwrap()
+}
+fn get_command_name_candidates(target: &str) -> &[&'static str] {
+    if target.is_empty() {
+        return &[];
+    }
+    let target_first = first(target).to_ascii_uppercase();
+    let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
+    let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
+    &COMMAND_NAMES[low..high]
+}
+
+fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
+    let command_name = input
+        .split(|c: char| !(c.is_whitespace() || c.may_continue_id() || c == '-'))
+        .next()
+        .unwrap();
+    if !eof && command_name.len() == input.len() {
+        return Err(Incomplete);
+    }
+    let string = command_name.strip_suffix('.').unwrap_or(command_name);
+    for command in get_command_name_candidates(command_name) {
+        if let Some(m) = command_match(command, string) {
+            if m.missing_words <= 0 {
+                return Ok(true);
+            }
+        }
+    }
+    Ok(false)
+}
+
+impl Segmenter {
+    fn parse_shbang<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let (c, rest) = take(input, eof)?;
+        if c == Some('#') {
+            if let (Some('!'), rest) = take(rest, eof)? {
+                self.state = (State::General, Substate::START_OF_COMMAND);
+                return Ok((self.parse_full_line(rest, eof)?, Type::Shbang));
+            }
+        }
+
+        self.state = (
+            State::General,
+            Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+        );
+        self.push(input, eof)
+    }
+    fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
+        match self.mode {
+            Mode::Auto => detect_command_name(input, eof),
+            Mode::Interactive => Ok(false),
+            Mode::Batch => Ok(true),
+        }
+    }
+    fn parse_start_of_line<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        debug_assert_eq!(self.state.0, State::General);
+        debug_assert!(self.start_of_line());
+        debug_assert!(!input.is_empty());
+
+        let (Some(c), rest) = take(input, eof).unwrap() else {
+            unreachable!()
+        };
+        match c {
+            '+' if is_start_of_string(skip_spaces_and_comments(input, eof)?, eof)? => {
+                // This  `+` is punctuation that may separate pieces of a string.
+                self.state = (State::General, Substate::empty());
+                return Ok((rest, Type::Punct));
+            }
+            '+' | '-' | '.' => {
+                self.state = (State::General, Substate::START_OF_COMMAND);
+                return Ok((rest, Type::StartCommand));
+            }
+            c if c.is_whitespace() => {
+                if at_end_of_line(rest, eof)? {
+                    self.state = (State::General, Substate::START_OF_COMMAND);
+                    return Ok((input, Type::SeparateCommands));
+                }
+            }
+            _ => {
+                if self.at_command_start(input, eof)? {
+                    self.state = (State::General, Substate::START_OF_COMMAND);
+                    return Ok((input, Type::StartCommand));
+                }
+            }
+        }
+        self.state.1 = Substate::START_OF_COMMAND;
+        self.parse_mid_command(input, eof)
+    }
+    fn parse_mid_command<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        debug_assert!(self.state.0 == State::General);
+        debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
+        let (Some(c), rest) = take(input, eof)? else {
+            unreachable!()
+        };
+        match c {
+            '\r' | '\n' if is_end_of_line(input, eof)? => {
+                self.state.1 |= Substate::START_OF_LINE;
+                Ok((
+                    self.parse_newline(input, eof).unwrap().unwrap(),
+                    Type::Newline,
+                ))
+            }
+            '/' => {
+                if let (Some('*'), rest) = take(rest, eof)? {
+                    let rest = skip_comment(rest, eof)?;
+                    return Ok((rest, Type::Comment));
+                } else {
+                    self.state.1 = Substate::empty();
+                    return Ok((rest, Type::Punct));
+                }
+            }
+            '-' => {
+                let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
+                match c {
+                    Some(c) if c.is_ascii_digit() => {
+                        return self.parse_number(rest, eof);
+                    }
+                    Some('.') => {
+                        if let (Some(c), _rest) = take(rest2, eof)? {
+                            if c.is_ascii_digit() {
+                                return self.parse_number(rest, eof);
+                            }
+                        }
+                    }
+                    None | Some(_) => (),
+                }
+                self.state.1 = Substate::empty();
+                return Ok((rest, Type::Punct));
+            }
+            '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
+                self.state.1 = Substate::empty();
+                return Ok((rest, Type::Punct));
+            }
+            '*' => {
+                if self.state.1.contains(Substate::START_OF_COMMAND) {
+                    self.state.0 = State::Comment1;
+                    self.parse_comment_1(input, eof)
+                } else {
+                    self.parse_digraph(&['*'], input, eof)
+                }
+            }
+            '<' => self.parse_digraph(&['=', '>'], rest, eof),
+            '>' => self.parse_digraph(&['='], rest, eof),
+            '~' => self.parse_digraph(&['='], rest, eof),
+            '.' => match take(rest, eof)? {
+                (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
+                (Some('\r' | '\n'), _) if is_end_of_line(rest, eof)? => {
+                    self.state.1 = Substate::START_OF_COMMAND;
+                    Ok((rest, Type::EndCommand))
+                }
+                _ => Ok((rest, Type::Punct)),
+            },
+            '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => {
+                self.parse_number(input, eof)
+            }
+            'u' | 'U' => self.maybe_parse_string(Type::UnicodeString, (input, rest), eof),
+            'x' | 'X' => self.maybe_parse_string(Type::HexString, (input, rest), eof),
+            '\'' | '"' => self.parse_string(Type::QuotedString, c, rest, eof),
+            '!' => {
+                let (c, rest2) = take(rest, eof)?;
+                match c {
+                    Some('*') => Ok((rest2, Type::MacroId)),
+                    Some(_) => self.parse_id(input, eof),
+                    None => Ok((rest, Type::Punct)),
+                }
+            }
+            c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Type::Spaces)),
+            c if c.may_start_id() => self.parse_id(input, eof),
+            '!'..='~' if c != '\\' && c != '^' => {
+                self.state.1 = Substate::empty();
+                Ok((rest, Type::Punct))
+            }
+            _ => {
+                println!("unexpected {c:?} {:?}", c.is_whitespace());
+                self.state.1 = Substate::empty();
+                Ok((rest, Type::UnexpectedChar))
+            }
+        }
+    }
+    fn parse_string<'a>(
+        &mut self,
+        type_: Type,
+        quote: char,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        loop {
+            let (Some(c), rest) = take(input, eof)? else {
+                break;
+            };
+            if c == quote {
+                if take(rest, eof)?.0 == Some(quote) {
+                    input = rest;
+                    continue;
+                } else {
+                    return Ok((rest, type_));
+                }
+            } else if is_end_of_line(input, eof)? {
+                break;
+            }
+            input = rest;
+        }
+        self.state.1 = Substate::empty();
+        Ok((input, Type::ExpectedQuote))
+    }
+    fn maybe_parse_string<'a>(
+        &mut self,
+        type_: Type,
+        input: (&'a str, &'a str),
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        match take(input.1, eof)? {
+            (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(type_, c, rest, eof),
+            _ => self.parse_id(input.0, eof),
+        }
+    }
+    fn next_id_in_command<'a>(
+        &self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, &'a str), Incomplete> {
+        let mut sub = Segmenter::new(self.mode, true);
+        loop {
+            let (rest, type_) = sub.push(input, eof)?;
+            match type_ {
+                Type::Shbang | Type::Spaces | Type::Comment | Type::Newline => (),
+
+                Type::Identifier => return Ok((&input[..input.len() - rest.len()], rest)),
+
+                Type::Number
+                | Type::QuotedString
+                | Type::HexString
+                | Type::UnicodeString
+                | Type::UnquotedString
+                | Type::ReservedWord
+                | Type::Punct
+                | Type::CommentCommand
+                | Type::DoRepeatCommand
+                | Type::InlineData
+                | Type::MacroId
+                | Type::MacroName
+                | Type::MacroBody
+                | Type::StartDocument
+                | Type::Document
+                | Type::StartCommand
+                | Type::SeparateCommands
+                | Type::EndCommand
+                | Type::End
+                | Type::ExpectedQuote
+                | Type::ExpectedExponent
+                | Type::UnexpectedChar => return Ok(("", rest)),
+            }
+            input = rest;
+        }
+    }
+    fn parse_id<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
+        let (Some(_), mut end) = take(input, eof).unwrap() else {
+            unreachable!()
+        };
+        while let (Some(c), rest) = take(end, eof)? {
+            if !c.may_continue_id() {
+                break;
+            };
+            end = rest;
+        }
+        let identifier = &input[..input.len() - end.len()];
+        let identifier = match identifier.strip_suffix('.') {
+            Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
+            _ => identifier,
+        };
+        let rest = &input[identifier.len()..];
+
+        if self.state.1.contains(Substate::START_OF_COMMAND) {
+            if id_match_n("COMMENT", identifier, 4) {
+                self.state.0 = State::Comment1;
+                return self.parse_comment_1(input, eof);
+            } else if id_match("DOCUMENT", identifier) {
+                self.state.0 = State::Document1;
+                return Ok((input, Type::StartDocument));
+            } else if id_match_n("DEFINE", identifier, 6) {
+                self.state.0 = State::Define1;
+            } else if id_match("FILE", identifier) {
+                println!("next={:?}", self.next_id_in_command(rest, eof)?.0);
+                if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
+                    self.state = (State::FileLabel1, Substate::empty());
+                    return Ok((rest, Type::Identifier));
+                }
+            } else if id_match("DO", identifier) {
+                if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
+                    self.state = (State::DoRepeat1, Substate::empty());
+                    return Ok((rest, Type::Identifier));
+                }
+            } else if id_match("BEGIN", identifier) {
+                let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
+                if id_match("DATA", next_id) {
+                    let rest2 = skip_spaces_and_comments(rest2, eof)?;
+                    let rest2 = if let Some(s) = rest2.strip_prefix('.') {
+                        skip_spaces(s, eof)?
+                    } else {
+                        rest2
+                    };
+                    if is_end_of_line(rest2, eof)? {
+                        let s = &input[..input.len() - rest2.len()];
+                        self.state = (
+                            if s.contains('\n') {
+                                State::BeginData1
+                            } else {
+                                State::BeginData2
+                            },
+                            Substate::empty(),
+                        );
+                        return Ok((rest, Type::Identifier));
+                    }
+                }
+            }
+        }
+
+        self.state.1 = Substate::empty();
+        let type_ = if is_reserved_word(identifier) {
+            Type::ReservedWord
+        } else if identifier.starts_with('!') {
+            Type::MacroId
+        } else {
+            Type::Identifier
+        };
+        Ok((rest, type_))
+    }
+    fn parse_digraph<'a>(
+        &mut self,
+        seconds: &[char],
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let (c, rest) = take(input, eof)?;
+        Ok((
+            match c {
+                Some(c) if seconds.contains(&c) => rest,
+                _ => input,
+            },
+            Type::Punct,
+        ))
+    }
+    fn parse_number<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let mut input = skip_digits(input, eof)?;
+        if let Some(rest) = match_char(|c| c == '.', input, eof)? {
+            let rest2 = skip_digits(rest, eof)?;
+            if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
+                input = rest2;
+            }
+        };
+        if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
+            let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
+            let rest2 = skip_digits(rest, eof)?;
+            if rest2.len() == rest.len() {
+                self.state.1 = Substate::empty();
+                return Ok((rest, Type::ExpectedExponent));
+            }
+            input = rest2;
+        }
+        Ok((input, Type::Number))
+    }
+    fn parse_comment_1<'a>(
+        &mut self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        enum CommentState<'a> {
+            Blank,
+            NotBlank,
+            Period(&'a str),
+        }
+        let mut state = CommentState::Blank;
+        loop {
+            let (Some(c), rest) = take(input, eof)? else {
+                // End of file.
+                self.state = (State::General, Substate::START_OF_COMMAND);
+                return Ok((input, Type::SeparateCommands));
+            };
+            match c {
+                '.' => state = CommentState::Period(input),
+                '\n' | '\r' if is_end_of_line(input, eof)? => {
+                    match state {
+                        CommentState::Blank => {
+                            // Blank line ends comment command.
+                            self.state = (State::General, Substate::START_OF_COMMAND);
+                            return Ok((input, Type::SeparateCommands));
+                        }
+                        CommentState::Period(period) => {
+                            // '.' at end of line ends comment command.
+                            self.state = (State::General, Substate::empty());
+                            return Ok((period, Type::CommentCommand));
+                        }
+                        CommentState::NotBlank => {
+                            // Comment continues onto next line.
+                            self.state = (State::Comment2, Substate::empty());
+                            return Ok((input, Type::CommentCommand));
+                        }
+                    }
+                }
+                c if c.is_whitespace() => (),
+                _ => state = CommentState::NotBlank,
+            }
+            input = rest;
+        }
+    }
+    fn parse_comment_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let rest = self.parse_newline(input, eof)?.unwrap();
+
+        let new_command = match take(rest, eof)?.0 {
+            Some('+') | Some('-') | Some('.') => true,
+            Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
+            None | Some(_) => false,
+        };
+        if new_command {
+            self.state = (
+                State::General,
+                Substate::START_OF_LINE | Substate::START_OF_COMMAND,
+            );
+        } else {
+            self.state.0 = State::Comment1;
+        }
+        Ok((rest, Type::Newline))
+    }
+    fn parse_document_1<'a>(
+        &mut self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let mut end_cmd = false;
+        loop {
+            let (Some(c), rest) = take(input, eof)? else {
+                self.state.0 = State::Document3;
+                return Ok((input, Type::Document));
+            };
+            match c {
+                '.' => end_cmd = true,
+                '\n' | '\r' if is_end_of_line(input, eof)? => {
+                    self.state.0 = if end_cmd {
+                        State::Document3
+                    } else {
+                        State::Document2
+                    };
+                    return Ok((input, Type::Document));
+                }
+                c if !c.is_whitespace() => end_cmd = false,
+                _ => (),
+            }
+            input = rest;
+        }
+    }
+    fn parse_document_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let rest = self.parse_newline(input, eof)?.unwrap();
+        self.state.0 = State::Document1;
+        Ok((rest, Type::Newline))
+    }
+    fn parse_document_3<'a>(
+        &mut self,
+        input: &'a str,
+        _eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        self.state = (
+            State::General,
+            Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+        );
+        Ok((input, Type::EndCommand))
+    }
+    fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
+        let input = skip_spaces_and_comments(input, eof)?;
+        match take(input, eof)?.0 {
+            Some('\'') | Some('"') | Some('\n') => Ok(true),
+            _ => Ok(false),
+        }
+    }
+    fn parse_file_label_1<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let mut sub = Segmenter {
+            state: (State::General, self.state.1),
+            ..*self
+        };
+        let (rest, type_) = sub.push(input, eof)?;
+        if type_ == Type::Identifier {
+            let id = &input[..input.len() - rest.len()];
+            debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
+            if Self::quoted_file_label(rest, eof)? {
+                *self = sub;
+            } else {
+                self.state.0 = State::FileLabel2;
+            }
+        } else {
+            self.state.1 = sub.state.1;
+        }
+        Ok((rest, type_))
+    }
+    fn parse_file_label_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let input = skip_spaces(input, eof)?;
+        self.state.0 = State::FileLabel3;
+        Ok((input, Type::Spaces))
+    }
+    fn parse_file_label_3<'a>(
+        &mut self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let mut end_cmd = None;
+        loop {
+            let (c, rest) = take(input, eof)?;
+            match c {
+                None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
+                    self.state = (State::General, Substate::empty());
+                    return Ok((end_cmd.unwrap_or(input), Type::UnquotedString));
+                }
+                None => unreachable!(),
+                Some('.') => end_cmd = Some(input),
+                Some(c) if !c.is_whitespace() => end_cmd = None,
+                Some(_) => (),
+            }
+            input = rest;
+        }
+    }
+    fn subparse<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
+        let mut sub = Segmenter {
+            mode: self.mode,
+            state: (State::General, self.state.1),
+            nest: 0,
+        };
+        let result = sub.push(input, eof)?;
+        self.state.1 = sub.state.1;
+        Ok(result)
+    }
+    /// We are segmenting a `DO REPEAT` command, currently reading the syntax
+    /// that defines the stand-in variables (the head) before the lines of
+    /// syntax to be repeated (the body).
+    fn parse_do_repeat_1<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let (rest, type_) = self.subparse(input, eof)?;
+        if type_ == Type::SeparateCommands {
+            // We reached a blank line that separates the head from the body.
+            self.state.0 = State::DoRepeat2;
+        } else if type_ == Type::EndCommand || type_ == Type::StartCommand {
+            // We reached the body.
+            self.state.0 = State::DoRepeat3;
+            self.nest = 1;
+        }
+        Ok((rest, type_))
+    }
+    /// We are segmenting a `DO REPEAT` command, currently reading a blank line
+    /// that separates the head from the body.
+    fn parse_do_repeat_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let (rest, type_) = self.subparse(input, eof)?;
+        if type_ == Type::Newline {
+            // We reached the body.
+            self.state.0 = State::DoRepeat3;
+            self.nest = 1;
+        }
+        Ok((rest, type_))
+    }
+    fn parse_newline<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<Option<&'a str>, Incomplete> {
+        let (Some(c), rest) = take(input, eof)? else {
+            return Ok(None);
+        };
+        match c {
+            '\n' => Ok(Some(rest)),
+            '\r' => {
+                if let (Some('\n'), rest) = take(rest, eof)? {
+                    Ok(Some(rest))
+                } else {
+                    Ok(None)
+                }
+            }
+            _ => Ok(None),
+        }
+    }
+
+    fn parse_full_line<'a>(
+        &mut self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<&'a str, Incomplete> {
+        loop {
+            if is_end_of_line(input, eof)? {
+                return Ok(input);
+            }
+            input = take(input, eof).unwrap().1;
+        }
+    }
+    fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result<(), Incomplete> {
+        let input = input.strip_prefix(&['-', '+']).unwrap_or(input);
+        let (id1, input) = self.next_id_in_command(input, eof)?;
+        let up = if id_match("DO", id1) {
+            true
+        } else if id_match("END", id1) {
+            false
+        } else {
+            return Ok(());
+        };
+
+        let (id2, _) = self.next_id_in_command(input, eof)?;
+        if id_match("REPEAT", id2) {
+            if up {
+                self.nest += 1
+            } else {
+                self.nest -= 1
+            };
+        }
+        Ok(())
+    }
+    /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
+    /// are to be repeated.  Report each line of syntax as a single
+    /// [`Type::DoRepeatCommand`].
+    ///
+    /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
+    /// blocks inside the lines we're segmenting.  `self.nest` counts the
+    /// nesting level, starting at 1.
+    fn parse_do_repeat_3<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        if let Some(rest) = self.parse_newline(input, eof)? {
+            return Ok((rest, Type::Newline));
+        }
+        let rest = self.parse_full_line(input, eof)?;
+        self.check_repeat_command(input, eof)?;
+        if self.nest == 0 {
+            // Nesting level dropped to 0, so we've finished reading the `DO
+            // REPEAT` body.
+            self.state = (
+                State::General,
+                Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+            );
+            self.push(input, eof)
+        } else {
+            Ok((rest, Type::DoRepeatCommand))
+        }
+    }
+    /// We are segmenting a `DEFINE` command, which consists of:
+    ///
+    ///   - The `DEFINE` keyword.
+    ///
+    ///   - An identifier.  We transform this into `Type::MacroName` instead of
+    ///     `Type::Identifier` or `Type::MacroId` because this identifier must
+    ///     never be macro-expanded.
+    ///
+    ///   - Anything but `(`.
+    ///
+    ///   - `(` followed by a sequence of tokens possibly including balanced
+    ///     parentheses up to a final `)`.
+    ///
+    ///   - A sequence of any number of lines, one string per line, ending with
+    ///     `!ENDDEFINE`.  The first line is usually blank (that is, a newline
+    ///     follows the `(`).  The last line usually just has `!ENDDEFINE.` on
+    ///     it, but it can start with other tokens.  The whole
+    ///     DEFINE...!ENDDEFINE can be on a single line, even.
+    fn parse_define_1_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let (rest, type_) = self.subparse(input, eof)?;
+        match type_ {
+            Type::Identifier | Type::MacroId if self.state.0 == State::Define1 => {
+                self.state.0 = State::Define2;
+                return Ok((rest, Type::MacroName));
+            }
+            Type::SeparateCommands | Type::EndCommand | Type::StartCommand => {
+                // The DEFINE command is malformed because we reached its end
+                // without ever hitting a `(` token.  Transition back to general
+                // parsing.
+                self.state.0 = State::General;
+            }
+            Type::Punct if rest.starts_with('(') => {
+                self.state.0 = State::Define3;
+                self.nest = 1;
+            }
+            _ => (),
+        }
+        Ok((rest, type_))
+    }
+    fn parse_define_3<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let (rest, type_) = self.subparse(input, eof)?;
+        match type_ {
+            Type::SeparateCommands | Type::EndCommand | Type::StartCommand => {
+                // The DEFINE command is malformed because we reached its end
+                // without ever hitting a `(` token.  Transition back to general
+                // parsing.
+                self.state.0 = State::General;
+            }
+            Type::Punct if rest.starts_with('(') => {
+                self.nest += 1;
+            }
+            Type::Punct if rest.starts_with(')') => {
+                self.nest -= 1;
+                if self.nest == 0 {
+                    self.state = (State::Define4, Substate::empty());
+                }
+            }
+            _ => (),
+        }
+        Ok((rest, type_))
+    }
+    fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> {
+        loop {
+            input = skip_spaces_and_comments(input, true).unwrap();
+            let (Some(c), rest) = take(input, true).unwrap() else {
+                return None;
+            };
+            match c {
+                '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
+                    return Some(input)
+                }
+                '\'' | '"' => {
+                    let index = rest.find(c)?;
+                    input = &rest[index + 1..];
+                }
+                _ => input = rest,
+            }
+        }
+    }
+    /// We are in the body of a macro definition, looking for additional lines
+    /// of the body or `!ENDDEFINE`.
+    ///
+    /// In `State::Define4`, we're parsing the first line of the macro body (the
+    /// same line as the closing parenthesis in the argument definition).  In
+    /// `State::Define5`, we're on a later line.
+    fn parse_define_4_5<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let rest = self.parse_full_line(input, eof)?;
+        let line = &input[..input.len() - rest.len()];
+        if let Some(end) = Self::find_enddefine(line) {
+            // Macro ends at the !ENDDEFINE on this line.
+            self.state = (State::General, Substate::empty());
+            let prefix = &input[..input.len() - end.len()];
+            if prefix.is_empty() {
+                // Line starts with `!ENDDEFINE`.
+                self.push(input, eof)
+            } else if input.trim().is_empty() {
+                // Line starts with spaces followed by `!ENDDEFINE`.
+                Ok((end, Type::Spaces))
+            } else {
+                // Line starts with some content followed by `!ENDDEFINE`.
+                Ok((end, Type::MacroBody))
+            }
+        } else {
+            // No `!ENDDEFINE`.  We have a full line of macro body.
+            //
+            // The line might be blank, whether completely empty or just spaces
+            // and comments.  That's OK: we need to report blank lines because
+            // they can have significance.
+            //
+            // However, if the first line of the macro body is blank, we just
+            // report it as spaces because it's not significant.
+            let type_ = if self.state.0 == State::Define4 && line.trim().is_empty() {
+                Type::Spaces
+            } else {
+                Type::MacroBody
+            };
+            self.state.0 = State::Define6;
+            Ok((rest, type_))
+        }
+    }
+    fn parse_define_6<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let rest = self.parse_newline(input, eof)?.unwrap();
+        self.state.0 = State::Define5;
+        Ok((rest, Type::Newline))
+    }
+    fn parse_begin_data_1<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let (rest, type_) = self.subparse(input, eof)?;
+        if type_ == Type::Newline {
+            self.state.0 = State::BeginData2;
+        }
+        Ok((rest, type_))
+    }
+    fn parse_begin_data_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let (rest, type_) = self.subparse(input, eof)?;
+        if type_ == Type::Newline {
+            self.state.0 = State::BeginData3;
+        }
+        Ok((rest, type_))
+    }
+    fn is_end_data(line: &str) -> bool {
+        let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
+            return false;
+        };
+        let (Some(c), rest) = take(rest, true).unwrap() else {
+            return false;
+        };
+        if !c.is_whitespace() {
+            return false;
+        };
+        let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
+            return false;
+        };
+
+        let mut endcmd = false;
+        for c in rest.chars() {
+            match c {
+                '.' if endcmd => return false,
+                '.' => endcmd = true,
+                c if c.is_whitespace() => (),
+                _ => return false,
+            }
+        }
+        endcmd
+    }
+    fn parse_begin_data_3<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let rest = self.parse_full_line(input, eof)?;
+        let line = &input[..input.len() - rest.len()];
+        if Self::is_end_data(line) {
+            self.state = (
+                State::General,
+                Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+            );
+            self.push(input, eof)
+        } else {
+            self.state.0 = State::BeginData4;
+            Ok((rest, Type::InlineData))
+        }
+    }
+    fn parse_begin_data_4<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Type), Incomplete> {
+        let rest = self.parse_newline(input, eof)?.unwrap();
+        self.state.0 = State::BeginData3;
+        Ok((rest, Type::Newline))
+    }
+}
+
+fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
+    line.get(..pattern.len())
+        .map(|prefix| {
+            prefix
+                .eq_ignore_ascii_case(pattern)
+                .then(|| &line[pattern.len()..])
+        })
+        .flatten()
+}
+
+#[cfg(test)]
+mod test {
+    use super::{Mode, Segmenter, Type};
+
+    /*
+        fn check_segmentation(mut input: &str, output: &[(Type, &str)]) {
+            let mut segmenter = Segmenter::new(Mode::Auto, false);
+            for (&exp_type, &exp_s) in output {
+                let (rest, type_) = segmenter.push(input, true).unwrap();
+
+            }
+    }*/
+
+    fn print_segmentation(mut input: &str) {
+        let mut segmenter = Segmenter::new(Mode::Auto, false);
+        loop {
+            let (rest, type_) = segmenter.push(input, true).unwrap();
+            let token = &input[..input.len() - rest.len()];
+            println!("{type_:?} {token:?}");
+            if type_ == Type::End {
+                break;
+            }
+            input = rest;
+        }
+    }
+
+    #[test]
+    fn test_identifiers() {
+        print_segmentation(
+            r#"a ab abc abcd !abcd
+A AB ABC ABCD !ABCD
+aB aBC aBcD !aBcD
+$x $y $z !$z
+grève Ângstrom poté
+#a #b #c ## #d !#d
+@efg @ @@. @#@ !@ 
+## # #12345 #.#
+f@#_.#6
+GhIjK
+.x 1y _z
+"#,
+        );
+    }
+
+    #[test]
+    fn test_identifiers_ending_in_dot() {
+        print_segmentation(
+            r#"abcd. abcd.
+ABCD. ABCD.
+aBcD. aBcD. 
+$y. $z. あいうえお.
+#c. #d..
+@@. @@....
+#.#.
+#abcd.
+.
+. 
+LMNOP. 
+QRSTUV./* end of line comment */
+qrstuv. /* end of line comment */
+QrStUv./* end of line comment */ 
+wxyz./* unterminated end of line comment
+WXYZ. /* unterminated end of line comment
+WxYz./* unterminated end of line comment 
+"#,
+        );
+    }
+
+    #[test]
+    fn test_reserved_words() {
+        print_segmentation(
+            r#"and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+"#,
+        );
+    }
+
+    #[test]
+    fn test_punctuation() {
+        print_segmentation(
+            r#"~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
+~&|=>=><=<~=<>(),-+*/[[]]**!*
+% : ; ? _ ` { } ~ !*
+"#,
+        );
+    }
+
+    #[test]
+    fn test_positive_numbers() {
+        print_segmentation(
+            r#"0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e- 1.
+"#,
+        );
+    }
+
+    #[test]
+    fn test_negative_numbers() {
+        print_segmentation(
+            r#" -0 -1 -01 -001. -1.
+ -123. /* comment 1 */ /* comment 2 */
+ -.1 -0.1 -00.1 -00.10
+ -5e1 -6E-1 -7e+1 -6E+01 -6e-03
+ -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
+ -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
+ -/**/1
+ -. -1e -e1 -1e+ -1e- -1.
+"#,
+        );
+    }
+
+    #[test]
+    fn test_strings() {
+        print_segmentation(
+            r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' ""
+'missing end quote
+"missing double quote
+x"4142" X'5152'
+u'fffd' U"041"
++ new command
++ /* comment */ 'string continuation'
++ /* also a punctuator on blank line
+- 'new command'
+"#,
+        );
+    }
+
+    #[test]
+    fn test_shbang() {
+        print_segmentation(
+            r#"#! /usr/bin/pspp
+title my title.
+#! /usr/bin/pspp
+"#,
+        );
+    }
+
+    #[test]
+    fn test_comment_command() {
+        print_segmentation(
+            r#"* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+   * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+"#,
+        );
+    }
+
+    #[test]
+    fn test_document_command() {
+        print_segmentation(
+            r#"DOCUMENT one line.
+DOC more
+    than
+        one
+            line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+
+"#,
+        );
+    }
+
+    #[test]
+    fn test_file_label_command() {
+        print_segmentation(
+            r#"FIL label isn't quoted.
+FILE
+  lab 'is quoted'.
+FILE /*
+/**/  lab not quoted here either
+
+"#,
+        );
+    }
+
+        #[test]
+    fn test_begin_data() {
+        print_segmentation(r#"begin data.
+end data.
+
+begin data. /*
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end  data
+end data
+.
+
+begin
+ data.
+data
+end data.
+
+begin data "xxx".
+begin data 123.
+not data
+
+"#);
+    }
+
  }
diff --git a/rust/src/lib.rs b/rust/src/lib.rs

index 933c74ad096911d1db982336f2924b3b22f3f4df..404ac18b67224e4f69db264a6a079363cb5f2130 100644 (file)
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -10,3 +10,4 @@ pub mod locale_charset;
  pub mod raw;
  pub mod sack;
  pub mod lex;
+pub mod prompt;
diff --git a/rust/src/prompt.rs b/rust/src/prompt.rs

new file mode 100644 (file)

index 0000000..71a6b7c
--- /dev/null
+++ b/rust/src/prompt.rs
@@ -0,0 +1,36 @@
+pub enum PromptStyle {
+    /// First line of command.
+    First,
+
+    /// Second or later line of command.
+    Later,
+
+    /// Between `BEGIN DATA` and `END DATA`.
+    Data,
+
+    /// `COMMENT` or `*` command.
+    Comment,
+
+    /// DOCUMENT command.
+    Document,
+
+    /// `DO REPEAT` command.
+    DoRepeat,
+
+    /// `DEFINE` command.
+    Define,
+}
+
+impl PromptStyle {
+    pub fn to_string(&self) -> &'static str {
+        match self {
+            PromptStyle::First => "first",
+            PromptStyle::Later => "later",
+            PromptStyle::Data => "data",
+            PromptStyle::Comment => "COMMENT",
+            PromptStyle::Document => "DOCUMENT",
+            PromptStyle::DoRepeat => "DO REPEAT",
+            PromptStyle::Define => "DEFINE",
+        }
+    }
+}
diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at

index abbc08c8cd4b4a037b7f7155ccc799b0cdde6593..80c09779a8732999a97e1d7295fe6b13e01b0250 100644 (file)
--- a/tests/language/lexer/segment.at
+++ b/tests/language/lexer/segment.at
@@ -1648,6 +1648,7 @@ AT_CLEANUP
  # uninitialized data, run with valgrind.  The test will pass either
  # way.  (The bug report has a more complicated crashing case.)
  AT_SETUP([input ends in carriage return])
+AT_KEYWORDS([segment])
  printf '\r' > input
  AT_DATA([expout-base], [dnl
  separate_commands
author	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 8 Jul 2024 00:48:38 +0000 (17:48 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 8 Jul 2024 00:48:38 +0000 (17:48 -0700)
Makefile.am		patch \| blob \| history
rust/Cargo.lock		patch \| blob \| history
rust/Cargo.toml		patch \| blob \| history
rust/src/identifier.rs		patch \| blob \| history
rust/src/lex/command_name.rs	[new file with mode: 0644]	patch \| blob
rust/src/lex/mod.rs		patch \| blob \| history
rust/src/lex/segment.rs		patch \| blob \| history
rust/src/lib.rs		patch \| blob \| history
rust/src/prompt.rs	[new file with mode: 0644]	patch \| blob
tests/language/lexer/segment.at		patch \| blob \| history