From: Ben Pfaff Date: Mon, 8 Jul 2024 00:48:38 +0000 (-0700) Subject: Segmentation code is coming along. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=808bb0b6ee7b60a67f7fd1463df4b0daa73fc4f3;p=pspp Segmentation code is coming along. --- diff --git a/Makefile.am b/Makefile.am index fcc4fe4856..b82bb6bb6f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -131,7 +131,7 @@ dist-hook-git: distfiles (git --version) >/dev/null 2>&1; then \ (cd $(srcdir) && git ls-files) \ | grep -vE '\.gitignore|README.Git|Smake|Bug-administration' \ - | grep -vE '\.gitattributes' \ + | grep -vE '\.gitattributes|rust' \ | LC_ALL=C sort -u > gitfiles; \ LC_ALL=C comm -1 -3 distfiles gitfiles > missing-distfiles; \ if test -s missing-distfiles; then \ diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 8b89b6fb95..4569faca60 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -52,6 +52,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" + [[package]] name = "bumpalo" version = "3.13.0" @@ -91,7 +97,7 @@ version = "4.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f3061d6db6d8fcbbd4b05e057f2acace52e64e96b498c08c2d7a4e65addd340" dependencies = [ - "bitflags", + "bitflags 1.3.2", "clap_derive", "clap_lex", "is-terminal", @@ -493,6 +499,7 @@ name = "pspp" version = "1.0.0" dependencies = [ "anyhow", + "bitflags 2.5.0", "chrono", "clap", "encoding_rs", @@ -509,6 +516,7 @@ dependencies = [ "ordered-float", "thiserror", "unicase", + "utf8-decode", "windows-sys 0.48.0", ] @@ -527,7 +535,7 @@ version = "0.36.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644" dependencies = [ - "bitflags", + "bitflags 1.3.2", "errno 0.2.8", "io-lifetimes", "libc", @@ -541,7 +549,7 @@ version = "0.37.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b24138615de35e32031d041a09032ef3487a616d901ca4db224e7d557efae2" dependencies = [ - "bitflags", + "bitflags 1.3.2", "errno 0.3.1", "io-lifetimes", "libc", @@ -651,6 +659,12 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +[[package]] +name = "utf8-decode" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498" + [[package]] name = "version_check" version = "0.9.4" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 5131409ac0..371ac6dff2 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -22,6 +22,8 @@ finl_unicode = "1.2.0" unicase = "2.6.0" libc = "0.2.147" indexmap = "2.1.0" +utf8-decode = "1.0.1" +bitflags = "2.5.0" [target.'cfg(windows)'.dependencies] windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index 1108a46a7e..8e37e64ecb 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -12,29 +12,57 @@ use thiserror::Error as ThisError; use unicase::UniCase; pub trait IdentifierChar { + /// Returns true if `self` is an ASCII character that may be the first + /// character in an identifier. + fn ascii_may_start_id(self) -> bool; + /// Returns true if `self` may be the first character in an identifier. fn may_start_id(self) -> bool; + /// Returns true if `self` is an ASCII character that may be a second or + /// subsequent character in an identifier. + fn ascii_may_continue_id(self) -> bool; + /// Returns true if `self` may be a second or subsequent character in an /// identifier. fn may_continue_id(self) -> bool; } impl IdentifierChar for char { + fn ascii_may_start_id(self) -> bool { + matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$') + } + fn may_start_id(self) -> bool { - use MajorCategory::*; + if self < '\u{0080}' { + self.ascii_may_start_id() + } else { + use MajorCategory::*; - ([L, M, S].contains(&self.get_major_category()) || "@#$".contains(self)) - && self != char::REPLACEMENT_CHARACTER + ([L, M, S].contains(&self.get_major_category()) || "@#$".contains(self)) + && self != char::REPLACEMENT_CHARACTER + } + } + + fn ascii_may_continue_id(self) -> bool { + self.ascii_may_start_id() || matches!(self, '0'..='9' | '.' | '_') } fn may_continue_id(self) -> bool { - use MajorCategory::*; + if self < '\u{0080}' { + self.ascii_may_continue_id() + } else { + use MajorCategory::*; - ([L, M, S, N].contains(&self.get_major_category()) || "@#$._".contains(self)) - && self != char::REPLACEMENT_CHARACTER + ([L, M, S, N].contains(&self.get_major_category()) || "@#$._".contains(self)) + && self != char::REPLACEMENT_CHARACTER + } } } +#[test] +fn gc() { + println!("{:?}", '<'.get_major_category()); +} #[derive(Clone, Debug, ThisError)] pub enum Error { @@ -66,7 +94,7 @@ pub enum Error { }, } -fn is_reserved_word(s: &str) -> bool { +pub fn is_reserved_word(s: &str) -> bool { for word in [ "and", "or", "not", "eq", "ge", "gt", "le", "ne", "all", "by", "to", "with", ] { @@ -151,6 +179,47 @@ impl Identifier { } } + + +/// Returns true if `token` is a case-insensitive match for `keyword`. +/// +/// Keywords match `keyword` and `token` are identical, or `token` is at least 3 +/// characters long and those characters are identical to `keyword` or differ +/// only in case. +/// +/// `keyword` must be ASCII. It's normally a constant string, so it's declared +/// as `&'static str` to make it harder to reverse the argument order. But +/// there's no reason that a non-static string won't work, so use +/// [`id_match_n_nonstatic`] instead if you need it. +pub fn id_match(keyword: &'static str, token: &str) -> bool { + id_match_n(keyword, token, 3) +} + +/// Returns true if `token` is a case-insensitive match for at least the first +/// `n` characters of `keyword`. +/// +/// `keyword` must be ASCII. It's normally a constant string, so it's declared +/// as `&'static str` to make it harder to reverse the argument order. But +/// there's no reason that a non-static string won't work, so use +/// [`id_match_n_nonstatic`] instead if you need it. +pub fn id_match_n(keyword: &'static str, token: &str, n: usize) -> bool { + id_match_n_nonstatic(keyword, token, n) +} + +/// Returns true if `token` is a case-insensitive match for at least the first +/// `n` characters of `keyword`. +/// +/// `keyword` must be ASCII. +pub fn id_match_n_nonstatic(keyword: &str, token: &str, n: usize) -> bool { + debug_assert!(keyword.is_ascii()); + let keyword_prefix = if (n..keyword.len()).contains(&token.len()) { + &keyword[..token.len()] + } else { + keyword + }; + keyword_prefix.eq_ignore_ascii_case(token) +} + impl Display for Identifier { fn fmt(&self, f: &mut Formatter) -> FmtResult { write!(f, "{}", self.0) diff --git a/rust/src/lex/command_name.rs b/rust/src/lex/command_name.rs new file mode 100644 index 0000000000..208bd457b9 --- /dev/null +++ b/rust/src/lex/command_name.rs @@ -0,0 +1,298 @@ +use crate::identifier::id_match_n_nonstatic; + +pub struct Match { + pub exact: bool, + pub missing_words: isize, +} + +fn count_words(s: &str) -> isize { + s.split_whitespace().count() as isize +} + +/// Compares `string` obtained from the user against the full name of a `command`, +/// using this algorithm: +/// +/// 1. Divide `command` into words `c[0]` through `c[n - 1]`. +/// +/// 2. Divide `string` into words `s[0]` through `s[m - 1]`. +/// +/// 3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the keyword +/// matching algorithm implemented by lex_id_match(). If any of them fail to +/// match, then `string` does not match `command` and the function returns false. +/// +/// 4. Otherwise, `string` and `command` match. Set *MISSING_WORDS to n - m. Set +/// *EXACT to false if any of the S[i] were found to be abbreviated in the +/// comparisons done in step 3, or to true if they were all exactly equal +/// (modulo case). Return true. */ +pub fn command_match(command: &str, string: &str) -> Option { + let mut command_words = command.split_whitespace(); + let mut string_words = string.split_whitespace(); + let mut exact = true; + loop { + let Some(cw) = command_words.next() else { + return Some(Match { + exact, + missing_words: -count_words(string), + }); + }; + let Some(sw) = string_words.next() else { + return Some(Match { + exact, + missing_words: 1 + count_words(command), + }); + }; + if !id_match_n_nonstatic(cw, sw, 3) { + return None; + } + if sw.len() < cw.len() { + exact = false; + } + } +} + +pub const COMMAND_NAMES: &'static [&'static str] = &[ + "2SLS", + "ACF", + "ADD DOCUMENT", + "ADD FILES", + "ADD VALUE LABELS", + "AGGREGATE", + "ALSCAL", + "ANACOR", + "ANOVA", + "APPLY DICTIONARY", + "AUTORECODE", + "BEGIN DATA", + "BREAK", + "CACHE", + "CASEPLOT", + "CASESTOVARS", + "CATPCA", + "CATREG", + "CCF", + "CD", + "CLEAR TRANSFORMATIONS", + "CLOSE FILE HANDLE", + "CLUSTER", + "COMPUTE", + "CONJOINT", + "CORRELATIONS", + "CORRESPONDENCE", + "COUNT", + "COXREG", + "CREATE", + "CROSSTABS", + "CSDESCRIPTIVES", + "CSGLM", + "CSLOGISTIC", + "CSPLAN", + "CSSELECT", + "CSTABULATE", + "CTABLES", + "CURVEFIT", + "DATA LIST", + "DATAFILE ATTRIBUTE", + "DATASET ACTIVATE", + "DATASET CLOSE", + "DATASET COPY", + "DATASET DECLARE", + "DATASET DISPLAY", + "DATASET NAME", + "DATE", + "DEBUG EVALUATE", + "DEBUG EXPAND", + "DEBUG FLOAT FORMAT", + "DEBUG FORMAT GUESSER", + "DEBUG MATRIX READ", + "DEBUG MOMENTS", + "DEBUG PAPER SIZE", + "DEBUG POOL", + "DEBUG XFORM FAIL", + "DEFINE", + "DELETE VARIABLES", + "DESCRIPTIVES", + "DETECTANOMALY", + "DISCRIMINANT", + "DISPLAY MACROS", + "DISPLAY VARIABLE SETS", + "DISPLAY", + "DO IF", + "DO REPEAT", + "DOCUMENT", + "DROP DOCUMENTS", + "ECHO", + "EDIT", + "ELSE IF", + "ELSE", + "END CASE", + "END FILE TYPE", + "END FILE", + "END IF", + "END LOOP", + "END REPEAT", + "ERASE", + "EXAMINE", + "EXECUTE", + "EXIT", + "EXPORT", + "FACTOR", + "FILE HANDLE", + "FILE LABEL", + "FILE TYPE", + "FILTER", + "FINISH", + "FIT", + "FLIP", + "FORMATS", + "FREQUENCIES", + "GENLOG", + "GET DATA", + "GET TRANSLATE", + "GET", + "GGRAPH", + "GLM", + "GRAPH", + "HILOGLINEAR", + "HOMALS", + "HOST", + "IF", + "IGRAPH", + "IMPORT", + "INCLUDE", + "INFO", + "INPUT PROGRAM", + "INSERT", + "KEYED DATA LIST", + "KM", + "LEAVE", + "LIST", + "LOGISTIC REGRESSION", + "LOGLINEAR", + "LOOP", + "MANOVA", + "MAPS", + "MATCH FILES", + "MATRIX DATA", + "MATRIX", + "MCONVERT", + "MEANS", + "MISSING VALUES", + "MIXED", + "MODEL CLOSE", + "MODEL HANDLE", + "MODEL LIST", + "MODEL NAME", + "MRSETS", + "MULT RESPONSE", + "MULTIPLE CORRESPONDENCE", + "MVA", + "N OF CASES", + "N", + "NAIVEBAYES", + "NEW FILE", + "NLR", + "NOMREG", + "NONPAR CORR", + "NPAR TESTS", + "NUMBERED", + "NUMERIC", + "OLAP CUBES", + "OMS", + "ONEWAY", + "ORTHOPLAN", + "OUTPUT MODIFY", + "OVERALS", + "PACF", + "PARTIAL CORR", + "PEARSON CORRELATIONS", + "PERMISSIONS", + "PLANCARDS", + "PLUM", + "POINT", + "PPLOT", + "PREDICT", + "PREFSCAL", + "PRESERVE", + "PRINCALS", + "PRINT EJECT", + "PRINT FORMATS", + "PRINT SPACE", + "PRINT", + "PROBIT", + "PROCEDURE OUTPUT", + "PROXIMITIES", + "PROXSCAL", + "Q", + "QUICK CLUSTER", + "QUIT", + "RANK", + "RATIO STATISTICS", + "READ MODEL", + "RECODE", + "RECORD TYPE", + "REFORMAT", + "REGRESSION", + "RELIABILITY", + "RENAME VARIABLES", + "REPEATING DATA", + "REPORT", + "REREAD", + "RESTORE", + "RMV", + "ROC", + "SAMPLE", + "SAVE DATA COLLECTION", + "SAVE TRANSLATE", + "SAVE", + "SCRIPT", + "SEASON", + "SELECT IF", + "SELECTPRED", + "SET", + "SHOW", + "SORT CASES", + "SORT VARIABLES", + "SPCHART", + "SPECTRA", + "SPLIT FILE", + "STEMLEAF", + "STRING", + "SUBTITLE", + "SUMMARIZE", + "SURVIVAL", + "SYSFILE INFO", + "T-TEST", + "TDISPLAY", + "TEMPORARY", + "TITLE", + "TREE", + "TSAPPLY", + "TSET", + "TSHOW", + "TSMODEL", + "TSPLOT", + "TWOSTEP CLUSTER", + "UNIANOVA", + "UNNUMBERED", + "UPDATE", + "USE", + "VALIDATEDATA", + "VALUE LABELS", + "VARCOMP", + "VARIABLE ALIGNMENT", + "VARIABLE ATTRIBUTE", + "VARIABLE LABELS", + "VARIABLE LEVEL", + "VARIABLE ROLE", + "VARIABLE WIDTH", + "VARSTOCASES", + "VECTOR", + "VERIFY", + "WEIGHT", + "WLS", + "WRITE FORMATS", + "WRITE", + "XEXPORT", + "XGRAPH", + "XSAVE", +]; diff --git a/rust/src/lex/mod.rs b/rust/src/lex/mod.rs index c5780e0db5..2047837df4 100644 --- a/rust/src/lex/mod.rs +++ b/rust/src/lex/mod.rs @@ -1 +1,2 @@ pub mod segment; +pub mod command_name; diff --git a/rust/src/lex/segment.rs b/rust/src/lex/segment.rs index 597e5766c5..d52dd888fc 100644 --- a/rust/src/lex/segment.rs +++ b/rust/src/lex/segment.rs @@ -19,6 +19,14 @@ //! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior //! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE). +use crate::{ + identifier::{id_match, id_match_n, is_reserved_word, IdentifierChar}, + prompt::PromptStyle, +}; +use bitflags::bitflags; + +use super::command_name::{command_match, COMMAND_NAMES}; + /// Segmentation mode. /// /// PSPP syntax is written in one of two modes which are broadly defined as @@ -47,7 +55,7 @@ pub enum Mode { } /// The type of a segment. -#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum Type { Number, QuotedString, @@ -75,9 +83,1456 @@ pub enum Type { End, ExpectedQuote, ExpectedExponent, - UnexpectedChar + UnexpectedChar, } +bitflags! { + #[derive(Copy, Clone)] + pub struct Substate: u8 { + const START_OF_LINE = 1; + const START_OF_COMMAND = 2; + } +} + +#[derive(Copy, Clone)] pub struct Segmenter { - state: State + state: (State, Substate), + nest: u8, + mode: Mode, +} + +#[derive(Copy, Clone, Debug)] +pub struct Incomplete; + +impl Segmenter { + /// Returns a segmenter with the given syntax `mode`. + /// + /// If `is_snippet` is false, then the segmenter will parse as if it's being + /// given a whole file. This means, for example, that it will interpret `-` + /// or `+` at the beginning of the syntax as a separator between commands + /// (since `-` or `+` at the beginning of a line has this meaning). + /// + /// If `is_snippet` is true, then the segmenter will parse as if it's being + /// given an isolated piece of syntax. This means that, for example, that + /// it will interpret `-` or `+` at the beginning of the syntax as an + /// operator token or (if followed by a digit) as part of a number. + pub fn new(mode: Mode, is_snippet: bool) -> Self { + Self { + state: if is_snippet { + (State::General, Substate::empty()) + } else { + (State::Shbang, Substate::empty()) + }, + mode, + nest: 0, + } + } + + pub fn mode(&self) -> Mode { + self.mode + } + + fn start_of_line(&self) -> bool { + self.state.1.contains(Substate::START_OF_LINE) + } + + fn start_of_command(&self) -> bool { + self.state.1.contains(Substate::START_OF_COMMAND) + } + + /// Returns the style of command prompt to display to an interactive user + /// for input in the current state.. The return value is most accurate in + /// mode `Mode::Interactive` and at the beginning of a line (that is, if + /// [`Segmenter::push`] consumed as much as possible of the input up to a + /// new-line). + pub fn prompt(&self) -> PromptStyle { + match self.state.0 { + State::Shbang => PromptStyle::First, + State::General => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::Comment1 | State::Comment2 => PromptStyle::Comment, + State::Document1 | State::Document2 => PromptStyle::Document, + State::Document3 => PromptStyle::First, + State::FileLabel1 => PromptStyle::Later, + State::FileLabel2 | State::FileLabel3 => PromptStyle::First, + State::DoRepeat1 | State::DoRepeat2 => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::DoRepeat3 => PromptStyle::DoRepeat, + State::Define1 | State::Define2 | State::Define3 => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define, + State::BeginData1 => PromptStyle::First, + State::BeginData2 => PromptStyle::Later, + State::BeginData3 | State::BeginData4 => PromptStyle::Data, + } + } + + /// Attempts to label a prefix of the remaining input with a segment type. + /// The caller supplies a prefix of the remaining input as `input`. If + /// `eof` is true, then `input` is the entire (remainder) of the input; if + /// `eof` is false, then further input is potentially available. + /// + /// The input may contain '\n' or '\r\n' line ends in any combination. + /// + /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes + /// in the segment at the beginning of `input` (a number in + /// `0..=input.len()`) and the type of that segment. The next call should + /// not include those bytes in `input`, because they have (figuratively) + /// been consumed by the segmenter. + /// + /// Segments can have zero length, including segment types `Type::End`, + /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and + /// `Type::Spaces`. + /// + /// Failure occurs only if the segment type of the bytes in `input` cannot + /// yet be determined. In this case, this function returns `Err(Incomplete)`. If + /// more input is available, the caller should obtain some more, then call + /// again with a longer `input`. If this is not enough, the process might + /// need to repeat again and again. If input is exhausted, then the caller + /// may call again setting `eof` to true. This function will never return + /// `Err(Incomplete)` when `eof` is true. + /// + /// The caller must not, in a sequence of calls, supply contradictory input. + /// That is, bytes provided as part of `input` in one call, but not + /// consumed, must not be provided with *different* values on subsequent + /// calls. This is because the function must often make decisions based on + /// looking ahead beyond the bytes that it consumes. + pub fn push<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> { + if input.is_empty() { + if eof { + return Ok((input, Type::End)); + } else { + return Err(Incomplete); + }; + } + + match self.state.0 { + State::Shbang => return self.parse_shbang(input, eof), + State::General => { + if self.start_of_line() { + self.parse_start_of_line(input, eof) + } else { + self.parse_mid_command(input, eof) + } + } + State::Comment1 => self.parse_comment_1(input, eof), + State::Comment2 => self.parse_comment_2(input, eof), + State::Document1 => self.parse_document_1(input, eof), + State::Document2 => self.parse_document_2(input, eof), + State::Document3 => self.parse_document_3(input, eof), + State::FileLabel1 => self.parse_file_label_1(input, eof), + State::FileLabel2 => self.parse_file_label_2(input, eof), + State::FileLabel3 => self.parse_file_label_3(input, eof), + State::DoRepeat1 => self.parse_do_repeat_1(input, eof), + State::DoRepeat2 => self.parse_do_repeat_2(input, eof), + State::DoRepeat3 => self.parse_do_repeat_3(input, eof), + State::Define1 => self.parse_define_1_2(input, eof), + State::Define2 => self.parse_define_1_2(input, eof), + State::Define3 => self.parse_define_3(input, eof), + State::Define4 => self.parse_define_4_5(input, eof), + State::Define5 => self.parse_define_4_5(input, eof), + State::Define6 => self.parse_define_6(input, eof), + State::BeginData1 => self.parse_begin_data_1(input, eof), + State::BeginData2 => self.parse_begin_data_2(input, eof), + State::BeginData3 => self.parse_begin_data_3(input, eof), + State::BeginData4 => self.parse_begin_data_4(input, eof), + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum State { + Shbang, + General, + Comment1, + Comment2, + Document1, + Document2, + Document3, + FileLabel1, + FileLabel2, + FileLabel3, + DoRepeat1, + DoRepeat2, + DoRepeat3, + Define1, + Define2, + Define3, + Define4, + Define5, + Define6, + BeginData1, + BeginData2, + BeginData3, + BeginData4, +} + +fn take(input: &str, eof: bool) -> Result<(Option, &str), Incomplete> { + let mut iter = input.chars(); + match iter.next() { + None if !eof => Err(Incomplete), + c => Ok((c, iter.as_str())), + } +} + +fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input), + '*' => { + if let (Some('/'), rest) = take(rest, eof)? { + return Ok(rest); + } + } + _ => (), + }; + input = rest; + } +} + +fn skip_matching(f: F, input: &str, eof: bool) -> Result<&str, Incomplete> +where + F: Fn(char) -> bool, +{ + let input = input.trim_start_matches(f); + if input.is_empty() && !eof { + Err(Incomplete) + } else { + Ok(input) + } +} + +fn match_char(f: F, input: &str, eof: bool) -> Result, Incomplete> +where + F: Fn(char) -> bool, +{ + if let (Some(c), rest) = take(input, eof)? { + if f(c) { + return Ok(Some(rest)); + } + } + Ok(None) +} + +fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), + c if c.is_whitespace() => (), + _ => return Ok(input), + } + input = rest; + } +} + +fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> { + skip_matching(|c| c.is_ascii_digit(), input, eof) +} + +fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '/' => { + let (c, rest2) = take(rest, eof)?; + match c { + Some('*') => input = skip_comment(rest2, eof)?, + Some(_) | None => return Ok(rest), + } + } + '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), + c if c.is_whitespace() => input = rest, + _ => return Ok(input), + }; + } +} + +fn is_start_of_string(input: &str, eof: bool) -> Result { + let (Some(c), _rest) = take(input, eof)? else { + return Ok(false); + }; + match c { + 'x' | 'X' | 'u' | 'U' => Ok({ + let (c, _rest) = take(input, eof)?; + c == Some('\'') || c == Some('"') + }), + '\'' | '"' | '\n' => Ok(true), + _ => Ok(false), + } +} + +fn is_end_of_line(input: &str, eof: bool) -> Result { + let (Some(c), rest) = take(input, eof)? else { + return Ok(true); + }; + Ok(match c { + '\n' => true, + '\r' => take(rest, eof)?.0 == Some('\n'), + _ => false, + }) +} + +fn at_end_of_line(input: &str, eof: bool) -> Result { + let input = skip_spaces_and_comments(input, eof)?; + is_end_of_line(input, eof) +} + +fn first(s: &str) -> char { + s.chars().next().unwrap() +} +fn get_command_name_candidates(target: &str) -> &[&'static str] { + if target.is_empty() { + return &[]; + } + let target_first = first(target).to_ascii_uppercase(); + let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first); + let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first); + &COMMAND_NAMES[low..high] +} + +fn detect_command_name(input: &str, eof: bool) -> Result { + let command_name = input + .split(|c: char| !(c.is_whitespace() || c.may_continue_id() || c == '-')) + .next() + .unwrap(); + if !eof && command_name.len() == input.len() { + return Err(Incomplete); + } + let string = command_name.strip_suffix('.').unwrap_or(command_name); + for command in get_command_name_candidates(command_name) { + if let Some(m) = command_match(command, string) { + if m.missing_words <= 0 { + return Ok(true); + } + } + } + Ok(false) +} + +impl Segmenter { + fn parse_shbang<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (c, rest) = take(input, eof)?; + if c == Some('#') { + if let (Some('!'), rest) = take(rest, eof)? { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((self.parse_full_line(rest, eof)?, Type::Shbang)); + } + } + + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + self.push(input, eof) + } + fn at_command_start(&self, input: &str, eof: bool) -> Result { + match self.mode { + Mode::Auto => detect_command_name(input, eof), + Mode::Interactive => Ok(false), + Mode::Batch => Ok(true), + } + } + fn parse_start_of_line<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + debug_assert_eq!(self.state.0, State::General); + debug_assert!(self.start_of_line()); + debug_assert!(!input.is_empty()); + + let (Some(c), rest) = take(input, eof).unwrap() else { + unreachable!() + }; + match c { + '+' if is_start_of_string(skip_spaces_and_comments(input, eof)?, eof)? => { + // This `+` is punctuation that may separate pieces of a string. + self.state = (State::General, Substate::empty()); + return Ok((rest, Type::Punct)); + } + '+' | '-' | '.' => { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((rest, Type::StartCommand)); + } + c if c.is_whitespace() => { + if at_end_of_line(rest, eof)? { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Type::SeparateCommands)); + } + } + _ => { + if self.at_command_start(input, eof)? { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Type::StartCommand)); + } + } + } + self.state.1 = Substate::START_OF_COMMAND; + self.parse_mid_command(input, eof) + } + fn parse_mid_command<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + debug_assert!(self.state.0 == State::General); + debug_assert!(!self.state.1.contains(Substate::START_OF_LINE)); + let (Some(c), rest) = take(input, eof)? else { + unreachable!() + }; + match c { + '\r' | '\n' if is_end_of_line(input, eof)? => { + self.state.1 |= Substate::START_OF_LINE; + Ok(( + self.parse_newline(input, eof).unwrap().unwrap(), + Type::Newline, + )) + } + '/' => { + if let (Some('*'), rest) = take(rest, eof)? { + let rest = skip_comment(rest, eof)?; + return Ok((rest, Type::Comment)); + } else { + self.state.1 = Substate::empty(); + return Ok((rest, Type::Punct)); + } + } + '-' => { + let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?; + match c { + Some(c) if c.is_ascii_digit() => { + return self.parse_number(rest, eof); + } + Some('.') => { + if let (Some(c), _rest) = take(rest2, eof)? { + if c.is_ascii_digit() { + return self.parse_number(rest, eof); + } + } + } + None | Some(_) => (), + } + self.state.1 = Substate::empty(); + return Ok((rest, Type::Punct)); + } + '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => { + self.state.1 = Substate::empty(); + return Ok((rest, Type::Punct)); + } + '*' => { + if self.state.1.contains(Substate::START_OF_COMMAND) { + self.state.0 = State::Comment1; + self.parse_comment_1(input, eof) + } else { + self.parse_digraph(&['*'], input, eof) + } + } + '<' => self.parse_digraph(&['=', '>'], rest, eof), + '>' => self.parse_digraph(&['='], rest, eof), + '~' => self.parse_digraph(&['='], rest, eof), + '.' => match take(rest, eof)? { + (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof), + (Some('\r' | '\n'), _) if is_end_of_line(rest, eof)? => { + self.state.1 = Substate::START_OF_COMMAND; + Ok((rest, Type::EndCommand)) + } + _ => Ok((rest, Type::Punct)), + }, + '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => { + self.parse_number(input, eof) + } + 'u' | 'U' => self.maybe_parse_string(Type::UnicodeString, (input, rest), eof), + 'x' | 'X' => self.maybe_parse_string(Type::HexString, (input, rest), eof), + '\'' | '"' => self.parse_string(Type::QuotedString, c, rest, eof), + '!' => { + let (c, rest2) = take(rest, eof)?; + match c { + Some('*') => Ok((rest2, Type::MacroId)), + Some(_) => self.parse_id(input, eof), + None => Ok((rest, Type::Punct)), + } + } + c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Type::Spaces)), + c if c.may_start_id() => self.parse_id(input, eof), + '!'..='~' if c != '\\' && c != '^' => { + self.state.1 = Substate::empty(); + Ok((rest, Type::Punct)) + } + _ => { + println!("unexpected {c:?} {:?}", c.is_whitespace()); + self.state.1 = Substate::empty(); + Ok((rest, Type::UnexpectedChar)) + } + } + } + fn parse_string<'a>( + &mut self, + type_: Type, + quote: char, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + break; + }; + if c == quote { + if take(rest, eof)?.0 == Some(quote) { + input = rest; + continue; + } else { + return Ok((rest, type_)); + } + } else if is_end_of_line(input, eof)? { + break; + } + input = rest; + } + self.state.1 = Substate::empty(); + Ok((input, Type::ExpectedQuote)) + } + fn maybe_parse_string<'a>( + &mut self, + type_: Type, + input: (&'a str, &'a str), + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + match take(input.1, eof)? { + (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(type_, c, rest, eof), + _ => self.parse_id(input.0, eof), + } + } + fn next_id_in_command<'a>( + &self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, &'a str), Incomplete> { + let mut sub = Segmenter::new(self.mode, true); + loop { + let (rest, type_) = sub.push(input, eof)?; + match type_ { + Type::Shbang | Type::Spaces | Type::Comment | Type::Newline => (), + + Type::Identifier => return Ok((&input[..input.len() - rest.len()], rest)), + + Type::Number + | Type::QuotedString + | Type::HexString + | Type::UnicodeString + | Type::UnquotedString + | Type::ReservedWord + | Type::Punct + | Type::CommentCommand + | Type::DoRepeatCommand + | Type::InlineData + | Type::MacroId + | Type::MacroName + | Type::MacroBody + | Type::StartDocument + | Type::Document + | Type::StartCommand + | Type::SeparateCommands + | Type::EndCommand + | Type::End + | Type::ExpectedQuote + | Type::ExpectedExponent + | Type::UnexpectedChar => return Ok(("", rest)), + } + input = rest; + } + } + fn parse_id<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> { + let (Some(_), mut end) = take(input, eof).unwrap() else { + unreachable!() + }; + while let (Some(c), rest) = take(end, eof)? { + if !c.may_continue_id() { + break; + }; + end = rest; + } + let identifier = &input[..input.len() - end.len()]; + let identifier = match identifier.strip_suffix('.') { + Some(without_dot) if at_end_of_line(end, eof)? => without_dot, + _ => identifier, + }; + let rest = &input[identifier.len()..]; + + if self.state.1.contains(Substate::START_OF_COMMAND) { + if id_match_n("COMMENT", identifier, 4) { + self.state.0 = State::Comment1; + return self.parse_comment_1(input, eof); + } else if id_match("DOCUMENT", identifier) { + self.state.0 = State::Document1; + return Ok((input, Type::StartDocument)); + } else if id_match_n("DEFINE", identifier, 6) { + self.state.0 = State::Define1; + } else if id_match("FILE", identifier) { + println!("next={:?}", self.next_id_in_command(rest, eof)?.0); + if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) { + self.state = (State::FileLabel1, Substate::empty()); + return Ok((rest, Type::Identifier)); + } + } else if id_match("DO", identifier) { + if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) { + self.state = (State::DoRepeat1, Substate::empty()); + return Ok((rest, Type::Identifier)); + } + } else if id_match("BEGIN", identifier) { + let (next_id, rest2) = self.next_id_in_command(rest, eof)?; + if id_match("DATA", next_id) { + let rest2 = skip_spaces_and_comments(rest2, eof)?; + let rest2 = if let Some(s) = rest2.strip_prefix('.') { + skip_spaces(s, eof)? + } else { + rest2 + }; + if is_end_of_line(rest2, eof)? { + let s = &input[..input.len() - rest2.len()]; + self.state = ( + if s.contains('\n') { + State::BeginData1 + } else { + State::BeginData2 + }, + Substate::empty(), + ); + return Ok((rest, Type::Identifier)); + } + } + } + } + + self.state.1 = Substate::empty(); + let type_ = if is_reserved_word(identifier) { + Type::ReservedWord + } else if identifier.starts_with('!') { + Type::MacroId + } else { + Type::Identifier + }; + Ok((rest, type_)) + } + fn parse_digraph<'a>( + &mut self, + seconds: &[char], + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (c, rest) = take(input, eof)?; + Ok(( + match c { + Some(c) if seconds.contains(&c) => rest, + _ => input, + }, + Type::Punct, + )) + } + fn parse_number<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let mut input = skip_digits(input, eof)?; + if let Some(rest) = match_char(|c| c == '.', input, eof)? { + let rest2 = skip_digits(rest, eof)?; + if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? { + input = rest2; + } + }; + if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? { + let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest); + let rest2 = skip_digits(rest, eof)?; + if rest2.len() == rest.len() { + self.state.1 = Substate::empty(); + return Ok((rest, Type::ExpectedExponent)); + } + input = rest2; + } + Ok((input, Type::Number)) + } + fn parse_comment_1<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + enum CommentState<'a> { + Blank, + NotBlank, + Period(&'a str), + } + let mut state = CommentState::Blank; + loop { + let (Some(c), rest) = take(input, eof)? else { + // End of file. + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Type::SeparateCommands)); + }; + match c { + '.' => state = CommentState::Period(input), + '\n' | '\r' if is_end_of_line(input, eof)? => { + match state { + CommentState::Blank => { + // Blank line ends comment command. + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Type::SeparateCommands)); + } + CommentState::Period(period) => { + // '.' at end of line ends comment command. + self.state = (State::General, Substate::empty()); + return Ok((period, Type::CommentCommand)); + } + CommentState::NotBlank => { + // Comment continues onto next line. + self.state = (State::Comment2, Substate::empty()); + return Ok((input, Type::CommentCommand)); + } + } + } + c if c.is_whitespace() => (), + _ => state = CommentState::NotBlank, + } + input = rest; + } + } + fn parse_comment_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + + let new_command = match take(rest, eof)?.0 { + Some('+') | Some('-') | Some('.') => true, + Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?, + None | Some(_) => false, + }; + if new_command { + self.state = ( + State::General, + Substate::START_OF_LINE | Substate::START_OF_COMMAND, + ); + } else { + self.state.0 = State::Comment1; + } + Ok((rest, Type::Newline)) + } + fn parse_document_1<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let mut end_cmd = false; + loop { + let (Some(c), rest) = take(input, eof)? else { + self.state.0 = State::Document3; + return Ok((input, Type::Document)); + }; + match c { + '.' => end_cmd = true, + '\n' | '\r' if is_end_of_line(input, eof)? => { + self.state.0 = if end_cmd { + State::Document3 + } else { + State::Document2 + }; + return Ok((input, Type::Document)); + } + c if !c.is_whitespace() => end_cmd = false, + _ => (), + } + input = rest; + } + } + fn parse_document_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::Document1; + Ok((rest, Type::Newline)) + } + fn parse_document_3<'a>( + &mut self, + input: &'a str, + _eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + Ok((input, Type::EndCommand)) + } + fn quoted_file_label(input: &str, eof: bool) -> Result { + let input = skip_spaces_and_comments(input, eof)?; + match take(input, eof)?.0 { + Some('\'') | Some('"') | Some('\n') => Ok(true), + _ => Ok(false), + } + } + fn parse_file_label_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let mut sub = Segmenter { + state: (State::General, self.state.1), + ..*self + }; + let (rest, type_) = sub.push(input, eof)?; + if type_ == Type::Identifier { + let id = &input[..input.len() - rest.len()]; + debug_assert!(id_match("LABEL", id), "{id} should be LABEL"); + if Self::quoted_file_label(rest, eof)? { + *self = sub; + } else { + self.state.0 = State::FileLabel2; + } + } else { + self.state.1 = sub.state.1; + } + Ok((rest, type_)) + } + fn parse_file_label_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let input = skip_spaces(input, eof)?; + self.state.0 = State::FileLabel3; + Ok((input, Type::Spaces)) + } + fn parse_file_label_3<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let mut end_cmd = None; + loop { + let (c, rest) = take(input, eof)?; + match c { + None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => { + self.state = (State::General, Substate::empty()); + return Ok((end_cmd.unwrap_or(input), Type::UnquotedString)); + } + None => unreachable!(), + Some('.') => end_cmd = Some(input), + Some(c) if !c.is_whitespace() => end_cmd = None, + Some(_) => (), + } + input = rest; + } + } + fn subparse<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> { + let mut sub = Segmenter { + mode: self.mode, + state: (State::General, self.state.1), + nest: 0, + }; + let result = sub.push(input, eof)?; + self.state.1 = sub.state.1; + Ok(result) + } + /// We are segmenting a `DO REPEAT` command, currently reading the syntax + /// that defines the stand-in variables (the head) before the lines of + /// syntax to be repeated (the body). + fn parse_do_repeat_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + if type_ == Type::SeparateCommands { + // We reached a blank line that separates the head from the body. + self.state.0 = State::DoRepeat2; + } else if type_ == Type::EndCommand || type_ == Type::StartCommand { + // We reached the body. + self.state.0 = State::DoRepeat3; + self.nest = 1; + } + Ok((rest, type_)) + } + /// We are segmenting a `DO REPEAT` command, currently reading a blank line + /// that separates the head from the body. + fn parse_do_repeat_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + if type_ == Type::Newline { + // We reached the body. + self.state.0 = State::DoRepeat3; + self.nest = 1; + } + Ok((rest, type_)) + } + fn parse_newline<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (Some(c), rest) = take(input, eof)? else { + return Ok(None); + }; + match c { + '\n' => Ok(Some(rest)), + '\r' => { + if let (Some('\n'), rest) = take(rest, eof)? { + Ok(Some(rest)) + } else { + Ok(None) + } + } + _ => Ok(None), + } + } + + fn parse_full_line<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<&'a str, Incomplete> { + loop { + if is_end_of_line(input, eof)? { + return Ok(input); + } + input = take(input, eof).unwrap().1; + } + } + fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result<(), Incomplete> { + let input = input.strip_prefix(&['-', '+']).unwrap_or(input); + let (id1, input) = self.next_id_in_command(input, eof)?; + let up = if id_match("DO", id1) { + true + } else if id_match("END", id1) { + false + } else { + return Ok(()); + }; + + let (id2, _) = self.next_id_in_command(input, eof)?; + if id_match("REPEAT", id2) { + if up { + self.nest += 1 + } else { + self.nest -= 1 + }; + } + Ok(()) + } + /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that + /// are to be repeated. Report each line of syntax as a single + /// [`Type::DoRepeatCommand`]. + /// + /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT` + /// blocks inside the lines we're segmenting. `self.nest` counts the + /// nesting level, starting at 1. + fn parse_do_repeat_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + if let Some(rest) = self.parse_newline(input, eof)? { + return Ok((rest, Type::Newline)); + } + let rest = self.parse_full_line(input, eof)?; + self.check_repeat_command(input, eof)?; + if self.nest == 0 { + // Nesting level dropped to 0, so we've finished reading the `DO + // REPEAT` body. + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + self.push(input, eof) + } else { + Ok((rest, Type::DoRepeatCommand)) + } + } + /// We are segmenting a `DEFINE` command, which consists of: + /// + /// - The `DEFINE` keyword. + /// + /// - An identifier. We transform this into `Type::MacroName` instead of + /// `Type::Identifier` or `Type::MacroId` because this identifier must + /// never be macro-expanded. + /// + /// - Anything but `(`. + /// + /// - `(` followed by a sequence of tokens possibly including balanced + /// parentheses up to a final `)`. + /// + /// - A sequence of any number of lines, one string per line, ending with + /// `!ENDDEFINE`. The first line is usually blank (that is, a newline + /// follows the `(`). The last line usually just has `!ENDDEFINE.` on + /// it, but it can start with other tokens. The whole + /// DEFINE...!ENDDEFINE can be on a single line, even. + fn parse_define_1_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + match type_ { + Type::Identifier | Type::MacroId if self.state.0 == State::Define1 => { + self.state.0 = State::Define2; + return Ok((rest, Type::MacroName)); + } + Type::SeparateCommands | Type::EndCommand | Type::StartCommand => { + // The DEFINE command is malformed because we reached its end + // without ever hitting a `(` token. Transition back to general + // parsing. + self.state.0 = State::General; + } + Type::Punct if rest.starts_with('(') => { + self.state.0 = State::Define3; + self.nest = 1; + } + _ => (), + } + Ok((rest, type_)) + } + fn parse_define_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + match type_ { + Type::SeparateCommands | Type::EndCommand | Type::StartCommand => { + // The DEFINE command is malformed because we reached its end + // without ever hitting a `(` token. Transition back to general + // parsing. + self.state.0 = State::General; + } + Type::Punct if rest.starts_with('(') => { + self.nest += 1; + } + Type::Punct if rest.starts_with(')') => { + self.nest -= 1; + if self.nest == 0 { + self.state = (State::Define4, Substate::empty()); + } + } + _ => (), + } + Ok((rest, type_)) + } + fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> { + loop { + input = skip_spaces_and_comments(input, true).unwrap(); + let (Some(c), rest) = take(input, true).unwrap() else { + return None; + }; + match c { + '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => { + return Some(input) + } + '\'' | '"' => { + let index = rest.find(c)?; + input = &rest[index + 1..]; + } + _ => input = rest, + } + } + } + /// We are in the body of a macro definition, looking for additional lines + /// of the body or `!ENDDEFINE`. + /// + /// In `State::Define4`, we're parsing the first line of the macro body (the + /// same line as the closing parenthesis in the argument definition). In + /// `State::Define5`, we're on a later line. + fn parse_define_4_5<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_full_line(input, eof)?; + let line = &input[..input.len() - rest.len()]; + if let Some(end) = Self::find_enddefine(line) { + // Macro ends at the !ENDDEFINE on this line. + self.state = (State::General, Substate::empty()); + let prefix = &input[..input.len() - end.len()]; + if prefix.is_empty() { + // Line starts with `!ENDDEFINE`. + self.push(input, eof) + } else if input.trim().is_empty() { + // Line starts with spaces followed by `!ENDDEFINE`. + Ok((end, Type::Spaces)) + } else { + // Line starts with some content followed by `!ENDDEFINE`. + Ok((end, Type::MacroBody)) + } + } else { + // No `!ENDDEFINE`. We have a full line of macro body. + // + // The line might be blank, whether completely empty or just spaces + // and comments. That's OK: we need to report blank lines because + // they can have significance. + // + // However, if the first line of the macro body is blank, we just + // report it as spaces because it's not significant. + let type_ = if self.state.0 == State::Define4 && line.trim().is_empty() { + Type::Spaces + } else { + Type::MacroBody + }; + self.state.0 = State::Define6; + Ok((rest, type_)) + } + } + fn parse_define_6<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::Define5; + Ok((rest, Type::Newline)) + } + fn parse_begin_data_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + if type_ == Type::Newline { + self.state.0 = State::BeginData2; + } + Ok((rest, type_)) + } + fn parse_begin_data_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + if type_ == Type::Newline { + self.state.0 = State::BeginData3; + } + Ok((rest, type_)) + } + fn is_end_data(line: &str) -> bool { + let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else { + return false; + }; + let (Some(c), rest) = take(rest, true).unwrap() else { + return false; + }; + if !c.is_whitespace() { + return false; + }; + let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else { + return false; + }; + + let mut endcmd = false; + for c in rest.chars() { + match c { + '.' if endcmd => return false, + '.' => endcmd = true, + c if c.is_whitespace() => (), + _ => return false, + } + } + endcmd + } + fn parse_begin_data_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_full_line(input, eof)?; + let line = &input[..input.len() - rest.len()]; + if Self::is_end_data(line) { + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + self.push(input, eof) + } else { + self.state.0 = State::BeginData4; + Ok((rest, Type::InlineData)) + } + } + fn parse_begin_data_4<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::BeginData3; + Ok((rest, Type::Newline)) + } +} + +fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> { + line.get(..pattern.len()) + .map(|prefix| { + prefix + .eq_ignore_ascii_case(pattern) + .then(|| &line[pattern.len()..]) + }) + .flatten() +} + +#[cfg(test)] +mod test { + use super::{Mode, Segmenter, Type}; + + /* + fn check_segmentation(mut input: &str, output: &[(Type, &str)]) { + let mut segmenter = Segmenter::new(Mode::Auto, false); + for (&exp_type, &exp_s) in output { + let (rest, type_) = segmenter.push(input, true).unwrap(); + + } + }*/ + + fn print_segmentation(mut input: &str) { + let mut segmenter = Segmenter::new(Mode::Auto, false); + loop { + let (rest, type_) = segmenter.push(input, true).unwrap(); + let token = &input[..input.len() - rest.len()]; + println!("{type_:?} {token:?}"); + if type_ == Type::End { + break; + } + input = rest; + } + } + + #[test] + fn test_identifiers() { + print_segmentation( + r#"a ab abc abcd !abcd +A AB ABC ABCD !ABCD +aB aBC aBcD !aBcD +$x $y $z !$z +grève Ângstrom poté +#a #b #c ## #d !#d +@efg @ @@. @#@ !@ +## # #12345 #.# +f@#_.#6 +GhIjK +.x 1y _z +"#, + ); + } + + #[test] + fn test_identifiers_ending_in_dot() { + print_segmentation( + r#"abcd. abcd. +ABCD. ABCD. +aBcD. aBcD. +$y. $z. あいうえお. +#c. #d.. +@@. @@.... +#.#. +#abcd. +. +. +LMNOP. +QRSTUV./* end of line comment */ +qrstuv. /* end of line comment */ +QrStUv./* end of line comment */ +wxyz./* unterminated end of line comment +WXYZ. /* unterminated end of line comment +WxYz./* unterminated end of line comment +"#, + ); + } + + #[test] + fn test_reserved_words() { + print_segmentation( + r#"and or not eq ge gt le lt ne all by to with +AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH +andx orx notx eqx gex gtx lex ltx nex allx byx tox withx +and. with. +"#, + ); + } + + #[test] + fn test_punctuation() { + print_segmentation( + r#"~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] ** +~&|=>=><=<~=<>(),-+*/[[]]**!* +% : ; ? _ ` { } ~ !* +"#, + ); + } + + #[test] + fn test_positive_numbers() { + print_segmentation( + r#"0 1 01 001. 1. +123. /* comment 1 */ /* comment 2 */ +.1 0.1 00.1 00.10 +5e1 6E-1 7e+1 6E+01 6e-03 +.3E1 .4e-1 .5E+1 .6e+01 .7E-03 +1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 +. 1e e1 1e+ 1e- 1. +"#, + ); + } + + #[test] + fn test_negative_numbers() { + print_segmentation( + r#" -0 -1 -01 -001. -1. + -123. /* comment 1 */ /* comment 2 */ + -.1 -0.1 -00.1 -00.10 + -5e1 -6E-1 -7e+1 -6E+01 -6e-03 + -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03 + -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03 + -/**/1 + -. -1e -e1 -1e+ -1e- -1. +"#, + ); + } + + #[test] + fn test_strings() { + print_segmentation( + r#"'x' "y" 'abc' +'Don''t' "Can't" 'Won''t' +"""quoted""" '"quoted"' +'' "" +'missing end quote +"missing double quote +x"4142" X'5152' +u'fffd' U"041" ++ new command ++ /* comment */ 'string continuation' ++ /* also a punctuator on blank line +- 'new command' +"#, + ); + } + + #[test] + fn test_shbang() { + print_segmentation( + r#"#! /usr/bin/pspp +title my title. +#! /usr/bin/pspp +"#, + ); + } + + #[test] + fn test_comment_command() { + print_segmentation( + r#"* Comment commands "don't +have to contain valid tokens. + +** Check ambiguity with ** token. +****************. + +comment keyword works too. +COMM also. +com is ambiguous with COMPUTE. + + * Comment need not start at left margin. + +* Comment ends with blank line + +next command. + +"#, + ); + } + + #[test] + fn test_document_command() { + print_segmentation( + r#"DOCUMENT one line. +DOC more + than + one + line. +docu +first.paragraph +isn't parsed as tokens + +second paragraph. + +"#, + ); + } + + #[test] + fn test_file_label_command() { + print_segmentation( + r#"FIL label isn't quoted. +FILE + lab 'is quoted'. +FILE /* +/**/ lab not quoted here either + +"#, + ); + } + + #[test] + fn test_begin_data() { + print_segmentation(r#"begin data. +end data. + +begin data. /* +123 +xxx +end data. + +BEG /**/ DAT /* +5 6 7 /* x + +end data +end data +. + +begin + data. +data +end data. + +begin data "xxx". +begin data 123. +not data + +"#); + } + } diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 933c74ad09..404ac18b67 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -10,3 +10,4 @@ pub mod locale_charset; pub mod raw; pub mod sack; pub mod lex; +pub mod prompt; diff --git a/rust/src/prompt.rs b/rust/src/prompt.rs new file mode 100644 index 0000000000..71a6b7ce2a --- /dev/null +++ b/rust/src/prompt.rs @@ -0,0 +1,36 @@ +pub enum PromptStyle { + /// First line of command. + First, + + /// Second or later line of command. + Later, + + /// Between `BEGIN DATA` and `END DATA`. + Data, + + /// `COMMENT` or `*` command. + Comment, + + /// DOCUMENT command. + Document, + + /// `DO REPEAT` command. + DoRepeat, + + /// `DEFINE` command. + Define, +} + +impl PromptStyle { + pub fn to_string(&self) -> &'static str { + match self { + PromptStyle::First => "first", + PromptStyle::Later => "later", + PromptStyle::Data => "data", + PromptStyle::Comment => "COMMENT", + PromptStyle::Document => "DOCUMENT", + PromptStyle::DoRepeat => "DO REPEAT", + PromptStyle::Define => "DEFINE", + } + } +} diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at index abbc08c8cd..80c09779a8 100644 --- a/tests/language/lexer/segment.at +++ b/tests/language/lexer/segment.at @@ -1648,6 +1648,7 @@ AT_CLEANUP # uninitialized data, run with valgrind. The test will pass either # way. (The bug report has a more complicated crashing case.) AT_SETUP([input ends in carriage return]) +AT_KEYWORDS([segment]) printf '\r' > input AT_DATA([expout-base], [dnl separate_commands