//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior
//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
+use crate::{
+ identifier::{id_match, id_match_n, is_reserved_word, IdentifierChar},
+ prompt::PromptStyle,
+};
+use bitflags::bitflags;
+
+use super::command_name::{command_match, COMMAND_NAMES};
+
/// Segmentation mode.
///
/// PSPP syntax is written in one of two modes which are broadly defined as
}
/// The type of a segment.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Type {
Number,
QuotedString,
End,
ExpectedQuote,
ExpectedExponent,
- UnexpectedChar
+ UnexpectedChar,
}
+bitflags! {
+ #[derive(Copy, Clone)]
+ pub struct Substate: u8 {
+ const START_OF_LINE = 1;
+ const START_OF_COMMAND = 2;
+ }
+}
+
+#[derive(Copy, Clone)]
pub struct Segmenter {
- state: State
+ state: (State, Substate),
+ nest: u8,
+ mode: Mode,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct Incomplete;
+
+impl Segmenter {
+ /// Returns a segmenter with the given syntax `mode`.
+ ///
+ /// If `is_snippet` is false, then the segmenter will parse as if it's being
+ /// given a whole file. This means, for example, that it will interpret `-`
+ /// or `+` at the beginning of the syntax as a separator between commands
+ /// (since `-` or `+` at the beginning of a line has this meaning).
+ ///
+ /// If `is_snippet` is true, then the segmenter will parse as if it's being
+ /// given an isolated piece of syntax. This means that, for example, that
+ /// it will interpret `-` or `+` at the beginning of the syntax as an
+ /// operator token or (if followed by a digit) as part of a number.
+ pub fn new(mode: Mode, is_snippet: bool) -> Self {
+ Self {
+ state: if is_snippet {
+ (State::General, Substate::empty())
+ } else {
+ (State::Shbang, Substate::empty())
+ },
+ mode,
+ nest: 0,
+ }
+ }
+
+ pub fn mode(&self) -> Mode {
+ self.mode
+ }
+
+ fn start_of_line(&self) -> bool {
+ self.state.1.contains(Substate::START_OF_LINE)
+ }
+
+ fn start_of_command(&self) -> bool {
+ self.state.1.contains(Substate::START_OF_COMMAND)
+ }
+
+ /// Returns the style of command prompt to display to an interactive user
+ /// for input in the current state.. The return value is most accurate in
+ /// mode `Mode::Interactive` and at the beginning of a line (that is, if
+ /// [`Segmenter::push`] consumed as much as possible of the input up to a
+ /// new-line).
+ pub fn prompt(&self) -> PromptStyle {
+ match self.state.0 {
+ State::Shbang => PromptStyle::First,
+ State::General => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::Comment1 | State::Comment2 => PromptStyle::Comment,
+ State::Document1 | State::Document2 => PromptStyle::Document,
+ State::Document3 => PromptStyle::First,
+ State::FileLabel1 => PromptStyle::Later,
+ State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
+ State::DoRepeat1 | State::DoRepeat2 => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::DoRepeat3 => PromptStyle::DoRepeat,
+ State::Define1 | State::Define2 | State::Define3 => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
+ State::BeginData1 => PromptStyle::First,
+ State::BeginData2 => PromptStyle::Later,
+ State::BeginData3 | State::BeginData4 => PromptStyle::Data,
+ }
+ }
+
+ /// Attempts to label a prefix of the remaining input with a segment type.
+ /// The caller supplies a prefix of the remaining input as `input`. If
+ /// `eof` is true, then `input` is the entire (remainder) of the input; if
+ /// `eof` is false, then further input is potentially available.
+ ///
+ /// The input may contain '\n' or '\r\n' line ends in any combination.
+ ///
+ /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
+ /// in the segment at the beginning of `input` (a number in
+ /// `0..=input.len()`) and the type of that segment. The next call should
+ /// not include those bytes in `input`, because they have (figuratively)
+ /// been consumed by the segmenter.
+ ///
+ /// Segments can have zero length, including segment types `Type::End`,
+ /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and
+ /// `Type::Spaces`.
+ ///
+ /// Failure occurs only if the segment type of the bytes in `input` cannot
+ /// yet be determined. In this case, this function returns `Err(Incomplete)`. If
+ /// more input is available, the caller should obtain some more, then call
+ /// again with a longer `input`. If this is not enough, the process might
+ /// need to repeat again and again. If input is exhausted, then the caller
+ /// may call again setting `eof` to true. This function will never return
+ /// `Err(Incomplete)` when `eof` is true.
+ ///
+ /// The caller must not, in a sequence of calls, supply contradictory input.
+ /// That is, bytes provided as part of `input` in one call, but not
+ /// consumed, must not be provided with *different* values on subsequent
+ /// calls. This is because the function must often make decisions based on
+ /// looking ahead beyond the bytes that it consumes.
+ pub fn push<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
+ if input.is_empty() {
+ if eof {
+ return Ok((input, Type::End));
+ } else {
+ return Err(Incomplete);
+ };
+ }
+
+ match self.state.0 {
+ State::Shbang => return self.parse_shbang(input, eof),
+ State::General => {
+ if self.start_of_line() {
+ self.parse_start_of_line(input, eof)
+ } else {
+ self.parse_mid_command(input, eof)
+ }
+ }
+ State::Comment1 => self.parse_comment_1(input, eof),
+ State::Comment2 => self.parse_comment_2(input, eof),
+ State::Document1 => self.parse_document_1(input, eof),
+ State::Document2 => self.parse_document_2(input, eof),
+ State::Document3 => self.parse_document_3(input, eof),
+ State::FileLabel1 => self.parse_file_label_1(input, eof),
+ State::FileLabel2 => self.parse_file_label_2(input, eof),
+ State::FileLabel3 => self.parse_file_label_3(input, eof),
+ State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
+ State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
+ State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
+ State::Define1 => self.parse_define_1_2(input, eof),
+ State::Define2 => self.parse_define_1_2(input, eof),
+ State::Define3 => self.parse_define_3(input, eof),
+ State::Define4 => self.parse_define_4_5(input, eof),
+ State::Define5 => self.parse_define_4_5(input, eof),
+ State::Define6 => self.parse_define_6(input, eof),
+ State::BeginData1 => self.parse_begin_data_1(input, eof),
+ State::BeginData2 => self.parse_begin_data_2(input, eof),
+ State::BeginData3 => self.parse_begin_data_3(input, eof),
+ State::BeginData4 => self.parse_begin_data_4(input, eof),
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum State {
+ Shbang,
+ General,
+ Comment1,
+ Comment2,
+ Document1,
+ Document2,
+ Document3,
+ FileLabel1,
+ FileLabel2,
+ FileLabel3,
+ DoRepeat1,
+ DoRepeat2,
+ DoRepeat3,
+ Define1,
+ Define2,
+ Define3,
+ Define4,
+ Define5,
+ Define6,
+ BeginData1,
+ BeginData2,
+ BeginData3,
+ BeginData4,
+}
+
+fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
+ let mut iter = input.chars();
+ match iter.next() {
+ None if !eof => Err(Incomplete),
+ c => Ok((c, iter.as_str())),
+ }
+}
+
+fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
+ '*' => {
+ if let (Some('/'), rest) = take(rest, eof)? {
+ return Ok(rest);
+ }
+ }
+ _ => (),
+ };
+ input = rest;
+ }
+}
+
+fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
+where
+ F: Fn(char) -> bool,
+{
+ let input = input.trim_start_matches(f);
+ if input.is_empty() && !eof {
+ Err(Incomplete)
+ } else {
+ Ok(input)
+ }
+}
+
+fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
+where
+ F: Fn(char) -> bool,
+{
+ if let (Some(c), rest) = take(input, eof)? {
+ if f(c) {
+ return Ok(Some(rest));
+ }
+ }
+ Ok(None)
+}
+
+fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+ c if c.is_whitespace() => (),
+ _ => return Ok(input),
+ }
+ input = rest;
+ }
+}
+
+fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
+ skip_matching(|c| c.is_ascii_digit(), input, eof)
+}
+
+fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '/' => {
+ let (c, rest2) = take(rest, eof)?;
+ match c {
+ Some('*') => input = skip_comment(rest2, eof)?,
+ Some(_) | None => return Ok(rest),
+ }
+ }
+ '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+ c if c.is_whitespace() => input = rest,
+ _ => return Ok(input),
+ };
+ }
+}
+
+fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let (Some(c), _rest) = take(input, eof)? else {
+ return Ok(false);
+ };
+ match c {
+ 'x' | 'X' | 'u' | 'U' => Ok({
+ let (c, _rest) = take(input, eof)?;
+ c == Some('\'') || c == Some('"')
+ }),
+ '\'' | '"' | '\n' => Ok(true),
+ _ => Ok(false),
+ }
+}
+
+fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(true);
+ };
+ Ok(match c {
+ '\n' => true,
+ '\r' => take(rest, eof)?.0 == Some('\n'),
+ _ => false,
+ })
+}
+
+fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let input = skip_spaces_and_comments(input, eof)?;
+ is_end_of_line(input, eof)
+}
+
+fn first(s: &str) -> char {
+ s.chars().next().unwrap()
+}
+fn get_command_name_candidates(target: &str) -> &[&'static str] {
+ if target.is_empty() {
+ return &[];
+ }
+ let target_first = first(target).to_ascii_uppercase();
+ let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
+ let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
+ &COMMAND_NAMES[low..high]
+}
+
+fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let command_name = input
+ .split(|c: char| !(c.is_whitespace() || c.may_continue_id() || c == '-'))
+ .next()
+ .unwrap();
+ if !eof && command_name.len() == input.len() {
+ return Err(Incomplete);
+ }
+ let string = command_name.strip_suffix('.').unwrap_or(command_name);
+ for command in get_command_name_candidates(command_name) {
+ if let Some(m) = command_match(command, string) {
+ if m.missing_words <= 0 {
+ return Ok(true);
+ }
+ }
+ }
+ Ok(false)
+}
+
+impl Segmenter {
+ fn parse_shbang<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (c, rest) = take(input, eof)?;
+ if c == Some('#') {
+ if let (Some('!'), rest) = take(rest, eof)? {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((self.parse_full_line(rest, eof)?, Type::Shbang));
+ }
+ }
+
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ self.push(input, eof)
+ }
+ fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
+ match self.mode {
+ Mode::Auto => detect_command_name(input, eof),
+ Mode::Interactive => Ok(false),
+ Mode::Batch => Ok(true),
+ }
+ }
+ fn parse_start_of_line<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ debug_assert_eq!(self.state.0, State::General);
+ debug_assert!(self.start_of_line());
+ debug_assert!(!input.is_empty());
+
+ let (Some(c), rest) = take(input, eof).unwrap() else {
+ unreachable!()
+ };
+ match c {
+ '+' if is_start_of_string(skip_spaces_and_comments(input, eof)?, eof)? => {
+ // This `+` is punctuation that may separate pieces of a string.
+ self.state = (State::General, Substate::empty());
+ return Ok((rest, Type::Punct));
+ }
+ '+' | '-' | '.' => {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((rest, Type::StartCommand));
+ }
+ c if c.is_whitespace() => {
+ if at_end_of_line(rest, eof)? {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Type::SeparateCommands));
+ }
+ }
+ _ => {
+ if self.at_command_start(input, eof)? {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Type::StartCommand));
+ }
+ }
+ }
+ self.state.1 = Substate::START_OF_COMMAND;
+ self.parse_mid_command(input, eof)
+ }
+ fn parse_mid_command<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ debug_assert!(self.state.0 == State::General);
+ debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
+ let (Some(c), rest) = take(input, eof)? else {
+ unreachable!()
+ };
+ match c {
+ '\r' | '\n' if is_end_of_line(input, eof)? => {
+ self.state.1 |= Substate::START_OF_LINE;
+ Ok((
+ self.parse_newline(input, eof).unwrap().unwrap(),
+ Type::Newline,
+ ))
+ }
+ '/' => {
+ if let (Some('*'), rest) = take(rest, eof)? {
+ let rest = skip_comment(rest, eof)?;
+ return Ok((rest, Type::Comment));
+ } else {
+ self.state.1 = Substate::empty();
+ return Ok((rest, Type::Punct));
+ }
+ }
+ '-' => {
+ let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
+ match c {
+ Some(c) if c.is_ascii_digit() => {
+ return self.parse_number(rest, eof);
+ }
+ Some('.') => {
+ if let (Some(c), _rest) = take(rest2, eof)? {
+ if c.is_ascii_digit() {
+ return self.parse_number(rest, eof);
+ }
+ }
+ }
+ None | Some(_) => (),
+ }
+ self.state.1 = Substate::empty();
+ return Ok((rest, Type::Punct));
+ }
+ '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
+ self.state.1 = Substate::empty();
+ return Ok((rest, Type::Punct));
+ }
+ '*' => {
+ if self.state.1.contains(Substate::START_OF_COMMAND) {
+ self.state.0 = State::Comment1;
+ self.parse_comment_1(input, eof)
+ } else {
+ self.parse_digraph(&['*'], input, eof)
+ }
+ }
+ '<' => self.parse_digraph(&['=', '>'], rest, eof),
+ '>' => self.parse_digraph(&['='], rest, eof),
+ '~' => self.parse_digraph(&['='], rest, eof),
+ '.' => match take(rest, eof)? {
+ (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
+ (Some('\r' | '\n'), _) if is_end_of_line(rest, eof)? => {
+ self.state.1 = Substate::START_OF_COMMAND;
+ Ok((rest, Type::EndCommand))
+ }
+ _ => Ok((rest, Type::Punct)),
+ },
+ '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => {
+ self.parse_number(input, eof)
+ }
+ 'u' | 'U' => self.maybe_parse_string(Type::UnicodeString, (input, rest), eof),
+ 'x' | 'X' => self.maybe_parse_string(Type::HexString, (input, rest), eof),
+ '\'' | '"' => self.parse_string(Type::QuotedString, c, rest, eof),
+ '!' => {
+ let (c, rest2) = take(rest, eof)?;
+ match c {
+ Some('*') => Ok((rest2, Type::MacroId)),
+ Some(_) => self.parse_id(input, eof),
+ None => Ok((rest, Type::Punct)),
+ }
+ }
+ c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Type::Spaces)),
+ c if c.may_start_id() => self.parse_id(input, eof),
+ '!'..='~' if c != '\\' && c != '^' => {
+ self.state.1 = Substate::empty();
+ Ok((rest, Type::Punct))
+ }
+ _ => {
+ println!("unexpected {c:?} {:?}", c.is_whitespace());
+ self.state.1 = Substate::empty();
+ Ok((rest, Type::UnexpectedChar))
+ }
+ }
+ }
+ fn parse_string<'a>(
+ &mut self,
+ type_: Type,
+ quote: char,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ break;
+ };
+ if c == quote {
+ if take(rest, eof)?.0 == Some(quote) {
+ input = rest;
+ continue;
+ } else {
+ return Ok((rest, type_));
+ }
+ } else if is_end_of_line(input, eof)? {
+ break;
+ }
+ input = rest;
+ }
+ self.state.1 = Substate::empty();
+ Ok((input, Type::ExpectedQuote))
+ }
+ fn maybe_parse_string<'a>(
+ &mut self,
+ type_: Type,
+ input: (&'a str, &'a str),
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ match take(input.1, eof)? {
+ (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(type_, c, rest, eof),
+ _ => self.parse_id(input.0, eof),
+ }
+ }
+ fn next_id_in_command<'a>(
+ &self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, &'a str), Incomplete> {
+ let mut sub = Segmenter::new(self.mode, true);
+ loop {
+ let (rest, type_) = sub.push(input, eof)?;
+ match type_ {
+ Type::Shbang | Type::Spaces | Type::Comment | Type::Newline => (),
+
+ Type::Identifier => return Ok((&input[..input.len() - rest.len()], rest)),
+
+ Type::Number
+ | Type::QuotedString
+ | Type::HexString
+ | Type::UnicodeString
+ | Type::UnquotedString
+ | Type::ReservedWord
+ | Type::Punct
+ | Type::CommentCommand
+ | Type::DoRepeatCommand
+ | Type::InlineData
+ | Type::MacroId
+ | Type::MacroName
+ | Type::MacroBody
+ | Type::StartDocument
+ | Type::Document
+ | Type::StartCommand
+ | Type::SeparateCommands
+ | Type::EndCommand
+ | Type::End
+ | Type::ExpectedQuote
+ | Type::ExpectedExponent
+ | Type::UnexpectedChar => return Ok(("", rest)),
+ }
+ input = rest;
+ }
+ }
+ fn parse_id<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
+ let (Some(_), mut end) = take(input, eof).unwrap() else {
+ unreachable!()
+ };
+ while let (Some(c), rest) = take(end, eof)? {
+ if !c.may_continue_id() {
+ break;
+ };
+ end = rest;
+ }
+ let identifier = &input[..input.len() - end.len()];
+ let identifier = match identifier.strip_suffix('.') {
+ Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
+ _ => identifier,
+ };
+ let rest = &input[identifier.len()..];
+
+ if self.state.1.contains(Substate::START_OF_COMMAND) {
+ if id_match_n("COMMENT", identifier, 4) {
+ self.state.0 = State::Comment1;
+ return self.parse_comment_1(input, eof);
+ } else if id_match("DOCUMENT", identifier) {
+ self.state.0 = State::Document1;
+ return Ok((input, Type::StartDocument));
+ } else if id_match_n("DEFINE", identifier, 6) {
+ self.state.0 = State::Define1;
+ } else if id_match("FILE", identifier) {
+ println!("next={:?}", self.next_id_in_command(rest, eof)?.0);
+ if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
+ self.state = (State::FileLabel1, Substate::empty());
+ return Ok((rest, Type::Identifier));
+ }
+ } else if id_match("DO", identifier) {
+ if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
+ self.state = (State::DoRepeat1, Substate::empty());
+ return Ok((rest, Type::Identifier));
+ }
+ } else if id_match("BEGIN", identifier) {
+ let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
+ if id_match("DATA", next_id) {
+ let rest2 = skip_spaces_and_comments(rest2, eof)?;
+ let rest2 = if let Some(s) = rest2.strip_prefix('.') {
+ skip_spaces(s, eof)?
+ } else {
+ rest2
+ };
+ if is_end_of_line(rest2, eof)? {
+ let s = &input[..input.len() - rest2.len()];
+ self.state = (
+ if s.contains('\n') {
+ State::BeginData1
+ } else {
+ State::BeginData2
+ },
+ Substate::empty(),
+ );
+ return Ok((rest, Type::Identifier));
+ }
+ }
+ }
+ }
+
+ self.state.1 = Substate::empty();
+ let type_ = if is_reserved_word(identifier) {
+ Type::ReservedWord
+ } else if identifier.starts_with('!') {
+ Type::MacroId
+ } else {
+ Type::Identifier
+ };
+ Ok((rest, type_))
+ }
+ fn parse_digraph<'a>(
+ &mut self,
+ seconds: &[char],
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (c, rest) = take(input, eof)?;
+ Ok((
+ match c {
+ Some(c) if seconds.contains(&c) => rest,
+ _ => input,
+ },
+ Type::Punct,
+ ))
+ }
+ fn parse_number<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let mut input = skip_digits(input, eof)?;
+ if let Some(rest) = match_char(|c| c == '.', input, eof)? {
+ let rest2 = skip_digits(rest, eof)?;
+ if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
+ input = rest2;
+ }
+ };
+ if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
+ let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
+ let rest2 = skip_digits(rest, eof)?;
+ if rest2.len() == rest.len() {
+ self.state.1 = Substate::empty();
+ return Ok((rest, Type::ExpectedExponent));
+ }
+ input = rest2;
+ }
+ Ok((input, Type::Number))
+ }
+ fn parse_comment_1<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ enum CommentState<'a> {
+ Blank,
+ NotBlank,
+ Period(&'a str),
+ }
+ let mut state = CommentState::Blank;
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ // End of file.
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Type::SeparateCommands));
+ };
+ match c {
+ '.' => state = CommentState::Period(input),
+ '\n' | '\r' if is_end_of_line(input, eof)? => {
+ match state {
+ CommentState::Blank => {
+ // Blank line ends comment command.
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Type::SeparateCommands));
+ }
+ CommentState::Period(period) => {
+ // '.' at end of line ends comment command.
+ self.state = (State::General, Substate::empty());
+ return Ok((period, Type::CommentCommand));
+ }
+ CommentState::NotBlank => {
+ // Comment continues onto next line.
+ self.state = (State::Comment2, Substate::empty());
+ return Ok((input, Type::CommentCommand));
+ }
+ }
+ }
+ c if c.is_whitespace() => (),
+ _ => state = CommentState::NotBlank,
+ }
+ input = rest;
+ }
+ }
+ fn parse_comment_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+
+ let new_command = match take(rest, eof)?.0 {
+ Some('+') | Some('-') | Some('.') => true,
+ Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
+ None | Some(_) => false,
+ };
+ if new_command {
+ self.state = (
+ State::General,
+ Substate::START_OF_LINE | Substate::START_OF_COMMAND,
+ );
+ } else {
+ self.state.0 = State::Comment1;
+ }
+ Ok((rest, Type::Newline))
+ }
+ fn parse_document_1<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let mut end_cmd = false;
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ self.state.0 = State::Document3;
+ return Ok((input, Type::Document));
+ };
+ match c {
+ '.' => end_cmd = true,
+ '\n' | '\r' if is_end_of_line(input, eof)? => {
+ self.state.0 = if end_cmd {
+ State::Document3
+ } else {
+ State::Document2
+ };
+ return Ok((input, Type::Document));
+ }
+ c if !c.is_whitespace() => end_cmd = false,
+ _ => (),
+ }
+ input = rest;
+ }
+ }
+ fn parse_document_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::Document1;
+ Ok((rest, Type::Newline))
+ }
+ fn parse_document_3<'a>(
+ &mut self,
+ input: &'a str,
+ _eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ Ok((input, Type::EndCommand))
+ }
+ fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let input = skip_spaces_and_comments(input, eof)?;
+ match take(input, eof)?.0 {
+ Some('\'') | Some('"') | Some('\n') => Ok(true),
+ _ => Ok(false),
+ }
+ }
+ fn parse_file_label_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let mut sub = Segmenter {
+ state: (State::General, self.state.1),
+ ..*self
+ };
+ let (rest, type_) = sub.push(input, eof)?;
+ if type_ == Type::Identifier {
+ let id = &input[..input.len() - rest.len()];
+ debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
+ if Self::quoted_file_label(rest, eof)? {
+ *self = sub;
+ } else {
+ self.state.0 = State::FileLabel2;
+ }
+ } else {
+ self.state.1 = sub.state.1;
+ }
+ Ok((rest, type_))
+ }
+ fn parse_file_label_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let input = skip_spaces(input, eof)?;
+ self.state.0 = State::FileLabel3;
+ Ok((input, Type::Spaces))
+ }
+ fn parse_file_label_3<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let mut end_cmd = None;
+ loop {
+ let (c, rest) = take(input, eof)?;
+ match c {
+ None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
+ self.state = (State::General, Substate::empty());
+ return Ok((end_cmd.unwrap_or(input), Type::UnquotedString));
+ }
+ None => unreachable!(),
+ Some('.') => end_cmd = Some(input),
+ Some(c) if !c.is_whitespace() => end_cmd = None,
+ Some(_) => (),
+ }
+ input = rest;
+ }
+ }
+ fn subparse<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
+ let mut sub = Segmenter {
+ mode: self.mode,
+ state: (State::General, self.state.1),
+ nest: 0,
+ };
+ let result = sub.push(input, eof)?;
+ self.state.1 = sub.state.1;
+ Ok(result)
+ }
+ /// We are segmenting a `DO REPEAT` command, currently reading the syntax
+ /// that defines the stand-in variables (the head) before the lines of
+ /// syntax to be repeated (the body).
+ fn parse_do_repeat_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ if type_ == Type::SeparateCommands {
+ // We reached a blank line that separates the head from the body.
+ self.state.0 = State::DoRepeat2;
+ } else if type_ == Type::EndCommand || type_ == Type::StartCommand {
+ // We reached the body.
+ self.state.0 = State::DoRepeat3;
+ self.nest = 1;
+ }
+ Ok((rest, type_))
+ }
+ /// We are segmenting a `DO REPEAT` command, currently reading a blank line
+ /// that separates the head from the body.
+ fn parse_do_repeat_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ if type_ == Type::Newline {
+ // We reached the body.
+ self.state.0 = State::DoRepeat3;
+ self.nest = 1;
+ }
+ Ok((rest, type_))
+ }
+ fn parse_newline<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<&'a str>, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(None);
+ };
+ match c {
+ '\n' => Ok(Some(rest)),
+ '\r' => {
+ if let (Some('\n'), rest) = take(rest, eof)? {
+ Ok(Some(rest))
+ } else {
+ Ok(None)
+ }
+ }
+ _ => Ok(None),
+ }
+ }
+
+ fn parse_full_line<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<&'a str, Incomplete> {
+ loop {
+ if is_end_of_line(input, eof)? {
+ return Ok(input);
+ }
+ input = take(input, eof).unwrap().1;
+ }
+ }
+ fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result<(), Incomplete> {
+ let input = input.strip_prefix(&['-', '+']).unwrap_or(input);
+ let (id1, input) = self.next_id_in_command(input, eof)?;
+ let up = if id_match("DO", id1) {
+ true
+ } else if id_match("END", id1) {
+ false
+ } else {
+ return Ok(());
+ };
+
+ let (id2, _) = self.next_id_in_command(input, eof)?;
+ if id_match("REPEAT", id2) {
+ if up {
+ self.nest += 1
+ } else {
+ self.nest -= 1
+ };
+ }
+ Ok(())
+ }
+ /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
+ /// are to be repeated. Report each line of syntax as a single
+ /// [`Type::DoRepeatCommand`].
+ ///
+ /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
+ /// blocks inside the lines we're segmenting. `self.nest` counts the
+ /// nesting level, starting at 1.
+ fn parse_do_repeat_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ if let Some(rest) = self.parse_newline(input, eof)? {
+ return Ok((rest, Type::Newline));
+ }
+ let rest = self.parse_full_line(input, eof)?;
+ self.check_repeat_command(input, eof)?;
+ if self.nest == 0 {
+ // Nesting level dropped to 0, so we've finished reading the `DO
+ // REPEAT` body.
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ self.push(input, eof)
+ } else {
+ Ok((rest, Type::DoRepeatCommand))
+ }
+ }
+ /// We are segmenting a `DEFINE` command, which consists of:
+ ///
+ /// - The `DEFINE` keyword.
+ ///
+ /// - An identifier. We transform this into `Type::MacroName` instead of
+ /// `Type::Identifier` or `Type::MacroId` because this identifier must
+ /// never be macro-expanded.
+ ///
+ /// - Anything but `(`.
+ ///
+ /// - `(` followed by a sequence of tokens possibly including balanced
+ /// parentheses up to a final `)`.
+ ///
+ /// - A sequence of any number of lines, one string per line, ending with
+ /// `!ENDDEFINE`. The first line is usually blank (that is, a newline
+ /// follows the `(`). The last line usually just has `!ENDDEFINE.` on
+ /// it, but it can start with other tokens. The whole
+ /// DEFINE...!ENDDEFINE can be on a single line, even.
+ fn parse_define_1_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ match type_ {
+ Type::Identifier | Type::MacroId if self.state.0 == State::Define1 => {
+ self.state.0 = State::Define2;
+ return Ok((rest, Type::MacroName));
+ }
+ Type::SeparateCommands | Type::EndCommand | Type::StartCommand => {
+ // The DEFINE command is malformed because we reached its end
+ // without ever hitting a `(` token. Transition back to general
+ // parsing.
+ self.state.0 = State::General;
+ }
+ Type::Punct if rest.starts_with('(') => {
+ self.state.0 = State::Define3;
+ self.nest = 1;
+ }
+ _ => (),
+ }
+ Ok((rest, type_))
+ }
+ fn parse_define_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ match type_ {
+ Type::SeparateCommands | Type::EndCommand | Type::StartCommand => {
+ // The DEFINE command is malformed because we reached its end
+ // without ever hitting a `(` token. Transition back to general
+ // parsing.
+ self.state.0 = State::General;
+ }
+ Type::Punct if rest.starts_with('(') => {
+ self.nest += 1;
+ }
+ Type::Punct if rest.starts_with(')') => {
+ self.nest -= 1;
+ if self.nest == 0 {
+ self.state = (State::Define4, Substate::empty());
+ }
+ }
+ _ => (),
+ }
+ Ok((rest, type_))
+ }
+ fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> {
+ loop {
+ input = skip_spaces_and_comments(input, true).unwrap();
+ let (Some(c), rest) = take(input, true).unwrap() else {
+ return None;
+ };
+ match c {
+ '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
+ return Some(input)
+ }
+ '\'' | '"' => {
+ let index = rest.find(c)?;
+ input = &rest[index + 1..];
+ }
+ _ => input = rest,
+ }
+ }
+ }
+ /// We are in the body of a macro definition, looking for additional lines
+ /// of the body or `!ENDDEFINE`.
+ ///
+ /// In `State::Define4`, we're parsing the first line of the macro body (the
+ /// same line as the closing parenthesis in the argument definition). In
+ /// `State::Define5`, we're on a later line.
+ fn parse_define_4_5<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_full_line(input, eof)?;
+ let line = &input[..input.len() - rest.len()];
+ if let Some(end) = Self::find_enddefine(line) {
+ // Macro ends at the !ENDDEFINE on this line.
+ self.state = (State::General, Substate::empty());
+ let prefix = &input[..input.len() - end.len()];
+ if prefix.is_empty() {
+ // Line starts with `!ENDDEFINE`.
+ self.push(input, eof)
+ } else if input.trim().is_empty() {
+ // Line starts with spaces followed by `!ENDDEFINE`.
+ Ok((end, Type::Spaces))
+ } else {
+ // Line starts with some content followed by `!ENDDEFINE`.
+ Ok((end, Type::MacroBody))
+ }
+ } else {
+ // No `!ENDDEFINE`. We have a full line of macro body.
+ //
+ // The line might be blank, whether completely empty or just spaces
+ // and comments. That's OK: we need to report blank lines because
+ // they can have significance.
+ //
+ // However, if the first line of the macro body is blank, we just
+ // report it as spaces because it's not significant.
+ let type_ = if self.state.0 == State::Define4 && line.trim().is_empty() {
+ Type::Spaces
+ } else {
+ Type::MacroBody
+ };
+ self.state.0 = State::Define6;
+ Ok((rest, type_))
+ }
+ }
+ fn parse_define_6<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::Define5;
+ Ok((rest, Type::Newline))
+ }
+ fn parse_begin_data_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ if type_ == Type::Newline {
+ self.state.0 = State::BeginData2;
+ }
+ Ok((rest, type_))
+ }
+ fn parse_begin_data_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ if type_ == Type::Newline {
+ self.state.0 = State::BeginData3;
+ }
+ Ok((rest, type_))
+ }
+ fn is_end_data(line: &str) -> bool {
+ let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
+ return false;
+ };
+ let (Some(c), rest) = take(rest, true).unwrap() else {
+ return false;
+ };
+ if !c.is_whitespace() {
+ return false;
+ };
+ let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
+ return false;
+ };
+
+ let mut endcmd = false;
+ for c in rest.chars() {
+ match c {
+ '.' if endcmd => return false,
+ '.' => endcmd = true,
+ c if c.is_whitespace() => (),
+ _ => return false,
+ }
+ }
+ endcmd
+ }
+ fn parse_begin_data_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_full_line(input, eof)?;
+ let line = &input[..input.len() - rest.len()];
+ if Self::is_end_data(line) {
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ self.push(input, eof)
+ } else {
+ self.state.0 = State::BeginData4;
+ Ok((rest, Type::InlineData))
+ }
+ }
+ fn parse_begin_data_4<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::BeginData3;
+ Ok((rest, Type::Newline))
+ }
+}
+
+fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
+ line.get(..pattern.len())
+ .map(|prefix| {
+ prefix
+ .eq_ignore_ascii_case(pattern)
+ .then(|| &line[pattern.len()..])
+ })
+ .flatten()
+}
+
+#[cfg(test)]
+mod test {
+ use super::{Mode, Segmenter, Type};
+
+ /*
+ fn check_segmentation(mut input: &str, output: &[(Type, &str)]) {
+ let mut segmenter = Segmenter::new(Mode::Auto, false);
+ for (&exp_type, &exp_s) in output {
+ let (rest, type_) = segmenter.push(input, true).unwrap();
+
+ }
+ }*/
+
+ fn print_segmentation(mut input: &str) {
+ let mut segmenter = Segmenter::new(Mode::Auto, false);
+ loop {
+ let (rest, type_) = segmenter.push(input, true).unwrap();
+ let token = &input[..input.len() - rest.len()];
+ println!("{type_:?} {token:?}");
+ if type_ == Type::End {
+ break;
+ }
+ input = rest;
+ }
+ }
+
+ #[test]
+ fn test_identifiers() {
+ print_segmentation(
+ r#"a ab abc abcd !abcd
+A AB ABC ABCD !ABCD
+aB aBC aBcD !aBcD
+$x $y $z !$z
+grève Ângstrom poté
+#a #b #c ## #d !#d
+@efg @ @@. @#@ !@
+## # #12345 #.#
+f@#_.#6
+GhIjK
+.x 1y _z
+"#,
+ );
+ }
+
+ #[test]
+ fn test_identifiers_ending_in_dot() {
+ print_segmentation(
+ r#"abcd. abcd.
+ABCD. ABCD.
+aBcD. aBcD.
+$y. $z. あいうえお.
+#c. #d..
+@@. @@....
+#.#.
+#abcd.
+.
+.
+LMNOP.
+QRSTUV./* end of line comment */
+qrstuv. /* end of line comment */
+QrStUv./* end of line comment */
+wxyz./* unterminated end of line comment
+WXYZ. /* unterminated end of line comment
+WxYz./* unterminated end of line comment
+"#,
+ );
+ }
+
+ #[test]
+ fn test_reserved_words() {
+ print_segmentation(
+ r#"and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+"#,
+ );
+ }
+
+ #[test]
+ fn test_punctuation() {
+ print_segmentation(
+ r#"~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
+~&|=>=><=<~=<>(),-+*/[[]]**!*
+% : ; ? _ ` { } ~ !*
+"#,
+ );
+ }
+
+ #[test]
+ fn test_positive_numbers() {
+ print_segmentation(
+ r#"0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e- 1.
+"#,
+ );
+ }
+
+ #[test]
+ fn test_negative_numbers() {
+ print_segmentation(
+ r#" -0 -1 -01 -001. -1.
+ -123. /* comment 1 */ /* comment 2 */
+ -.1 -0.1 -00.1 -00.10
+ -5e1 -6E-1 -7e+1 -6E+01 -6e-03
+ -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
+ -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
+ -/**/1
+ -. -1e -e1 -1e+ -1e- -1.
+"#,
+ );
+ }
+
+ #[test]
+ fn test_strings() {
+ print_segmentation(
+ r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' ""
+'missing end quote
+"missing double quote
+x"4142" X'5152'
+u'fffd' U"041"
++ new command
++ /* comment */ 'string continuation'
++ /* also a punctuator on blank line
+- 'new command'
+"#,
+ );
+ }
+
+ #[test]
+ fn test_shbang() {
+ print_segmentation(
+ r#"#! /usr/bin/pspp
+title my title.
+#! /usr/bin/pspp
+"#,
+ );
+ }
+
+ #[test]
+ fn test_comment_command() {
+ print_segmentation(
+ r#"* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+ * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+"#,
+ );
+ }
+
+ #[test]
+ fn test_document_command() {
+ print_segmentation(
+ r#"DOCUMENT one line.
+DOC more
+ than
+ one
+ line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+
+"#,
+ );
+ }
+
+ #[test]
+ fn test_file_label_command() {
+ print_segmentation(
+ r#"FIL label isn't quoted.
+FILE
+ lab 'is quoted'.
+FILE /*
+/**/ lab not quoted here either
+
+"#,
+ );
+ }
+
+ #[test]
+ fn test_begin_data() {
+ print_segmentation(r#"begin data.
+end data.
+
+begin data. /*
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end data
+end data
+.
+
+begin
+ data.
+data
+end data.
+
+begin data "xxx".
+begin data 123.
+not data
+
+"#);
+ }
+
}