From: Ben Pfaff Date: Thu, 11 Jul 2024 19:01:13 +0000 (-0700) Subject: more tests X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=40a8f09189737f5ca44f60c55d7b23e9d77e9a1d;p=pspp more tests --- diff --git a/rust/src/lex/segment.rs b/rust/src/lex/segment.rs deleted file mode 100644 index 53bc26d5ea..0000000000 --- a/rust/src/lex/segment.rs +++ /dev/null @@ -1,3421 +0,0 @@ -//! Syntax segmentation. -//! -//! PSPP divides traditional "lexical analysis" or "tokenization" into two -//! phases: a lower-level phase called "segmentation" and a higher-level phase -//! called "scanning". This module implements the segmentation phase. -//! [`super::scan`] contains declarations for the scanning phase. -//! -//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label -//! (a segment type) for each byte or contiguous sequence of bytes in the input. -//! It also, in a few corner cases, outputs zero-width segments that label the -//! boundary between a pair of bytes in the input. -//! -//! Some segment types correspond directly to tokens; for example, an -//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID) -//! later in lexical analysis. Other segments contribute to tokens but do not -//! correspond directly; for example, multiple quoted string segments -//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators -//! (SEG_PUNCT) may be combined to form a single string token (T_STRING). Still -//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior -//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE). - -use crate::{ - identifier::{id_match, id_match_n, is_reserved_word, IdentifierChar}, - prompt::PromptStyle, -}; -use bitflags::bitflags; - -use super::command_name::{command_match, COMMAND_NAMES}; - -/// Segmentation mode. -/// -/// PSPP syntax is written in one of two modes which are broadly defined as -/// follows: -/// -/// - In interactive mode, commands end with a period at the end of the line -/// or with a blank line. -/// -/// - In batch mode, the second and subsequent lines of a command are indented -/// from the left margin. -/// -/// The segmenter can also try to automatically detect the mode in use, using a -/// heuristic that is usually correct. -#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] -pub enum Mode { - /// Try to interpret input correctly regardless of whether it is written - /// for interactive or batch mode. - #[default] - Auto, - - /// Interactive syntax mode. - Interactive, - - /// Batch syntax mode. - Batch, -} - -/// The type of a segment. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Type { - Number, - QuotedString, - HexString, - UnicodeString, - UnquotedString, - ReservedWord, - Identifier, - Punct, - Shbang, - Spaces, - Comment, - Newline, - CommentCommand, - DoRepeatCommand, - DoRepeatOverflow, - InlineData, - MacroId, - MacroName, - MacroBody, - StartDocument, - Document, - StartCommand, - SeparateCommands, - EndCommand, - End, - ExpectedQuote, - ExpectedExponent, - UnexpectedChar, -} - -bitflags! { - #[derive(Copy, Clone, Debug)] - pub struct Substate: u8 { - const START_OF_LINE = 1; - const START_OF_COMMAND = 2; - } -} - -#[derive(Copy, Clone)] -pub struct Segmenter { - state: (State, Substate), - nest: u8, - mode: Mode, -} - -#[derive(Copy, Clone, Debug)] -pub struct Incomplete; - -impl Segmenter { - /// Returns a segmenter with the given syntax `mode`. - /// - /// If `is_snippet` is false, then the segmenter will parse as if it's being - /// given a whole file. This means, for example, that it will interpret `-` - /// or `+` at the beginning of the syntax as a separator between commands - /// (since `-` or `+` at the beginning of a line has this meaning). - /// - /// If `is_snippet` is true, then the segmenter will parse as if it's being - /// given an isolated piece of syntax. This means that, for example, that - /// it will interpret `-` or `+` at the beginning of the syntax as an - /// operator token or (if followed by a digit) as part of a number. - pub fn new(mode: Mode, is_snippet: bool) -> Self { - Self { - state: if is_snippet { - (State::General, Substate::empty()) - } else { - (State::Shbang, Substate::empty()) - }, - mode, - nest: 0, - } - } - - pub fn mode(&self) -> Mode { - self.mode - } - - fn start_of_line(&self) -> bool { - self.state.1.contains(Substate::START_OF_LINE) - } - - fn start_of_command(&self) -> bool { - self.state.1.contains(Substate::START_OF_COMMAND) - } - - /// Returns the style of command prompt to display to an interactive user - /// for input in the current state.. The return value is most accurate in - /// mode `Mode::Interactive` and at the beginning of a line (that is, if - /// [`Segmenter::push`] consumed as much as possible of the input up to a - /// new-line). - pub fn prompt(&self) -> PromptStyle { - match self.state.0 { - State::Shbang => PromptStyle::First, - State::General => { - if self.start_of_command() { - PromptStyle::First - } else { - PromptStyle::Later - } - } - State::Comment1 | State::Comment2 => PromptStyle::Comment, - State::Document1 | State::Document2 => PromptStyle::Document, - State::Document3 => PromptStyle::First, - State::FileLabel1 => PromptStyle::Later, - State::FileLabel2 | State::FileLabel3 => PromptStyle::First, - State::DoRepeat1 | State::DoRepeat2 => { - if self.start_of_command() { - PromptStyle::First - } else { - PromptStyle::Later - } - } - State::DoRepeat3 => PromptStyle::DoRepeat, - State::DoRepeat4 => PromptStyle::DoRepeat, - State::Define1 | State::Define2 | State::Define3 => { - if self.start_of_command() { - PromptStyle::First - } else { - PromptStyle::Later - } - } - State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define, - State::BeginData1 => PromptStyle::First, - State::BeginData2 => PromptStyle::Later, - State::BeginData3 | State::BeginData4 => PromptStyle::Data, - } - } - - /// Attempts to label a prefix of the remaining input with a segment type. - /// The caller supplies a prefix of the remaining input as `input`. If - /// `eof` is true, then `input` is the entire (remainder) of the input; if - /// `eof` is false, then further input is potentially available. - /// - /// The input may contain '\n' or '\r\n' line ends in any combination. - /// - /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes - /// in the segment at the beginning of `input` (a number in - /// `0..=input.len()`) and the type of that segment. The next call should - /// not include those bytes in `input`, because they have (figuratively) - /// been consumed by the segmenter. - /// - /// Segments can have zero length, including segment types `Type::End`, - /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and - /// `Type::Spaces`. - /// - /// Failure occurs only if the segment type of the bytes in `input` cannot - /// yet be determined. In this case, this function returns `Err(Incomplete)`. If - /// more input is available, the caller should obtain some more, then call - /// again with a longer `input`. If this is not enough, the process might - /// need to repeat again and again. If input is exhausted, then the caller - /// may call again setting `eof` to true. This function will never return - /// `Err(Incomplete)` when `eof` is true. - /// - /// The caller must not, in a sequence of calls, supply contradictory input. - /// That is, bytes provided as part of `input` in one call, but not - /// consumed, must not be provided with *different* values on subsequent - /// calls. This is because the function must often make decisions based on - /// looking ahead beyond the bytes that it consumes. - pub fn push<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> { - if input.is_empty() { - if eof { - return Ok((input, Type::End)); - } else { - return Err(Incomplete); - }; - } - - match self.state.0 { - State::Shbang => return self.parse_shbang(input, eof), - State::General => { - if self.start_of_line() { - self.parse_start_of_line(input, eof) - } else { - self.parse_mid_line(input, eof) - } - } - State::Comment1 => self.parse_comment_1(input, eof), - State::Comment2 => self.parse_comment_2(input, eof), - State::Document1 => self.parse_document_1(input, eof), - State::Document2 => self.parse_document_2(input, eof), - State::Document3 => self.parse_document_3(input, eof), - State::FileLabel1 => self.parse_file_label_1(input, eof), - State::FileLabel2 => self.parse_file_label_2(input, eof), - State::FileLabel3 => self.parse_file_label_3(input, eof), - State::DoRepeat1 => self.parse_do_repeat_1(input, eof), - State::DoRepeat2 => self.parse_do_repeat_2(input, eof), - State::DoRepeat3 => self.parse_do_repeat_3(input, eof), - State::DoRepeat4 => self.parse_do_repeat_4(input), - State::Define1 => self.parse_define_1_2(input, eof), - State::Define2 => self.parse_define_1_2(input, eof), - State::Define3 => self.parse_define_3(input, eof), - State::Define4 => self.parse_define_4_5(input, eof), - State::Define5 => self.parse_define_4_5(input, eof), - State::Define6 => self.parse_define_6(input, eof), - State::BeginData1 => self.parse_begin_data_1(input, eof), - State::BeginData2 => self.parse_begin_data_2(input, eof), - State::BeginData3 => self.parse_begin_data_3(input, eof), - State::BeginData4 => self.parse_begin_data_4(input, eof), - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -enum State { - Shbang, - General, - Comment1, - Comment2, - Document1, - Document2, - Document3, - FileLabel1, - FileLabel2, - FileLabel3, - DoRepeat1, - DoRepeat2, - DoRepeat3, - DoRepeat4, - Define1, - Define2, - Define3, - Define4, - Define5, - Define6, - BeginData1, - BeginData2, - BeginData3, - BeginData4, -} - -fn take(input: &str, eof: bool) -> Result<(Option, &str), Incomplete> { - let mut iter = input.chars(); - match iter.next() { - None if !eof => Err(Incomplete), - c => Ok((c, iter.as_str())), - } -} - -fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - return Ok(input); - }; - match c { - '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input), - '*' => { - if let (Some('/'), rest) = take(rest, eof)? { - return Ok(rest); - } - } - _ => (), - }; - input = rest; - } -} - -fn skip_matching(f: F, input: &str, eof: bool) -> Result<&str, Incomplete> -where - F: Fn(char) -> bool, -{ - let input = input.trim_start_matches(f); - if input.is_empty() && !eof { - Err(Incomplete) - } else { - Ok(input) - } -} - -fn match_char(f: F, input: &str, eof: bool) -> Result, Incomplete> -where - F: Fn(char) -> bool, -{ - if let (Some(c), rest) = take(input, eof)? { - if f(c) { - return Ok(Some(rest)); - } - } - Ok(None) -} - -fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - return Ok(input); - }; - match c { - '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), - c if c.is_whitespace() => (), - _ => return Ok(input), - } - input = rest; - } -} - -fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> { - skip_matching(|c| c.is_ascii_digit(), input, eof) -} - -fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - return Ok(input); - }; - match c { - '/' => { - let (c, rest2) = take(rest, eof)?; - match c { - Some('*') => input = skip_comment(rest2, eof)?, - Some(_) | None => return Ok(rest), - } - } - '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), - c if c.is_whitespace() => input = rest, - _ => return Ok(input), - }; - } -} - -fn is_start_of_string(input: &str, eof: bool) -> Result { - let (Some(c), _rest) = take(input, eof)? else { - return Ok(false); - }; - match c { - 'x' | 'X' | 'u' | 'U' => Ok({ - let (c, _rest) = take(input, eof)?; - c == Some('\'') || c == Some('"') - }), - '\'' | '"' | '\n' => Ok(true), - _ => Ok(false), - } -} - -fn is_end_of_line(input: &str, eof: bool) -> Result { - let (Some(c), rest) = take(input, eof)? else { - return Ok(true); - }; - Ok(match c { - '\n' => true, - '\r' => take(rest, eof)?.0 == Some('\n'), - _ => false, - }) -} - -fn at_end_of_line(input: &str, eof: bool) -> Result { - is_end_of_line(skip_spaces_and_comments(input, eof)?, eof) -} - -fn first(s: &str) -> char { - s.chars().next().unwrap() -} -fn get_command_name_candidates(target: &str) -> &[&'static str] { - if target.is_empty() { - return &[]; - } - let target_first = first(target).to_ascii_uppercase(); - let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first); - let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first); - &COMMAND_NAMES[low..high] -} - -fn detect_command_name(input: &str, eof: bool) -> Result { - let command_name = input - .split(|c: char| { - !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-') - }) - .next() - .unwrap(); - if !eof && command_name.len() == input.len() { - return Err(Incomplete); - } - let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.'); - for command in get_command_name_candidates(command_name) { - if let Some(m) = command_match(command, command_name) { - if m.missing_words <= 0 { - return Ok(true); - } - } - } - Ok(false) -} - -impl Segmenter { - fn parse_shbang<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - if let (Some('#'), rest) = take(input, eof)? { - if let (Some('!'), rest) = take(rest, eof)? { - let rest = self.parse_full_line(rest, eof)?; - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((rest, Type::Shbang)); - } - } - - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - self.push(input, eof) - } - fn at_command_start(&self, input: &str, eof: bool) -> Result { - match self.mode { - Mode::Auto => detect_command_name(input, eof), - Mode::Interactive => Ok(false), - Mode::Batch => Ok(true), - } - } - fn parse_start_of_line<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - debug_assert_eq!(self.state.0, State::General); - debug_assert!(self.start_of_line()); - debug_assert!(!input.is_empty()); - - let (Some(c), rest) = take(input, eof).unwrap() else { - unreachable!() - }; - match c { - '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => { - // This `+` is punctuation that may separate pieces of a string. - self.state = (State::General, Substate::empty()); - return Ok((rest, Type::Punct)); - } - '+' | '-' | '.' => { - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((rest, Type::StartCommand)); - } - _ if c.is_whitespace() => { - if at_end_of_line(input, eof)? { - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((input, Type::SeparateCommands)); - } - } - _ => { - if self.at_command_start(input, eof)? - && !self.state.1.contains(Substate::START_OF_COMMAND) - { - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((input, Type::StartCommand)); - } - } - } - self.state.1 = Substate::START_OF_COMMAND; - self.parse_mid_line(input, eof) - } - fn parse_mid_line<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - debug_assert!(self.state.0 == State::General); - debug_assert!(!self.state.1.contains(Substate::START_OF_LINE)); - let (Some(c), rest) = take(input, eof)? else { - unreachable!() - }; - match c { - '\r' | '\n' if is_end_of_line(input, eof)? => { - self.state.1 |= Substate::START_OF_LINE; - Ok(( - self.parse_newline(input, eof).unwrap().unwrap(), - Type::Newline, - )) - } - '/' => { - if let (Some('*'), rest) = take(rest, eof)? { - let rest = skip_comment(rest, eof)?; - return Ok((rest, Type::Comment)); - } else { - self.state.1 = Substate::empty(); - return Ok((rest, Type::Punct)); - } - } - '-' => { - let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?; - match c { - Some(c) if c.is_ascii_digit() => { - return self.parse_number(rest, eof); - } - Some('.') => { - if let (Some(c), _rest) = take(rest2, eof)? { - if c.is_ascii_digit() { - return self.parse_number(rest, eof); - } - } - } - None | Some(_) => (), - } - self.state.1 = Substate::empty(); - return Ok((rest, Type::Punct)); - } - '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => { - self.state.1 = Substate::empty(); - return Ok((rest, Type::Punct)); - } - '*' => { - if self.state.1.contains(Substate::START_OF_COMMAND) { - self.state.0 = State::Comment1; - self.parse_comment_1(input, eof) - } else { - self.parse_digraph(&['*'], rest, eof) - } - } - '<' => self.parse_digraph(&['=', '>'], rest, eof), - '>' => self.parse_digraph(&['='], rest, eof), - '~' => self.parse_digraph(&['='], rest, eof), - '.' if at_end_of_line(rest, eof)? => { - self.state.1 = Substate::START_OF_COMMAND; - Ok((rest, Type::EndCommand)) - } - '.' => match take(rest, eof)? { - (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof), - _ => Ok((rest, Type::Punct)), - }, - '0'..='9' => self.parse_number(input, eof), - 'u' | 'U' => self.maybe_parse_string(Type::UnicodeString, (input, rest), eof), - 'x' | 'X' => self.maybe_parse_string(Type::HexString, (input, rest), eof), - '\'' | '"' => self.parse_string(Type::QuotedString, c, rest, eof), - '!' => { - let (c, rest2) = take(rest, eof)?; - match c { - Some('*') => Ok((rest2, Type::MacroId)), - Some(_) => self.parse_id(input, eof), - None => Ok((rest, Type::Punct)), - } - } - c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Type::Spaces)), - c if c.may_start_id() => self.parse_id(input, eof), - '!'..='~' if c != '\\' && c != '^' => { - self.state.1 = Substate::empty(); - Ok((rest, Type::Punct)) - } - _ => { - self.state.1 = Substate::empty(); - Ok((rest, Type::UnexpectedChar)) - } - } - } - fn parse_string<'a>( - &mut self, - type_: Type, - quote: char, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - while let (Some(c), rest) = take(input, eof)? { - match c { - _ if c == quote => { - let (c, rest2) = take(rest, eof)?; - if c != Some(quote) { - self.state.1 = Substate::empty(); - return Ok((rest, type_)); - } - input = rest2; - } - '\r' | '\n' if is_end_of_line(input, eof)? => break, - _ => input = rest, - } - } - self.state.1 = Substate::empty(); - Ok((input, Type::ExpectedQuote)) - } - fn maybe_parse_string<'a>( - &mut self, - type_: Type, - input: (&'a str, &'a str), - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - match take(input.1, eof)? { - (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(type_, c, rest, eof), - _ => self.parse_id(input.0, eof), - } - } - fn next_id_in_command<'a>( - &self, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, &'a str), Incomplete> { - let mut sub = Segmenter::new(self.mode, true); - loop { - let (rest, type_) = sub.push(input, eof)?; - match type_ { - Type::Shbang | Type::Spaces | Type::Comment | Type::Newline => (), - - Type::Identifier => return Ok((&input[..input.len() - rest.len()], rest)), - - Type::Number - | Type::QuotedString - | Type::HexString - | Type::UnicodeString - | Type::UnquotedString - | Type::ReservedWord - | Type::Punct - | Type::CommentCommand - | Type::DoRepeatCommand - | Type::DoRepeatOverflow - | Type::InlineData - | Type::MacroId - | Type::MacroName - | Type::MacroBody - | Type::StartDocument - | Type::Document - | Type::StartCommand - | Type::SeparateCommands - | Type::EndCommand - | Type::End - | Type::ExpectedQuote - | Type::ExpectedExponent - | Type::UnexpectedChar => return Ok(("", rest)), - } - input = rest; - } - } - fn parse_id<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> { - let (Some(_), mut end) = take(input, eof).unwrap() else { - unreachable!() - }; - while let (Some(c), rest) = take(end, eof)? { - if !c.may_continue_id() { - break; - }; - end = rest; - } - let identifier = &input[..input.len() - end.len()]; - let identifier = match identifier.strip_suffix('.') { - Some(without_dot) if at_end_of_line(end, eof)? => without_dot, - _ => identifier, - }; - let rest = &input[identifier.len()..]; - - if self.state.1.contains(Substate::START_OF_COMMAND) { - if id_match_n("COMMENT", identifier, 4) { - self.state.0 = State::Comment1; - return self.parse_comment_1(input, eof); - } else if id_match("DOCUMENT", identifier) { - self.state.0 = State::Document1; - return Ok((input, Type::StartDocument)); - } else if id_match_n("DEFINE", identifier, 6) { - self.state.0 = State::Define1; - } else if id_match("FILE", identifier) { - if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) { - self.state = (State::FileLabel1, Substate::empty()); - return Ok((rest, Type::Identifier)); - } - } else if id_match("DO", identifier) { - if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) { - self.state = (State::DoRepeat1, Substate::empty()); - return Ok((rest, Type::Identifier)); - } - } else if id_match("BEGIN", identifier) { - let (next_id, rest2) = self.next_id_in_command(rest, eof)?; - if id_match("DATA", next_id) { - let rest2 = skip_spaces_and_comments(rest2, eof)?; - let rest2 = if let Some(s) = rest2.strip_prefix('.') { - skip_spaces_and_comments(s, eof)? - } else { - rest2 - }; - if is_end_of_line(rest2, eof)? { - let s = &input[..input.len() - rest2.len()]; - self.state = ( - if s.contains('\n') { - State::BeginData1 - } else { - State::BeginData2 - }, - Substate::empty(), - ); - return Ok((rest, Type::Identifier)); - } - } - } - } - - self.state.1 = Substate::empty(); - let type_ = if is_reserved_word(identifier) { - Type::ReservedWord - } else if identifier.starts_with('!') { - Type::MacroId - } else { - Type::Identifier - }; - Ok((rest, type_)) - } - fn parse_digraph<'a>( - &mut self, - seconds: &[char], - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let (c, rest) = take(input, eof)?; - self.state.1 = Substate::empty(); - Ok(( - match c { - Some(c) if seconds.contains(&c) => rest, - _ => input, - }, - Type::Punct, - )) - } - fn parse_number<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let mut input = skip_digits(input, eof)?; - if let Some(rest) = match_char(|c| c == '.', input, eof)? { - let rest2 = skip_digits(rest, eof)?; - if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? { - input = rest2; - } - }; - if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? { - let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest); - let rest2 = skip_digits(rest, eof)?; - if rest2.len() == rest.len() { - self.state.1 = Substate::empty(); - return Ok((rest, Type::ExpectedExponent)); - } - input = rest2; - } - self.state.1 = Substate::empty(); - Ok((input, Type::Number)) - } - fn parse_comment_1<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - enum CommentState<'a> { - Blank, - NotBlank, - Period(&'a str), - } - let mut state = CommentState::Blank; - loop { - let (Some(c), rest) = take(input, eof)? else { - // End of file. - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((input, Type::SeparateCommands)); - }; - match c { - '.' => state = CommentState::Period(input), - '\n' | '\r' if is_end_of_line(input, eof)? => { - match state { - CommentState::Blank => { - // Blank line ends comment command. - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((input, Type::SeparateCommands)); - } - CommentState::Period(period) => { - // '.' at end of line ends comment command. - self.state = (State::General, Substate::empty()); - return Ok((period, Type::CommentCommand)); - } - CommentState::NotBlank => { - // Comment continues onto next line. - self.state = (State::Comment2, Substate::empty()); - return Ok((input, Type::CommentCommand)); - } - } - } - c if c.is_whitespace() => (), - _ => state = CommentState::NotBlank, - } - input = rest; - } - } - fn parse_comment_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - - let new_command = match take(rest, eof)?.0 { - Some('+') | Some('-') | Some('.') => true, - Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?, - None | Some(_) => false, - }; - if new_command { - self.state = ( - State::General, - Substate::START_OF_LINE | Substate::START_OF_COMMAND, - ); - } else { - self.state.0 = State::Comment1; - } - Ok((rest, Type::Newline)) - } - fn parse_document_1<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let mut end_cmd = false; - loop { - let (Some(c), rest) = take(input, eof)? else { - self.state.0 = State::Document3; - return Ok((input, Type::Document)); - }; - match c { - '.' => end_cmd = true, - '\n' | '\r' if is_end_of_line(input, eof)? => { - self.state.0 = if end_cmd { - State::Document3 - } else { - State::Document2 - }; - return Ok((input, Type::Document)); - } - c if !c.is_whitespace() => end_cmd = false, - _ => (), - } - input = rest; - } - } - fn parse_document_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - self.state.0 = State::Document1; - Ok((rest, Type::Newline)) - } - fn parse_document_3<'a>( - &mut self, - input: &'a str, - _eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - Ok((input, Type::EndCommand)) - } - fn quoted_file_label(input: &str, eof: bool) -> Result { - let input = skip_spaces_and_comments(input, eof)?; - match take(input, eof)?.0 { - Some('\'') | Some('"') | Some('\n') => Ok(true), - _ => Ok(false), - } - } - fn parse_file_label_1<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let mut sub = Segmenter { - state: (State::General, self.state.1), - ..*self - }; - let (rest, type_) = sub.push(input, eof)?; - if type_ == Type::Identifier { - let id = &input[..input.len() - rest.len()]; - debug_assert!(id_match("LABEL", id), "{id} should be LABEL"); - if Self::quoted_file_label(rest, eof)? { - *self = sub; - } else { - self.state.0 = State::FileLabel2; - } - } else { - self.state.1 = sub.state.1; - } - Ok((rest, type_)) - } - fn parse_file_label_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let input = skip_spaces(input, eof)?; - self.state.0 = State::FileLabel3; - Ok((input, Type::Spaces)) - } - fn parse_file_label_3<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let mut end_cmd = None; - loop { - let (c, rest) = take(input, eof)?; - match c { - None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => { - self.state = (State::General, Substate::empty()); - return Ok((end_cmd.unwrap_or(input), Type::UnquotedString)); - } - None => unreachable!(), - Some('.') => end_cmd = Some(input), - Some(c) if !c.is_whitespace() => end_cmd = None, - Some(_) => (), - } - input = rest; - } - } - fn subparse<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> { - let mut sub = Segmenter { - mode: self.mode, - state: (State::General, self.state.1), - nest: 0, - }; - let result = sub.push(input, eof)?; - self.state.1 = sub.state.1; - Ok(result) - } - /// We are segmenting a `DO REPEAT` command, currently reading the syntax - /// that defines the stand-in variables (the head) before the lines of - /// syntax to be repeated (the body). - fn parse_do_repeat_1<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let (rest, type_) = self.subparse(input, eof)?; - if type_ == Type::SeparateCommands { - // We reached a blank line that separates the head from the body. - self.state.0 = State::DoRepeat2; - } else if type_ == Type::EndCommand || type_ == Type::StartCommand { - // We reached the body. - self.state.0 = State::DoRepeat3; - self.nest = 1; - } - Ok((rest, type_)) - } - /// We are segmenting a `DO REPEAT` command, currently reading a blank line - /// that separates the head from the body. - fn parse_do_repeat_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let (rest, type_) = self.subparse(input, eof)?; - if type_ == Type::Newline { - // We reached the body. - self.state.0 = State::DoRepeat3; - self.nest = 1; - } - Ok((rest, type_)) - } - fn parse_newline<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (Some(c), rest) = take(input, eof)? else { - return Ok(None); - }; - match c { - '\n' => Ok(Some(rest)), - '\r' => { - if let (Some('\n'), rest) = take(rest, eof)? { - Ok(Some(rest)) - } else { - Ok(None) - } - } - _ => Ok(None), - } - } - - fn parse_full_line<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result<&'a str, Incomplete> { - loop { - if is_end_of_line(input, eof)? { - return Ok(input); - } - input = take(input, eof).unwrap().1; - } - } - fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result { - let input = input.strip_prefix(&['-', '+']).unwrap_or(input); - let (id1, input) = self.next_id_in_command(input, eof)?; - if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) { - Ok(1) - } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) - { - Ok(-1) - } else { - Ok(0) - } - } - /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that - /// are to be repeated. Report each line of syntax as a single - /// [`Type::DoRepeatCommand`]. - /// - /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT` - /// blocks inside the lines we're segmenting. `self.nest` counts the - /// nesting level, starting at 1. - fn parse_do_repeat_3<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - if let Some(rest) = self.parse_newline(input, eof)? { - return Ok((rest, Type::Newline)); - } - let rest = self.parse_full_line(input, eof)?; - let direction = self.check_repeat_command(input, eof)?; - if direction > 0 { - if let Some(nest) = self.nest.checked_add(1) { - self.nest = nest; - } else { - self.state.0 = State::DoRepeat4; - } - } else if direction < 0 { - self.nest -= 1; - if self.nest == 0 { - // Nesting level dropped to 0, so we've finished reading the `DO - // REPEAT` body. - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - return self.push(input, eof); - } - } - return Ok((rest, Type::DoRepeatCommand)); - } - fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Type), Incomplete> { - self.state.0 = State::DoRepeat3; - Ok((input, Type::DoRepeatOverflow)) - } - /// We are segmenting a `DEFINE` command, which consists of: - /// - /// - The `DEFINE` keyword. - /// - /// - An identifier. We transform this into `Type::MacroName` instead of - /// `Type::Identifier` or `Type::MacroId` because this identifier must - /// never be macro-expanded. - /// - /// - Anything but `(`. - /// - /// - `(` followed by a sequence of tokens possibly including balanced - /// parentheses up to a final `)`. - /// - /// - A sequence of any number of lines, one string per line, ending with - /// `!ENDDEFINE`. The first line is usually blank (that is, a newline - /// follows the `(`). The last line usually just has `!ENDDEFINE.` on - /// it, but it can start with other tokens. The whole - /// DEFINE...!ENDDEFINE can be on a single line, even. - fn parse_define_1_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let (rest, type_) = self.subparse(input, eof)?; - match type_ { - Type::Identifier | Type::MacroId if self.state.0 == State::Define1 => { - self.state.0 = State::Define2; - return Ok((rest, Type::MacroName)); - } - Type::SeparateCommands | Type::EndCommand | Type::StartCommand => { - // The DEFINE command is malformed because we reached its end - // without ever hitting a `(` token. Transition back to general - // parsing. - self.state.0 = State::General; - } - Type::Punct if input.starts_with('(') => { - self.state.0 = State::Define3; - self.nest = 1; - } - _ => (), - } - Ok((rest, type_)) - } - fn parse_define_3<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let (rest, type_) = self.subparse(input, eof)?; - match type_ { - Type::SeparateCommands | Type::EndCommand | Type::StartCommand => { - // The DEFINE command is malformed because we reached its end - // without ever hitting a `(` token. Transition back to general - // parsing. - self.state.0 = State::General; - } - Type::Punct if input.starts_with('(') => { - self.nest += 1; - } - Type::Punct if input.starts_with(')') => { - self.nest -= 1; - if self.nest == 0 { - self.state = (State::Define4, Substate::empty()); - } - } - _ => (), - } - Ok((rest, type_)) - } - fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> { - loop { - input = skip_spaces_and_comments(input, true).unwrap(); - let (Some(c), rest) = take(input, true).unwrap() else { - return None; - }; - match c { - '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => { - return Some(input) - } - '\'' | '"' => { - let index = rest.find(c)?; - input = &rest[index + 1..]; - } - _ => input = rest, - } - } - } - - /// We are in the body of a macro definition, looking for additional lines - /// of the body or `!ENDDEFINE`. - /// - /// In `State::Define4`, we're parsing the first line of the macro body (the - /// same line as the closing parenthesis in the argument definition). In - /// `State::Define5`, we're on a later line. - fn parse_define_4_5<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let rest = self.parse_full_line(input, eof)?; - let line = &input[..input.len() - rest.len()]; - if let Some(end) = Self::find_enddefine(line) { - // Macro ends at the !ENDDEFINE on this line. - self.state = (State::General, Substate::empty()); - let (prefix, rest) = input.split_at(line.len() - end.len()); - if prefix.is_empty() { - // Line starts with `!ENDDEFINE`. - self.push(input, eof) - } else if prefix.trim_start().is_empty() { - // Line starts with spaces followed by `!ENDDEFINE`. - Ok((rest, Type::Spaces)) - } else { - // Line starts with some content followed by `!ENDDEFINE`. - Ok((rest, Type::MacroBody)) - } - } else { - // No `!ENDDEFINE`. We have a full line of macro body. - // - // If the first line of the macro body is blank, we just report it - // as spaces, or not at all if there are no spaces, because it's not - // significant. - // - // However, if it's a later line, we need to report it because blank - // lines can have significance. - let type_ = if self.state.0 == State::Define4 && line.trim_start().is_empty() { - if line.is_empty() { - return self.parse_define_6(input, eof); - } - Type::Spaces - } else { - Type::MacroBody - }; - self.state.0 = State::Define6; - Ok((rest, type_)) - } - } - fn parse_define_6<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - self.state.0 = State::Define5; - Ok((rest, Type::Newline)) - } - fn parse_begin_data_1<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let (rest, type_) = self.subparse(input, eof)?; - if type_ == Type::Newline { - self.state.0 = State::BeginData2; - } - Ok((rest, type_)) - } - fn parse_begin_data_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let (rest, type_) = self.subparse(input, eof)?; - if type_ == Type::Newline { - self.state.0 = State::BeginData3; - } - Ok((rest, type_)) - } - fn is_end_data(line: &str) -> bool { - let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else { - return false; - }; - let (Some(c), rest) = take(rest, true).unwrap() else { - return false; - }; - if !c.is_whitespace() { - return false; - }; - let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else { - return false; - }; - - let mut endcmd = false; - for c in rest.chars() { - match c { - '.' if endcmd => return false, - '.' => endcmd = true, - c if c.is_whitespace() => (), - _ => return false, - } - } - true - } - fn parse_begin_data_3<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let rest = self.parse_full_line(input, eof)?; - let line = &input[..input.len() - rest.len()]; - if Self::is_end_data(line) { - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - self.push(input, eof) - } else { - self.state.0 = State::BeginData4; - Ok((rest, Type::InlineData)) - } - } - fn parse_begin_data_4<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Type), Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - self.state.0 = State::BeginData3; - Ok((rest, Type::Newline)) - } -} - -fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> { - line.get(..pattern.len()) - .map(|prefix| { - prefix - .eq_ignore_ascii_case(pattern) - .then(|| &line[pattern.len()..]) - }) - .flatten() -} - -#[cfg(test)] -mod test { - use crate::prompt::PromptStyle; - - use super::{Mode, Segmenter, Type}; - - fn check_segmentation( - mut input: &str, - mode: Mode, - expect_segments: &[(Type, &str)], - expect_prompts: &[PromptStyle], - ) { - let mut segments = Vec::with_capacity(expect_segments.len()); - let mut prompts = Vec::new(); - let mut segmenter = Segmenter::new(mode, false); - loop { - let (rest, type_) = segmenter.push(input, true).unwrap(); - let len = input.len() - rest.len(); - let token = &input[..len]; - segments.push((type_, token)); - match type_ { - Type::End => break, - Type::Newline => prompts.push(segmenter.prompt()), - _ => (), - } - input = rest; - } - - if &segments != expect_segments { - eprintln!("segments differ from expected:"); - let difference = diff::slice(expect_segments, &segments); - for result in difference { - match result { - diff::Result::Left(left) => eprintln!("-{left:?}"), - diff::Result::Both(left, _right) => eprintln!(" {left:?}"), - diff::Result::Right(right) => eprintln!("+{right:?}"), - } - } - panic!(); - } - - if &prompts != expect_prompts { - eprintln!("prompts differ from expected:"); - let difference = diff::slice(expect_prompts, &prompts); - for result in difference { - match result { - diff::Result::Left(left) => eprintln!("-{left:?}"), - diff::Result::Both(left, _right) => eprintln!(" {left:?}"), - diff::Result::Right(right) => eprintln!("+{right:?}"), - } - } - panic!(); - } - } - - fn print_segmentation(mut input: &str) { - let mut segmenter = Segmenter::new(Mode::Auto, false); - loop { - let (rest, type_) = segmenter.push(input, true).unwrap(); - let len = input.len() - rest.len(); - let token = &input[..len]; - print!("{type_:?} {token:?}"); - match type_ { - Type::Newline => print!(" ({:?})", segmenter.prompt()), - Type::End => break, - _ => (), - } - println!(); - input = rest; - } - } - - #[test] - fn test_identifiers() { - check_segmentation( - r#"a ab abc abcd !abcd -A AB ABC ABCD !ABCD -aB aBC aBcD !aBcD -$x $y $z !$z -grève Ângstrom poté -#a #b #c ## #d !#d -@efg @ @@. @#@ !@ -## # #12345 #.# -f@#_.#6 -GhIjK -.x 1y _z -"#, - Mode::Auto, - &[ - (Type::Identifier, "a"), - (Type::Spaces, " "), - (Type::Identifier, "ab"), - (Type::Spaces, " "), - (Type::Identifier, "abc"), - (Type::Spaces, " "), - (Type::Identifier, "abcd"), - (Type::Spaces, " "), - (Type::MacroId, "!abcd"), - (Type::Newline, "\n"), - (Type::Identifier, "A"), - (Type::Spaces, " "), - (Type::Identifier, "AB"), - (Type::Spaces, " "), - (Type::Identifier, "ABC"), - (Type::Spaces, " "), - (Type::Identifier, "ABCD"), - (Type::Spaces, " "), - (Type::MacroId, "!ABCD"), - (Type::Newline, "\n"), - (Type::Identifier, "aB"), - (Type::Spaces, " "), - (Type::Identifier, "aBC"), - (Type::Spaces, " "), - (Type::Identifier, "aBcD"), - (Type::Spaces, " "), - (Type::MacroId, "!aBcD"), - (Type::Newline, "\n"), - (Type::Identifier, "$x"), - (Type::Spaces, " "), - (Type::Identifier, "$y"), - (Type::Spaces, " "), - (Type::Identifier, "$z"), - (Type::Spaces, " "), - (Type::MacroId, "!$z"), - (Type::Newline, "\n"), - (Type::Identifier, "grève"), - (Type::Spaces, "\u{00a0}"), - (Type::Identifier, "Ângstrom"), - (Type::Spaces, "\u{00a0}"), - (Type::Identifier, "poté"), - (Type::Newline, "\n"), - (Type::Identifier, "#a"), - (Type::Spaces, " "), - (Type::Identifier, "#b"), - (Type::Spaces, " "), - (Type::Identifier, "#c"), - (Type::Spaces, " "), - (Type::Identifier, "##"), - (Type::Spaces, " "), - (Type::Identifier, "#d"), - (Type::Spaces, " "), - (Type::MacroId, "!#d"), - (Type::Newline, "\n"), - (Type::Identifier, "@efg"), - (Type::Spaces, " "), - (Type::Identifier, "@"), - (Type::Spaces, " "), - (Type::Identifier, "@@."), - (Type::Spaces, " "), - (Type::Identifier, "@#@"), - (Type::Spaces, " "), - (Type::MacroId, "!@"), - (Type::Spaces, " "), - (Type::Newline, "\n"), - (Type::Identifier, "##"), - (Type::Spaces, " "), - (Type::Identifier, "#"), - (Type::Spaces, " "), - (Type::Identifier, "#12345"), - (Type::Spaces, " "), - (Type::Identifier, "#.#"), - (Type::Newline, "\n"), - (Type::Identifier, "f@#_.#6"), - (Type::Newline, "\n"), - (Type::Identifier, "GhIjK"), - (Type::Newline, "\n"), - (Type::StartCommand, "."), - (Type::Identifier, "x"), - (Type::Spaces, " "), - (Type::Number, "1"), - (Type::Identifier, "y"), - (Type::Spaces, " "), - (Type::Punct, "_"), - (Type::Identifier, "z"), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - ], - ); - } - - #[test] - fn test_identifiers_ending_in_dot() { - check_segmentation( - r#"abcd. abcd. -ABCD. ABCD. -aBcD. aBcD. -$y. $z. あいうえお. -#c. #d.. -@@. @@.... -#.#. -#abcd. -. -. -LMNOP. -QRSTUV./* end of line comment */ -qrstuv. /* end of line comment */ -QrStUv./* end of line comment */ -wxyz./* unterminated end of line comment -WXYZ. /* unterminated end of line comment -WxYz./* unterminated end of line comment -"#, - Mode::Auto, - &[ - (Type::Identifier, "abcd."), - (Type::Spaces, " "), - (Type::Identifier, "abcd"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "ABCD."), - (Type::Spaces, " "), - (Type::Identifier, "ABCD"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "aBcD."), - (Type::Spaces, " "), - (Type::Identifier, "aBcD"), - (Type::EndCommand, "."), - (Type::Spaces, " "), - (Type::Newline, "\n"), - (Type::Identifier, "$y."), - (Type::Spaces, " "), - (Type::Identifier, "$z."), - (Type::Spaces, " "), - (Type::Identifier, "あいうえお"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "#c."), - (Type::Spaces, " "), - (Type::Identifier, "#d."), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "@@."), - (Type::Spaces, " "), - (Type::Identifier, "@@..."), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "#.#"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "#abcd"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::StartCommand, "."), - (Type::Newline, "\n"), - (Type::StartCommand, "."), - (Type::Spaces, " "), - (Type::Newline, "\n"), - (Type::Identifier, "LMNOP"), - (Type::EndCommand, "."), - (Type::Spaces, " "), - (Type::Newline, "\n"), - (Type::Identifier, "QRSTUV"), - (Type::EndCommand, "."), - (Type::Comment, "/* end of line comment */"), - (Type::Newline, "\n"), - (Type::Identifier, "qrstuv"), - (Type::EndCommand, "."), - (Type::Spaces, " "), - (Type::Comment, "/* end of line comment */"), - (Type::Newline, "\n"), - (Type::Identifier, "QrStUv"), - (Type::EndCommand, "."), - (Type::Comment, "/* end of line comment */"), - (Type::Spaces, " "), - (Type::Newline, "\n"), - (Type::Identifier, "wxyz"), - (Type::EndCommand, "."), - (Type::Comment, "/* unterminated end of line comment"), - (Type::Newline, "\n"), - (Type::Identifier, "WXYZ"), - (Type::EndCommand, "."), - (Type::Spaces, " "), - (Type::Comment, "/* unterminated end of line comment"), - (Type::Newline, "\n"), - (Type::Identifier, "WxYz"), - (Type::EndCommand, "."), - (Type::Comment, "/* unterminated end of line comment "), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_reserved_words() { - check_segmentation( - r#"and or not eq ge gt le lt ne all by to with -AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH -andx orx notx eqx gex gtx lex ltx nex allx byx tox withx -and. with. -"#, - Mode::Auto, - &[ - (Type::ReservedWord, "and"), - (Type::Spaces, " "), - (Type::ReservedWord, "or"), - (Type::Spaces, " "), - (Type::ReservedWord, "not"), - (Type::Spaces, " "), - (Type::ReservedWord, "eq"), - (Type::Spaces, " "), - (Type::ReservedWord, "ge"), - (Type::Spaces, " "), - (Type::ReservedWord, "gt"), - (Type::Spaces, " "), - (Type::ReservedWord, "le"), - (Type::Spaces, " "), - (Type::ReservedWord, "lt"), - (Type::Spaces, " "), - (Type::ReservedWord, "ne"), - (Type::Spaces, " "), - (Type::ReservedWord, "all"), - (Type::Spaces, " "), - (Type::ReservedWord, "by"), - (Type::Spaces, " "), - (Type::ReservedWord, "to"), - (Type::Spaces, " "), - (Type::ReservedWord, "with"), - (Type::Newline, "\n"), - (Type::ReservedWord, "AND"), - (Type::Spaces, " "), - (Type::ReservedWord, "OR"), - (Type::Spaces, " "), - (Type::ReservedWord, "NOT"), - (Type::Spaces, " "), - (Type::ReservedWord, "EQ"), - (Type::Spaces, " "), - (Type::ReservedWord, "GE"), - (Type::Spaces, " "), - (Type::ReservedWord, "GT"), - (Type::Spaces, " "), - (Type::ReservedWord, "LE"), - (Type::Spaces, " "), - (Type::ReservedWord, "LT"), - (Type::Spaces, " "), - (Type::ReservedWord, "NE"), - (Type::Spaces, " "), - (Type::ReservedWord, "ALL"), - (Type::Spaces, " "), - (Type::ReservedWord, "BY"), - (Type::Spaces, " "), - (Type::ReservedWord, "TO"), - (Type::Spaces, " "), - (Type::ReservedWord, "WITH"), - (Type::Newline, "\n"), - (Type::Identifier, "andx"), - (Type::Spaces, " "), - (Type::Identifier, "orx"), - (Type::Spaces, " "), - (Type::Identifier, "notx"), - (Type::Spaces, " "), - (Type::Identifier, "eqx"), - (Type::Spaces, " "), - (Type::Identifier, "gex"), - (Type::Spaces, " "), - (Type::Identifier, "gtx"), - (Type::Spaces, " "), - (Type::Identifier, "lex"), - (Type::Spaces, " "), - (Type::Identifier, "ltx"), - (Type::Spaces, " "), - (Type::Identifier, "nex"), - (Type::Spaces, " "), - (Type::Identifier, "allx"), - (Type::Spaces, " "), - (Type::Identifier, "byx"), - (Type::Spaces, " "), - (Type::Identifier, "tox"), - (Type::Spaces, " "), - (Type::Identifier, "withx"), - (Type::Newline, "\n"), - (Type::Identifier, "and."), - (Type::Spaces, " "), - (Type::ReservedWord, "with"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_punctuation() { - check_segmentation( - r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] ** -~&|=>=><=<~=<>(),-+*/[]**!* -% : ; ? _ ` { } ~ !* -"#, - Mode::Auto, - &[ - (Type::Punct, "~"), - (Type::Spaces, " "), - (Type::Punct, "&"), - (Type::Spaces, " "), - (Type::Punct, "|"), - (Type::Spaces, " "), - (Type::Punct, "="), - (Type::Spaces, " "), - (Type::Punct, ">="), - (Type::Spaces, " "), - (Type::Punct, ">"), - (Type::Spaces, " "), - (Type::Punct, "<="), - (Type::Spaces, " "), - (Type::Punct, "<"), - (Type::Spaces, " "), - (Type::Punct, "~="), - (Type::Spaces, " "), - (Type::Punct, "<>"), - (Type::Spaces, " "), - (Type::Punct, "("), - (Type::Spaces, " "), - (Type::Punct, ")"), - (Type::Spaces, " "), - (Type::Punct, ","), - (Type::Spaces, " "), - (Type::Punct, "-"), - (Type::Spaces, " "), - (Type::Punct, "+"), - (Type::Spaces, " "), - (Type::Punct, "*"), - (Type::Spaces, " "), - (Type::Punct, "/"), - (Type::Spaces, " "), - (Type::Punct, "["), - (Type::Spaces, " "), - (Type::Punct, "]"), - (Type::Spaces, " "), - (Type::Punct, "**"), - (Type::Newline, "\n"), - (Type::Punct, "~"), - (Type::Punct, "&"), - (Type::Punct, "|"), - (Type::Punct, "="), - (Type::Punct, ">="), - (Type::Punct, ">"), - (Type::Punct, "<="), - (Type::Punct, "<"), - (Type::Punct, "~="), - (Type::Punct, "<>"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Punct, ","), - (Type::Punct, "-"), - (Type::Punct, "+"), - (Type::Punct, "*"), - (Type::Punct, "/"), - (Type::Punct, "["), - (Type::Punct, "]"), - (Type::Punct, "**"), - (Type::MacroId, "!*"), - (Type::Newline, "\n"), - (Type::Punct, "%"), - (Type::Spaces, " "), - (Type::Punct, ":"), - (Type::Spaces, " "), - (Type::Punct, ";"), - (Type::Spaces, " "), - (Type::Punct, "?"), - (Type::Spaces, " "), - (Type::Punct, "_"), - (Type::Spaces, " "), - (Type::Punct, "`"), - (Type::Spaces, " "), - (Type::Punct, "{"), - (Type::Spaces, " "), - (Type::Punct, "}"), - (Type::Spaces, " "), - (Type::Punct, "~"), - (Type::Spaces, " "), - (Type::MacroId, "!*"), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later], - ); - } - - #[test] - fn test_positive_numbers() { - check_segmentation( - r#"0 1 01 001. 1. -123. /* comment 1 */ /* comment 2 */ -.1 0.1 00.1 00.10 -5e1 6E-1 7e+1 6E+01 6e-03 -.3E1 .4e-1 .5E+1 .6e+01 .7E-03 -1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 -. 1e e1 1e+ 1e- 1. -"#, - Mode::Auto, - &[ - (Type::Number, "0"), - (Type::Spaces, " "), - (Type::Number, "1"), - (Type::Spaces, " "), - (Type::Number, "01"), - (Type::Spaces, " "), - (Type::Number, "001."), - (Type::Spaces, " "), - (Type::Number, "1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Number, "123"), - (Type::EndCommand, "."), - (Type::Spaces, " "), - (Type::Comment, "/* comment 1 */"), - (Type::Spaces, " "), - (Type::Comment, "/* comment 2 */"), - (Type::Newline, "\n"), - (Type::StartCommand, "."), - (Type::Number, "1"), - (Type::Spaces, " "), - (Type::Number, "0.1"), - (Type::Spaces, " "), - (Type::Number, "00.1"), - (Type::Spaces, " "), - (Type::Number, "00.10"), - (Type::Newline, "\n"), - (Type::Number, "5e1"), - (Type::Spaces, " "), - (Type::Number, "6E-1"), - (Type::Spaces, " "), - (Type::Number, "7e+1"), - (Type::Spaces, " "), - (Type::Number, "6E+01"), - (Type::Spaces, " "), - (Type::Number, "6e-03"), - (Type::Newline, "\n"), - (Type::StartCommand, "."), - (Type::Number, "3E1"), - (Type::Spaces, " "), - (Type::Number, ".4e-1"), - (Type::Spaces, " "), - (Type::Number, ".5E+1"), - (Type::Spaces, " "), - (Type::Number, ".6e+01"), - (Type::Spaces, " "), - (Type::Number, ".7E-03"), - (Type::Newline, "\n"), - (Type::Number, "1.23e1"), - (Type::Spaces, " "), - (Type::Number, "45.6E-1"), - (Type::Spaces, " "), - (Type::Number, "78.9e+1"), - (Type::Spaces, " "), - (Type::Number, "99.9E+01"), - (Type::Spaces, " "), - (Type::Number, "11.2e-03"), - (Type::Newline, "\n"), - (Type::StartCommand, "."), - (Type::Spaces, " "), - (Type::ExpectedExponent, "1e"), - (Type::Spaces, " "), - (Type::Identifier, "e1"), - (Type::Spaces, " "), - (Type::ExpectedExponent, "1e+"), - (Type::Spaces, " "), - (Type::ExpectedExponent, "1e-"), - (Type::Spaces, " "), - (Type::Number, "1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_negative_numbers() { - check_segmentation( - r#" -0 -1 -01 -001. -1. - -123. /* comment 1 */ /* comment 2 */ - -.1 -0.1 -00.1 -00.10 - -5e1 -6E-1 -7e+1 -6E+01 -6e-03 - -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03 - -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03 - -/**/1 - -. -1e -e1 -1e+ -1e- -1. -"#, - Mode::Auto, - &[ - (Type::Spaces, " "), - (Type::Number, "-0"), - (Type::Spaces, " "), - (Type::Number, "-1"), - (Type::Spaces, " "), - (Type::Number, "-01"), - (Type::Spaces, " "), - (Type::Number, "-001."), - (Type::Spaces, " "), - (Type::Number, "-1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Number, "-123"), - (Type::EndCommand, "."), - (Type::Spaces, " "), - (Type::Comment, "/* comment 1 */"), - (Type::Spaces, " "), - (Type::Comment, "/* comment 2 */"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Number, "-.1"), - (Type::Spaces, " "), - (Type::Number, "-0.1"), - (Type::Spaces, " "), - (Type::Number, "-00.1"), - (Type::Spaces, " "), - (Type::Number, "-00.10"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Number, "-5e1"), - (Type::Spaces, " "), - (Type::Number, "-6E-1"), - (Type::Spaces, " "), - (Type::Number, "-7e+1"), - (Type::Spaces, " "), - (Type::Number, "-6E+01"), - (Type::Spaces, " "), - (Type::Number, "-6e-03"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Number, "-.3E1"), - (Type::Spaces, " "), - (Type::Number, "-.4e-1"), - (Type::Spaces, " "), - (Type::Number, "-.5E+1"), - (Type::Spaces, " "), - (Type::Number, "-.6e+01"), - (Type::Spaces, " "), - (Type::Number, "-.7E-03"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Number, "-1.23e1"), - (Type::Spaces, " "), - (Type::Number, "-45.6E-1"), - (Type::Spaces, " "), - (Type::Number, "-78.9e+1"), - (Type::Spaces, " "), - (Type::Number, "-99.9E+01"), - (Type::Spaces, " "), - (Type::Number, "-11.2e-03"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Punct, "-"), - (Type::Comment, "/**/"), - (Type::Number, "1"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Punct, "-"), - (Type::Punct, "."), - (Type::Spaces, " "), - (Type::ExpectedExponent, "-1e"), - (Type::Spaces, " "), - (Type::Punct, "-"), - (Type::Identifier, "e1"), - (Type::Spaces, " "), - (Type::ExpectedExponent, "-1e+"), - (Type::Spaces, " "), - (Type::ExpectedExponent, "-1e-"), - (Type::Spaces, " "), - (Type::Number, "-1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_strings() { - check_segmentation( - r#"'x' "y" 'abc' -'Don''t' "Can't" 'Won''t' -"""quoted""" '"quoted"' -'' "" -'missing end quote -"missing double quote -x"4142" X'5152' -u'fffd' U"041" -+ new command -+ /* comment */ 'string continuation' -+ /* also a punctuator on blank line -- 'new command' -"#, - Mode::Auto, - &[ - (Type::QuotedString, "'x'"), - (Type::Spaces, " "), - (Type::QuotedString, "\"y\""), - (Type::Spaces, " "), - (Type::QuotedString, "'abc'"), - (Type::Newline, "\n"), - (Type::QuotedString, "'Don''t'"), - (Type::Spaces, " "), - (Type::QuotedString, "\"Can't\""), - (Type::Spaces, " "), - (Type::QuotedString, "'Won''t'"), - (Type::Newline, "\n"), - (Type::QuotedString, "\"\"\"quoted\"\"\""), - (Type::Spaces, " "), - (Type::QuotedString, "'\"quoted\"'"), - (Type::Newline, "\n"), - (Type::QuotedString, "''"), - (Type::Spaces, " "), - (Type::QuotedString, "\"\""), - (Type::Newline, "\n"), - (Type::ExpectedQuote, "'missing end quote"), - (Type::Newline, "\n"), - (Type::ExpectedQuote, "\"missing double quote"), - (Type::Newline, "\n"), - (Type::HexString, "x\"4142\""), - (Type::Spaces, " "), - (Type::HexString, "X'5152'"), - (Type::Newline, "\n"), - (Type::UnicodeString, "u'fffd'"), - (Type::Spaces, " "), - (Type::UnicodeString, "U\"041\""), - (Type::Newline, "\n"), - (Type::StartCommand, "+"), - (Type::Spaces, " "), - (Type::Identifier, "new"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::Newline, "\n"), - (Type::Punct, "+"), - (Type::Spaces, " "), - (Type::Comment, "/* comment */"), - (Type::Spaces, " "), - (Type::QuotedString, "'string continuation'"), - (Type::Newline, "\n"), - (Type::Punct, "+"), - (Type::Spaces, " "), - (Type::Comment, "/* also a punctuator on blank line"), - (Type::Newline, "\n"), - (Type::StartCommand, "-"), - (Type::Spaces, " "), - (Type::QuotedString, "'new command'"), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - ], - ); - } - - #[test] - fn test_shbang() { - check_segmentation( - r#"#! /usr/bin/pspp -title my title. -#! /usr/bin/pspp -"#, - Mode::Interactive, - &[ - (Type::Shbang, "#! /usr/bin/pspp"), - (Type::Newline, "\n"), - (Type::Identifier, "title"), - (Type::Spaces, " "), - (Type::Identifier, "my"), - (Type::Spaces, " "), - (Type::Identifier, "title"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "#"), - (Type::MacroId, "!"), - (Type::Spaces, " "), - (Type::Punct, "/"), - (Type::Identifier, "usr"), - (Type::Punct, "/"), - (Type::Identifier, "bin"), - (Type::Punct, "/"), - (Type::Identifier, "pspp"), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::First, PromptStyle::First, PromptStyle::Later], - ); - } - - #[test] - fn test_comment_command() { - check_segmentation( - r#"* Comment commands "don't -have to contain valid tokens. - -** Check ambiguity with ** token. -****************. - -comment keyword works too. -COMM also. -com is ambiguous with COMPUTE. - - * Comment need not start at left margin. - -* Comment ends with blank line - -next command. - -"#, - Mode::Interactive, - &[ - (Type::CommentCommand, "* Comment commands \"don't"), - (Type::Newline, "\n"), - (Type::CommentCommand, "have to contain valid tokens"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::CommentCommand, "** Check ambiguity with ** token"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::CommentCommand, "****************"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::CommentCommand, "comment keyword works too"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::CommentCommand, "COMM also"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "com"), - (Type::Spaces, " "), - (Type::Identifier, "is"), - (Type::Spaces, " "), - (Type::Identifier, "ambiguous"), - (Type::Spaces, " "), - (Type::ReservedWord, "with"), - (Type::Spaces, " "), - (Type::Identifier, "COMPUTE"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::Spaces, " "), - ( - Type::CommentCommand, - "* Comment need not start at left margin", - ), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::CommentCommand, "* Comment ends with blank line"), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::Identifier, "next"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Comment, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Comment, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_document_command() { - check_segmentation( - r#"DOCUMENT one line. -DOC more - than - one - line. -docu -first.paragraph -isn't parsed as tokens - -second paragraph. -"#, - Mode::Interactive, - &[ - (Type::StartDocument, ""), - (Type::Document, "DOCUMENT one line."), - (Type::EndCommand, ""), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::StartDocument, ""), - (Type::Document, "DOC more"), - (Type::Newline, "\n"), - (Type::Document, " than"), - (Type::Newline, "\n"), - (Type::Document, " one"), - (Type::Newline, "\n"), - (Type::Document, " line."), - (Type::EndCommand, ""), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::StartDocument, ""), - (Type::Document, "docu"), - (Type::Newline, "\n"), - (Type::Document, "first.paragraph"), - (Type::Newline, "\n"), - (Type::Document, "isn't parsed as tokens"), - (Type::Newline, "\n"), - (Type::Document, ""), - (Type::Newline, "\n"), - (Type::Document, "second paragraph."), - (Type::EndCommand, ""), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::First, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_file_label_command() { - check_segmentation( - r#"FIL label isn't quoted. -FILE - lab 'is quoted'. -FILE /* -/**/ lab not quoted here either - -"#, - Mode::Interactive, - &[ - (Type::Identifier, "FIL"), - (Type::Spaces, " "), - (Type::Identifier, "label"), - (Type::Spaces, " "), - (Type::UnquotedString, "isn't quoted"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "FILE"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "lab"), - (Type::Spaces, " "), - (Type::QuotedString, "'is quoted'"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "FILE"), - (Type::Spaces, " "), - (Type::Comment, "/*"), - (Type::Newline, "\n"), - (Type::Comment, "/**/"), - (Type::Spaces, " "), - (Type::Identifier, "lab"), - (Type::Spaces, " "), - (Type::UnquotedString, "not quoted here either"), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_begin_data() { - check_segmentation( - r#"begin data. -end data. - -begin data. /* -123 -xxx -end data. - -BEG /**/ DAT /* -5 6 7 /* x - -end data -end data -. - -begin - data. -data -end data. - -begin data "xxx". -begin data 123. -not data -"#, - Mode::Interactive, - &[ - (Type::Identifier, "begin"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "end"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::Identifier, "begin"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::EndCommand, "."), - (Type::Spaces, " "), - (Type::Comment, "/*"), - (Type::Newline, "\n"), - (Type::InlineData, "123"), - (Type::Newline, "\n"), - (Type::InlineData, "xxx"), - (Type::Newline, "\n"), - (Type::Identifier, "end"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::Identifier, "BEG"), - (Type::Spaces, " "), - (Type::Comment, "/**/"), - (Type::Spaces, " "), - (Type::Identifier, "DAT"), - (Type::Spaces, " "), - (Type::Comment, "/*"), - (Type::Newline, "\n"), - (Type::InlineData, "5 6 7 /* x"), - (Type::Newline, "\n"), - (Type::InlineData, ""), - (Type::Newline, "\n"), - (Type::InlineData, "end data"), - (Type::Newline, "\n"), - (Type::Identifier, "end"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::Newline, "\n"), - (Type::StartCommand, "."), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::Identifier, "begin"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::InlineData, "data"), - (Type::Newline, "\n"), - (Type::Identifier, "end"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::Identifier, "begin"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::Spaces, " "), - (Type::QuotedString, "\"xxx\""), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "begin"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::Spaces, " "), - (Type::Number, "123"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::ReservedWord, "not"), - (Type::Spaces, " "), - (Type::Identifier, "data"), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Data, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - ], - ); - } - - #[test] - fn test_do_repeat() { - check_segmentation( - r#"do repeat x=a b c - y=d e f. - do repeat a=1 thru 5. -another command. -second command -+ third command. -end /* x */ /* y */ repeat print. -end - repeat. -do - repeat #a=1. - inner command. -end repeat. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "do"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::Spaces, " "), - (Type::Identifier, "x"), - (Type::Punct, "="), - (Type::Identifier, "a"), - (Type::Spaces, " "), - (Type::Identifier, "b"), - (Type::Spaces, " "), - (Type::Identifier, "c"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "y"), - (Type::Punct, "="), - (Type::Identifier, "d"), - (Type::Spaces, " "), - (Type::Identifier, "e"), - (Type::Spaces, " "), - (Type::Identifier, "f"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, " do repeat a=1 thru 5."), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, "another command."), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, "second command"), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, "+ third command."), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print."), - (Type::Newline, "\n"), - (Type::Identifier, "end"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "do"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::Spaces, " "), - (Type::Identifier, "#a"), - (Type::Punct, "="), - (Type::Number, "1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, " inner command."), - (Type::Newline, "\n"), - (Type::Identifier, "end"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_do_repeat_overflow() { - const N: usize = 257; - let do_repeat: Vec = (0..N) - .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5)) - .collect(); - let end_repeat: Vec = (0..N) - .rev() - .map(|i| format!("end repeat. /* {i}\n")) - .collect(); - - let s: String = do_repeat - .iter() - .chain(end_repeat.iter()) - .map(|s| s.as_str()) - .collect(); - let mut expect_output = vec![ - (Type::Identifier, "do"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::Spaces, " "), - (Type::Identifier, "v0"), - (Type::Punct, "="), - (Type::Number, "0"), - (Type::Spaces, " "), - (Type::Identifier, "thru"), - (Type::Spaces, " "), - (Type::Number, "5"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - ]; - for i in 1..N { - expect_output.push((Type::DoRepeatCommand, &do_repeat[i].trim_end())); - if i >= 255 { - expect_output.push((Type::DoRepeatOverflow, "")); - } - expect_output.push((Type::Newline, "\n")); - } - for i in 0..254 { - expect_output.push((Type::DoRepeatCommand, &end_repeat[i].trim_end())); - expect_output.push((Type::Newline, "\n")); - } - let comments: Vec = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect(); - for comment in &comments { - expect_output.extend([ - (Type::Identifier, "end"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::EndCommand, "."), - (Type::Spaces, " "), - (Type::Comment, comment), - (Type::Newline, "\n"), - ]); - } - expect_output.push((Type::End, "")); - - let expect_prompts: Vec<_> = (0..N * 2 - 3) - .map(|_| PromptStyle::DoRepeat) - .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First]) - .collect(); - check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts); - } - - #[test] - fn test_do_repeat_batch() { - check_segmentation( - r#"do repeat x=a b c - y=d e f -do repeat a=1 thru 5 -another command -second command -+ third command -end /* x */ /* y */ repeat print -end - repeat -do - repeat #a=1 - - inner command -end repeat -"#, - Mode::Batch, - &[ - (Type::Identifier, "do"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::Spaces, " "), - (Type::Identifier, "x"), - (Type::Punct, "="), - (Type::Identifier, "a"), - (Type::Spaces, " "), - (Type::Identifier, "b"), - (Type::Spaces, " "), - (Type::Identifier, "c"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "y"), - (Type::Punct, "="), - (Type::Identifier, "d"), - (Type::Spaces, " "), - (Type::Identifier, "e"), - (Type::Spaces, " "), - (Type::Identifier, "f"), - (Type::Newline, "\n"), - (Type::StartCommand, ""), - (Type::DoRepeatCommand, "do repeat a=1 thru 5"), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, "another command"), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, "second command"), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, "+ third command"), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print"), - (Type::Newline, "\n"), - (Type::Identifier, "end"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::Newline, "\n"), - (Type::StartCommand, ""), - (Type::Identifier, "do"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::Spaces, " "), - (Type::Identifier, "#a"), - (Type::Punct, "="), - (Type::Number, "1"), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::DoRepeatCommand, " inner command"), - (Type::Newline, "\n"), - (Type::Identifier, "end"), - (Type::Spaces, " "), - (Type::Identifier, "repeat"), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::Later, - ], - ); - } - - mod define { - use crate::{ - lex::segment::{Mode, Type}, - prompt::PromptStyle, - }; - - use super::check_segmentation; - - #[test] - fn test_simple() { - check_segmentation( - r#"define !macro1() -var1 var2 var3 "!enddefine" -!enddefine. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::MacroBody, "var1 var2 var3 \"!enddefine\""), - (Type::Newline, "\n"), - (Type::MacroId, "!enddefine"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_no_newline_after_parentheses() { - check_segmentation( - r#"define !macro1() var1 var2 var3 /* !enddefine -!enddefine. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::MacroBody, " var1 var2 var3 /* !enddefine"), - (Type::Newline, "\n"), - (Type::MacroId, "!enddefine"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_no_newline_before_enddefine() { - check_segmentation( - r#"define !macro1() -var1 var2 var3!enddefine. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::MacroBody, "var1 var2 var3"), - (Type::MacroId, "!enddefine"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_all_on_one_line() { - check_segmentation( - r#"define !macro1()var1 var2 var3!enddefine. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::MacroBody, "var1 var2 var3"), - (Type::MacroId, "!enddefine"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::First], - ); - } - - #[test] - fn test_empty() { - check_segmentation( - r#"define !macro1() -!enddefine. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::MacroId, "!enddefine"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_blank_lines() { - check_segmentation( - r#"define !macro1() - - -!enddefine. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::MacroBody, ""), - (Type::Newline, "\n"), - (Type::MacroBody, ""), - (Type::Newline, "\n"), - (Type::MacroId, "!enddefine"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_arguments() { - check_segmentation( - r#"define !macro1(a(), b(), c()) -!enddefine. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Identifier, "a"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Punct, ","), - (Type::Spaces, " "), - (Type::Identifier, "b"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Punct, ","), - (Type::Spaces, " "), - (Type::Identifier, "c"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::MacroId, "!enddefine"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_multiline_arguments() { - check_segmentation( - r#"define !macro1( - a(), b( - ), - c() -) -!enddefine. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "a"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Punct, ","), - (Type::Spaces, " "), - (Type::Identifier, "b"), - (Type::Punct, "("), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Punct, ")"), - (Type::Punct, ","), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "c"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::MacroId, "!enddefine"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Define, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_arguments_start_on_second_line() { - check_segmentation( - r#"define !macro1 -(x,y,z -) -content 1 -content 2 -!enddefine. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Newline, "\n"), - (Type::Punct, "("), - (Type::Identifier, "x"), - (Type::Punct, ","), - (Type::Identifier, "y"), - (Type::Punct, ","), - (Type::Identifier, "z"), - (Type::Newline, "\n"), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::MacroBody, "content 1"), - (Type::Newline, "\n"), - (Type::MacroBody, "content 2"), - (Type::Newline, "\n"), - (Type::MacroId, "!enddefine"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_early_end_of_command_1() { - check_segmentation( - r#"define !macro1. -data list /x 1. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "data"), - (Type::Spaces, " "), - (Type::Identifier, "list"), - (Type::Spaces, " "), - (Type::Punct, "/"), - (Type::Identifier, "x"), - (Type::Spaces, " "), - (Type::Number, "1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::First, PromptStyle::First], - ); - } - - #[test] - fn test_early_end_of_command_2() { - check_segmentation( - r#"define !macro1 -x. -data list /x 1. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Newline, "\n"), - (Type::Identifier, "x"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "data"), - (Type::Spaces, " "), - (Type::Identifier, "list"), - (Type::Spaces, " "), - (Type::Punct, "/"), - (Type::Identifier, "x"), - (Type::Spaces, " "), - (Type::Number, "1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::Later, PromptStyle::First, PromptStyle::First], - ); - } - - #[test] - fn test_early_end_of_command_3() { - check_segmentation( - r#"define !macro1(. -x. -data list /x 1. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "x"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "data"), - (Type::Spaces, " "), - (Type::Identifier, "list"), - (Type::Spaces, " "), - (Type::Punct, "/"), - (Type::Identifier, "x"), - (Type::Spaces, " "), - (Type::Number, "1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::First, PromptStyle::First, PromptStyle::First], - ); - } - - #[test] - fn test_early_end_of_command_4() { - // Notice the command terminator at the end of the `DEFINE` command, - // which should not be there and ends it early. - check_segmentation( - r#"define !macro1. -data list /x 1. -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "data"), - (Type::Spaces, " "), - (Type::Identifier, "list"), - (Type::Spaces, " "), - (Type::Punct, "/"), - (Type::Identifier, "x"), - (Type::Spaces, " "), - (Type::Number, "1"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::First, PromptStyle::First], - ); - } - - #[test] - fn test_missing_enddefine() { - check_segmentation( - r#"define !macro1() -content line 1 -content line 2 -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::MacroBody, "content line 1"), - (Type::Newline, "\n"), - (Type::MacroBody, "content line 2"), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::Define, - ], - ); - } - - #[test] - fn test_missing_enddefine_2() { - check_segmentation( - r#"define !macro1() -"#, - Mode::Interactive, - &[ - (Type::Identifier, "define"), - (Type::Spaces, " "), - (Type::MacroName, "!macro1"), - (Type::Punct, "("), - (Type::Punct, ")"), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[PromptStyle::Define], - ); - } - } - - #[test] - fn test_batch_mode() { - check_segmentation( - r#"first command - another line of first command -+ second command -third command - -fourth command. - fifth command. -"#, - Mode::Batch, - &[ - (Type::Identifier, "first"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "another"), - (Type::Spaces, " "), - (Type::Identifier, "line"), - (Type::Spaces, " "), - (Type::Identifier, "of"), - (Type::Spaces, " "), - (Type::Identifier, "first"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::Newline, "\n"), - (Type::StartCommand, "+"), - (Type::Spaces, " "), - (Type::Identifier, "second"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::Newline, "\n"), - (Type::StartCommand, ""), - (Type::Identifier, "third"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::Identifier, "fourth"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "fifth"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_auto_mode() { - check_segmentation( - r#"command - another line of command -2sls -+ another command -another line of second command -data list /x 1 -aggregate. -print eject. -twostep cluster - - -fourth command. - fifth command. -"#, - Mode::Auto, - &[ - (Type::Identifier, "command"), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "another"), - (Type::Spaces, " "), - (Type::Identifier, "line"), - (Type::Spaces, " "), - (Type::Identifier, "of"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::Newline, "\n"), - (Type::StartCommand, ""), - (Type::Number, "2"), - (Type::Identifier, "sls"), - (Type::Newline, "\n"), - (Type::StartCommand, "+"), - (Type::Spaces, " "), - (Type::Identifier, "another"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::Newline, "\n"), - (Type::Identifier, "another"), - (Type::Spaces, " "), - (Type::Identifier, "line"), - (Type::Spaces, " "), - (Type::Identifier, "of"), - (Type::Spaces, " "), - (Type::Identifier, "second"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::Newline, "\n"), - (Type::StartCommand, ""), - (Type::Identifier, "data"), - (Type::Spaces, " "), - (Type::Identifier, "list"), - (Type::Spaces, " "), - (Type::Punct, "/"), - (Type::Identifier, "x"), - (Type::Spaces, " "), - (Type::Number, "1"), - (Type::Newline, "\n"), - (Type::StartCommand, ""), - (Type::Identifier, "aggregate"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "print"), - (Type::Spaces, " "), - (Type::Identifier, "eject"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Identifier, "twostep"), - (Type::Spaces, " "), - (Type::Identifier, "cluster"), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::SeparateCommands, ""), - (Type::Newline, "\n"), - (Type::Identifier, "fourth"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::Spaces, " "), - (Type::Identifier, "fifth"), - (Type::Spaces, " "), - (Type::Identifier, "command"), - (Type::EndCommand, "."), - (Type::Newline, "\n"), - (Type::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - ], - ); - } -} diff --git a/rust/src/lex/segment/mod.rs b/rust/src/lex/segment/mod.rs new file mode 100644 index 0000000000..6bf30ba2e5 --- /dev/null +++ b/rust/src/lex/segment/mod.rs @@ -0,0 +1,1322 @@ +//! Syntax segmentation. +//! +//! PSPP divides traditional "lexical analysis" or "tokenization" into two +//! phases: a lower-level phase called "segmentation" and a higher-level phase +//! called "scanning". This module implements the segmentation phase. +//! [`super::scan`] contains declarations for the scanning phase. +//! +//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label +//! (a segment type) for each byte or contiguous sequence of bytes in the input. +//! It also, in a few corner cases, outputs zero-width segments that label the +//! boundary between a pair of bytes in the input. +//! +//! Some segment types correspond directly to tokens; for example, an +//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID) +//! later in lexical analysis. Other segments contribute to tokens but do not +//! correspond directly; for example, multiple quoted string segments +//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators +//! (SEG_PUNCT) may be combined to form a single string token (T_STRING). Still +//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior +//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE). + +use crate::{ + identifier::{id_match, id_match_n, is_reserved_word, IdentifierChar}, + prompt::PromptStyle, +}; +use bitflags::bitflags; + +use super::command_name::{command_match, COMMAND_NAMES}; + +/// Segmentation mode. +/// +/// PSPP syntax is written in one of two modes which are broadly defined as +/// follows: +/// +/// - In interactive mode, commands end with a period at the end of the line +/// or with a blank line. +/// +/// - In batch mode, the second and subsequent lines of a command are indented +/// from the left margin. +/// +/// The segmenter can also try to automatically detect the mode in use, using a +/// heuristic that is usually correct. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] +pub enum Mode { + /// Try to interpret input correctly regardless of whether it is written + /// for interactive or batch mode. + #[default] + Auto, + + /// Interactive syntax mode. + Interactive, + + /// Batch syntax mode. + Batch, +} + +/// The type of a segment. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Type { + Number, + QuotedString, + HexString, + UnicodeString, + UnquotedString, + ReservedWord, + Identifier, + Punct, + Shbang, + Spaces, + Comment, + Newline, + CommentCommand, + DoRepeatCommand, + DoRepeatOverflow, + InlineData, + MacroId, + MacroName, + MacroBody, + StartDocument, + Document, + StartCommand, + SeparateCommands, + EndCommand, + End, + ExpectedQuote, + ExpectedExponent, + UnexpectedChar, +} + +bitflags! { + #[derive(Copy, Clone, Debug)] + pub struct Substate: u8 { + const START_OF_LINE = 1; + const START_OF_COMMAND = 2; + } +} + +#[derive(Copy, Clone)] +pub struct Segmenter { + state: (State, Substate), + nest: u8, + mode: Mode, +} + +#[derive(Copy, Clone, Debug)] +pub struct Incomplete; + +impl Segmenter { + /// Returns a segmenter with the given syntax `mode`. + /// + /// If `is_snippet` is false, then the segmenter will parse as if it's being + /// given a whole file. This means, for example, that it will interpret `-` + /// or `+` at the beginning of the syntax as a separator between commands + /// (since `-` or `+` at the beginning of a line has this meaning). + /// + /// If `is_snippet` is true, then the segmenter will parse as if it's being + /// given an isolated piece of syntax. This means that, for example, that + /// it will interpret `-` or `+` at the beginning of the syntax as an + /// operator token or (if followed by a digit) as part of a number. + pub fn new(mode: Mode, is_snippet: bool) -> Self { + Self { + state: if is_snippet { + (State::General, Substate::empty()) + } else { + (State::Shbang, Substate::empty()) + }, + mode, + nest: 0, + } + } + + pub fn mode(&self) -> Mode { + self.mode + } + + fn start_of_line(&self) -> bool { + self.state.1.contains(Substate::START_OF_LINE) + } + + fn start_of_command(&self) -> bool { + self.state.1.contains(Substate::START_OF_COMMAND) + } + + /// Returns the style of command prompt to display to an interactive user + /// for input in the current state.. The return value is most accurate in + /// mode `Mode::Interactive` and at the beginning of a line (that is, if + /// [`Segmenter::push`] consumed as much as possible of the input up to a + /// new-line). + pub fn prompt(&self) -> PromptStyle { + match self.state.0 { + State::Shbang => PromptStyle::First, + State::General => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::Comment1 | State::Comment2 => PromptStyle::Comment, + State::Document1 | State::Document2 => PromptStyle::Document, + State::Document3 => PromptStyle::First, + State::FileLabel1 => PromptStyle::Later, + State::FileLabel2 | State::FileLabel3 => PromptStyle::First, + State::DoRepeat1 | State::DoRepeat2 => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::DoRepeat3 => PromptStyle::DoRepeat, + State::DoRepeat4 => PromptStyle::DoRepeat, + State::Define1 | State::Define2 | State::Define3 => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define, + State::BeginData1 => PromptStyle::First, + State::BeginData2 => PromptStyle::Later, + State::BeginData3 | State::BeginData4 => PromptStyle::Data, + } + } + + /// Attempts to label a prefix of the remaining input with a segment type. + /// The caller supplies a prefix of the remaining input as `input`. If + /// `eof` is true, then `input` is the entire (remainder) of the input; if + /// `eof` is false, then further input is potentially available. + /// + /// The input may contain '\n' or '\r\n' line ends in any combination. + /// + /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes + /// in the segment at the beginning of `input` (a number in + /// `0..=input.len()`) and the type of that segment. The next call should + /// not include those bytes in `input`, because they have (figuratively) + /// been consumed by the segmenter. + /// + /// Segments can have zero length, including segment types `Type::End`, + /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and + /// `Type::Spaces`. + /// + /// Failure occurs only if the segment type of the bytes in `input` cannot + /// yet be determined. In this case, this function returns `Err(Incomplete)`. If + /// more input is available, the caller should obtain some more, then call + /// again with a longer `input`. If this is not enough, the process might + /// need to repeat again and again. If input is exhausted, then the caller + /// may call again setting `eof` to true. This function will never return + /// `Err(Incomplete)` when `eof` is true. + /// + /// The caller must not, in a sequence of calls, supply contradictory input. + /// That is, bytes provided as part of `input` in one call, but not + /// consumed, must not be provided with *different* values on subsequent + /// calls. This is because the function must often make decisions based on + /// looking ahead beyond the bytes that it consumes. + pub fn push<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> { + if input.is_empty() { + if eof { + return Ok((input, Type::End)); + } else { + return Err(Incomplete); + }; + } + + match self.state.0 { + State::Shbang => return self.parse_shbang(input, eof), + State::General => { + if self.start_of_line() { + self.parse_start_of_line(input, eof) + } else { + self.parse_mid_line(input, eof) + } + } + State::Comment1 => self.parse_comment_1(input, eof), + State::Comment2 => self.parse_comment_2(input, eof), + State::Document1 => self.parse_document_1(input, eof), + State::Document2 => self.parse_document_2(input, eof), + State::Document3 => self.parse_document_3(input, eof), + State::FileLabel1 => self.parse_file_label_1(input, eof), + State::FileLabel2 => self.parse_file_label_2(input, eof), + State::FileLabel3 => self.parse_file_label_3(input, eof), + State::DoRepeat1 => self.parse_do_repeat_1(input, eof), + State::DoRepeat2 => self.parse_do_repeat_2(input, eof), + State::DoRepeat3 => self.parse_do_repeat_3(input, eof), + State::DoRepeat4 => self.parse_do_repeat_4(input), + State::Define1 => self.parse_define_1_2(input, eof), + State::Define2 => self.parse_define_1_2(input, eof), + State::Define3 => self.parse_define_3(input, eof), + State::Define4 => self.parse_define_4_5(input, eof), + State::Define5 => self.parse_define_4_5(input, eof), + State::Define6 => self.parse_define_6(input, eof), + State::BeginData1 => self.parse_begin_data_1(input, eof), + State::BeginData2 => self.parse_begin_data_2(input, eof), + State::BeginData3 => self.parse_begin_data_3(input, eof), + State::BeginData4 => self.parse_begin_data_4(input, eof), + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum State { + Shbang, + General, + Comment1, + Comment2, + Document1, + Document2, + Document3, + FileLabel1, + FileLabel2, + FileLabel3, + DoRepeat1, + DoRepeat2, + DoRepeat3, + DoRepeat4, + Define1, + Define2, + Define3, + Define4, + Define5, + Define6, + BeginData1, + BeginData2, + BeginData3, + BeginData4, +} + +fn take(input: &str, eof: bool) -> Result<(Option, &str), Incomplete> { + let mut iter = input.chars(); + match iter.next() { + None if !eof => Err(Incomplete), + c => Ok((c, iter.as_str())), + } +} + +fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input), + '*' => { + if let (Some('/'), rest) = take(rest, eof)? { + return Ok(rest); + } + } + _ => (), + }; + input = rest; + } +} + +fn skip_matching(f: F, input: &str, eof: bool) -> Result<&str, Incomplete> +where + F: Fn(char) -> bool, +{ + let input = input.trim_start_matches(f); + if input.is_empty() && !eof { + Err(Incomplete) + } else { + Ok(input) + } +} + +fn match_char(f: F, input: &str, eof: bool) -> Result, Incomplete> +where + F: Fn(char) -> bool, +{ + if let (Some(c), rest) = take(input, eof)? { + if f(c) { + return Ok(Some(rest)); + } + } + Ok(None) +} + +fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), + c if c.is_whitespace() => (), + _ => return Ok(input), + } + input = rest; + } +} + +fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> { + skip_matching(|c| c.is_ascii_digit(), input, eof) +} + +fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '/' => { + let (c, rest2) = take(rest, eof)?; + match c { + Some('*') => input = skip_comment(rest2, eof)?, + Some(_) | None => return Ok(rest), + } + } + '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), + c if c.is_whitespace() => input = rest, + _ => return Ok(input), + }; + } +} + +fn is_start_of_string(input: &str, eof: bool) -> Result { + let (Some(c), _rest) = take(input, eof)? else { + return Ok(false); + }; + match c { + 'x' | 'X' | 'u' | 'U' => Ok({ + let (c, _rest) = take(input, eof)?; + c == Some('\'') || c == Some('"') + }), + '\'' | '"' => Ok(true), + '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true), + _ => Ok(false), + } +} + +fn is_end_of_line(input: &str, eof: bool) -> Result { + let (Some(c), rest) = take(input, eof)? else { + return Ok(true); + }; + Ok(match c { + '\n' => true, + '\r' => take(rest, eof)?.0 == Some('\n'), + _ => false, + }) +} + +fn at_end_of_line(input: &str, eof: bool) -> Result { + is_end_of_line(skip_spaces_and_comments(input, eof)?, eof) +} + +fn first(s: &str) -> char { + s.chars().next().unwrap() +} +fn get_command_name_candidates(target: &str) -> &[&'static str] { + if target.is_empty() { + return &[]; + } + let target_first = first(target).to_ascii_uppercase(); + let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first); + let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first); + &COMMAND_NAMES[low..high] +} + +fn detect_command_name(input: &str, eof: bool) -> Result { + let command_name = input + .split(|c: char| { + !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-') + }) + .next() + .unwrap(); + if !eof && command_name.len() == input.len() { + return Err(Incomplete); + } + let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.'); + for command in get_command_name_candidates(command_name) { + if let Some(m) = command_match(command, command_name) { + if m.missing_words <= 0 { + return Ok(true); + } + } + } + Ok(false) +} + +impl Segmenter { + fn parse_shbang<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + if let (Some('#'), rest) = take(input, eof)? { + if let (Some('!'), rest) = take(rest, eof)? { + let rest = self.parse_full_line(rest, eof)?; + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((rest, Type::Shbang)); + } + } + + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + self.push(input, eof) + } + fn at_command_start(&self, input: &str, eof: bool) -> Result { + match self.mode { + Mode::Auto => detect_command_name(input, eof), + Mode::Interactive => Ok(false), + Mode::Batch => Ok(true), + } + } + fn parse_start_of_line<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + debug_assert_eq!(self.state.0, State::General); + debug_assert!(self.start_of_line()); + debug_assert!(!input.is_empty()); + + let (Some(c), rest) = take(input, eof).unwrap() else { + unreachable!() + }; + match c { + '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => { + // This `+` is punctuation that may separate pieces of a string. + self.state = (State::General, Substate::empty()); + return Ok((rest, Type::Punct)); + } + '+' | '-' | '.' => { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((rest, Type::StartCommand)); + } + _ if c.is_whitespace() => { + if at_end_of_line(input, eof)? { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Type::SeparateCommands)); + } + } + _ => { + if self.at_command_start(input, eof)? + && !self.state.1.contains(Substate::START_OF_COMMAND) + { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Type::StartCommand)); + } + } + } + self.state.1 = Substate::START_OF_COMMAND; + self.parse_mid_line(input, eof) + } + fn parse_mid_line<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + debug_assert!(self.state.0 == State::General); + debug_assert!(!self.state.1.contains(Substate::START_OF_LINE)); + let (Some(c), rest) = take(input, eof)? else { + unreachable!() + }; + match c { + '\r' | '\n' if is_end_of_line(input, eof)? => { + self.state.1 |= Substate::START_OF_LINE; + Ok(( + self.parse_newline(input, eof).unwrap().unwrap(), + Type::Newline, + )) + } + '/' => { + if let (Some('*'), rest) = take(rest, eof)? { + let rest = skip_comment(rest, eof)?; + return Ok((rest, Type::Comment)); + } else { + self.state.1 = Substate::empty(); + return Ok((rest, Type::Punct)); + } + } + '-' => { + let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?; + match c { + Some(c) if c.is_ascii_digit() => { + return self.parse_number(rest, eof); + } + Some('.') => { + if let (Some(c), _rest) = take(rest2, eof)? { + if c.is_ascii_digit() { + return self.parse_number(rest, eof); + } + } + } + None | Some(_) => (), + } + self.state.1 = Substate::empty(); + return Ok((rest, Type::Punct)); + } + '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => { + self.state.1 = Substate::empty(); + return Ok((rest, Type::Punct)); + } + '*' => { + if self.state.1.contains(Substate::START_OF_COMMAND) { + self.state.0 = State::Comment1; + self.parse_comment_1(input, eof) + } else { + self.parse_digraph(&['*'], rest, eof) + } + } + '<' => self.parse_digraph(&['=', '>'], rest, eof), + '>' => self.parse_digraph(&['='], rest, eof), + '~' => self.parse_digraph(&['='], rest, eof), + '.' if at_end_of_line(rest, eof)? => { + self.state.1 = Substate::START_OF_COMMAND; + Ok((rest, Type::EndCommand)) + } + '.' => match take(rest, eof)? { + (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof), + _ => Ok((rest, Type::Punct)), + }, + '0'..='9' => self.parse_number(input, eof), + 'u' | 'U' => self.maybe_parse_string(Type::UnicodeString, (input, rest), eof), + 'x' | 'X' => self.maybe_parse_string(Type::HexString, (input, rest), eof), + '\'' | '"' => self.parse_string(Type::QuotedString, c, rest, eof), + '!' => { + let (c, rest2) = take(rest, eof)?; + match c { + Some('*') => Ok((rest2, Type::MacroId)), + Some(_) => self.parse_id(input, eof), + None => Ok((rest, Type::Punct)), + } + } + c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Type::Spaces)), + c if c.may_start_id() => self.parse_id(input, eof), + '!'..='~' if c != '\\' && c != '^' => { + self.state.1 = Substate::empty(); + Ok((rest, Type::Punct)) + } + _ => { + self.state.1 = Substate::empty(); + Ok((rest, Type::UnexpectedChar)) + } + } + } + fn parse_string<'a>( + &mut self, + type_: Type, + quote: char, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + println!("{quote:?} {input:?}"); + while let (Some(c), rest) = take(input, eof)? { + match c { + _ if c == quote => { + let (c, rest2) = take(rest, eof)?; + if c != Some(quote) { + self.state.1 = Substate::empty(); + return Ok((rest, type_)); + } + input = rest2; + } + '\r' | '\n' if is_end_of_line(input, eof)? => break, + _ => input = rest, + } + } + self.state.1 = Substate::empty(); + Ok((input, Type::ExpectedQuote)) + } + fn maybe_parse_string<'a>( + &mut self, + type_: Type, + input: (&'a str, &'a str), + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + match take(input.1, eof)? { + (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(type_, c, rest, eof), + _ => self.parse_id(input.0, eof), + } + } + fn next_id_in_command<'a>( + &self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, &'a str), Incomplete> { + let mut sub = Segmenter::new(self.mode, true); + loop { + let (rest, type_) = sub.push(input, eof)?; + match type_ { + Type::Shbang | Type::Spaces | Type::Comment | Type::Newline => (), + + Type::Identifier => return Ok((&input[..input.len() - rest.len()], rest)), + + Type::Number + | Type::QuotedString + | Type::HexString + | Type::UnicodeString + | Type::UnquotedString + | Type::ReservedWord + | Type::Punct + | Type::CommentCommand + | Type::DoRepeatCommand + | Type::DoRepeatOverflow + | Type::InlineData + | Type::MacroId + | Type::MacroName + | Type::MacroBody + | Type::StartDocument + | Type::Document + | Type::StartCommand + | Type::SeparateCommands + | Type::EndCommand + | Type::End + | Type::ExpectedQuote + | Type::ExpectedExponent + | Type::UnexpectedChar => return Ok(("", rest)), + } + input = rest; + } + } + fn parse_id<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> { + let (Some(_), mut end) = take(input, eof).unwrap() else { + unreachable!() + }; + while let (Some(c), rest) = take(end, eof)? { + if !c.may_continue_id() { + break; + }; + end = rest; + } + let identifier = &input[..input.len() - end.len()]; + let identifier = match identifier.strip_suffix('.') { + Some(without_dot) if at_end_of_line(end, eof)? => without_dot, + _ => identifier, + }; + let rest = &input[identifier.len()..]; + + if self.state.1.contains(Substate::START_OF_COMMAND) { + if id_match_n("COMMENT", identifier, 4) { + self.state.0 = State::Comment1; + return self.parse_comment_1(input, eof); + } else if id_match("DOCUMENT", identifier) { + self.state.0 = State::Document1; + return Ok((input, Type::StartDocument)); + } else if id_match_n("DEFINE", identifier, 6) { + self.state.0 = State::Define1; + } else if id_match("FILE", identifier) { + if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) { + self.state = (State::FileLabel1, Substate::empty()); + return Ok((rest, Type::Identifier)); + } + } else if id_match("DO", identifier) { + if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) { + self.state = (State::DoRepeat1, Substate::empty()); + return Ok((rest, Type::Identifier)); + } + } else if id_match("BEGIN", identifier) { + let (next_id, rest2) = self.next_id_in_command(rest, eof)?; + if id_match("DATA", next_id) { + let rest2 = skip_spaces_and_comments(rest2, eof)?; + let rest2 = if let Some(s) = rest2.strip_prefix('.') { + skip_spaces_and_comments(s, eof)? + } else { + rest2 + }; + if is_end_of_line(rest2, eof)? { + let s = &input[..input.len() - rest2.len()]; + self.state = ( + if s.contains('\n') { + State::BeginData1 + } else { + State::BeginData2 + }, + Substate::empty(), + ); + return Ok((rest, Type::Identifier)); + } + } + } + } + + self.state.1 = Substate::empty(); + let type_ = if is_reserved_word(identifier) { + Type::ReservedWord + } else if identifier.starts_with('!') { + Type::MacroId + } else { + Type::Identifier + }; + Ok((rest, type_)) + } + fn parse_digraph<'a>( + &mut self, + seconds: &[char], + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (c, rest) = take(input, eof)?; + self.state.1 = Substate::empty(); + Ok(( + match c { + Some(c) if seconds.contains(&c) => rest, + _ => input, + }, + Type::Punct, + )) + } + fn parse_number<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let mut input = skip_digits(input, eof)?; + if let Some(rest) = match_char(|c| c == '.', input, eof)? { + let rest2 = skip_digits(rest, eof)?; + if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? { + input = rest2; + } + }; + if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? { + let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest); + let rest2 = skip_digits(rest, eof)?; + if rest2.len() == rest.len() { + self.state.1 = Substate::empty(); + return Ok((rest, Type::ExpectedExponent)); + } + input = rest2; + } + self.state.1 = Substate::empty(); + Ok((input, Type::Number)) + } + fn parse_comment_1<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + enum CommentState<'a> { + Blank, + NotBlank, + Period(&'a str), + } + let mut state = CommentState::Blank; + loop { + let (Some(c), rest) = take(input, eof)? else { + // End of file. + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Type::SeparateCommands)); + }; + match c { + '.' => state = CommentState::Period(input), + '\n' | '\r' if is_end_of_line(input, eof)? => { + match state { + CommentState::Blank => { + // Blank line ends comment command. + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Type::SeparateCommands)); + } + CommentState::Period(period) => { + // '.' at end of line ends comment command. + self.state = (State::General, Substate::empty()); + return Ok((period, Type::CommentCommand)); + } + CommentState::NotBlank => { + // Comment continues onto next line. + self.state = (State::Comment2, Substate::empty()); + return Ok((input, Type::CommentCommand)); + } + } + } + c if c.is_whitespace() => (), + _ => state = CommentState::NotBlank, + } + input = rest; + } + } + fn parse_comment_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + + let new_command = match take(rest, eof)?.0 { + Some('+') | Some('-') | Some('.') => true, + Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?, + None | Some(_) => false, + }; + if new_command { + self.state = ( + State::General, + Substate::START_OF_LINE | Substate::START_OF_COMMAND, + ); + } else { + self.state.0 = State::Comment1; + } + Ok((rest, Type::Newline)) + } + fn parse_document_1<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let mut end_cmd = false; + loop { + let (Some(c), rest) = take(input, eof)? else { + self.state.0 = State::Document3; + return Ok((input, Type::Document)); + }; + match c { + '.' => end_cmd = true, + '\n' | '\r' if is_end_of_line(input, eof)? => { + self.state.0 = if end_cmd { + State::Document3 + } else { + State::Document2 + }; + return Ok((input, Type::Document)); + } + c if !c.is_whitespace() => end_cmd = false, + _ => (), + } + input = rest; + } + } + fn parse_document_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::Document1; + Ok((rest, Type::Newline)) + } + fn parse_document_3<'a>( + &mut self, + input: &'a str, + _eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + Ok((input, Type::EndCommand)) + } + fn quoted_file_label(input: &str, eof: bool) -> Result { + let input = skip_spaces_and_comments(input, eof)?; + match take(input, eof)?.0 { + Some('\'') | Some('"') | Some('\n') => Ok(true), + _ => Ok(false), + } + } + fn parse_file_label_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let mut sub = Segmenter { + state: (State::General, self.state.1), + ..*self + }; + let (rest, type_) = sub.push(input, eof)?; + if type_ == Type::Identifier { + let id = &input[..input.len() - rest.len()]; + debug_assert!(id_match("LABEL", id), "{id} should be LABEL"); + if Self::quoted_file_label(rest, eof)? { + *self = sub; + } else { + self.state.0 = State::FileLabel2; + } + } else { + self.state.1 = sub.state.1; + } + Ok((rest, type_)) + } + fn parse_file_label_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let input = skip_spaces(input, eof)?; + self.state.0 = State::FileLabel3; + Ok((input, Type::Spaces)) + } + fn parse_file_label_3<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let mut end_cmd = None; + loop { + let (c, rest) = take(input, eof)?; + match c { + None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => { + self.state = (State::General, Substate::empty()); + return Ok((end_cmd.unwrap_or(input), Type::UnquotedString)); + } + None => unreachable!(), + Some('.') => end_cmd = Some(input), + Some(c) if !c.is_whitespace() => end_cmd = None, + Some(_) => (), + } + input = rest; + } + } + fn subparse<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> { + let mut sub = Segmenter { + mode: self.mode, + state: (State::General, self.state.1), + nest: 0, + }; + let result = sub.push(input, eof)?; + self.state.1 = sub.state.1; + Ok(result) + } + /// We are segmenting a `DO REPEAT` command, currently reading the syntax + /// that defines the stand-in variables (the head) before the lines of + /// syntax to be repeated (the body). + fn parse_do_repeat_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + if type_ == Type::SeparateCommands { + // We reached a blank line that separates the head from the body. + self.state.0 = State::DoRepeat2; + } else if type_ == Type::EndCommand || type_ == Type::StartCommand { + // We reached the body. + self.state.0 = State::DoRepeat3; + self.nest = 1; + } + Ok((rest, type_)) + } + /// We are segmenting a `DO REPEAT` command, currently reading a blank line + /// that separates the head from the body. + fn parse_do_repeat_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + if type_ == Type::Newline { + // We reached the body. + self.state.0 = State::DoRepeat3; + self.nest = 1; + } + Ok((rest, type_)) + } + fn parse_newline<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (Some(c), rest) = take(input, eof)? else { + return Ok(None); + }; + match c { + '\n' => Ok(Some(rest)), + '\r' => { + if let (Some('\n'), rest) = take(rest, eof)? { + Ok(Some(rest)) + } else { + Ok(None) + } + } + _ => Ok(None), + } + } + + fn parse_full_line<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<&'a str, Incomplete> { + loop { + if is_end_of_line(input, eof)? { + return Ok(input); + } + input = take(input, eof).unwrap().1; + } + } + fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result { + let input = input.strip_prefix(&['-', '+']).unwrap_or(input); + let (id1, input) = self.next_id_in_command(input, eof)?; + if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) { + Ok(1) + } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) + { + Ok(-1) + } else { + Ok(0) + } + } + /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that + /// are to be repeated. Report each line of syntax as a single + /// [`Type::DoRepeatCommand`]. + /// + /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT` + /// blocks inside the lines we're segmenting. `self.nest` counts the + /// nesting level, starting at 1. + fn parse_do_repeat_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + if let Some(rest) = self.parse_newline(input, eof)? { + return Ok((rest, Type::Newline)); + } + let rest = self.parse_full_line(input, eof)?; + let direction = self.check_repeat_command(input, eof)?; + if direction > 0 { + if let Some(nest) = self.nest.checked_add(1) { + self.nest = nest; + } else { + self.state.0 = State::DoRepeat4; + } + } else if direction < 0 { + self.nest -= 1; + if self.nest == 0 { + // Nesting level dropped to 0, so we've finished reading the `DO + // REPEAT` body. + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + return self.push(input, eof); + } + } + return Ok((rest, Type::DoRepeatCommand)); + } + fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Type), Incomplete> { + self.state.0 = State::DoRepeat3; + Ok((input, Type::DoRepeatOverflow)) + } + /// We are segmenting a `DEFINE` command, which consists of: + /// + /// - The `DEFINE` keyword. + /// + /// - An identifier. We transform this into `Type::MacroName` instead of + /// `Type::Identifier` or `Type::MacroId` because this identifier must + /// never be macro-expanded. + /// + /// - Anything but `(`. + /// + /// - `(` followed by a sequence of tokens possibly including balanced + /// parentheses up to a final `)`. + /// + /// - A sequence of any number of lines, one string per line, ending with + /// `!ENDDEFINE`. The first line is usually blank (that is, a newline + /// follows the `(`). The last line usually just has `!ENDDEFINE.` on + /// it, but it can start with other tokens. The whole + /// DEFINE...!ENDDEFINE can be on a single line, even. + fn parse_define_1_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + match type_ { + Type::Identifier | Type::MacroId if self.state.0 == State::Define1 => { + self.state.0 = State::Define2; + return Ok((rest, Type::MacroName)); + } + Type::SeparateCommands | Type::EndCommand | Type::StartCommand => { + // The DEFINE command is malformed because we reached its end + // without ever hitting a `(` token. Transition back to general + // parsing. + self.state.0 = State::General; + } + Type::Punct if input.starts_with('(') => { + self.state.0 = State::Define3; + self.nest = 1; + } + _ => (), + } + Ok((rest, type_)) + } + fn parse_define_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + match type_ { + Type::SeparateCommands | Type::EndCommand | Type::StartCommand => { + // The DEFINE command is malformed because we reached its end + // without ever hitting a `(` token. Transition back to general + // parsing. + self.state.0 = State::General; + } + Type::Punct if input.starts_with('(') => { + self.nest += 1; + } + Type::Punct if input.starts_with(')') => { + self.nest -= 1; + if self.nest == 0 { + self.state = (State::Define4, Substate::empty()); + } + } + _ => (), + } + Ok((rest, type_)) + } + fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> { + loop { + input = skip_spaces_and_comments(input, true).unwrap(); + let (Some(c), rest) = take(input, true).unwrap() else { + return None; + }; + match c { + '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => { + return Some(input) + } + '\'' | '"' => { + let index = rest.find(c)?; + input = &rest[index + 1..]; + } + _ => input = rest, + } + } + } + + /// We are in the body of a macro definition, looking for additional lines + /// of the body or `!ENDDEFINE`. + /// + /// In `State::Define4`, we're parsing the first line of the macro body (the + /// same line as the closing parenthesis in the argument definition). In + /// `State::Define5`, we're on a later line. + fn parse_define_4_5<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_full_line(input, eof)?; + let line = &input[..input.len() - rest.len()]; + if let Some(end) = Self::find_enddefine(line) { + // Macro ends at the !ENDDEFINE on this line. + self.state = (State::General, Substate::empty()); + let (prefix, rest) = input.split_at(line.len() - end.len()); + if prefix.is_empty() { + // Line starts with `!ENDDEFINE`. + self.push(input, eof) + } else if prefix.trim_start().is_empty() { + // Line starts with spaces followed by `!ENDDEFINE`. + Ok((rest, Type::Spaces)) + } else { + // Line starts with some content followed by `!ENDDEFINE`. + Ok((rest, Type::MacroBody)) + } + } else { + // No `!ENDDEFINE`. We have a full line of macro body. + // + // If the first line of the macro body is blank, we just report it + // as spaces, or not at all if there are no spaces, because it's not + // significant. + // + // However, if it's a later line, we need to report it because blank + // lines can have significance. + let type_ = if self.state.0 == State::Define4 && line.trim_start().is_empty() { + if line.is_empty() { + return self.parse_define_6(input, eof); + } + Type::Spaces + } else { + Type::MacroBody + }; + self.state.0 = State::Define6; + Ok((rest, type_)) + } + } + fn parse_define_6<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::Define5; + Ok((rest, Type::Newline)) + } + fn parse_begin_data_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + if type_ == Type::Newline { + self.state.0 = State::BeginData2; + } + Ok((rest, type_)) + } + fn parse_begin_data_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let (rest, type_) = self.subparse(input, eof)?; + if type_ == Type::Newline { + self.state.0 = State::BeginData3; + } + Ok((rest, type_)) + } + fn is_end_data(line: &str) -> bool { + let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else { + return false; + }; + let (Some(c), rest) = take(rest, true).unwrap() else { + return false; + }; + if !c.is_whitespace() { + return false; + }; + let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else { + return false; + }; + + let mut endcmd = false; + for c in rest.chars() { + match c { + '.' if endcmd => return false, + '.' => endcmd = true, + c if c.is_whitespace() => (), + _ => return false, + } + } + true + } + fn parse_begin_data_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_full_line(input, eof)?; + let line = &input[..input.len() - rest.len()]; + if Self::is_end_data(line) { + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + self.push(input, eof) + } else { + self.state.0 = State::BeginData4; + Ok((rest, Type::InlineData)) + } + } + fn parse_begin_data_4<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Type), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::BeginData3; + Ok((rest, Type::Newline)) + } +} + +fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> { + line.get(..pattern.len()) + .map(|prefix| { + prefix + .eq_ignore_ascii_case(pattern) + .then(|| &line[pattern.len()..]) + }) + .flatten() +} + +#[cfg(test)] +mod test; diff --git a/rust/src/lex/segment/test.rs b/rust/src/lex/segment/test.rs new file mode 100644 index 0000000000..d24523f56e --- /dev/null +++ b/rust/src/lex/segment/test.rs @@ -0,0 +1,2158 @@ +use crate::prompt::PromptStyle; + +use super::{Mode, Segmenter, Type}; + +fn push_segment<'a>(segmenter: &mut Segmenter, input: &'a str, one_byte: bool) -> (&'a str, Type) { + if one_byte { + for len in input.char_indices().map(|(pos, _c)| pos) { + if let Ok((rest, type_)) = segmenter.push(&input[..len], false) { + return (&input[len - rest.len()..], type_); + } + } + } + segmenter.push(input, true).unwrap() +} + +fn _check_segmentation( + mut input: &str, + mode: Mode, + expect_segments: &[(Type, &str)], + expect_prompts: &[PromptStyle], + one_byte: bool, +) { + let mut segments = Vec::with_capacity(expect_segments.len()); + let mut prompts = Vec::new(); + let mut segmenter = Segmenter::new(mode, false); + loop { + let (rest, type_) = push_segment(&mut segmenter, input, one_byte); + let len = input.len() - rest.len(); + let token = &input[..len]; + segments.push((type_, token)); + match type_ { + Type::End => break, + Type::Newline => prompts.push(segmenter.prompt()), + _ => (), + } + input = rest; + } + + if &segments != expect_segments { + eprintln!("segments differ from expected:"); + let difference = diff::slice(expect_segments, &segments); + for result in difference { + match result { + diff::Result::Left(left) => eprintln!("-{left:?}"), + diff::Result::Both(left, _right) => eprintln!(" {left:?}"), + diff::Result::Right(right) => eprintln!("+{right:?}"), + } + } + panic!(); + } + + if &prompts != expect_prompts { + eprintln!("prompts differ from expected:"); + let difference = diff::slice(expect_prompts, &prompts); + for result in difference { + match result { + diff::Result::Left(left) => eprintln!("-{left:?}"), + diff::Result::Both(left, _right) => eprintln!(" {left:?}"), + diff::Result::Right(right) => eprintln!("+{right:?}"), + } + } + panic!(); + } +} + +fn check_segmentation( + input: &str, + mode: Mode, + expect_segments: &[(Type, &str)], + expect_prompts: &[PromptStyle], +) { + for (one_byte, one_byte_name) in [(false, "full-string"), (true, "byte-by-byte")] { + println!("running {one_byte_name} segmentation test with LF newlines..."); + _check_segmentation(input, mode, expect_segments, expect_prompts, one_byte); + + println!("running {one_byte_name} segmentation test with CRLF newlines..."); + _check_segmentation( + &input.replace('\n', "\r\n"), + mode, + &expect_segments + .iter() + .map(|(type_, s)| match *type_ { + Type::Newline => (Type::Newline, "\r\n"), + _ => (*type_, *s), + }) + .collect::>(), + expect_prompts, + one_byte, + ); + + if let Some(input) = input.strip_suffix('\n') { + println!("running {one_byte_name} segmentation test without final newline..."); + let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect(); + assert_eq!(expect_segments.pop(), Some((Type::End, ""))); + assert_eq!(expect_segments.pop(), Some((Type::Newline, "\n"))); + while let Some((Type::SeparateCommands | Type::EndCommand, "")) = expect_segments.last() + { + expect_segments.pop(); + } + expect_segments.push((Type::End, "")); + _check_segmentation( + input, + mode, + &expect_segments, + &expect_prompts[..expect_prompts.len() - 1], + one_byte, + ); + } + } +} + +fn print_segmentation(mut input: &str) { + let mut segmenter = Segmenter::new(Mode::Auto, false); + loop { + let (rest, type_) = segmenter.push(input, true).unwrap(); + let len = input.len() - rest.len(); + let token = &input[..len]; + print!("{type_:?} {token:?}"); + match type_ { + Type::Newline => print!(" ({:?})", segmenter.prompt()), + Type::End => break, + _ => (), + } + println!(); + input = rest; + } +} + +#[test] +fn test_identifiers() { + check_segmentation( + r#"a ab abc abcd !abcd +A AB ABC ABCD !ABCD +aB aBC aBcD !aBcD +$x $y $z !$z +grève Ângstrom poté +#a #b #c ## #d !#d +@efg @ @@. @#@ !@ +## # #12345 #.# +f@#_.#6 +GhIjK +.x 1y _z +"#, + Mode::Auto, + &[ + (Type::Identifier, "a"), + (Type::Spaces, " "), + (Type::Identifier, "ab"), + (Type::Spaces, " "), + (Type::Identifier, "abc"), + (Type::Spaces, " "), + (Type::Identifier, "abcd"), + (Type::Spaces, " "), + (Type::MacroId, "!abcd"), + (Type::Newline, "\n"), + (Type::Identifier, "A"), + (Type::Spaces, " "), + (Type::Identifier, "AB"), + (Type::Spaces, " "), + (Type::Identifier, "ABC"), + (Type::Spaces, " "), + (Type::Identifier, "ABCD"), + (Type::Spaces, " "), + (Type::MacroId, "!ABCD"), + (Type::Newline, "\n"), + (Type::Identifier, "aB"), + (Type::Spaces, " "), + (Type::Identifier, "aBC"), + (Type::Spaces, " "), + (Type::Identifier, "aBcD"), + (Type::Spaces, " "), + (Type::MacroId, "!aBcD"), + (Type::Newline, "\n"), + (Type::Identifier, "$x"), + (Type::Spaces, " "), + (Type::Identifier, "$y"), + (Type::Spaces, " "), + (Type::Identifier, "$z"), + (Type::Spaces, " "), + (Type::MacroId, "!$z"), + (Type::Newline, "\n"), + (Type::Identifier, "grève"), + (Type::Spaces, "\u{00a0}"), + (Type::Identifier, "Ângstrom"), + (Type::Spaces, "\u{00a0}"), + (Type::Identifier, "poté"), + (Type::Newline, "\n"), + (Type::Identifier, "#a"), + (Type::Spaces, " "), + (Type::Identifier, "#b"), + (Type::Spaces, " "), + (Type::Identifier, "#c"), + (Type::Spaces, " "), + (Type::Identifier, "##"), + (Type::Spaces, " "), + (Type::Identifier, "#d"), + (Type::Spaces, " "), + (Type::MacroId, "!#d"), + (Type::Newline, "\n"), + (Type::Identifier, "@efg"), + (Type::Spaces, " "), + (Type::Identifier, "@"), + (Type::Spaces, " "), + (Type::Identifier, "@@."), + (Type::Spaces, " "), + (Type::Identifier, "@#@"), + (Type::Spaces, " "), + (Type::MacroId, "!@"), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "##"), + (Type::Spaces, " "), + (Type::Identifier, "#"), + (Type::Spaces, " "), + (Type::Identifier, "#12345"), + (Type::Spaces, " "), + (Type::Identifier, "#.#"), + (Type::Newline, "\n"), + (Type::Identifier, "f@#_.#6"), + (Type::Newline, "\n"), + (Type::Identifier, "GhIjK"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::Identifier, "y"), + (Type::Spaces, " "), + (Type::Punct, "_"), + (Type::Identifier, "z"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + ], + ); +} + +#[test] +fn test_identifiers_ending_in_dot() { + check_segmentation( + r#"abcd. abcd. +ABCD. ABCD. +aBcD. aBcD. +$y. $z. あいうえお. +#c. #d.. +@@. @@.... +#.#. +#abcd. +. +. +LMNOP. +QRSTUV./* end of line comment */ +qrstuv. /* end of line comment */ +QrStUv./* end of line comment */ +wxyz./* unterminated end of line comment +WXYZ. /* unterminated end of line comment +WxYz./* unterminated end of line comment +"#, + Mode::Auto, + &[ + (Type::Identifier, "abcd."), + (Type::Spaces, " "), + (Type::Identifier, "abcd"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "ABCD."), + (Type::Spaces, " "), + (Type::Identifier, "ABCD"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "aBcD."), + (Type::Spaces, " "), + (Type::Identifier, "aBcD"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "$y."), + (Type::Spaces, " "), + (Type::Identifier, "$z."), + (Type::Spaces, " "), + (Type::Identifier, "あいうえお"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "#c."), + (Type::Spaces, " "), + (Type::Identifier, "#d."), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "@@."), + (Type::Spaces, " "), + (Type::Identifier, "@@..."), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "#.#"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "#abcd"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "LMNOP"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "QRSTUV"), + (Type::EndCommand, "."), + (Type::Comment, "/* end of line comment */"), + (Type::Newline, "\n"), + (Type::Identifier, "qrstuv"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/* end of line comment */"), + (Type::Newline, "\n"), + (Type::Identifier, "QrStUv"), + (Type::EndCommand, "."), + (Type::Comment, "/* end of line comment */"), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "wxyz"), + (Type::EndCommand, "."), + (Type::Comment, "/* unterminated end of line comment"), + (Type::Newline, "\n"), + (Type::Identifier, "WXYZ"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/* unterminated end of line comment"), + (Type::Newline, "\n"), + (Type::Identifier, "WxYz"), + (Type::EndCommand, "."), + (Type::Comment, "/* unterminated end of line comment "), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_reserved_words() { + check_segmentation( + r#"and or not eq ge gt le lt ne all by to with +AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH +andx orx notx eqx gex gtx lex ltx nex allx byx tox withx +and. with. +"#, + Mode::Auto, + &[ + (Type::ReservedWord, "and"), + (Type::Spaces, " "), + (Type::ReservedWord, "or"), + (Type::Spaces, " "), + (Type::ReservedWord, "not"), + (Type::Spaces, " "), + (Type::ReservedWord, "eq"), + (Type::Spaces, " "), + (Type::ReservedWord, "ge"), + (Type::Spaces, " "), + (Type::ReservedWord, "gt"), + (Type::Spaces, " "), + (Type::ReservedWord, "le"), + (Type::Spaces, " "), + (Type::ReservedWord, "lt"), + (Type::Spaces, " "), + (Type::ReservedWord, "ne"), + (Type::Spaces, " "), + (Type::ReservedWord, "all"), + (Type::Spaces, " "), + (Type::ReservedWord, "by"), + (Type::Spaces, " "), + (Type::ReservedWord, "to"), + (Type::Spaces, " "), + (Type::ReservedWord, "with"), + (Type::Newline, "\n"), + (Type::ReservedWord, "AND"), + (Type::Spaces, " "), + (Type::ReservedWord, "OR"), + (Type::Spaces, " "), + (Type::ReservedWord, "NOT"), + (Type::Spaces, " "), + (Type::ReservedWord, "EQ"), + (Type::Spaces, " "), + (Type::ReservedWord, "GE"), + (Type::Spaces, " "), + (Type::ReservedWord, "GT"), + (Type::Spaces, " "), + (Type::ReservedWord, "LE"), + (Type::Spaces, " "), + (Type::ReservedWord, "LT"), + (Type::Spaces, " "), + (Type::ReservedWord, "NE"), + (Type::Spaces, " "), + (Type::ReservedWord, "ALL"), + (Type::Spaces, " "), + (Type::ReservedWord, "BY"), + (Type::Spaces, " "), + (Type::ReservedWord, "TO"), + (Type::Spaces, " "), + (Type::ReservedWord, "WITH"), + (Type::Newline, "\n"), + (Type::Identifier, "andx"), + (Type::Spaces, " "), + (Type::Identifier, "orx"), + (Type::Spaces, " "), + (Type::Identifier, "notx"), + (Type::Spaces, " "), + (Type::Identifier, "eqx"), + (Type::Spaces, " "), + (Type::Identifier, "gex"), + (Type::Spaces, " "), + (Type::Identifier, "gtx"), + (Type::Spaces, " "), + (Type::Identifier, "lex"), + (Type::Spaces, " "), + (Type::Identifier, "ltx"), + (Type::Spaces, " "), + (Type::Identifier, "nex"), + (Type::Spaces, " "), + (Type::Identifier, "allx"), + (Type::Spaces, " "), + (Type::Identifier, "byx"), + (Type::Spaces, " "), + (Type::Identifier, "tox"), + (Type::Spaces, " "), + (Type::Identifier, "withx"), + (Type::Newline, "\n"), + (Type::Identifier, "and."), + (Type::Spaces, " "), + (Type::ReservedWord, "with"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_punctuation() { + check_segmentation( + r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] ** +~&|=>=><=<~=<>(),-+*/[]**!* +% : ; ? _ ` { } ~ !* +"#, + Mode::Auto, + &[ + (Type::Punct, "~"), + (Type::Spaces, " "), + (Type::Punct, "&"), + (Type::Spaces, " "), + (Type::Punct, "|"), + (Type::Spaces, " "), + (Type::Punct, "="), + (Type::Spaces, " "), + (Type::Punct, ">="), + (Type::Spaces, " "), + (Type::Punct, ">"), + (Type::Spaces, " "), + (Type::Punct, "<="), + (Type::Spaces, " "), + (Type::Punct, "<"), + (Type::Spaces, " "), + (Type::Punct, "~="), + (Type::Spaces, " "), + (Type::Punct, "<>"), + (Type::Spaces, " "), + (Type::Punct, "("), + (Type::Spaces, " "), + (Type::Punct, ")"), + (Type::Spaces, " "), + (Type::Punct, ","), + (Type::Spaces, " "), + (Type::Punct, "-"), + (Type::Spaces, " "), + (Type::Punct, "+"), + (Type::Spaces, " "), + (Type::Punct, "*"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Spaces, " "), + (Type::Punct, "["), + (Type::Spaces, " "), + (Type::Punct, "]"), + (Type::Spaces, " "), + (Type::Punct, "**"), + (Type::Newline, "\n"), + (Type::Punct, "~"), + (Type::Punct, "&"), + (Type::Punct, "|"), + (Type::Punct, "="), + (Type::Punct, ">="), + (Type::Punct, ">"), + (Type::Punct, "<="), + (Type::Punct, "<"), + (Type::Punct, "~="), + (Type::Punct, "<>"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Punct, "-"), + (Type::Punct, "+"), + (Type::Punct, "*"), + (Type::Punct, "/"), + (Type::Punct, "["), + (Type::Punct, "]"), + (Type::Punct, "**"), + (Type::MacroId, "!*"), + (Type::Newline, "\n"), + (Type::Punct, "%"), + (Type::Spaces, " "), + (Type::Punct, ":"), + (Type::Spaces, " "), + (Type::Punct, ";"), + (Type::Spaces, " "), + (Type::Punct, "?"), + (Type::Spaces, " "), + (Type::Punct, "_"), + (Type::Spaces, " "), + (Type::Punct, "`"), + (Type::Spaces, " "), + (Type::Punct, "{"), + (Type::Spaces, " "), + (Type::Punct, "}"), + (Type::Spaces, " "), + (Type::Punct, "~"), + (Type::Spaces, " "), + (Type::MacroId, "!*"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later], + ); +} + +#[test] +fn test_positive_numbers() { + check_segmentation( + r#"0 1 01 001. 1. +123. /* comment 1 */ /* comment 2 */ +.1 0.1 00.1 00.10 +5e1 6E-1 7e+1 6E+01 6e-03 +.3E1 .4e-1 .5E+1 .6e+01 .7E-03 +1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 +. 1e e1 1e+ 1e- 1. +"#, + Mode::Auto, + &[ + (Type::Number, "0"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::Spaces, " "), + (Type::Number, "01"), + (Type::Spaces, " "), + (Type::Number, "001."), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Number, "123"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/* comment 1 */"), + (Type::Spaces, " "), + (Type::Comment, "/* comment 2 */"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Number, "1"), + (Type::Spaces, " "), + (Type::Number, "0.1"), + (Type::Spaces, " "), + (Type::Number, "00.1"), + (Type::Spaces, " "), + (Type::Number, "00.10"), + (Type::Newline, "\n"), + (Type::Number, "5e1"), + (Type::Spaces, " "), + (Type::Number, "6E-1"), + (Type::Spaces, " "), + (Type::Number, "7e+1"), + (Type::Spaces, " "), + (Type::Number, "6E+01"), + (Type::Spaces, " "), + (Type::Number, "6e-03"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Number, "3E1"), + (Type::Spaces, " "), + (Type::Number, ".4e-1"), + (Type::Spaces, " "), + (Type::Number, ".5E+1"), + (Type::Spaces, " "), + (Type::Number, ".6e+01"), + (Type::Spaces, " "), + (Type::Number, ".7E-03"), + (Type::Newline, "\n"), + (Type::Number, "1.23e1"), + (Type::Spaces, " "), + (Type::Number, "45.6E-1"), + (Type::Spaces, " "), + (Type::Number, "78.9e+1"), + (Type::Spaces, " "), + (Type::Number, "99.9E+01"), + (Type::Spaces, " "), + (Type::Number, "11.2e-03"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Spaces, " "), + (Type::ExpectedExponent, "1e"), + (Type::Spaces, " "), + (Type::Identifier, "e1"), + (Type::Spaces, " "), + (Type::ExpectedExponent, "1e+"), + (Type::Spaces, " "), + (Type::ExpectedExponent, "1e-"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_negative_numbers() { + check_segmentation( + r#" -0 -1 -01 -001. -1. + -123. /* comment 1 */ /* comment 2 */ + -.1 -0.1 -00.1 -00.10 + -5e1 -6E-1 -7e+1 -6E+01 -6e-03 + -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03 + -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03 + -/**/1 + -. -1e -e1 -1e+ -1e- -1. +"#, + Mode::Auto, + &[ + (Type::Spaces, " "), + (Type::Number, "-0"), + (Type::Spaces, " "), + (Type::Number, "-1"), + (Type::Spaces, " "), + (Type::Number, "-01"), + (Type::Spaces, " "), + (Type::Number, "-001."), + (Type::Spaces, " "), + (Type::Number, "-1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-123"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/* comment 1 */"), + (Type::Spaces, " "), + (Type::Comment, "/* comment 2 */"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-.1"), + (Type::Spaces, " "), + (Type::Number, "-0.1"), + (Type::Spaces, " "), + (Type::Number, "-00.1"), + (Type::Spaces, " "), + (Type::Number, "-00.10"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-5e1"), + (Type::Spaces, " "), + (Type::Number, "-6E-1"), + (Type::Spaces, " "), + (Type::Number, "-7e+1"), + (Type::Spaces, " "), + (Type::Number, "-6E+01"), + (Type::Spaces, " "), + (Type::Number, "-6e-03"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-.3E1"), + (Type::Spaces, " "), + (Type::Number, "-.4e-1"), + (Type::Spaces, " "), + (Type::Number, "-.5E+1"), + (Type::Spaces, " "), + (Type::Number, "-.6e+01"), + (Type::Spaces, " "), + (Type::Number, "-.7E-03"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-1.23e1"), + (Type::Spaces, " "), + (Type::Number, "-45.6E-1"), + (Type::Spaces, " "), + (Type::Number, "-78.9e+1"), + (Type::Spaces, " "), + (Type::Number, "-99.9E+01"), + (Type::Spaces, " "), + (Type::Number, "-11.2e-03"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Punct, "-"), + (Type::Comment, "/**/"), + (Type::Number, "1"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Punct, "-"), + (Type::Punct, "."), + (Type::Spaces, " "), + (Type::ExpectedExponent, "-1e"), + (Type::Spaces, " "), + (Type::Punct, "-"), + (Type::Identifier, "e1"), + (Type::Spaces, " "), + (Type::ExpectedExponent, "-1e+"), + (Type::Spaces, " "), + (Type::ExpectedExponent, "-1e-"), + (Type::Spaces, " "), + (Type::Number, "-1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_strings() { + check_segmentation( + r#"'x' "y" 'abc' +'Don''t' "Can't" 'Won''t' +"""quoted""" '"quoted"' +'' "" +'missing end quote +"missing double quote +x"4142" X'5152' +u'fffd' U"041" ++ new command ++ /* comment */ 'string continuation' ++ /* also a punctuator on blank line +- 'new command' +"#, + Mode::Auto, + &[ + (Type::QuotedString, "'x'"), + (Type::Spaces, " "), + (Type::QuotedString, "\"y\""), + (Type::Spaces, " "), + (Type::QuotedString, "'abc'"), + (Type::Newline, "\n"), + (Type::QuotedString, "'Don''t'"), + (Type::Spaces, " "), + (Type::QuotedString, "\"Can't\""), + (Type::Spaces, " "), + (Type::QuotedString, "'Won''t'"), + (Type::Newline, "\n"), + (Type::QuotedString, "\"\"\"quoted\"\"\""), + (Type::Spaces, " "), + (Type::QuotedString, "'\"quoted\"'"), + (Type::Newline, "\n"), + (Type::QuotedString, "''"), + (Type::Spaces, " "), + (Type::QuotedString, "\"\""), + (Type::Newline, "\n"), + (Type::ExpectedQuote, "'missing end quote"), + (Type::Newline, "\n"), + (Type::ExpectedQuote, "\"missing double quote"), + (Type::Newline, "\n"), + (Type::HexString, "x\"4142\""), + (Type::Spaces, " "), + (Type::HexString, "X'5152'"), + (Type::Newline, "\n"), + (Type::UnicodeString, "u'fffd'"), + (Type::Spaces, " "), + (Type::UnicodeString, "U\"041\""), + (Type::Newline, "\n"), + (Type::StartCommand, "+"), + (Type::Spaces, " "), + (Type::Identifier, "new"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::Punct, "+"), + (Type::Spaces, " "), + (Type::Comment, "/* comment */"), + (Type::Spaces, " "), + (Type::QuotedString, "'string continuation'"), + (Type::Newline, "\n"), + (Type::Punct, "+"), + (Type::Spaces, " "), + (Type::Comment, "/* also a punctuator on blank line"), + (Type::Newline, "\n"), + (Type::StartCommand, "-"), + (Type::Spaces, " "), + (Type::QuotedString, "'new command'"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + ], + ); +} + +#[test] +fn test_shbang() { + check_segmentation( + r#"#! /usr/bin/pspp +title my title. +#! /usr/bin/pspp +"#, + Mode::Interactive, + &[ + (Type::Shbang, "#! /usr/bin/pspp"), + (Type::Newline, "\n"), + (Type::Identifier, "title"), + (Type::Spaces, " "), + (Type::Identifier, "my"), + (Type::Spaces, " "), + (Type::Identifier, "title"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "#"), + (Type::MacroId, "!"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "usr"), + (Type::Punct, "/"), + (Type::Identifier, "bin"), + (Type::Punct, "/"), + (Type::Identifier, "pspp"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First, PromptStyle::First, PromptStyle::Later], + ); +} + +#[test] +fn test_comment_command() { + check_segmentation( + r#"* Comment commands "don't +have to contain valid tokens. + +** Check ambiguity with ** token. +****************. + +comment keyword works too. +COMM also. +com is ambiguous with COMPUTE. + + * Comment need not start at left margin. + +* Comment ends with blank line + +next command. + +"#, + Mode::Interactive, + &[ + (Type::CommentCommand, "* Comment commands \"don't"), + (Type::Newline, "\n"), + (Type::CommentCommand, "have to contain valid tokens"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::CommentCommand, "** Check ambiguity with ** token"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::CommentCommand, "****************"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::CommentCommand, "comment keyword works too"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::CommentCommand, "COMM also"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "com"), + (Type::Spaces, " "), + (Type::Identifier, "is"), + (Type::Spaces, " "), + (Type::Identifier, "ambiguous"), + (Type::Spaces, " "), + (Type::ReservedWord, "with"), + (Type::Spaces, " "), + (Type::Identifier, "COMPUTE"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Spaces, " "), + ( + Type::CommentCommand, + "* Comment need not start at left margin", + ), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::CommentCommand, "* Comment ends with blank line"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "next"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Comment, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Comment, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_document_command() { + check_segmentation( + r#"DOCUMENT one line. +DOC more + than + one + line. +docu +first.paragraph +isn't parsed as tokens + +second paragraph. +"#, + Mode::Interactive, + &[ + (Type::StartDocument, ""), + (Type::Document, "DOCUMENT one line."), + (Type::EndCommand, ""), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::StartDocument, ""), + (Type::Document, "DOC more"), + (Type::Newline, "\n"), + (Type::Document, " than"), + (Type::Newline, "\n"), + (Type::Document, " one"), + (Type::Newline, "\n"), + (Type::Document, " line."), + (Type::EndCommand, ""), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::StartDocument, ""), + (Type::Document, "docu"), + (Type::Newline, "\n"), + (Type::Document, "first.paragraph"), + (Type::Newline, "\n"), + (Type::Document, "isn't parsed as tokens"), + (Type::Newline, "\n"), + (Type::Document, ""), + (Type::Newline, "\n"), + (Type::Document, "second paragraph."), + (Type::EndCommand, ""), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::First, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_file_label_command() { + check_segmentation( + r#"FIL label isn't quoted. +FILE + lab 'is quoted'. +FILE /* +/**/ lab not quoted here either + +"#, + Mode::Interactive, + &[ + (Type::Identifier, "FIL"), + (Type::Spaces, " "), + (Type::Identifier, "label"), + (Type::Spaces, " "), + (Type::UnquotedString, "isn't quoted"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "FILE"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "lab"), + (Type::Spaces, " "), + (Type::QuotedString, "'is quoted'"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "FILE"), + (Type::Spaces, " "), + (Type::Comment, "/*"), + (Type::Newline, "\n"), + (Type::Comment, "/**/"), + (Type::Spaces, " "), + (Type::Identifier, "lab"), + (Type::Spaces, " "), + (Type::UnquotedString, "not quoted here either"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_begin_data() { + check_segmentation( + r#"begin data. +end data. + +begin data. /* +123 +xxx +end data. + +BEG /**/ DAT /* +5 6 7 /* x + +end data +end data +. + +begin + data. +data +end data. + +begin data "xxx". +begin data 123. +not data +"#, + Mode::Interactive, + &[ + (Type::Identifier, "begin"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "begin"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/*"), + (Type::Newline, "\n"), + (Type::InlineData, "123"), + (Type::Newline, "\n"), + (Type::InlineData, "xxx"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "BEG"), + (Type::Spaces, " "), + (Type::Comment, "/**/"), + (Type::Spaces, " "), + (Type::Identifier, "DAT"), + (Type::Spaces, " "), + (Type::Comment, "/*"), + (Type::Newline, "\n"), + (Type::InlineData, "5 6 7 /* x"), + (Type::Newline, "\n"), + (Type::InlineData, ""), + (Type::Newline, "\n"), + (Type::InlineData, "end data"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "begin"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::InlineData, "data"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "begin"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::QuotedString, "\"xxx\""), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "begin"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Number, "123"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::ReservedWord, "not"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Data, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + ], + ); +} + +#[test] +fn test_do_repeat() { + check_segmentation( + r#"do repeat x=a b c + y=d e f. + do repeat a=1 thru 5. +another command. +second command ++ third command. +end /* x */ /* y */ repeat print. +end + repeat. +do + repeat #a=1. + inner command. +end repeat. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "do"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "x"), + (Type::Punct, "="), + (Type::Identifier, "a"), + (Type::Spaces, " "), + (Type::Identifier, "b"), + (Type::Spaces, " "), + (Type::Identifier, "c"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "y"), + (Type::Punct, "="), + (Type::Identifier, "d"), + (Type::Spaces, " "), + (Type::Identifier, "e"), + (Type::Spaces, " "), + (Type::Identifier, "f"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, " do repeat a=1 thru 5."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "another command."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "second command"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "+ third command."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print."), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "do"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "#a"), + (Type::Punct, "="), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, " inner command."), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_do_repeat_overflow() { + const N: usize = 257; + let do_repeat: Vec = (0..N) + .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5)) + .collect(); + let end_repeat: Vec = (0..N) + .rev() + .map(|i| format!("end repeat. /* {i}\n")) + .collect(); + + let s: String = do_repeat + .iter() + .chain(end_repeat.iter()) + .map(|s| s.as_str()) + .collect(); + let mut expect_output = vec![ + (Type::Identifier, "do"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "v0"), + (Type::Punct, "="), + (Type::Number, "0"), + (Type::Spaces, " "), + (Type::Identifier, "thru"), + (Type::Spaces, " "), + (Type::Number, "5"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + ]; + for i in 1..N { + expect_output.push((Type::DoRepeatCommand, &do_repeat[i].trim_end())); + if i >= 255 { + expect_output.push((Type::DoRepeatOverflow, "")); + } + expect_output.push((Type::Newline, "\n")); + } + for i in 0..254 { + expect_output.push((Type::DoRepeatCommand, &end_repeat[i].trim_end())); + expect_output.push((Type::Newline, "\n")); + } + let comments: Vec = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect(); + for comment in &comments { + expect_output.extend([ + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, comment), + (Type::Newline, "\n"), + ]); + } + expect_output.push((Type::End, "")); + + let expect_prompts: Vec<_> = (0..N * 2 - 3) + .map(|_| PromptStyle::DoRepeat) + .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First]) + .collect(); + check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts); +} + +#[test] +fn test_do_repeat_batch() { + check_segmentation( + r#"do repeat x=a b c + y=d e f +do repeat a=1 thru 5 +another command +second command ++ third command +end /* x */ /* y */ repeat print +end + repeat +do + repeat #a=1 + + inner command +end repeat +"#, + Mode::Batch, + &[ + (Type::Identifier, "do"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "x"), + (Type::Punct, "="), + (Type::Identifier, "a"), + (Type::Spaces, " "), + (Type::Identifier, "b"), + (Type::Spaces, " "), + (Type::Identifier, "c"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "y"), + (Type::Punct, "="), + (Type::Identifier, "d"), + (Type::Spaces, " "), + (Type::Identifier, "e"), + (Type::Spaces, " "), + (Type::Identifier, "f"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::DoRepeatCommand, "do repeat a=1 thru 5"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "another command"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "second command"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "+ third command"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Identifier, "do"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "#a"), + (Type::Punct, "="), + (Type::Number, "1"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, " inner command"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::Later, + ], + ); +} + +mod define { + use crate::{ + lex::segment::{Mode, Type}, + prompt::PromptStyle, + }; + + use super::check_segmentation; + + #[test] + fn test_simple() { + check_segmentation( + r#"define !macro1() +var1 var2 var3 "!enddefine" +!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, "var1 var2 var3 \"!enddefine\""), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_no_newline_after_parentheses() { + check_segmentation( + r#"define !macro1() var1 var2 var3 /* !enddefine +!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::MacroBody, " var1 var2 var3 /* !enddefine"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_no_newline_before_enddefine() { + check_segmentation( + r#"define !macro1() +var1 var2 var3!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, "var1 var2 var3"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_all_on_one_line() { + check_segmentation( + r#"define !macro1()var1 var2 var3!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::MacroBody, "var1 var2 var3"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First], + ); + } + + #[test] + fn test_empty() { + check_segmentation( + r#"define !macro1() +!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_blank_lines() { + check_segmentation( + r#"define !macro1() + + +!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, ""), + (Type::Newline, "\n"), + (Type::MacroBody, ""), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::First, + ], + ); + } + + #[test] + fn test_arguments() { + check_segmentation( + r#"define !macro1(a(), b(), c()) +!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Identifier, "a"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Spaces, " "), + (Type::Identifier, "b"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Spaces, " "), + (Type::Identifier, "c"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_multiline_arguments() { + check_segmentation( + r#"define !macro1( + a(), b( + ), + c() +) +!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "a"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Spaces, " "), + (Type::Identifier, "b"), + (Type::Punct, "("), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "c"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Define, + PromptStyle::First, + ], + ); + } + + #[test] + fn test_arguments_start_on_second_line() { + check_segmentation( + r#"define !macro1 +(x,y,z +) +content 1 +content 2 +!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Newline, "\n"), + (Type::Punct, "("), + (Type::Identifier, "x"), + (Type::Punct, ","), + (Type::Identifier, "y"), + (Type::Punct, ","), + (Type::Identifier, "z"), + (Type::Newline, "\n"), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, "content 1"), + (Type::Newline, "\n"), + (Type::MacroBody, "content 2"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::First, + ], + ); + } + + #[test] + fn test_early_end_of_command_1() { + check_segmentation( + r#"define !macro1. +data list /x 1. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First, PromptStyle::First], + ); + } + + #[test] + fn test_early_end_of_command_2() { + check_segmentation( + r#"define !macro1 +x. +data list /x 1. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Newline, "\n"), + (Type::Identifier, "x"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Later, PromptStyle::First, PromptStyle::First], + ); + } + + #[test] + fn test_early_end_of_command_3() { + check_segmentation( + r#"define !macro1(. +x. +data list /x 1. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "x"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First, PromptStyle::First, PromptStyle::First], + ); + } + + #[test] + fn test_early_end_of_command_4() { + // Notice the command terminator at the end of the `DEFINE` command, + // which should not be there and ends it early. + check_segmentation( + r#"define !macro1. +data list /x 1. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First, PromptStyle::First], + ); + } + + #[test] + fn test_missing_enddefine() { + check_segmentation( + r#"define !macro1() +content line 1 +content line 2 +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, "content line 1"), + (Type::Newline, "\n"), + (Type::MacroBody, "content line 2"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::Define, + ], + ); + } + + #[test] + fn test_missing_enddefine_2() { + check_segmentation( + r#"define !macro1() +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define], + ); + } +} + +#[test] +fn test_batch_mode() { + check_segmentation( + r#"first command + another line of first command ++ second command +third command + +fourth command. + fifth command. +"#, + Mode::Batch, + &[ + (Type::Identifier, "first"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "another"), + (Type::Spaces, " "), + (Type::Identifier, "line"), + (Type::Spaces, " "), + (Type::Identifier, "of"), + (Type::Spaces, " "), + (Type::Identifier, "first"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::StartCommand, "+"), + (Type::Spaces, " "), + (Type::Identifier, "second"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Identifier, "third"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "fourth"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "fifth"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_auto_mode() { + check_segmentation( + r#"command + another line of command +2sls ++ another command +another line of second command +data list /x 1 +aggregate. +print eject. +twostep cluster + + +fourth command. + fifth command. +"#, + Mode::Auto, + &[ + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "another"), + (Type::Spaces, " "), + (Type::Identifier, "line"), + (Type::Spaces, " "), + (Type::Identifier, "of"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Number, "2"), + (Type::Identifier, "sls"), + (Type::Newline, "\n"), + (Type::StartCommand, "+"), + (Type::Spaces, " "), + (Type::Identifier, "another"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::Identifier, "another"), + (Type::Spaces, " "), + (Type::Identifier, "line"), + (Type::Spaces, " "), + (Type::Identifier, "of"), + (Type::Spaces, " "), + (Type::Identifier, "second"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Identifier, "aggregate"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "print"), + (Type::Spaces, " "), + (Type::Identifier, "eject"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "twostep"), + (Type::Spaces, " "), + (Type::Identifier, "cluster"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "fourth"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "fifth"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], + ); +}