+++ /dev/null
-//! Syntax segmentation.
-//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning". This module implements the segmentation phase.
-//! [`super::scan`] contains declarations for the scanning phase.
-//!
-//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label
-//! (a segment type) for each byte or contiguous sequence of bytes in the input.
-//! It also, in a few corner cases, outputs zero-width segments that label the
-//! boundary between a pair of bytes in the input.
-//!
-//! Some segment types correspond directly to tokens; for example, an
-//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID)
-//! later in lexical analysis. Other segments contribute to tokens but do not
-//! correspond directly; for example, multiple quoted string segments
-//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators
-//! (SEG_PUNCT) may be combined to form a single string token (T_STRING). Still
-//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior
-//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
-
-use crate::{
- identifier::{id_match, id_match_n, is_reserved_word, IdentifierChar},
- prompt::PromptStyle,
-};
-use bitflags::bitflags;
-
-use super::command_name::{command_match, COMMAND_NAMES};
-
-/// Segmentation mode.
-///
-/// PSPP syntax is written in one of two modes which are broadly defined as
-/// follows:
-///
-/// - In interactive mode, commands end with a period at the end of the line
-/// or with a blank line.
-///
-/// - In batch mode, the second and subsequent lines of a command are indented
-/// from the left margin.
-///
-/// The segmenter can also try to automatically detect the mode in use, using a
-/// heuristic that is usually correct.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
-pub enum Mode {
- /// Try to interpret input correctly regardless of whether it is written
- /// for interactive or batch mode.
- #[default]
- Auto,
-
- /// Interactive syntax mode.
- Interactive,
-
- /// Batch syntax mode.
- Batch,
-}
-
-/// The type of a segment.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Type {
- Number,
- QuotedString,
- HexString,
- UnicodeString,
- UnquotedString,
- ReservedWord,
- Identifier,
- Punct,
- Shbang,
- Spaces,
- Comment,
- Newline,
- CommentCommand,
- DoRepeatCommand,
- DoRepeatOverflow,
- InlineData,
- MacroId,
- MacroName,
- MacroBody,
- StartDocument,
- Document,
- StartCommand,
- SeparateCommands,
- EndCommand,
- End,
- ExpectedQuote,
- ExpectedExponent,
- UnexpectedChar,
-}
-
-bitflags! {
- #[derive(Copy, Clone, Debug)]
- pub struct Substate: u8 {
- const START_OF_LINE = 1;
- const START_OF_COMMAND = 2;
- }
-}
-
-#[derive(Copy, Clone)]
-pub struct Segmenter {
- state: (State, Substate),
- nest: u8,
- mode: Mode,
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct Incomplete;
-
-impl Segmenter {
- /// Returns a segmenter with the given syntax `mode`.
- ///
- /// If `is_snippet` is false, then the segmenter will parse as if it's being
- /// given a whole file. This means, for example, that it will interpret `-`
- /// or `+` at the beginning of the syntax as a separator between commands
- /// (since `-` or `+` at the beginning of a line has this meaning).
- ///
- /// If `is_snippet` is true, then the segmenter will parse as if it's being
- /// given an isolated piece of syntax. This means that, for example, that
- /// it will interpret `-` or `+` at the beginning of the syntax as an
- /// operator token or (if followed by a digit) as part of a number.
- pub fn new(mode: Mode, is_snippet: bool) -> Self {
- Self {
- state: if is_snippet {
- (State::General, Substate::empty())
- } else {
- (State::Shbang, Substate::empty())
- },
- mode,
- nest: 0,
- }
- }
-
- pub fn mode(&self) -> Mode {
- self.mode
- }
-
- fn start_of_line(&self) -> bool {
- self.state.1.contains(Substate::START_OF_LINE)
- }
-
- fn start_of_command(&self) -> bool {
- self.state.1.contains(Substate::START_OF_COMMAND)
- }
-
- /// Returns the style of command prompt to display to an interactive user
- /// for input in the current state.. The return value is most accurate in
- /// mode `Mode::Interactive` and at the beginning of a line (that is, if
- /// [`Segmenter::push`] consumed as much as possible of the input up to a
- /// new-line).
- pub fn prompt(&self) -> PromptStyle {
- match self.state.0 {
- State::Shbang => PromptStyle::First,
- State::General => {
- if self.start_of_command() {
- PromptStyle::First
- } else {
- PromptStyle::Later
- }
- }
- State::Comment1 | State::Comment2 => PromptStyle::Comment,
- State::Document1 | State::Document2 => PromptStyle::Document,
- State::Document3 => PromptStyle::First,
- State::FileLabel1 => PromptStyle::Later,
- State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
- State::DoRepeat1 | State::DoRepeat2 => {
- if self.start_of_command() {
- PromptStyle::First
- } else {
- PromptStyle::Later
- }
- }
- State::DoRepeat3 => PromptStyle::DoRepeat,
- State::DoRepeat4 => PromptStyle::DoRepeat,
- State::Define1 | State::Define2 | State::Define3 => {
- if self.start_of_command() {
- PromptStyle::First
- } else {
- PromptStyle::Later
- }
- }
- State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
- State::BeginData1 => PromptStyle::First,
- State::BeginData2 => PromptStyle::Later,
- State::BeginData3 | State::BeginData4 => PromptStyle::Data,
- }
- }
-
- /// Attempts to label a prefix of the remaining input with a segment type.
- /// The caller supplies a prefix of the remaining input as `input`. If
- /// `eof` is true, then `input` is the entire (remainder) of the input; if
- /// `eof` is false, then further input is potentially available.
- ///
- /// The input may contain '\n' or '\r\n' line ends in any combination.
- ///
- /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
- /// in the segment at the beginning of `input` (a number in
- /// `0..=input.len()`) and the type of that segment. The next call should
- /// not include those bytes in `input`, because they have (figuratively)
- /// been consumed by the segmenter.
- ///
- /// Segments can have zero length, including segment types `Type::End`,
- /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and
- /// `Type::Spaces`.
- ///
- /// Failure occurs only if the segment type of the bytes in `input` cannot
- /// yet be determined. In this case, this function returns `Err(Incomplete)`. If
- /// more input is available, the caller should obtain some more, then call
- /// again with a longer `input`. If this is not enough, the process might
- /// need to repeat again and again. If input is exhausted, then the caller
- /// may call again setting `eof` to true. This function will never return
- /// `Err(Incomplete)` when `eof` is true.
- ///
- /// The caller must not, in a sequence of calls, supply contradictory input.
- /// That is, bytes provided as part of `input` in one call, but not
- /// consumed, must not be provided with *different* values on subsequent
- /// calls. This is because the function must often make decisions based on
- /// looking ahead beyond the bytes that it consumes.
- pub fn push<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
- if input.is_empty() {
- if eof {
- return Ok((input, Type::End));
- } else {
- return Err(Incomplete);
- };
- }
-
- match self.state.0 {
- State::Shbang => return self.parse_shbang(input, eof),
- State::General => {
- if self.start_of_line() {
- self.parse_start_of_line(input, eof)
- } else {
- self.parse_mid_line(input, eof)
- }
- }
- State::Comment1 => self.parse_comment_1(input, eof),
- State::Comment2 => self.parse_comment_2(input, eof),
- State::Document1 => self.parse_document_1(input, eof),
- State::Document2 => self.parse_document_2(input, eof),
- State::Document3 => self.parse_document_3(input, eof),
- State::FileLabel1 => self.parse_file_label_1(input, eof),
- State::FileLabel2 => self.parse_file_label_2(input, eof),
- State::FileLabel3 => self.parse_file_label_3(input, eof),
- State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
- State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
- State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
- State::DoRepeat4 => self.parse_do_repeat_4(input),
- State::Define1 => self.parse_define_1_2(input, eof),
- State::Define2 => self.parse_define_1_2(input, eof),
- State::Define3 => self.parse_define_3(input, eof),
- State::Define4 => self.parse_define_4_5(input, eof),
- State::Define5 => self.parse_define_4_5(input, eof),
- State::Define6 => self.parse_define_6(input, eof),
- State::BeginData1 => self.parse_begin_data_1(input, eof),
- State::BeginData2 => self.parse_begin_data_2(input, eof),
- State::BeginData3 => self.parse_begin_data_3(input, eof),
- State::BeginData4 => self.parse_begin_data_4(input, eof),
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-enum State {
- Shbang,
- General,
- Comment1,
- Comment2,
- Document1,
- Document2,
- Document3,
- FileLabel1,
- FileLabel2,
- FileLabel3,
- DoRepeat1,
- DoRepeat2,
- DoRepeat3,
- DoRepeat4,
- Define1,
- Define2,
- Define3,
- Define4,
- Define5,
- Define6,
- BeginData1,
- BeginData2,
- BeginData3,
- BeginData4,
-}
-
-fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
- let mut iter = input.chars();
- match iter.next() {
- None if !eof => Err(Incomplete),
- c => Ok((c, iter.as_str())),
- }
-}
-
-fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(input);
- };
- match c {
- '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
- '*' => {
- if let (Some('/'), rest) = take(rest, eof)? {
- return Ok(rest);
- }
- }
- _ => (),
- };
- input = rest;
- }
-}
-
-fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
-where
- F: Fn(char) -> bool,
-{
- let input = input.trim_start_matches(f);
- if input.is_empty() && !eof {
- Err(Incomplete)
- } else {
- Ok(input)
- }
-}
-
-fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
-where
- F: Fn(char) -> bool,
-{
- if let (Some(c), rest) = take(input, eof)? {
- if f(c) {
- return Ok(Some(rest));
- }
- }
- Ok(None)
-}
-
-fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(input);
- };
- match c {
- '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
- c if c.is_whitespace() => (),
- _ => return Ok(input),
- }
- input = rest;
- }
-}
-
-fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
- skip_matching(|c| c.is_ascii_digit(), input, eof)
-}
-
-fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(input);
- };
- match c {
- '/' => {
- let (c, rest2) = take(rest, eof)?;
- match c {
- Some('*') => input = skip_comment(rest2, eof)?,
- Some(_) | None => return Ok(rest),
- }
- }
- '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
- c if c.is_whitespace() => input = rest,
- _ => return Ok(input),
- };
- }
-}
-
-fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let (Some(c), _rest) = take(input, eof)? else {
- return Ok(false);
- };
- match c {
- 'x' | 'X' | 'u' | 'U' => Ok({
- let (c, _rest) = take(input, eof)?;
- c == Some('\'') || c == Some('"')
- }),
- '\'' | '"' | '\n' => Ok(true),
- _ => Ok(false),
- }
-}
-
-fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(true);
- };
- Ok(match c {
- '\n' => true,
- '\r' => take(rest, eof)?.0 == Some('\n'),
- _ => false,
- })
-}
-
-fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
- is_end_of_line(skip_spaces_and_comments(input, eof)?, eof)
-}
-
-fn first(s: &str) -> char {
- s.chars().next().unwrap()
-}
-fn get_command_name_candidates(target: &str) -> &[&'static str] {
- if target.is_empty() {
- return &[];
- }
- let target_first = first(target).to_ascii_uppercase();
- let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
- let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
- &COMMAND_NAMES[low..high]
-}
-
-fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let command_name = input
- .split(|c: char| {
- !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-')
- })
- .next()
- .unwrap();
- if !eof && command_name.len() == input.len() {
- return Err(Incomplete);
- }
- let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.');
- for command in get_command_name_candidates(command_name) {
- if let Some(m) = command_match(command, command_name) {
- if m.missing_words <= 0 {
- return Ok(true);
- }
- }
- }
- Ok(false)
-}
-
-impl Segmenter {
- fn parse_shbang<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- if let (Some('#'), rest) = take(input, eof)? {
- if let (Some('!'), rest) = take(rest, eof)? {
- let rest = self.parse_full_line(rest, eof)?;
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((rest, Type::Shbang));
- }
- }
-
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- self.push(input, eof)
- }
- fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
- match self.mode {
- Mode::Auto => detect_command_name(input, eof),
- Mode::Interactive => Ok(false),
- Mode::Batch => Ok(true),
- }
- }
- fn parse_start_of_line<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- debug_assert_eq!(self.state.0, State::General);
- debug_assert!(self.start_of_line());
- debug_assert!(!input.is_empty());
-
- let (Some(c), rest) = take(input, eof).unwrap() else {
- unreachable!()
- };
- match c {
- '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => {
- // This `+` is punctuation that may separate pieces of a string.
- self.state = (State::General, Substate::empty());
- return Ok((rest, Type::Punct));
- }
- '+' | '-' | '.' => {
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((rest, Type::StartCommand));
- }
- _ if c.is_whitespace() => {
- if at_end_of_line(input, eof)? {
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((input, Type::SeparateCommands));
- }
- }
- _ => {
- if self.at_command_start(input, eof)?
- && !self.state.1.contains(Substate::START_OF_COMMAND)
- {
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((input, Type::StartCommand));
- }
- }
- }
- self.state.1 = Substate::START_OF_COMMAND;
- self.parse_mid_line(input, eof)
- }
- fn parse_mid_line<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- debug_assert!(self.state.0 == State::General);
- debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
- let (Some(c), rest) = take(input, eof)? else {
- unreachable!()
- };
- match c {
- '\r' | '\n' if is_end_of_line(input, eof)? => {
- self.state.1 |= Substate::START_OF_LINE;
- Ok((
- self.parse_newline(input, eof).unwrap().unwrap(),
- Type::Newline,
- ))
- }
- '/' => {
- if let (Some('*'), rest) = take(rest, eof)? {
- let rest = skip_comment(rest, eof)?;
- return Ok((rest, Type::Comment));
- } else {
- self.state.1 = Substate::empty();
- return Ok((rest, Type::Punct));
- }
- }
- '-' => {
- let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
- match c {
- Some(c) if c.is_ascii_digit() => {
- return self.parse_number(rest, eof);
- }
- Some('.') => {
- if let (Some(c), _rest) = take(rest2, eof)? {
- if c.is_ascii_digit() {
- return self.parse_number(rest, eof);
- }
- }
- }
- None | Some(_) => (),
- }
- self.state.1 = Substate::empty();
- return Ok((rest, Type::Punct));
- }
- '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
- self.state.1 = Substate::empty();
- return Ok((rest, Type::Punct));
- }
- '*' => {
- if self.state.1.contains(Substate::START_OF_COMMAND) {
- self.state.0 = State::Comment1;
- self.parse_comment_1(input, eof)
- } else {
- self.parse_digraph(&['*'], rest, eof)
- }
- }
- '<' => self.parse_digraph(&['=', '>'], rest, eof),
- '>' => self.parse_digraph(&['='], rest, eof),
- '~' => self.parse_digraph(&['='], rest, eof),
- '.' if at_end_of_line(rest, eof)? => {
- self.state.1 = Substate::START_OF_COMMAND;
- Ok((rest, Type::EndCommand))
- }
- '.' => match take(rest, eof)? {
- (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
- _ => Ok((rest, Type::Punct)),
- },
- '0'..='9' => self.parse_number(input, eof),
- 'u' | 'U' => self.maybe_parse_string(Type::UnicodeString, (input, rest), eof),
- 'x' | 'X' => self.maybe_parse_string(Type::HexString, (input, rest), eof),
- '\'' | '"' => self.parse_string(Type::QuotedString, c, rest, eof),
- '!' => {
- let (c, rest2) = take(rest, eof)?;
- match c {
- Some('*') => Ok((rest2, Type::MacroId)),
- Some(_) => self.parse_id(input, eof),
- None => Ok((rest, Type::Punct)),
- }
- }
- c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Type::Spaces)),
- c if c.may_start_id() => self.parse_id(input, eof),
- '!'..='~' if c != '\\' && c != '^' => {
- self.state.1 = Substate::empty();
- Ok((rest, Type::Punct))
- }
- _ => {
- self.state.1 = Substate::empty();
- Ok((rest, Type::UnexpectedChar))
- }
- }
- }
- fn parse_string<'a>(
- &mut self,
- type_: Type,
- quote: char,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- while let (Some(c), rest) = take(input, eof)? {
- match c {
- _ if c == quote => {
- let (c, rest2) = take(rest, eof)?;
- if c != Some(quote) {
- self.state.1 = Substate::empty();
- return Ok((rest, type_));
- }
- input = rest2;
- }
- '\r' | '\n' if is_end_of_line(input, eof)? => break,
- _ => input = rest,
- }
- }
- self.state.1 = Substate::empty();
- Ok((input, Type::ExpectedQuote))
- }
- fn maybe_parse_string<'a>(
- &mut self,
- type_: Type,
- input: (&'a str, &'a str),
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- match take(input.1, eof)? {
- (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(type_, c, rest, eof),
- _ => self.parse_id(input.0, eof),
- }
- }
- fn next_id_in_command<'a>(
- &self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, &'a str), Incomplete> {
- let mut sub = Segmenter::new(self.mode, true);
- loop {
- let (rest, type_) = sub.push(input, eof)?;
- match type_ {
- Type::Shbang | Type::Spaces | Type::Comment | Type::Newline => (),
-
- Type::Identifier => return Ok((&input[..input.len() - rest.len()], rest)),
-
- Type::Number
- | Type::QuotedString
- | Type::HexString
- | Type::UnicodeString
- | Type::UnquotedString
- | Type::ReservedWord
- | Type::Punct
- | Type::CommentCommand
- | Type::DoRepeatCommand
- | Type::DoRepeatOverflow
- | Type::InlineData
- | Type::MacroId
- | Type::MacroName
- | Type::MacroBody
- | Type::StartDocument
- | Type::Document
- | Type::StartCommand
- | Type::SeparateCommands
- | Type::EndCommand
- | Type::End
- | Type::ExpectedQuote
- | Type::ExpectedExponent
- | Type::UnexpectedChar => return Ok(("", rest)),
- }
- input = rest;
- }
- }
- fn parse_id<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
- let (Some(_), mut end) = take(input, eof).unwrap() else {
- unreachable!()
- };
- while let (Some(c), rest) = take(end, eof)? {
- if !c.may_continue_id() {
- break;
- };
- end = rest;
- }
- let identifier = &input[..input.len() - end.len()];
- let identifier = match identifier.strip_suffix('.') {
- Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
- _ => identifier,
- };
- let rest = &input[identifier.len()..];
-
- if self.state.1.contains(Substate::START_OF_COMMAND) {
- if id_match_n("COMMENT", identifier, 4) {
- self.state.0 = State::Comment1;
- return self.parse_comment_1(input, eof);
- } else if id_match("DOCUMENT", identifier) {
- self.state.0 = State::Document1;
- return Ok((input, Type::StartDocument));
- } else if id_match_n("DEFINE", identifier, 6) {
- self.state.0 = State::Define1;
- } else if id_match("FILE", identifier) {
- if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
- self.state = (State::FileLabel1, Substate::empty());
- return Ok((rest, Type::Identifier));
- }
- } else if id_match("DO", identifier) {
- if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
- self.state = (State::DoRepeat1, Substate::empty());
- return Ok((rest, Type::Identifier));
- }
- } else if id_match("BEGIN", identifier) {
- let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
- if id_match("DATA", next_id) {
- let rest2 = skip_spaces_and_comments(rest2, eof)?;
- let rest2 = if let Some(s) = rest2.strip_prefix('.') {
- skip_spaces_and_comments(s, eof)?
- } else {
- rest2
- };
- if is_end_of_line(rest2, eof)? {
- let s = &input[..input.len() - rest2.len()];
- self.state = (
- if s.contains('\n') {
- State::BeginData1
- } else {
- State::BeginData2
- },
- Substate::empty(),
- );
- return Ok((rest, Type::Identifier));
- }
- }
- }
- }
-
- self.state.1 = Substate::empty();
- let type_ = if is_reserved_word(identifier) {
- Type::ReservedWord
- } else if identifier.starts_with('!') {
- Type::MacroId
- } else {
- Type::Identifier
- };
- Ok((rest, type_))
- }
- fn parse_digraph<'a>(
- &mut self,
- seconds: &[char],
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let (c, rest) = take(input, eof)?;
- self.state.1 = Substate::empty();
- Ok((
- match c {
- Some(c) if seconds.contains(&c) => rest,
- _ => input,
- },
- Type::Punct,
- ))
- }
- fn parse_number<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let mut input = skip_digits(input, eof)?;
- if let Some(rest) = match_char(|c| c == '.', input, eof)? {
- let rest2 = skip_digits(rest, eof)?;
- if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
- input = rest2;
- }
- };
- if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
- let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
- let rest2 = skip_digits(rest, eof)?;
- if rest2.len() == rest.len() {
- self.state.1 = Substate::empty();
- return Ok((rest, Type::ExpectedExponent));
- }
- input = rest2;
- }
- self.state.1 = Substate::empty();
- Ok((input, Type::Number))
- }
- fn parse_comment_1<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- enum CommentState<'a> {
- Blank,
- NotBlank,
- Period(&'a str),
- }
- let mut state = CommentState::Blank;
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- // End of file.
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((input, Type::SeparateCommands));
- };
- match c {
- '.' => state = CommentState::Period(input),
- '\n' | '\r' if is_end_of_line(input, eof)? => {
- match state {
- CommentState::Blank => {
- // Blank line ends comment command.
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((input, Type::SeparateCommands));
- }
- CommentState::Period(period) => {
- // '.' at end of line ends comment command.
- self.state = (State::General, Substate::empty());
- return Ok((period, Type::CommentCommand));
- }
- CommentState::NotBlank => {
- // Comment continues onto next line.
- self.state = (State::Comment2, Substate::empty());
- return Ok((input, Type::CommentCommand));
- }
- }
- }
- c if c.is_whitespace() => (),
- _ => state = CommentState::NotBlank,
- }
- input = rest;
- }
- }
- fn parse_comment_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
-
- let new_command = match take(rest, eof)?.0 {
- Some('+') | Some('-') | Some('.') => true,
- Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
- None | Some(_) => false,
- };
- if new_command {
- self.state = (
- State::General,
- Substate::START_OF_LINE | Substate::START_OF_COMMAND,
- );
- } else {
- self.state.0 = State::Comment1;
- }
- Ok((rest, Type::Newline))
- }
- fn parse_document_1<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let mut end_cmd = false;
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- self.state.0 = State::Document3;
- return Ok((input, Type::Document));
- };
- match c {
- '.' => end_cmd = true,
- '\n' | '\r' if is_end_of_line(input, eof)? => {
- self.state.0 = if end_cmd {
- State::Document3
- } else {
- State::Document2
- };
- return Ok((input, Type::Document));
- }
- c if !c.is_whitespace() => end_cmd = false,
- _ => (),
- }
- input = rest;
- }
- }
- fn parse_document_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
- self.state.0 = State::Document1;
- Ok((rest, Type::Newline))
- }
- fn parse_document_3<'a>(
- &mut self,
- input: &'a str,
- _eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- Ok((input, Type::EndCommand))
- }
- fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let input = skip_spaces_and_comments(input, eof)?;
- match take(input, eof)?.0 {
- Some('\'') | Some('"') | Some('\n') => Ok(true),
- _ => Ok(false),
- }
- }
- fn parse_file_label_1<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let mut sub = Segmenter {
- state: (State::General, self.state.1),
- ..*self
- };
- let (rest, type_) = sub.push(input, eof)?;
- if type_ == Type::Identifier {
- let id = &input[..input.len() - rest.len()];
- debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
- if Self::quoted_file_label(rest, eof)? {
- *self = sub;
- } else {
- self.state.0 = State::FileLabel2;
- }
- } else {
- self.state.1 = sub.state.1;
- }
- Ok((rest, type_))
- }
- fn parse_file_label_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let input = skip_spaces(input, eof)?;
- self.state.0 = State::FileLabel3;
- Ok((input, Type::Spaces))
- }
- fn parse_file_label_3<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let mut end_cmd = None;
- loop {
- let (c, rest) = take(input, eof)?;
- match c {
- None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
- self.state = (State::General, Substate::empty());
- return Ok((end_cmd.unwrap_or(input), Type::UnquotedString));
- }
- None => unreachable!(),
- Some('.') => end_cmd = Some(input),
- Some(c) if !c.is_whitespace() => end_cmd = None,
- Some(_) => (),
- }
- input = rest;
- }
- }
- fn subparse<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
- let mut sub = Segmenter {
- mode: self.mode,
- state: (State::General, self.state.1),
- nest: 0,
- };
- let result = sub.push(input, eof)?;
- self.state.1 = sub.state.1;
- Ok(result)
- }
- /// We are segmenting a `DO REPEAT` command, currently reading the syntax
- /// that defines the stand-in variables (the head) before the lines of
- /// syntax to be repeated (the body).
- fn parse_do_repeat_1<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let (rest, type_) = self.subparse(input, eof)?;
- if type_ == Type::SeparateCommands {
- // We reached a blank line that separates the head from the body.
- self.state.0 = State::DoRepeat2;
- } else if type_ == Type::EndCommand || type_ == Type::StartCommand {
- // We reached the body.
- self.state.0 = State::DoRepeat3;
- self.nest = 1;
- }
- Ok((rest, type_))
- }
- /// We are segmenting a `DO REPEAT` command, currently reading a blank line
- /// that separates the head from the body.
- fn parse_do_repeat_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let (rest, type_) = self.subparse(input, eof)?;
- if type_ == Type::Newline {
- // We reached the body.
- self.state.0 = State::DoRepeat3;
- self.nest = 1;
- }
- Ok((rest, type_))
- }
- fn parse_newline<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<&'a str>, Incomplete> {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(None);
- };
- match c {
- '\n' => Ok(Some(rest)),
- '\r' => {
- if let (Some('\n'), rest) = take(rest, eof)? {
- Ok(Some(rest))
- } else {
- Ok(None)
- }
- }
- _ => Ok(None),
- }
- }
-
- fn parse_full_line<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<&'a str, Incomplete> {
- loop {
- if is_end_of_line(input, eof)? {
- return Ok(input);
- }
- input = take(input, eof).unwrap().1;
- }
- }
- fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result<isize, Incomplete> {
- let input = input.strip_prefix(&['-', '+']).unwrap_or(input);
- let (id1, input) = self.next_id_in_command(input, eof)?;
- if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) {
- Ok(1)
- } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0)
- {
- Ok(-1)
- } else {
- Ok(0)
- }
- }
- /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
- /// are to be repeated. Report each line of syntax as a single
- /// [`Type::DoRepeatCommand`].
- ///
- /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
- /// blocks inside the lines we're segmenting. `self.nest` counts the
- /// nesting level, starting at 1.
- fn parse_do_repeat_3<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- if let Some(rest) = self.parse_newline(input, eof)? {
- return Ok((rest, Type::Newline));
- }
- let rest = self.parse_full_line(input, eof)?;
- let direction = self.check_repeat_command(input, eof)?;
- if direction > 0 {
- if let Some(nest) = self.nest.checked_add(1) {
- self.nest = nest;
- } else {
- self.state.0 = State::DoRepeat4;
- }
- } else if direction < 0 {
- self.nest -= 1;
- if self.nest == 0 {
- // Nesting level dropped to 0, so we've finished reading the `DO
- // REPEAT` body.
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- return self.push(input, eof);
- }
- }
- return Ok((rest, Type::DoRepeatCommand));
- }
- fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Type), Incomplete> {
- self.state.0 = State::DoRepeat3;
- Ok((input, Type::DoRepeatOverflow))
- }
- /// We are segmenting a `DEFINE` command, which consists of:
- ///
- /// - The `DEFINE` keyword.
- ///
- /// - An identifier. We transform this into `Type::MacroName` instead of
- /// `Type::Identifier` or `Type::MacroId` because this identifier must
- /// never be macro-expanded.
- ///
- /// - Anything but `(`.
- ///
- /// - `(` followed by a sequence of tokens possibly including balanced
- /// parentheses up to a final `)`.
- ///
- /// - A sequence of any number of lines, one string per line, ending with
- /// `!ENDDEFINE`. The first line is usually blank (that is, a newline
- /// follows the `(`). The last line usually just has `!ENDDEFINE.` on
- /// it, but it can start with other tokens. The whole
- /// DEFINE...!ENDDEFINE can be on a single line, even.
- fn parse_define_1_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let (rest, type_) = self.subparse(input, eof)?;
- match type_ {
- Type::Identifier | Type::MacroId if self.state.0 == State::Define1 => {
- self.state.0 = State::Define2;
- return Ok((rest, Type::MacroName));
- }
- Type::SeparateCommands | Type::EndCommand | Type::StartCommand => {
- // The DEFINE command is malformed because we reached its end
- // without ever hitting a `(` token. Transition back to general
- // parsing.
- self.state.0 = State::General;
- }
- Type::Punct if input.starts_with('(') => {
- self.state.0 = State::Define3;
- self.nest = 1;
- }
- _ => (),
- }
- Ok((rest, type_))
- }
- fn parse_define_3<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let (rest, type_) = self.subparse(input, eof)?;
- match type_ {
- Type::SeparateCommands | Type::EndCommand | Type::StartCommand => {
- // The DEFINE command is malformed because we reached its end
- // without ever hitting a `(` token. Transition back to general
- // parsing.
- self.state.0 = State::General;
- }
- Type::Punct if input.starts_with('(') => {
- self.nest += 1;
- }
- Type::Punct if input.starts_with(')') => {
- self.nest -= 1;
- if self.nest == 0 {
- self.state = (State::Define4, Substate::empty());
- }
- }
- _ => (),
- }
- Ok((rest, type_))
- }
- fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> {
- loop {
- input = skip_spaces_and_comments(input, true).unwrap();
- let (Some(c), rest) = take(input, true).unwrap() else {
- return None;
- };
- match c {
- '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
- return Some(input)
- }
- '\'' | '"' => {
- let index = rest.find(c)?;
- input = &rest[index + 1..];
- }
- _ => input = rest,
- }
- }
- }
-
- /// We are in the body of a macro definition, looking for additional lines
- /// of the body or `!ENDDEFINE`.
- ///
- /// In `State::Define4`, we're parsing the first line of the macro body (the
- /// same line as the closing parenthesis in the argument definition). In
- /// `State::Define5`, we're on a later line.
- fn parse_define_4_5<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let rest = self.parse_full_line(input, eof)?;
- let line = &input[..input.len() - rest.len()];
- if let Some(end) = Self::find_enddefine(line) {
- // Macro ends at the !ENDDEFINE on this line.
- self.state = (State::General, Substate::empty());
- let (prefix, rest) = input.split_at(line.len() - end.len());
- if prefix.is_empty() {
- // Line starts with `!ENDDEFINE`.
- self.push(input, eof)
- } else if prefix.trim_start().is_empty() {
- // Line starts with spaces followed by `!ENDDEFINE`.
- Ok((rest, Type::Spaces))
- } else {
- // Line starts with some content followed by `!ENDDEFINE`.
- Ok((rest, Type::MacroBody))
- }
- } else {
- // No `!ENDDEFINE`. We have a full line of macro body.
- //
- // If the first line of the macro body is blank, we just report it
- // as spaces, or not at all if there are no spaces, because it's not
- // significant.
- //
- // However, if it's a later line, we need to report it because blank
- // lines can have significance.
- let type_ = if self.state.0 == State::Define4 && line.trim_start().is_empty() {
- if line.is_empty() {
- return self.parse_define_6(input, eof);
- }
- Type::Spaces
- } else {
- Type::MacroBody
- };
- self.state.0 = State::Define6;
- Ok((rest, type_))
- }
- }
- fn parse_define_6<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
- self.state.0 = State::Define5;
- Ok((rest, Type::Newline))
- }
- fn parse_begin_data_1<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let (rest, type_) = self.subparse(input, eof)?;
- if type_ == Type::Newline {
- self.state.0 = State::BeginData2;
- }
- Ok((rest, type_))
- }
- fn parse_begin_data_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let (rest, type_) = self.subparse(input, eof)?;
- if type_ == Type::Newline {
- self.state.0 = State::BeginData3;
- }
- Ok((rest, type_))
- }
- fn is_end_data(line: &str) -> bool {
- let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
- return false;
- };
- let (Some(c), rest) = take(rest, true).unwrap() else {
- return false;
- };
- if !c.is_whitespace() {
- return false;
- };
- let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
- return false;
- };
-
- let mut endcmd = false;
- for c in rest.chars() {
- match c {
- '.' if endcmd => return false,
- '.' => endcmd = true,
- c if c.is_whitespace() => (),
- _ => return false,
- }
- }
- true
- }
- fn parse_begin_data_3<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let rest = self.parse_full_line(input, eof)?;
- let line = &input[..input.len() - rest.len()];
- if Self::is_end_data(line) {
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- self.push(input, eof)
- } else {
- self.state.0 = State::BeginData4;
- Ok((rest, Type::InlineData))
- }
- }
- fn parse_begin_data_4<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Type), Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
- self.state.0 = State::BeginData3;
- Ok((rest, Type::Newline))
- }
-}
-
-fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
- line.get(..pattern.len())
- .map(|prefix| {
- prefix
- .eq_ignore_ascii_case(pattern)
- .then(|| &line[pattern.len()..])
- })
- .flatten()
-}
-
-#[cfg(test)]
-mod test {
- use crate::prompt::PromptStyle;
-
- use super::{Mode, Segmenter, Type};
-
- fn check_segmentation(
- mut input: &str,
- mode: Mode,
- expect_segments: &[(Type, &str)],
- expect_prompts: &[PromptStyle],
- ) {
- let mut segments = Vec::with_capacity(expect_segments.len());
- let mut prompts = Vec::new();
- let mut segmenter = Segmenter::new(mode, false);
- loop {
- let (rest, type_) = segmenter.push(input, true).unwrap();
- let len = input.len() - rest.len();
- let token = &input[..len];
- segments.push((type_, token));
- match type_ {
- Type::End => break,
- Type::Newline => prompts.push(segmenter.prompt()),
- _ => (),
- }
- input = rest;
- }
-
- if &segments != expect_segments {
- eprintln!("segments differ from expected:");
- let difference = diff::slice(expect_segments, &segments);
- for result in difference {
- match result {
- diff::Result::Left(left) => eprintln!("-{left:?}"),
- diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
- diff::Result::Right(right) => eprintln!("+{right:?}"),
- }
- }
- panic!();
- }
-
- if &prompts != expect_prompts {
- eprintln!("prompts differ from expected:");
- let difference = diff::slice(expect_prompts, &prompts);
- for result in difference {
- match result {
- diff::Result::Left(left) => eprintln!("-{left:?}"),
- diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
- diff::Result::Right(right) => eprintln!("+{right:?}"),
- }
- }
- panic!();
- }
- }
-
- fn print_segmentation(mut input: &str) {
- let mut segmenter = Segmenter::new(Mode::Auto, false);
- loop {
- let (rest, type_) = segmenter.push(input, true).unwrap();
- let len = input.len() - rest.len();
- let token = &input[..len];
- print!("{type_:?} {token:?}");
- match type_ {
- Type::Newline => print!(" ({:?})", segmenter.prompt()),
- Type::End => break,
- _ => (),
- }
- println!();
- input = rest;
- }
- }
-
- #[test]
- fn test_identifiers() {
- check_segmentation(
- r#"a ab abc abcd !abcd
-A AB ABC ABCD !ABCD
-aB aBC aBcD !aBcD
-$x $y $z !$z
-grève Ângstrom poté
-#a #b #c ## #d !#d
-@efg @ @@. @#@ !@
-## # #12345 #.#
-f@#_.#6
-GhIjK
-.x 1y _z
-"#,
- Mode::Auto,
- &[
- (Type::Identifier, "a"),
- (Type::Spaces, " "),
- (Type::Identifier, "ab"),
- (Type::Spaces, " "),
- (Type::Identifier, "abc"),
- (Type::Spaces, " "),
- (Type::Identifier, "abcd"),
- (Type::Spaces, " "),
- (Type::MacroId, "!abcd"),
- (Type::Newline, "\n"),
- (Type::Identifier, "A"),
- (Type::Spaces, " "),
- (Type::Identifier, "AB"),
- (Type::Spaces, " "),
- (Type::Identifier, "ABC"),
- (Type::Spaces, " "),
- (Type::Identifier, "ABCD"),
- (Type::Spaces, " "),
- (Type::MacroId, "!ABCD"),
- (Type::Newline, "\n"),
- (Type::Identifier, "aB"),
- (Type::Spaces, " "),
- (Type::Identifier, "aBC"),
- (Type::Spaces, " "),
- (Type::Identifier, "aBcD"),
- (Type::Spaces, " "),
- (Type::MacroId, "!aBcD"),
- (Type::Newline, "\n"),
- (Type::Identifier, "$x"),
- (Type::Spaces, " "),
- (Type::Identifier, "$y"),
- (Type::Spaces, " "),
- (Type::Identifier, "$z"),
- (Type::Spaces, " "),
- (Type::MacroId, "!$z"),
- (Type::Newline, "\n"),
- (Type::Identifier, "grève"),
- (Type::Spaces, "\u{00a0}"),
- (Type::Identifier, "Ângstrom"),
- (Type::Spaces, "\u{00a0}"),
- (Type::Identifier, "poté"),
- (Type::Newline, "\n"),
- (Type::Identifier, "#a"),
- (Type::Spaces, " "),
- (Type::Identifier, "#b"),
- (Type::Spaces, " "),
- (Type::Identifier, "#c"),
- (Type::Spaces, " "),
- (Type::Identifier, "##"),
- (Type::Spaces, " "),
- (Type::Identifier, "#d"),
- (Type::Spaces, " "),
- (Type::MacroId, "!#d"),
- (Type::Newline, "\n"),
- (Type::Identifier, "@efg"),
- (Type::Spaces, " "),
- (Type::Identifier, "@"),
- (Type::Spaces, " "),
- (Type::Identifier, "@@."),
- (Type::Spaces, " "),
- (Type::Identifier, "@#@"),
- (Type::Spaces, " "),
- (Type::MacroId, "!@"),
- (Type::Spaces, " "),
- (Type::Newline, "\n"),
- (Type::Identifier, "##"),
- (Type::Spaces, " "),
- (Type::Identifier, "#"),
- (Type::Spaces, " "),
- (Type::Identifier, "#12345"),
- (Type::Spaces, " "),
- (Type::Identifier, "#.#"),
- (Type::Newline, "\n"),
- (Type::Identifier, "f@#_.#6"),
- (Type::Newline, "\n"),
- (Type::Identifier, "GhIjK"),
- (Type::Newline, "\n"),
- (Type::StartCommand, "."),
- (Type::Identifier, "x"),
- (Type::Spaces, " "),
- (Type::Number, "1"),
- (Type::Identifier, "y"),
- (Type::Spaces, " "),
- (Type::Punct, "_"),
- (Type::Identifier, "z"),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- ],
- );
- }
-
- #[test]
- fn test_identifiers_ending_in_dot() {
- check_segmentation(
- r#"abcd. abcd.
-ABCD. ABCD.
-aBcD. aBcD.
-$y. $z. あいうえお.
-#c. #d..
-@@. @@....
-#.#.
-#abcd.
-.
-.
-LMNOP.
-QRSTUV./* end of line comment */
-qrstuv. /* end of line comment */
-QrStUv./* end of line comment */
-wxyz./* unterminated end of line comment
-WXYZ. /* unterminated end of line comment
-WxYz./* unterminated end of line comment
-"#,
- Mode::Auto,
- &[
- (Type::Identifier, "abcd."),
- (Type::Spaces, " "),
- (Type::Identifier, "abcd"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "ABCD."),
- (Type::Spaces, " "),
- (Type::Identifier, "ABCD"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "aBcD."),
- (Type::Spaces, " "),
- (Type::Identifier, "aBcD"),
- (Type::EndCommand, "."),
- (Type::Spaces, " "),
- (Type::Newline, "\n"),
- (Type::Identifier, "$y."),
- (Type::Spaces, " "),
- (Type::Identifier, "$z."),
- (Type::Spaces, " "),
- (Type::Identifier, "あいうえお"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "#c."),
- (Type::Spaces, " "),
- (Type::Identifier, "#d."),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "@@."),
- (Type::Spaces, " "),
- (Type::Identifier, "@@..."),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "#.#"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "#abcd"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::StartCommand, "."),
- (Type::Newline, "\n"),
- (Type::StartCommand, "."),
- (Type::Spaces, " "),
- (Type::Newline, "\n"),
- (Type::Identifier, "LMNOP"),
- (Type::EndCommand, "."),
- (Type::Spaces, " "),
- (Type::Newline, "\n"),
- (Type::Identifier, "QRSTUV"),
- (Type::EndCommand, "."),
- (Type::Comment, "/* end of line comment */"),
- (Type::Newline, "\n"),
- (Type::Identifier, "qrstuv"),
- (Type::EndCommand, "."),
- (Type::Spaces, " "),
- (Type::Comment, "/* end of line comment */"),
- (Type::Newline, "\n"),
- (Type::Identifier, "QrStUv"),
- (Type::EndCommand, "."),
- (Type::Comment, "/* end of line comment */"),
- (Type::Spaces, " "),
- (Type::Newline, "\n"),
- (Type::Identifier, "wxyz"),
- (Type::EndCommand, "."),
- (Type::Comment, "/* unterminated end of line comment"),
- (Type::Newline, "\n"),
- (Type::Identifier, "WXYZ"),
- (Type::EndCommand, "."),
- (Type::Spaces, " "),
- (Type::Comment, "/* unterminated end of line comment"),
- (Type::Newline, "\n"),
- (Type::Identifier, "WxYz"),
- (Type::EndCommand, "."),
- (Type::Comment, "/* unterminated end of line comment "),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_reserved_words() {
- check_segmentation(
- r#"and or not eq ge gt le lt ne all by to with
-AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
-andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
-and. with.
-"#,
- Mode::Auto,
- &[
- (Type::ReservedWord, "and"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "or"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "not"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "eq"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "ge"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "gt"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "le"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "lt"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "ne"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "all"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "by"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "to"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "with"),
- (Type::Newline, "\n"),
- (Type::ReservedWord, "AND"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "OR"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "NOT"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "EQ"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "GE"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "GT"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "LE"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "LT"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "NE"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "ALL"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "BY"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "TO"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "WITH"),
- (Type::Newline, "\n"),
- (Type::Identifier, "andx"),
- (Type::Spaces, " "),
- (Type::Identifier, "orx"),
- (Type::Spaces, " "),
- (Type::Identifier, "notx"),
- (Type::Spaces, " "),
- (Type::Identifier, "eqx"),
- (Type::Spaces, " "),
- (Type::Identifier, "gex"),
- (Type::Spaces, " "),
- (Type::Identifier, "gtx"),
- (Type::Spaces, " "),
- (Type::Identifier, "lex"),
- (Type::Spaces, " "),
- (Type::Identifier, "ltx"),
- (Type::Spaces, " "),
- (Type::Identifier, "nex"),
- (Type::Spaces, " "),
- (Type::Identifier, "allx"),
- (Type::Spaces, " "),
- (Type::Identifier, "byx"),
- (Type::Spaces, " "),
- (Type::Identifier, "tox"),
- (Type::Spaces, " "),
- (Type::Identifier, "withx"),
- (Type::Newline, "\n"),
- (Type::Identifier, "and."),
- (Type::Spaces, " "),
- (Type::ReservedWord, "with"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_punctuation() {
- check_segmentation(
- r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
-~&|=>=><=<~=<>(),-+*/[]**!*
-% : ; ? _ ` { } ~ !*
-"#,
- Mode::Auto,
- &[
- (Type::Punct, "~"),
- (Type::Spaces, " "),
- (Type::Punct, "&"),
- (Type::Spaces, " "),
- (Type::Punct, "|"),
- (Type::Spaces, " "),
- (Type::Punct, "="),
- (Type::Spaces, " "),
- (Type::Punct, ">="),
- (Type::Spaces, " "),
- (Type::Punct, ">"),
- (Type::Spaces, " "),
- (Type::Punct, "<="),
- (Type::Spaces, " "),
- (Type::Punct, "<"),
- (Type::Spaces, " "),
- (Type::Punct, "~="),
- (Type::Spaces, " "),
- (Type::Punct, "<>"),
- (Type::Spaces, " "),
- (Type::Punct, "("),
- (Type::Spaces, " "),
- (Type::Punct, ")"),
- (Type::Spaces, " "),
- (Type::Punct, ","),
- (Type::Spaces, " "),
- (Type::Punct, "-"),
- (Type::Spaces, " "),
- (Type::Punct, "+"),
- (Type::Spaces, " "),
- (Type::Punct, "*"),
- (Type::Spaces, " "),
- (Type::Punct, "/"),
- (Type::Spaces, " "),
- (Type::Punct, "["),
- (Type::Spaces, " "),
- (Type::Punct, "]"),
- (Type::Spaces, " "),
- (Type::Punct, "**"),
- (Type::Newline, "\n"),
- (Type::Punct, "~"),
- (Type::Punct, "&"),
- (Type::Punct, "|"),
- (Type::Punct, "="),
- (Type::Punct, ">="),
- (Type::Punct, ">"),
- (Type::Punct, "<="),
- (Type::Punct, "<"),
- (Type::Punct, "~="),
- (Type::Punct, "<>"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Punct, ","),
- (Type::Punct, "-"),
- (Type::Punct, "+"),
- (Type::Punct, "*"),
- (Type::Punct, "/"),
- (Type::Punct, "["),
- (Type::Punct, "]"),
- (Type::Punct, "**"),
- (Type::MacroId, "!*"),
- (Type::Newline, "\n"),
- (Type::Punct, "%"),
- (Type::Spaces, " "),
- (Type::Punct, ":"),
- (Type::Spaces, " "),
- (Type::Punct, ";"),
- (Type::Spaces, " "),
- (Type::Punct, "?"),
- (Type::Spaces, " "),
- (Type::Punct, "_"),
- (Type::Spaces, " "),
- (Type::Punct, "`"),
- (Type::Spaces, " "),
- (Type::Punct, "{"),
- (Type::Spaces, " "),
- (Type::Punct, "}"),
- (Type::Spaces, " "),
- (Type::Punct, "~"),
- (Type::Spaces, " "),
- (Type::MacroId, "!*"),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later],
- );
- }
-
- #[test]
- fn test_positive_numbers() {
- check_segmentation(
- r#"0 1 01 001. 1.
-123. /* comment 1 */ /* comment 2 */
-.1 0.1 00.1 00.10
-5e1 6E-1 7e+1 6E+01 6e-03
-.3E1 .4e-1 .5E+1 .6e+01 .7E-03
-1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
-. 1e e1 1e+ 1e- 1.
-"#,
- Mode::Auto,
- &[
- (Type::Number, "0"),
- (Type::Spaces, " "),
- (Type::Number, "1"),
- (Type::Spaces, " "),
- (Type::Number, "01"),
- (Type::Spaces, " "),
- (Type::Number, "001."),
- (Type::Spaces, " "),
- (Type::Number, "1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Number, "123"),
- (Type::EndCommand, "."),
- (Type::Spaces, " "),
- (Type::Comment, "/* comment 1 */"),
- (Type::Spaces, " "),
- (Type::Comment, "/* comment 2 */"),
- (Type::Newline, "\n"),
- (Type::StartCommand, "."),
- (Type::Number, "1"),
- (Type::Spaces, " "),
- (Type::Number, "0.1"),
- (Type::Spaces, " "),
- (Type::Number, "00.1"),
- (Type::Spaces, " "),
- (Type::Number, "00.10"),
- (Type::Newline, "\n"),
- (Type::Number, "5e1"),
- (Type::Spaces, " "),
- (Type::Number, "6E-1"),
- (Type::Spaces, " "),
- (Type::Number, "7e+1"),
- (Type::Spaces, " "),
- (Type::Number, "6E+01"),
- (Type::Spaces, " "),
- (Type::Number, "6e-03"),
- (Type::Newline, "\n"),
- (Type::StartCommand, "."),
- (Type::Number, "3E1"),
- (Type::Spaces, " "),
- (Type::Number, ".4e-1"),
- (Type::Spaces, " "),
- (Type::Number, ".5E+1"),
- (Type::Spaces, " "),
- (Type::Number, ".6e+01"),
- (Type::Spaces, " "),
- (Type::Number, ".7E-03"),
- (Type::Newline, "\n"),
- (Type::Number, "1.23e1"),
- (Type::Spaces, " "),
- (Type::Number, "45.6E-1"),
- (Type::Spaces, " "),
- (Type::Number, "78.9e+1"),
- (Type::Spaces, " "),
- (Type::Number, "99.9E+01"),
- (Type::Spaces, " "),
- (Type::Number, "11.2e-03"),
- (Type::Newline, "\n"),
- (Type::StartCommand, "."),
- (Type::Spaces, " "),
- (Type::ExpectedExponent, "1e"),
- (Type::Spaces, " "),
- (Type::Identifier, "e1"),
- (Type::Spaces, " "),
- (Type::ExpectedExponent, "1e+"),
- (Type::Spaces, " "),
- (Type::ExpectedExponent, "1e-"),
- (Type::Spaces, " "),
- (Type::Number, "1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_negative_numbers() {
- check_segmentation(
- r#" -0 -1 -01 -001. -1.
- -123. /* comment 1 */ /* comment 2 */
- -.1 -0.1 -00.1 -00.10
- -5e1 -6E-1 -7e+1 -6E+01 -6e-03
- -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
- -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
- -/**/1
- -. -1e -e1 -1e+ -1e- -1.
-"#,
- Mode::Auto,
- &[
- (Type::Spaces, " "),
- (Type::Number, "-0"),
- (Type::Spaces, " "),
- (Type::Number, "-1"),
- (Type::Spaces, " "),
- (Type::Number, "-01"),
- (Type::Spaces, " "),
- (Type::Number, "-001."),
- (Type::Spaces, " "),
- (Type::Number, "-1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Number, "-123"),
- (Type::EndCommand, "."),
- (Type::Spaces, " "),
- (Type::Comment, "/* comment 1 */"),
- (Type::Spaces, " "),
- (Type::Comment, "/* comment 2 */"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Number, "-.1"),
- (Type::Spaces, " "),
- (Type::Number, "-0.1"),
- (Type::Spaces, " "),
- (Type::Number, "-00.1"),
- (Type::Spaces, " "),
- (Type::Number, "-00.10"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Number, "-5e1"),
- (Type::Spaces, " "),
- (Type::Number, "-6E-1"),
- (Type::Spaces, " "),
- (Type::Number, "-7e+1"),
- (Type::Spaces, " "),
- (Type::Number, "-6E+01"),
- (Type::Spaces, " "),
- (Type::Number, "-6e-03"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Number, "-.3E1"),
- (Type::Spaces, " "),
- (Type::Number, "-.4e-1"),
- (Type::Spaces, " "),
- (Type::Number, "-.5E+1"),
- (Type::Spaces, " "),
- (Type::Number, "-.6e+01"),
- (Type::Spaces, " "),
- (Type::Number, "-.7E-03"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Number, "-1.23e1"),
- (Type::Spaces, " "),
- (Type::Number, "-45.6E-1"),
- (Type::Spaces, " "),
- (Type::Number, "-78.9e+1"),
- (Type::Spaces, " "),
- (Type::Number, "-99.9E+01"),
- (Type::Spaces, " "),
- (Type::Number, "-11.2e-03"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Punct, "-"),
- (Type::Comment, "/**/"),
- (Type::Number, "1"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Punct, "-"),
- (Type::Punct, "."),
- (Type::Spaces, " "),
- (Type::ExpectedExponent, "-1e"),
- (Type::Spaces, " "),
- (Type::Punct, "-"),
- (Type::Identifier, "e1"),
- (Type::Spaces, " "),
- (Type::ExpectedExponent, "-1e+"),
- (Type::Spaces, " "),
- (Type::ExpectedExponent, "-1e-"),
- (Type::Spaces, " "),
- (Type::Number, "-1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_strings() {
- check_segmentation(
- r#"'x' "y" 'abc'
-'Don''t' "Can't" 'Won''t'
-"""quoted""" '"quoted"'
-'' ""
-'missing end quote
-"missing double quote
-x"4142" X'5152'
-u'fffd' U"041"
-+ new command
-+ /* comment */ 'string continuation'
-+ /* also a punctuator on blank line
-- 'new command'
-"#,
- Mode::Auto,
- &[
- (Type::QuotedString, "'x'"),
- (Type::Spaces, " "),
- (Type::QuotedString, "\"y\""),
- (Type::Spaces, " "),
- (Type::QuotedString, "'abc'"),
- (Type::Newline, "\n"),
- (Type::QuotedString, "'Don''t'"),
- (Type::Spaces, " "),
- (Type::QuotedString, "\"Can't\""),
- (Type::Spaces, " "),
- (Type::QuotedString, "'Won''t'"),
- (Type::Newline, "\n"),
- (Type::QuotedString, "\"\"\"quoted\"\"\""),
- (Type::Spaces, " "),
- (Type::QuotedString, "'\"quoted\"'"),
- (Type::Newline, "\n"),
- (Type::QuotedString, "''"),
- (Type::Spaces, " "),
- (Type::QuotedString, "\"\""),
- (Type::Newline, "\n"),
- (Type::ExpectedQuote, "'missing end quote"),
- (Type::Newline, "\n"),
- (Type::ExpectedQuote, "\"missing double quote"),
- (Type::Newline, "\n"),
- (Type::HexString, "x\"4142\""),
- (Type::Spaces, " "),
- (Type::HexString, "X'5152'"),
- (Type::Newline, "\n"),
- (Type::UnicodeString, "u'fffd'"),
- (Type::Spaces, " "),
- (Type::UnicodeString, "U\"041\""),
- (Type::Newline, "\n"),
- (Type::StartCommand, "+"),
- (Type::Spaces, " "),
- (Type::Identifier, "new"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::Newline, "\n"),
- (Type::Punct, "+"),
- (Type::Spaces, " "),
- (Type::Comment, "/* comment */"),
- (Type::Spaces, " "),
- (Type::QuotedString, "'string continuation'"),
- (Type::Newline, "\n"),
- (Type::Punct, "+"),
- (Type::Spaces, " "),
- (Type::Comment, "/* also a punctuator on blank line"),
- (Type::Newline, "\n"),
- (Type::StartCommand, "-"),
- (Type::Spaces, " "),
- (Type::QuotedString, "'new command'"),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- ],
- );
- }
-
- #[test]
- fn test_shbang() {
- check_segmentation(
- r#"#! /usr/bin/pspp
-title my title.
-#! /usr/bin/pspp
-"#,
- Mode::Interactive,
- &[
- (Type::Shbang, "#! /usr/bin/pspp"),
- (Type::Newline, "\n"),
- (Type::Identifier, "title"),
- (Type::Spaces, " "),
- (Type::Identifier, "my"),
- (Type::Spaces, " "),
- (Type::Identifier, "title"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "#"),
- (Type::MacroId, "!"),
- (Type::Spaces, " "),
- (Type::Punct, "/"),
- (Type::Identifier, "usr"),
- (Type::Punct, "/"),
- (Type::Identifier, "bin"),
- (Type::Punct, "/"),
- (Type::Identifier, "pspp"),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::First, PromptStyle::First, PromptStyle::Later],
- );
- }
-
- #[test]
- fn test_comment_command() {
- check_segmentation(
- r#"* Comment commands "don't
-have to contain valid tokens.
-
-** Check ambiguity with ** token.
-****************.
-
-comment keyword works too.
-COMM also.
-com is ambiguous with COMPUTE.
-
- * Comment need not start at left margin.
-
-* Comment ends with blank line
-
-next command.
-
-"#,
- Mode::Interactive,
- &[
- (Type::CommentCommand, "* Comment commands \"don't"),
- (Type::Newline, "\n"),
- (Type::CommentCommand, "have to contain valid tokens"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::CommentCommand, "** Check ambiguity with ** token"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::CommentCommand, "****************"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::CommentCommand, "comment keyword works too"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::CommentCommand, "COMM also"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "com"),
- (Type::Spaces, " "),
- (Type::Identifier, "is"),
- (Type::Spaces, " "),
- (Type::Identifier, "ambiguous"),
- (Type::Spaces, " "),
- (Type::ReservedWord, "with"),
- (Type::Spaces, " "),
- (Type::Identifier, "COMPUTE"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (
- Type::CommentCommand,
- "* Comment need not start at left margin",
- ),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::CommentCommand, "* Comment ends with blank line"),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::Identifier, "next"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Comment,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Comment,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_document_command() {
- check_segmentation(
- r#"DOCUMENT one line.
-DOC more
- than
- one
- line.
-docu
-first.paragraph
-isn't parsed as tokens
-
-second paragraph.
-"#,
- Mode::Interactive,
- &[
- (Type::StartDocument, ""),
- (Type::Document, "DOCUMENT one line."),
- (Type::EndCommand, ""),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::StartDocument, ""),
- (Type::Document, "DOC more"),
- (Type::Newline, "\n"),
- (Type::Document, " than"),
- (Type::Newline, "\n"),
- (Type::Document, " one"),
- (Type::Newline, "\n"),
- (Type::Document, " line."),
- (Type::EndCommand, ""),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::StartDocument, ""),
- (Type::Document, "docu"),
- (Type::Newline, "\n"),
- (Type::Document, "first.paragraph"),
- (Type::Newline, "\n"),
- (Type::Document, "isn't parsed as tokens"),
- (Type::Newline, "\n"),
- (Type::Document, ""),
- (Type::Newline, "\n"),
- (Type::Document, "second paragraph."),
- (Type::EndCommand, ""),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::First,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_file_label_command() {
- check_segmentation(
- r#"FIL label isn't quoted.
-FILE
- lab 'is quoted'.
-FILE /*
-/**/ lab not quoted here either
-
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "FIL"),
- (Type::Spaces, " "),
- (Type::Identifier, "label"),
- (Type::Spaces, " "),
- (Type::UnquotedString, "isn't quoted"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "FILE"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "lab"),
- (Type::Spaces, " "),
- (Type::QuotedString, "'is quoted'"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "FILE"),
- (Type::Spaces, " "),
- (Type::Comment, "/*"),
- (Type::Newline, "\n"),
- (Type::Comment, "/**/"),
- (Type::Spaces, " "),
- (Type::Identifier, "lab"),
- (Type::Spaces, " "),
- (Type::UnquotedString, "not quoted here either"),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_begin_data() {
- check_segmentation(
- r#"begin data.
-end data.
-
-begin data. /*
-123
-xxx
-end data.
-
-BEG /**/ DAT /*
-5 6 7 /* x
-
-end data
-end data
-.
-
-begin
- data.
-data
-end data.
-
-begin data "xxx".
-begin data 123.
-not data
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "begin"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "end"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::Identifier, "begin"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::EndCommand, "."),
- (Type::Spaces, " "),
- (Type::Comment, "/*"),
- (Type::Newline, "\n"),
- (Type::InlineData, "123"),
- (Type::Newline, "\n"),
- (Type::InlineData, "xxx"),
- (Type::Newline, "\n"),
- (Type::Identifier, "end"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::Identifier, "BEG"),
- (Type::Spaces, " "),
- (Type::Comment, "/**/"),
- (Type::Spaces, " "),
- (Type::Identifier, "DAT"),
- (Type::Spaces, " "),
- (Type::Comment, "/*"),
- (Type::Newline, "\n"),
- (Type::InlineData, "5 6 7 /* x"),
- (Type::Newline, "\n"),
- (Type::InlineData, ""),
- (Type::Newline, "\n"),
- (Type::InlineData, "end data"),
- (Type::Newline, "\n"),
- (Type::Identifier, "end"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::Newline, "\n"),
- (Type::StartCommand, "."),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::Identifier, "begin"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::InlineData, "data"),
- (Type::Newline, "\n"),
- (Type::Identifier, "end"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::Identifier, "begin"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::Spaces, " "),
- (Type::QuotedString, "\"xxx\""),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "begin"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::Spaces, " "),
- (Type::Number, "123"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::ReservedWord, "not"),
- (Type::Spaces, " "),
- (Type::Identifier, "data"),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Data,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- ],
- );
- }
-
- #[test]
- fn test_do_repeat() {
- check_segmentation(
- r#"do repeat x=a b c
- y=d e f.
- do repeat a=1 thru 5.
-another command.
-second command
-+ third command.
-end /* x */ /* y */ repeat print.
-end
- repeat.
-do
- repeat #a=1.
- inner command.
-end repeat.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "do"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::Spaces, " "),
- (Type::Identifier, "x"),
- (Type::Punct, "="),
- (Type::Identifier, "a"),
- (Type::Spaces, " "),
- (Type::Identifier, "b"),
- (Type::Spaces, " "),
- (Type::Identifier, "c"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "y"),
- (Type::Punct, "="),
- (Type::Identifier, "d"),
- (Type::Spaces, " "),
- (Type::Identifier, "e"),
- (Type::Spaces, " "),
- (Type::Identifier, "f"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, " do repeat a=1 thru 5."),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, "another command."),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, "second command"),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, "+ third command."),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print."),
- (Type::Newline, "\n"),
- (Type::Identifier, "end"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "do"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::Spaces, " "),
- (Type::Identifier, "#a"),
- (Type::Punct, "="),
- (Type::Number, "1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, " inner command."),
- (Type::Newline, "\n"),
- (Type::Identifier, "end"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_do_repeat_overflow() {
- const N: usize = 257;
- let do_repeat: Vec<String> = (0..N)
- .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5))
- .collect();
- let end_repeat: Vec<String> = (0..N)
- .rev()
- .map(|i| format!("end repeat. /* {i}\n"))
- .collect();
-
- let s: String = do_repeat
- .iter()
- .chain(end_repeat.iter())
- .map(|s| s.as_str())
- .collect();
- let mut expect_output = vec![
- (Type::Identifier, "do"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::Spaces, " "),
- (Type::Identifier, "v0"),
- (Type::Punct, "="),
- (Type::Number, "0"),
- (Type::Spaces, " "),
- (Type::Identifier, "thru"),
- (Type::Spaces, " "),
- (Type::Number, "5"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- ];
- for i in 1..N {
- expect_output.push((Type::DoRepeatCommand, &do_repeat[i].trim_end()));
- if i >= 255 {
- expect_output.push((Type::DoRepeatOverflow, ""));
- }
- expect_output.push((Type::Newline, "\n"));
- }
- for i in 0..254 {
- expect_output.push((Type::DoRepeatCommand, &end_repeat[i].trim_end()));
- expect_output.push((Type::Newline, "\n"));
- }
- let comments: Vec<String> = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect();
- for comment in &comments {
- expect_output.extend([
- (Type::Identifier, "end"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::EndCommand, "."),
- (Type::Spaces, " "),
- (Type::Comment, comment),
- (Type::Newline, "\n"),
- ]);
- }
- expect_output.push((Type::End, ""));
-
- let expect_prompts: Vec<_> = (0..N * 2 - 3)
- .map(|_| PromptStyle::DoRepeat)
- .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First])
- .collect();
- check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts);
- }
-
- #[test]
- fn test_do_repeat_batch() {
- check_segmentation(
- r#"do repeat x=a b c
- y=d e f
-do repeat a=1 thru 5
-another command
-second command
-+ third command
-end /* x */ /* y */ repeat print
-end
- repeat
-do
- repeat #a=1
-
- inner command
-end repeat
-"#,
- Mode::Batch,
- &[
- (Type::Identifier, "do"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::Spaces, " "),
- (Type::Identifier, "x"),
- (Type::Punct, "="),
- (Type::Identifier, "a"),
- (Type::Spaces, " "),
- (Type::Identifier, "b"),
- (Type::Spaces, " "),
- (Type::Identifier, "c"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "y"),
- (Type::Punct, "="),
- (Type::Identifier, "d"),
- (Type::Spaces, " "),
- (Type::Identifier, "e"),
- (Type::Spaces, " "),
- (Type::Identifier, "f"),
- (Type::Newline, "\n"),
- (Type::StartCommand, ""),
- (Type::DoRepeatCommand, "do repeat a=1 thru 5"),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, "another command"),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, "second command"),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, "+ third command"),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print"),
- (Type::Newline, "\n"),
- (Type::Identifier, "end"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::Newline, "\n"),
- (Type::StartCommand, ""),
- (Type::Identifier, "do"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::Spaces, " "),
- (Type::Identifier, "#a"),
- (Type::Punct, "="),
- (Type::Number, "1"),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::DoRepeatCommand, " inner command"),
- (Type::Newline, "\n"),
- (Type::Identifier, "end"),
- (Type::Spaces, " "),
- (Type::Identifier, "repeat"),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::Later,
- ],
- );
- }
-
- mod define {
- use crate::{
- lex::segment::{Mode, Type},
- prompt::PromptStyle,
- };
-
- use super::check_segmentation;
-
- #[test]
- fn test_simple() {
- check_segmentation(
- r#"define !macro1()
-var1 var2 var3 "!enddefine"
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::MacroBody, "var1 var2 var3 \"!enddefine\""),
- (Type::Newline, "\n"),
- (Type::MacroId, "!enddefine"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_no_newline_after_parentheses() {
- check_segmentation(
- r#"define !macro1() var1 var2 var3 /* !enddefine
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::MacroBody, " var1 var2 var3 /* !enddefine"),
- (Type::Newline, "\n"),
- (Type::MacroId, "!enddefine"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_no_newline_before_enddefine() {
- check_segmentation(
- r#"define !macro1()
-var1 var2 var3!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::MacroBody, "var1 var2 var3"),
- (Type::MacroId, "!enddefine"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_all_on_one_line() {
- check_segmentation(
- r#"define !macro1()var1 var2 var3!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::MacroBody, "var1 var2 var3"),
- (Type::MacroId, "!enddefine"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::First],
- );
- }
-
- #[test]
- fn test_empty() {
- check_segmentation(
- r#"define !macro1()
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::MacroId, "!enddefine"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_blank_lines() {
- check_segmentation(
- r#"define !macro1()
-
-
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::MacroBody, ""),
- (Type::Newline, "\n"),
- (Type::MacroBody, ""),
- (Type::Newline, "\n"),
- (Type::MacroId, "!enddefine"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_arguments() {
- check_segmentation(
- r#"define !macro1(a(), b(), c())
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Identifier, "a"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Punct, ","),
- (Type::Spaces, " "),
- (Type::Identifier, "b"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Punct, ","),
- (Type::Spaces, " "),
- (Type::Identifier, "c"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::MacroId, "!enddefine"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_multiline_arguments() {
- check_segmentation(
- r#"define !macro1(
- a(), b(
- ),
- c()
-)
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "a"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Punct, ","),
- (Type::Spaces, " "),
- (Type::Identifier, "b"),
- (Type::Punct, "("),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Punct, ")"),
- (Type::Punct, ","),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "c"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::MacroId, "!enddefine"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Define,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_arguments_start_on_second_line() {
- check_segmentation(
- r#"define !macro1
-(x,y,z
-)
-content 1
-content 2
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Newline, "\n"),
- (Type::Punct, "("),
- (Type::Identifier, "x"),
- (Type::Punct, ","),
- (Type::Identifier, "y"),
- (Type::Punct, ","),
- (Type::Identifier, "z"),
- (Type::Newline, "\n"),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::MacroBody, "content 1"),
- (Type::Newline, "\n"),
- (Type::MacroBody, "content 2"),
- (Type::Newline, "\n"),
- (Type::MacroId, "!enddefine"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_early_end_of_command_1() {
- check_segmentation(
- r#"define !macro1.
-data list /x 1.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "data"),
- (Type::Spaces, " "),
- (Type::Identifier, "list"),
- (Type::Spaces, " "),
- (Type::Punct, "/"),
- (Type::Identifier, "x"),
- (Type::Spaces, " "),
- (Type::Number, "1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::First, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_early_end_of_command_2() {
- check_segmentation(
- r#"define !macro1
-x.
-data list /x 1.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Newline, "\n"),
- (Type::Identifier, "x"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "data"),
- (Type::Spaces, " "),
- (Type::Identifier, "list"),
- (Type::Spaces, " "),
- (Type::Punct, "/"),
- (Type::Identifier, "x"),
- (Type::Spaces, " "),
- (Type::Number, "1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::Later, PromptStyle::First, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_early_end_of_command_3() {
- check_segmentation(
- r#"define !macro1(.
-x.
-data list /x 1.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "x"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "data"),
- (Type::Spaces, " "),
- (Type::Identifier, "list"),
- (Type::Spaces, " "),
- (Type::Punct, "/"),
- (Type::Identifier, "x"),
- (Type::Spaces, " "),
- (Type::Number, "1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::First, PromptStyle::First, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_early_end_of_command_4() {
- // Notice the command terminator at the end of the `DEFINE` command,
- // which should not be there and ends it early.
- check_segmentation(
- r#"define !macro1.
-data list /x 1.
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "data"),
- (Type::Spaces, " "),
- (Type::Identifier, "list"),
- (Type::Spaces, " "),
- (Type::Punct, "/"),
- (Type::Identifier, "x"),
- (Type::Spaces, " "),
- (Type::Number, "1"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::First, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_missing_enddefine() {
- check_segmentation(
- r#"define !macro1()
-content line 1
-content line 2
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::MacroBody, "content line 1"),
- (Type::Newline, "\n"),
- (Type::MacroBody, "content line 2"),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::Define,
- ],
- );
- }
-
- #[test]
- fn test_missing_enddefine_2() {
- check_segmentation(
- r#"define !macro1()
-"#,
- Mode::Interactive,
- &[
- (Type::Identifier, "define"),
- (Type::Spaces, " "),
- (Type::MacroName, "!macro1"),
- (Type::Punct, "("),
- (Type::Punct, ")"),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[PromptStyle::Define],
- );
- }
- }
-
- #[test]
- fn test_batch_mode() {
- check_segmentation(
- r#"first command
- another line of first command
-+ second command
-third command
-
-fourth command.
- fifth command.
-"#,
- Mode::Batch,
- &[
- (Type::Identifier, "first"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "another"),
- (Type::Spaces, " "),
- (Type::Identifier, "line"),
- (Type::Spaces, " "),
- (Type::Identifier, "of"),
- (Type::Spaces, " "),
- (Type::Identifier, "first"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::Newline, "\n"),
- (Type::StartCommand, "+"),
- (Type::Spaces, " "),
- (Type::Identifier, "second"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::Newline, "\n"),
- (Type::StartCommand, ""),
- (Type::Identifier, "third"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::Identifier, "fourth"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "fifth"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_auto_mode() {
- check_segmentation(
- r#"command
- another line of command
-2sls
-+ another command
-another line of second command
-data list /x 1
-aggregate.
-print eject.
-twostep cluster
-
-
-fourth command.
- fifth command.
-"#,
- Mode::Auto,
- &[
- (Type::Identifier, "command"),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "another"),
- (Type::Spaces, " "),
- (Type::Identifier, "line"),
- (Type::Spaces, " "),
- (Type::Identifier, "of"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::Newline, "\n"),
- (Type::StartCommand, ""),
- (Type::Number, "2"),
- (Type::Identifier, "sls"),
- (Type::Newline, "\n"),
- (Type::StartCommand, "+"),
- (Type::Spaces, " "),
- (Type::Identifier, "another"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::Newline, "\n"),
- (Type::Identifier, "another"),
- (Type::Spaces, " "),
- (Type::Identifier, "line"),
- (Type::Spaces, " "),
- (Type::Identifier, "of"),
- (Type::Spaces, " "),
- (Type::Identifier, "second"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::Newline, "\n"),
- (Type::StartCommand, ""),
- (Type::Identifier, "data"),
- (Type::Spaces, " "),
- (Type::Identifier, "list"),
- (Type::Spaces, " "),
- (Type::Punct, "/"),
- (Type::Identifier, "x"),
- (Type::Spaces, " "),
- (Type::Number, "1"),
- (Type::Newline, "\n"),
- (Type::StartCommand, ""),
- (Type::Identifier, "aggregate"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "print"),
- (Type::Spaces, " "),
- (Type::Identifier, "eject"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Identifier, "twostep"),
- (Type::Spaces, " "),
- (Type::Identifier, "cluster"),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::SeparateCommands, ""),
- (Type::Newline, "\n"),
- (Type::Identifier, "fourth"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::Spaces, " "),
- (Type::Identifier, "fifth"),
- (Type::Spaces, " "),
- (Type::Identifier, "command"),
- (Type::EndCommand, "."),
- (Type::Newline, "\n"),
- (Type::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- ],
- );
- }
-}
--- /dev/null
+//! Syntax segmentation.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into two
+//! phases: a lower-level phase called "segmentation" and a higher-level phase
+//! called "scanning". This module implements the segmentation phase.
+//! [`super::scan`] contains declarations for the scanning phase.
+//!
+//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label
+//! (a segment type) for each byte or contiguous sequence of bytes in the input.
+//! It also, in a few corner cases, outputs zero-width segments that label the
+//! boundary between a pair of bytes in the input.
+//!
+//! Some segment types correspond directly to tokens; for example, an
+//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID)
+//! later in lexical analysis. Other segments contribute to tokens but do not
+//! correspond directly; for example, multiple quoted string segments
+//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators
+//! (SEG_PUNCT) may be combined to form a single string token (T_STRING). Still
+//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior
+//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
+
+use crate::{
+ identifier::{id_match, id_match_n, is_reserved_word, IdentifierChar},
+ prompt::PromptStyle,
+};
+use bitflags::bitflags;
+
+use super::command_name::{command_match, COMMAND_NAMES};
+
+/// Segmentation mode.
+///
+/// PSPP syntax is written in one of two modes which are broadly defined as
+/// follows:
+///
+/// - In interactive mode, commands end with a period at the end of the line
+/// or with a blank line.
+///
+/// - In batch mode, the second and subsequent lines of a command are indented
+/// from the left margin.
+///
+/// The segmenter can also try to automatically detect the mode in use, using a
+/// heuristic that is usually correct.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
+pub enum Mode {
+ /// Try to interpret input correctly regardless of whether it is written
+ /// for interactive or batch mode.
+ #[default]
+ Auto,
+
+ /// Interactive syntax mode.
+ Interactive,
+
+ /// Batch syntax mode.
+ Batch,
+}
+
+/// The type of a segment.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Type {
+ Number,
+ QuotedString,
+ HexString,
+ UnicodeString,
+ UnquotedString,
+ ReservedWord,
+ Identifier,
+ Punct,
+ Shbang,
+ Spaces,
+ Comment,
+ Newline,
+ CommentCommand,
+ DoRepeatCommand,
+ DoRepeatOverflow,
+ InlineData,
+ MacroId,
+ MacroName,
+ MacroBody,
+ StartDocument,
+ Document,
+ StartCommand,
+ SeparateCommands,
+ EndCommand,
+ End,
+ ExpectedQuote,
+ ExpectedExponent,
+ UnexpectedChar,
+}
+
+bitflags! {
+ #[derive(Copy, Clone, Debug)]
+ pub struct Substate: u8 {
+ const START_OF_LINE = 1;
+ const START_OF_COMMAND = 2;
+ }
+}
+
+#[derive(Copy, Clone)]
+pub struct Segmenter {
+ state: (State, Substate),
+ nest: u8,
+ mode: Mode,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct Incomplete;
+
+impl Segmenter {
+ /// Returns a segmenter with the given syntax `mode`.
+ ///
+ /// If `is_snippet` is false, then the segmenter will parse as if it's being
+ /// given a whole file. This means, for example, that it will interpret `-`
+ /// or `+` at the beginning of the syntax as a separator between commands
+ /// (since `-` or `+` at the beginning of a line has this meaning).
+ ///
+ /// If `is_snippet` is true, then the segmenter will parse as if it's being
+ /// given an isolated piece of syntax. This means that, for example, that
+ /// it will interpret `-` or `+` at the beginning of the syntax as an
+ /// operator token or (if followed by a digit) as part of a number.
+ pub fn new(mode: Mode, is_snippet: bool) -> Self {
+ Self {
+ state: if is_snippet {
+ (State::General, Substate::empty())
+ } else {
+ (State::Shbang, Substate::empty())
+ },
+ mode,
+ nest: 0,
+ }
+ }
+
+ pub fn mode(&self) -> Mode {
+ self.mode
+ }
+
+ fn start_of_line(&self) -> bool {
+ self.state.1.contains(Substate::START_OF_LINE)
+ }
+
+ fn start_of_command(&self) -> bool {
+ self.state.1.contains(Substate::START_OF_COMMAND)
+ }
+
+ /// Returns the style of command prompt to display to an interactive user
+ /// for input in the current state.. The return value is most accurate in
+ /// mode `Mode::Interactive` and at the beginning of a line (that is, if
+ /// [`Segmenter::push`] consumed as much as possible of the input up to a
+ /// new-line).
+ pub fn prompt(&self) -> PromptStyle {
+ match self.state.0 {
+ State::Shbang => PromptStyle::First,
+ State::General => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::Comment1 | State::Comment2 => PromptStyle::Comment,
+ State::Document1 | State::Document2 => PromptStyle::Document,
+ State::Document3 => PromptStyle::First,
+ State::FileLabel1 => PromptStyle::Later,
+ State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
+ State::DoRepeat1 | State::DoRepeat2 => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::DoRepeat3 => PromptStyle::DoRepeat,
+ State::DoRepeat4 => PromptStyle::DoRepeat,
+ State::Define1 | State::Define2 | State::Define3 => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
+ State::BeginData1 => PromptStyle::First,
+ State::BeginData2 => PromptStyle::Later,
+ State::BeginData3 | State::BeginData4 => PromptStyle::Data,
+ }
+ }
+
+ /// Attempts to label a prefix of the remaining input with a segment type.
+ /// The caller supplies a prefix of the remaining input as `input`. If
+ /// `eof` is true, then `input` is the entire (remainder) of the input; if
+ /// `eof` is false, then further input is potentially available.
+ ///
+ /// The input may contain '\n' or '\r\n' line ends in any combination.
+ ///
+ /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
+ /// in the segment at the beginning of `input` (a number in
+ /// `0..=input.len()`) and the type of that segment. The next call should
+ /// not include those bytes in `input`, because they have (figuratively)
+ /// been consumed by the segmenter.
+ ///
+ /// Segments can have zero length, including segment types `Type::End`,
+ /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and
+ /// `Type::Spaces`.
+ ///
+ /// Failure occurs only if the segment type of the bytes in `input` cannot
+ /// yet be determined. In this case, this function returns `Err(Incomplete)`. If
+ /// more input is available, the caller should obtain some more, then call
+ /// again with a longer `input`. If this is not enough, the process might
+ /// need to repeat again and again. If input is exhausted, then the caller
+ /// may call again setting `eof` to true. This function will never return
+ /// `Err(Incomplete)` when `eof` is true.
+ ///
+ /// The caller must not, in a sequence of calls, supply contradictory input.
+ /// That is, bytes provided as part of `input` in one call, but not
+ /// consumed, must not be provided with *different* values on subsequent
+ /// calls. This is because the function must often make decisions based on
+ /// looking ahead beyond the bytes that it consumes.
+ pub fn push<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
+ if input.is_empty() {
+ if eof {
+ return Ok((input, Type::End));
+ } else {
+ return Err(Incomplete);
+ };
+ }
+
+ match self.state.0 {
+ State::Shbang => return self.parse_shbang(input, eof),
+ State::General => {
+ if self.start_of_line() {
+ self.parse_start_of_line(input, eof)
+ } else {
+ self.parse_mid_line(input, eof)
+ }
+ }
+ State::Comment1 => self.parse_comment_1(input, eof),
+ State::Comment2 => self.parse_comment_2(input, eof),
+ State::Document1 => self.parse_document_1(input, eof),
+ State::Document2 => self.parse_document_2(input, eof),
+ State::Document3 => self.parse_document_3(input, eof),
+ State::FileLabel1 => self.parse_file_label_1(input, eof),
+ State::FileLabel2 => self.parse_file_label_2(input, eof),
+ State::FileLabel3 => self.parse_file_label_3(input, eof),
+ State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
+ State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
+ State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
+ State::DoRepeat4 => self.parse_do_repeat_4(input),
+ State::Define1 => self.parse_define_1_2(input, eof),
+ State::Define2 => self.parse_define_1_2(input, eof),
+ State::Define3 => self.parse_define_3(input, eof),
+ State::Define4 => self.parse_define_4_5(input, eof),
+ State::Define5 => self.parse_define_4_5(input, eof),
+ State::Define6 => self.parse_define_6(input, eof),
+ State::BeginData1 => self.parse_begin_data_1(input, eof),
+ State::BeginData2 => self.parse_begin_data_2(input, eof),
+ State::BeginData3 => self.parse_begin_data_3(input, eof),
+ State::BeginData4 => self.parse_begin_data_4(input, eof),
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum State {
+ Shbang,
+ General,
+ Comment1,
+ Comment2,
+ Document1,
+ Document2,
+ Document3,
+ FileLabel1,
+ FileLabel2,
+ FileLabel3,
+ DoRepeat1,
+ DoRepeat2,
+ DoRepeat3,
+ DoRepeat4,
+ Define1,
+ Define2,
+ Define3,
+ Define4,
+ Define5,
+ Define6,
+ BeginData1,
+ BeginData2,
+ BeginData3,
+ BeginData4,
+}
+
+fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
+ let mut iter = input.chars();
+ match iter.next() {
+ None if !eof => Err(Incomplete),
+ c => Ok((c, iter.as_str())),
+ }
+}
+
+fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
+ '*' => {
+ if let (Some('/'), rest) = take(rest, eof)? {
+ return Ok(rest);
+ }
+ }
+ _ => (),
+ };
+ input = rest;
+ }
+}
+
+fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
+where
+ F: Fn(char) -> bool,
+{
+ let input = input.trim_start_matches(f);
+ if input.is_empty() && !eof {
+ Err(Incomplete)
+ } else {
+ Ok(input)
+ }
+}
+
+fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
+where
+ F: Fn(char) -> bool,
+{
+ if let (Some(c), rest) = take(input, eof)? {
+ if f(c) {
+ return Ok(Some(rest));
+ }
+ }
+ Ok(None)
+}
+
+fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+ c if c.is_whitespace() => (),
+ _ => return Ok(input),
+ }
+ input = rest;
+ }
+}
+
+fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
+ skip_matching(|c| c.is_ascii_digit(), input, eof)
+}
+
+fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '/' => {
+ let (c, rest2) = take(rest, eof)?;
+ match c {
+ Some('*') => input = skip_comment(rest2, eof)?,
+ Some(_) | None => return Ok(rest),
+ }
+ }
+ '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+ c if c.is_whitespace() => input = rest,
+ _ => return Ok(input),
+ };
+ }
+}
+
+fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let (Some(c), _rest) = take(input, eof)? else {
+ return Ok(false);
+ };
+ match c {
+ 'x' | 'X' | 'u' | 'U' => Ok({
+ let (c, _rest) = take(input, eof)?;
+ c == Some('\'') || c == Some('"')
+ }),
+ '\'' | '"' => Ok(true),
+ '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true),
+ _ => Ok(false),
+ }
+}
+
+fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(true);
+ };
+ Ok(match c {
+ '\n' => true,
+ '\r' => take(rest, eof)?.0 == Some('\n'),
+ _ => false,
+ })
+}
+
+fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ is_end_of_line(skip_spaces_and_comments(input, eof)?, eof)
+}
+
+fn first(s: &str) -> char {
+ s.chars().next().unwrap()
+}
+fn get_command_name_candidates(target: &str) -> &[&'static str] {
+ if target.is_empty() {
+ return &[];
+ }
+ let target_first = first(target).to_ascii_uppercase();
+ let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
+ let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
+ &COMMAND_NAMES[low..high]
+}
+
+fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let command_name = input
+ .split(|c: char| {
+ !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-')
+ })
+ .next()
+ .unwrap();
+ if !eof && command_name.len() == input.len() {
+ return Err(Incomplete);
+ }
+ let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.');
+ for command in get_command_name_candidates(command_name) {
+ if let Some(m) = command_match(command, command_name) {
+ if m.missing_words <= 0 {
+ return Ok(true);
+ }
+ }
+ }
+ Ok(false)
+}
+
+impl Segmenter {
+ fn parse_shbang<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ if let (Some('#'), rest) = take(input, eof)? {
+ if let (Some('!'), rest) = take(rest, eof)? {
+ let rest = self.parse_full_line(rest, eof)?;
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((rest, Type::Shbang));
+ }
+ }
+
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ self.push(input, eof)
+ }
+ fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
+ match self.mode {
+ Mode::Auto => detect_command_name(input, eof),
+ Mode::Interactive => Ok(false),
+ Mode::Batch => Ok(true),
+ }
+ }
+ fn parse_start_of_line<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ debug_assert_eq!(self.state.0, State::General);
+ debug_assert!(self.start_of_line());
+ debug_assert!(!input.is_empty());
+
+ let (Some(c), rest) = take(input, eof).unwrap() else {
+ unreachable!()
+ };
+ match c {
+ '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => {
+ // This `+` is punctuation that may separate pieces of a string.
+ self.state = (State::General, Substate::empty());
+ return Ok((rest, Type::Punct));
+ }
+ '+' | '-' | '.' => {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((rest, Type::StartCommand));
+ }
+ _ if c.is_whitespace() => {
+ if at_end_of_line(input, eof)? {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Type::SeparateCommands));
+ }
+ }
+ _ => {
+ if self.at_command_start(input, eof)?
+ && !self.state.1.contains(Substate::START_OF_COMMAND)
+ {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Type::StartCommand));
+ }
+ }
+ }
+ self.state.1 = Substate::START_OF_COMMAND;
+ self.parse_mid_line(input, eof)
+ }
+ fn parse_mid_line<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ debug_assert!(self.state.0 == State::General);
+ debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
+ let (Some(c), rest) = take(input, eof)? else {
+ unreachable!()
+ };
+ match c {
+ '\r' | '\n' if is_end_of_line(input, eof)? => {
+ self.state.1 |= Substate::START_OF_LINE;
+ Ok((
+ self.parse_newline(input, eof).unwrap().unwrap(),
+ Type::Newline,
+ ))
+ }
+ '/' => {
+ if let (Some('*'), rest) = take(rest, eof)? {
+ let rest = skip_comment(rest, eof)?;
+ return Ok((rest, Type::Comment));
+ } else {
+ self.state.1 = Substate::empty();
+ return Ok((rest, Type::Punct));
+ }
+ }
+ '-' => {
+ let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
+ match c {
+ Some(c) if c.is_ascii_digit() => {
+ return self.parse_number(rest, eof);
+ }
+ Some('.') => {
+ if let (Some(c), _rest) = take(rest2, eof)? {
+ if c.is_ascii_digit() {
+ return self.parse_number(rest, eof);
+ }
+ }
+ }
+ None | Some(_) => (),
+ }
+ self.state.1 = Substate::empty();
+ return Ok((rest, Type::Punct));
+ }
+ '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
+ self.state.1 = Substate::empty();
+ return Ok((rest, Type::Punct));
+ }
+ '*' => {
+ if self.state.1.contains(Substate::START_OF_COMMAND) {
+ self.state.0 = State::Comment1;
+ self.parse_comment_1(input, eof)
+ } else {
+ self.parse_digraph(&['*'], rest, eof)
+ }
+ }
+ '<' => self.parse_digraph(&['=', '>'], rest, eof),
+ '>' => self.parse_digraph(&['='], rest, eof),
+ '~' => self.parse_digraph(&['='], rest, eof),
+ '.' if at_end_of_line(rest, eof)? => {
+ self.state.1 = Substate::START_OF_COMMAND;
+ Ok((rest, Type::EndCommand))
+ }
+ '.' => match take(rest, eof)? {
+ (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
+ _ => Ok((rest, Type::Punct)),
+ },
+ '0'..='9' => self.parse_number(input, eof),
+ 'u' | 'U' => self.maybe_parse_string(Type::UnicodeString, (input, rest), eof),
+ 'x' | 'X' => self.maybe_parse_string(Type::HexString, (input, rest), eof),
+ '\'' | '"' => self.parse_string(Type::QuotedString, c, rest, eof),
+ '!' => {
+ let (c, rest2) = take(rest, eof)?;
+ match c {
+ Some('*') => Ok((rest2, Type::MacroId)),
+ Some(_) => self.parse_id(input, eof),
+ None => Ok((rest, Type::Punct)),
+ }
+ }
+ c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Type::Spaces)),
+ c if c.may_start_id() => self.parse_id(input, eof),
+ '!'..='~' if c != '\\' && c != '^' => {
+ self.state.1 = Substate::empty();
+ Ok((rest, Type::Punct))
+ }
+ _ => {
+ self.state.1 = Substate::empty();
+ Ok((rest, Type::UnexpectedChar))
+ }
+ }
+ }
+ fn parse_string<'a>(
+ &mut self,
+ type_: Type,
+ quote: char,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ println!("{quote:?} {input:?}");
+ while let (Some(c), rest) = take(input, eof)? {
+ match c {
+ _ if c == quote => {
+ let (c, rest2) = take(rest, eof)?;
+ if c != Some(quote) {
+ self.state.1 = Substate::empty();
+ return Ok((rest, type_));
+ }
+ input = rest2;
+ }
+ '\r' | '\n' if is_end_of_line(input, eof)? => break,
+ _ => input = rest,
+ }
+ }
+ self.state.1 = Substate::empty();
+ Ok((input, Type::ExpectedQuote))
+ }
+ fn maybe_parse_string<'a>(
+ &mut self,
+ type_: Type,
+ input: (&'a str, &'a str),
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ match take(input.1, eof)? {
+ (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(type_, c, rest, eof),
+ _ => self.parse_id(input.0, eof),
+ }
+ }
+ fn next_id_in_command<'a>(
+ &self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, &'a str), Incomplete> {
+ let mut sub = Segmenter::new(self.mode, true);
+ loop {
+ let (rest, type_) = sub.push(input, eof)?;
+ match type_ {
+ Type::Shbang | Type::Spaces | Type::Comment | Type::Newline => (),
+
+ Type::Identifier => return Ok((&input[..input.len() - rest.len()], rest)),
+
+ Type::Number
+ | Type::QuotedString
+ | Type::HexString
+ | Type::UnicodeString
+ | Type::UnquotedString
+ | Type::ReservedWord
+ | Type::Punct
+ | Type::CommentCommand
+ | Type::DoRepeatCommand
+ | Type::DoRepeatOverflow
+ | Type::InlineData
+ | Type::MacroId
+ | Type::MacroName
+ | Type::MacroBody
+ | Type::StartDocument
+ | Type::Document
+ | Type::StartCommand
+ | Type::SeparateCommands
+ | Type::EndCommand
+ | Type::End
+ | Type::ExpectedQuote
+ | Type::ExpectedExponent
+ | Type::UnexpectedChar => return Ok(("", rest)),
+ }
+ input = rest;
+ }
+ }
+ fn parse_id<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
+ let (Some(_), mut end) = take(input, eof).unwrap() else {
+ unreachable!()
+ };
+ while let (Some(c), rest) = take(end, eof)? {
+ if !c.may_continue_id() {
+ break;
+ };
+ end = rest;
+ }
+ let identifier = &input[..input.len() - end.len()];
+ let identifier = match identifier.strip_suffix('.') {
+ Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
+ _ => identifier,
+ };
+ let rest = &input[identifier.len()..];
+
+ if self.state.1.contains(Substate::START_OF_COMMAND) {
+ if id_match_n("COMMENT", identifier, 4) {
+ self.state.0 = State::Comment1;
+ return self.parse_comment_1(input, eof);
+ } else if id_match("DOCUMENT", identifier) {
+ self.state.0 = State::Document1;
+ return Ok((input, Type::StartDocument));
+ } else if id_match_n("DEFINE", identifier, 6) {
+ self.state.0 = State::Define1;
+ } else if id_match("FILE", identifier) {
+ if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
+ self.state = (State::FileLabel1, Substate::empty());
+ return Ok((rest, Type::Identifier));
+ }
+ } else if id_match("DO", identifier) {
+ if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
+ self.state = (State::DoRepeat1, Substate::empty());
+ return Ok((rest, Type::Identifier));
+ }
+ } else if id_match("BEGIN", identifier) {
+ let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
+ if id_match("DATA", next_id) {
+ let rest2 = skip_spaces_and_comments(rest2, eof)?;
+ let rest2 = if let Some(s) = rest2.strip_prefix('.') {
+ skip_spaces_and_comments(s, eof)?
+ } else {
+ rest2
+ };
+ if is_end_of_line(rest2, eof)? {
+ let s = &input[..input.len() - rest2.len()];
+ self.state = (
+ if s.contains('\n') {
+ State::BeginData1
+ } else {
+ State::BeginData2
+ },
+ Substate::empty(),
+ );
+ return Ok((rest, Type::Identifier));
+ }
+ }
+ }
+ }
+
+ self.state.1 = Substate::empty();
+ let type_ = if is_reserved_word(identifier) {
+ Type::ReservedWord
+ } else if identifier.starts_with('!') {
+ Type::MacroId
+ } else {
+ Type::Identifier
+ };
+ Ok((rest, type_))
+ }
+ fn parse_digraph<'a>(
+ &mut self,
+ seconds: &[char],
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (c, rest) = take(input, eof)?;
+ self.state.1 = Substate::empty();
+ Ok((
+ match c {
+ Some(c) if seconds.contains(&c) => rest,
+ _ => input,
+ },
+ Type::Punct,
+ ))
+ }
+ fn parse_number<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let mut input = skip_digits(input, eof)?;
+ if let Some(rest) = match_char(|c| c == '.', input, eof)? {
+ let rest2 = skip_digits(rest, eof)?;
+ if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
+ input = rest2;
+ }
+ };
+ if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
+ let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
+ let rest2 = skip_digits(rest, eof)?;
+ if rest2.len() == rest.len() {
+ self.state.1 = Substate::empty();
+ return Ok((rest, Type::ExpectedExponent));
+ }
+ input = rest2;
+ }
+ self.state.1 = Substate::empty();
+ Ok((input, Type::Number))
+ }
+ fn parse_comment_1<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ enum CommentState<'a> {
+ Blank,
+ NotBlank,
+ Period(&'a str),
+ }
+ let mut state = CommentState::Blank;
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ // End of file.
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Type::SeparateCommands));
+ };
+ match c {
+ '.' => state = CommentState::Period(input),
+ '\n' | '\r' if is_end_of_line(input, eof)? => {
+ match state {
+ CommentState::Blank => {
+ // Blank line ends comment command.
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Type::SeparateCommands));
+ }
+ CommentState::Period(period) => {
+ // '.' at end of line ends comment command.
+ self.state = (State::General, Substate::empty());
+ return Ok((period, Type::CommentCommand));
+ }
+ CommentState::NotBlank => {
+ // Comment continues onto next line.
+ self.state = (State::Comment2, Substate::empty());
+ return Ok((input, Type::CommentCommand));
+ }
+ }
+ }
+ c if c.is_whitespace() => (),
+ _ => state = CommentState::NotBlank,
+ }
+ input = rest;
+ }
+ }
+ fn parse_comment_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+
+ let new_command = match take(rest, eof)?.0 {
+ Some('+') | Some('-') | Some('.') => true,
+ Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
+ None | Some(_) => false,
+ };
+ if new_command {
+ self.state = (
+ State::General,
+ Substate::START_OF_LINE | Substate::START_OF_COMMAND,
+ );
+ } else {
+ self.state.0 = State::Comment1;
+ }
+ Ok((rest, Type::Newline))
+ }
+ fn parse_document_1<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let mut end_cmd = false;
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ self.state.0 = State::Document3;
+ return Ok((input, Type::Document));
+ };
+ match c {
+ '.' => end_cmd = true,
+ '\n' | '\r' if is_end_of_line(input, eof)? => {
+ self.state.0 = if end_cmd {
+ State::Document3
+ } else {
+ State::Document2
+ };
+ return Ok((input, Type::Document));
+ }
+ c if !c.is_whitespace() => end_cmd = false,
+ _ => (),
+ }
+ input = rest;
+ }
+ }
+ fn parse_document_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::Document1;
+ Ok((rest, Type::Newline))
+ }
+ fn parse_document_3<'a>(
+ &mut self,
+ input: &'a str,
+ _eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ Ok((input, Type::EndCommand))
+ }
+ fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let input = skip_spaces_and_comments(input, eof)?;
+ match take(input, eof)?.0 {
+ Some('\'') | Some('"') | Some('\n') => Ok(true),
+ _ => Ok(false),
+ }
+ }
+ fn parse_file_label_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let mut sub = Segmenter {
+ state: (State::General, self.state.1),
+ ..*self
+ };
+ let (rest, type_) = sub.push(input, eof)?;
+ if type_ == Type::Identifier {
+ let id = &input[..input.len() - rest.len()];
+ debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
+ if Self::quoted_file_label(rest, eof)? {
+ *self = sub;
+ } else {
+ self.state.0 = State::FileLabel2;
+ }
+ } else {
+ self.state.1 = sub.state.1;
+ }
+ Ok((rest, type_))
+ }
+ fn parse_file_label_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let input = skip_spaces(input, eof)?;
+ self.state.0 = State::FileLabel3;
+ Ok((input, Type::Spaces))
+ }
+ fn parse_file_label_3<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let mut end_cmd = None;
+ loop {
+ let (c, rest) = take(input, eof)?;
+ match c {
+ None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
+ self.state = (State::General, Substate::empty());
+ return Ok((end_cmd.unwrap_or(input), Type::UnquotedString));
+ }
+ None => unreachable!(),
+ Some('.') => end_cmd = Some(input),
+ Some(c) if !c.is_whitespace() => end_cmd = None,
+ Some(_) => (),
+ }
+ input = rest;
+ }
+ }
+ fn subparse<'a>(&mut self, input: &'a str, eof: bool) -> Result<(&'a str, Type), Incomplete> {
+ let mut sub = Segmenter {
+ mode: self.mode,
+ state: (State::General, self.state.1),
+ nest: 0,
+ };
+ let result = sub.push(input, eof)?;
+ self.state.1 = sub.state.1;
+ Ok(result)
+ }
+ /// We are segmenting a `DO REPEAT` command, currently reading the syntax
+ /// that defines the stand-in variables (the head) before the lines of
+ /// syntax to be repeated (the body).
+ fn parse_do_repeat_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ if type_ == Type::SeparateCommands {
+ // We reached a blank line that separates the head from the body.
+ self.state.0 = State::DoRepeat2;
+ } else if type_ == Type::EndCommand || type_ == Type::StartCommand {
+ // We reached the body.
+ self.state.0 = State::DoRepeat3;
+ self.nest = 1;
+ }
+ Ok((rest, type_))
+ }
+ /// We are segmenting a `DO REPEAT` command, currently reading a blank line
+ /// that separates the head from the body.
+ fn parse_do_repeat_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ if type_ == Type::Newline {
+ // We reached the body.
+ self.state.0 = State::DoRepeat3;
+ self.nest = 1;
+ }
+ Ok((rest, type_))
+ }
+ fn parse_newline<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<&'a str>, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(None);
+ };
+ match c {
+ '\n' => Ok(Some(rest)),
+ '\r' => {
+ if let (Some('\n'), rest) = take(rest, eof)? {
+ Ok(Some(rest))
+ } else {
+ Ok(None)
+ }
+ }
+ _ => Ok(None),
+ }
+ }
+
+ fn parse_full_line<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<&'a str, Incomplete> {
+ loop {
+ if is_end_of_line(input, eof)? {
+ return Ok(input);
+ }
+ input = take(input, eof).unwrap().1;
+ }
+ }
+ fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result<isize, Incomplete> {
+ let input = input.strip_prefix(&['-', '+']).unwrap_or(input);
+ let (id1, input) = self.next_id_in_command(input, eof)?;
+ if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) {
+ Ok(1)
+ } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0)
+ {
+ Ok(-1)
+ } else {
+ Ok(0)
+ }
+ }
+ /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
+ /// are to be repeated. Report each line of syntax as a single
+ /// [`Type::DoRepeatCommand`].
+ ///
+ /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
+ /// blocks inside the lines we're segmenting. `self.nest` counts the
+ /// nesting level, starting at 1.
+ fn parse_do_repeat_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ if let Some(rest) = self.parse_newline(input, eof)? {
+ return Ok((rest, Type::Newline));
+ }
+ let rest = self.parse_full_line(input, eof)?;
+ let direction = self.check_repeat_command(input, eof)?;
+ if direction > 0 {
+ if let Some(nest) = self.nest.checked_add(1) {
+ self.nest = nest;
+ } else {
+ self.state.0 = State::DoRepeat4;
+ }
+ } else if direction < 0 {
+ self.nest -= 1;
+ if self.nest == 0 {
+ // Nesting level dropped to 0, so we've finished reading the `DO
+ // REPEAT` body.
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ return self.push(input, eof);
+ }
+ }
+ return Ok((rest, Type::DoRepeatCommand));
+ }
+ fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Type), Incomplete> {
+ self.state.0 = State::DoRepeat3;
+ Ok((input, Type::DoRepeatOverflow))
+ }
+ /// We are segmenting a `DEFINE` command, which consists of:
+ ///
+ /// - The `DEFINE` keyword.
+ ///
+ /// - An identifier. We transform this into `Type::MacroName` instead of
+ /// `Type::Identifier` or `Type::MacroId` because this identifier must
+ /// never be macro-expanded.
+ ///
+ /// - Anything but `(`.
+ ///
+ /// - `(` followed by a sequence of tokens possibly including balanced
+ /// parentheses up to a final `)`.
+ ///
+ /// - A sequence of any number of lines, one string per line, ending with
+ /// `!ENDDEFINE`. The first line is usually blank (that is, a newline
+ /// follows the `(`). The last line usually just has `!ENDDEFINE.` on
+ /// it, but it can start with other tokens. The whole
+ /// DEFINE...!ENDDEFINE can be on a single line, even.
+ fn parse_define_1_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ match type_ {
+ Type::Identifier | Type::MacroId if self.state.0 == State::Define1 => {
+ self.state.0 = State::Define2;
+ return Ok((rest, Type::MacroName));
+ }
+ Type::SeparateCommands | Type::EndCommand | Type::StartCommand => {
+ // The DEFINE command is malformed because we reached its end
+ // without ever hitting a `(` token. Transition back to general
+ // parsing.
+ self.state.0 = State::General;
+ }
+ Type::Punct if input.starts_with('(') => {
+ self.state.0 = State::Define3;
+ self.nest = 1;
+ }
+ _ => (),
+ }
+ Ok((rest, type_))
+ }
+ fn parse_define_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ match type_ {
+ Type::SeparateCommands | Type::EndCommand | Type::StartCommand => {
+ // The DEFINE command is malformed because we reached its end
+ // without ever hitting a `(` token. Transition back to general
+ // parsing.
+ self.state.0 = State::General;
+ }
+ Type::Punct if input.starts_with('(') => {
+ self.nest += 1;
+ }
+ Type::Punct if input.starts_with(')') => {
+ self.nest -= 1;
+ if self.nest == 0 {
+ self.state = (State::Define4, Substate::empty());
+ }
+ }
+ _ => (),
+ }
+ Ok((rest, type_))
+ }
+ fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> {
+ loop {
+ input = skip_spaces_and_comments(input, true).unwrap();
+ let (Some(c), rest) = take(input, true).unwrap() else {
+ return None;
+ };
+ match c {
+ '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
+ return Some(input)
+ }
+ '\'' | '"' => {
+ let index = rest.find(c)?;
+ input = &rest[index + 1..];
+ }
+ _ => input = rest,
+ }
+ }
+ }
+
+ /// We are in the body of a macro definition, looking for additional lines
+ /// of the body or `!ENDDEFINE`.
+ ///
+ /// In `State::Define4`, we're parsing the first line of the macro body (the
+ /// same line as the closing parenthesis in the argument definition). In
+ /// `State::Define5`, we're on a later line.
+ fn parse_define_4_5<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_full_line(input, eof)?;
+ let line = &input[..input.len() - rest.len()];
+ if let Some(end) = Self::find_enddefine(line) {
+ // Macro ends at the !ENDDEFINE on this line.
+ self.state = (State::General, Substate::empty());
+ let (prefix, rest) = input.split_at(line.len() - end.len());
+ if prefix.is_empty() {
+ // Line starts with `!ENDDEFINE`.
+ self.push(input, eof)
+ } else if prefix.trim_start().is_empty() {
+ // Line starts with spaces followed by `!ENDDEFINE`.
+ Ok((rest, Type::Spaces))
+ } else {
+ // Line starts with some content followed by `!ENDDEFINE`.
+ Ok((rest, Type::MacroBody))
+ }
+ } else {
+ // No `!ENDDEFINE`. We have a full line of macro body.
+ //
+ // If the first line of the macro body is blank, we just report it
+ // as spaces, or not at all if there are no spaces, because it's not
+ // significant.
+ //
+ // However, if it's a later line, we need to report it because blank
+ // lines can have significance.
+ let type_ = if self.state.0 == State::Define4 && line.trim_start().is_empty() {
+ if line.is_empty() {
+ return self.parse_define_6(input, eof);
+ }
+ Type::Spaces
+ } else {
+ Type::MacroBody
+ };
+ self.state.0 = State::Define6;
+ Ok((rest, type_))
+ }
+ }
+ fn parse_define_6<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::Define5;
+ Ok((rest, Type::Newline))
+ }
+ fn parse_begin_data_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ if type_ == Type::Newline {
+ self.state.0 = State::BeginData2;
+ }
+ Ok((rest, type_))
+ }
+ fn parse_begin_data_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let (rest, type_) = self.subparse(input, eof)?;
+ if type_ == Type::Newline {
+ self.state.0 = State::BeginData3;
+ }
+ Ok((rest, type_))
+ }
+ fn is_end_data(line: &str) -> bool {
+ let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
+ return false;
+ };
+ let (Some(c), rest) = take(rest, true).unwrap() else {
+ return false;
+ };
+ if !c.is_whitespace() {
+ return false;
+ };
+ let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
+ return false;
+ };
+
+ let mut endcmd = false;
+ for c in rest.chars() {
+ match c {
+ '.' if endcmd => return false,
+ '.' => endcmd = true,
+ c if c.is_whitespace() => (),
+ _ => return false,
+ }
+ }
+ true
+ }
+ fn parse_begin_data_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_full_line(input, eof)?;
+ let line = &input[..input.len() - rest.len()];
+ if Self::is_end_data(line) {
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ self.push(input, eof)
+ } else {
+ self.state.0 = State::BeginData4;
+ Ok((rest, Type::InlineData))
+ }
+ }
+ fn parse_begin_data_4<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Type), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::BeginData3;
+ Ok((rest, Type::Newline))
+ }
+}
+
+fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
+ line.get(..pattern.len())
+ .map(|prefix| {
+ prefix
+ .eq_ignore_ascii_case(pattern)
+ .then(|| &line[pattern.len()..])
+ })
+ .flatten()
+}
+
+#[cfg(test)]
+mod test;
--- /dev/null
+use crate::prompt::PromptStyle;
+
+use super::{Mode, Segmenter, Type};
+
+fn push_segment<'a>(segmenter: &mut Segmenter, input: &'a str, one_byte: bool) -> (&'a str, Type) {
+ if one_byte {
+ for len in input.char_indices().map(|(pos, _c)| pos) {
+ if let Ok((rest, type_)) = segmenter.push(&input[..len], false) {
+ return (&input[len - rest.len()..], type_);
+ }
+ }
+ }
+ segmenter.push(input, true).unwrap()
+}
+
+fn _check_segmentation(
+ mut input: &str,
+ mode: Mode,
+ expect_segments: &[(Type, &str)],
+ expect_prompts: &[PromptStyle],
+ one_byte: bool,
+) {
+ let mut segments = Vec::with_capacity(expect_segments.len());
+ let mut prompts = Vec::new();
+ let mut segmenter = Segmenter::new(mode, false);
+ loop {
+ let (rest, type_) = push_segment(&mut segmenter, input, one_byte);
+ let len = input.len() - rest.len();
+ let token = &input[..len];
+ segments.push((type_, token));
+ match type_ {
+ Type::End => break,
+ Type::Newline => prompts.push(segmenter.prompt()),
+ _ => (),
+ }
+ input = rest;
+ }
+
+ if &segments != expect_segments {
+ eprintln!("segments differ from expected:");
+ let difference = diff::slice(expect_segments, &segments);
+ for result in difference {
+ match result {
+ diff::Result::Left(left) => eprintln!("-{left:?}"),
+ diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+ diff::Result::Right(right) => eprintln!("+{right:?}"),
+ }
+ }
+ panic!();
+ }
+
+ if &prompts != expect_prompts {
+ eprintln!("prompts differ from expected:");
+ let difference = diff::slice(expect_prompts, &prompts);
+ for result in difference {
+ match result {
+ diff::Result::Left(left) => eprintln!("-{left:?}"),
+ diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+ diff::Result::Right(right) => eprintln!("+{right:?}"),
+ }
+ }
+ panic!();
+ }
+}
+
+fn check_segmentation(
+ input: &str,
+ mode: Mode,
+ expect_segments: &[(Type, &str)],
+ expect_prompts: &[PromptStyle],
+) {
+ for (one_byte, one_byte_name) in [(false, "full-string"), (true, "byte-by-byte")] {
+ println!("running {one_byte_name} segmentation test with LF newlines...");
+ _check_segmentation(input, mode, expect_segments, expect_prompts, one_byte);
+
+ println!("running {one_byte_name} segmentation test with CRLF newlines...");
+ _check_segmentation(
+ &input.replace('\n', "\r\n"),
+ mode,
+ &expect_segments
+ .iter()
+ .map(|(type_, s)| match *type_ {
+ Type::Newline => (Type::Newline, "\r\n"),
+ _ => (*type_, *s),
+ })
+ .collect::<Vec<_>>(),
+ expect_prompts,
+ one_byte,
+ );
+
+ if let Some(input) = input.strip_suffix('\n') {
+ println!("running {one_byte_name} segmentation test without final newline...");
+ let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect();
+ assert_eq!(expect_segments.pop(), Some((Type::End, "")));
+ assert_eq!(expect_segments.pop(), Some((Type::Newline, "\n")));
+ while let Some((Type::SeparateCommands | Type::EndCommand, "")) = expect_segments.last()
+ {
+ expect_segments.pop();
+ }
+ expect_segments.push((Type::End, ""));
+ _check_segmentation(
+ input,
+ mode,
+ &expect_segments,
+ &expect_prompts[..expect_prompts.len() - 1],
+ one_byte,
+ );
+ }
+ }
+}
+
+fn print_segmentation(mut input: &str) {
+ let mut segmenter = Segmenter::new(Mode::Auto, false);
+ loop {
+ let (rest, type_) = segmenter.push(input, true).unwrap();
+ let len = input.len() - rest.len();
+ let token = &input[..len];
+ print!("{type_:?} {token:?}");
+ match type_ {
+ Type::Newline => print!(" ({:?})", segmenter.prompt()),
+ Type::End => break,
+ _ => (),
+ }
+ println!();
+ input = rest;
+ }
+}
+
+#[test]
+fn test_identifiers() {
+ check_segmentation(
+ r#"a ab abc abcd !abcd
+A AB ABC ABCD !ABCD
+aB aBC aBcD !aBcD
+$x $y $z !$z
+grève Ângstrom poté
+#a #b #c ## #d !#d
+@efg @ @@. @#@ !@
+## # #12345 #.#
+f@#_.#6
+GhIjK
+.x 1y _z
+"#,
+ Mode::Auto,
+ &[
+ (Type::Identifier, "a"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "ab"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "abc"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "abcd"),
+ (Type::Spaces, " "),
+ (Type::MacroId, "!abcd"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "A"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "AB"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "ABC"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "ABCD"),
+ (Type::Spaces, " "),
+ (Type::MacroId, "!ABCD"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "aB"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "aBC"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "aBcD"),
+ (Type::Spaces, " "),
+ (Type::MacroId, "!aBcD"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "$x"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "$y"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "$z"),
+ (Type::Spaces, " "),
+ (Type::MacroId, "!$z"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "grève"),
+ (Type::Spaces, "\u{00a0}"),
+ (Type::Identifier, "Ângstrom"),
+ (Type::Spaces, "\u{00a0}"),
+ (Type::Identifier, "poté"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "#a"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "#b"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "#c"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "##"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "#d"),
+ (Type::Spaces, " "),
+ (Type::MacroId, "!#d"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "@efg"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "@"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "@@."),
+ (Type::Spaces, " "),
+ (Type::Identifier, "@#@"),
+ (Type::Spaces, " "),
+ (Type::MacroId, "!@"),
+ (Type::Spaces, " "),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "##"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "#"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "#12345"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "#.#"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "f@#_.#6"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "GhIjK"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "."),
+ (Type::Identifier, "x"),
+ (Type::Spaces, " "),
+ (Type::Number, "1"),
+ (Type::Identifier, "y"),
+ (Type::Spaces, " "),
+ (Type::Punct, "_"),
+ (Type::Identifier, "z"),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ ],
+ );
+}
+
+#[test]
+fn test_identifiers_ending_in_dot() {
+ check_segmentation(
+ r#"abcd. abcd.
+ABCD. ABCD.
+aBcD. aBcD.
+$y. $z. あいうえお.
+#c. #d..
+@@. @@....
+#.#.
+#abcd.
+.
+.
+LMNOP.
+QRSTUV./* end of line comment */
+qrstuv. /* end of line comment */
+QrStUv./* end of line comment */
+wxyz./* unterminated end of line comment
+WXYZ. /* unterminated end of line comment
+WxYz./* unterminated end of line comment
+"#,
+ Mode::Auto,
+ &[
+ (Type::Identifier, "abcd."),
+ (Type::Spaces, " "),
+ (Type::Identifier, "abcd"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "ABCD."),
+ (Type::Spaces, " "),
+ (Type::Identifier, "ABCD"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "aBcD."),
+ (Type::Spaces, " "),
+ (Type::Identifier, "aBcD"),
+ (Type::EndCommand, "."),
+ (Type::Spaces, " "),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "$y."),
+ (Type::Spaces, " "),
+ (Type::Identifier, "$z."),
+ (Type::Spaces, " "),
+ (Type::Identifier, "あいうえお"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "#c."),
+ (Type::Spaces, " "),
+ (Type::Identifier, "#d."),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "@@."),
+ (Type::Spaces, " "),
+ (Type::Identifier, "@@..."),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "#.#"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "#abcd"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "."),
+ (Type::Spaces, " "),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "LMNOP"),
+ (Type::EndCommand, "."),
+ (Type::Spaces, " "),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "QRSTUV"),
+ (Type::EndCommand, "."),
+ (Type::Comment, "/* end of line comment */"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "qrstuv"),
+ (Type::EndCommand, "."),
+ (Type::Spaces, " "),
+ (Type::Comment, "/* end of line comment */"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "QrStUv"),
+ (Type::EndCommand, "."),
+ (Type::Comment, "/* end of line comment */"),
+ (Type::Spaces, " "),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "wxyz"),
+ (Type::EndCommand, "."),
+ (Type::Comment, "/* unterminated end of line comment"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "WXYZ"),
+ (Type::EndCommand, "."),
+ (Type::Spaces, " "),
+ (Type::Comment, "/* unterminated end of line comment"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "WxYz"),
+ (Type::EndCommand, "."),
+ (Type::Comment, "/* unterminated end of line comment "),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_reserved_words() {
+ check_segmentation(
+ r#"and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+"#,
+ Mode::Auto,
+ &[
+ (Type::ReservedWord, "and"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "or"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "not"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "eq"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "ge"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "gt"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "le"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "lt"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "ne"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "all"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "by"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "to"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "with"),
+ (Type::Newline, "\n"),
+ (Type::ReservedWord, "AND"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "OR"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "NOT"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "EQ"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "GE"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "GT"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "LE"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "LT"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "NE"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "ALL"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "BY"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "TO"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "WITH"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "andx"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "orx"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "notx"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "eqx"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "gex"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "gtx"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "lex"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "ltx"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "nex"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "allx"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "byx"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "tox"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "withx"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "and."),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "with"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_punctuation() {
+ check_segmentation(
+ r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
+~&|=>=><=<~=<>(),-+*/[]**!*
+% : ; ? _ ` { } ~ !*
+"#,
+ Mode::Auto,
+ &[
+ (Type::Punct, "~"),
+ (Type::Spaces, " "),
+ (Type::Punct, "&"),
+ (Type::Spaces, " "),
+ (Type::Punct, "|"),
+ (Type::Spaces, " "),
+ (Type::Punct, "="),
+ (Type::Spaces, " "),
+ (Type::Punct, ">="),
+ (Type::Spaces, " "),
+ (Type::Punct, ">"),
+ (Type::Spaces, " "),
+ (Type::Punct, "<="),
+ (Type::Spaces, " "),
+ (Type::Punct, "<"),
+ (Type::Spaces, " "),
+ (Type::Punct, "~="),
+ (Type::Spaces, " "),
+ (Type::Punct, "<>"),
+ (Type::Spaces, " "),
+ (Type::Punct, "("),
+ (Type::Spaces, " "),
+ (Type::Punct, ")"),
+ (Type::Spaces, " "),
+ (Type::Punct, ","),
+ (Type::Spaces, " "),
+ (Type::Punct, "-"),
+ (Type::Spaces, " "),
+ (Type::Punct, "+"),
+ (Type::Spaces, " "),
+ (Type::Punct, "*"),
+ (Type::Spaces, " "),
+ (Type::Punct, "/"),
+ (Type::Spaces, " "),
+ (Type::Punct, "["),
+ (Type::Spaces, " "),
+ (Type::Punct, "]"),
+ (Type::Spaces, " "),
+ (Type::Punct, "**"),
+ (Type::Newline, "\n"),
+ (Type::Punct, "~"),
+ (Type::Punct, "&"),
+ (Type::Punct, "|"),
+ (Type::Punct, "="),
+ (Type::Punct, ">="),
+ (Type::Punct, ">"),
+ (Type::Punct, "<="),
+ (Type::Punct, "<"),
+ (Type::Punct, "~="),
+ (Type::Punct, "<>"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Punct, ","),
+ (Type::Punct, "-"),
+ (Type::Punct, "+"),
+ (Type::Punct, "*"),
+ (Type::Punct, "/"),
+ (Type::Punct, "["),
+ (Type::Punct, "]"),
+ (Type::Punct, "**"),
+ (Type::MacroId, "!*"),
+ (Type::Newline, "\n"),
+ (Type::Punct, "%"),
+ (Type::Spaces, " "),
+ (Type::Punct, ":"),
+ (Type::Spaces, " "),
+ (Type::Punct, ";"),
+ (Type::Spaces, " "),
+ (Type::Punct, "?"),
+ (Type::Spaces, " "),
+ (Type::Punct, "_"),
+ (Type::Spaces, " "),
+ (Type::Punct, "`"),
+ (Type::Spaces, " "),
+ (Type::Punct, "{"),
+ (Type::Spaces, " "),
+ (Type::Punct, "}"),
+ (Type::Spaces, " "),
+ (Type::Punct, "~"),
+ (Type::Spaces, " "),
+ (Type::MacroId, "!*"),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later],
+ );
+}
+
+#[test]
+fn test_positive_numbers() {
+ check_segmentation(
+ r#"0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e- 1.
+"#,
+ Mode::Auto,
+ &[
+ (Type::Number, "0"),
+ (Type::Spaces, " "),
+ (Type::Number, "1"),
+ (Type::Spaces, " "),
+ (Type::Number, "01"),
+ (Type::Spaces, " "),
+ (Type::Number, "001."),
+ (Type::Spaces, " "),
+ (Type::Number, "1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Number, "123"),
+ (Type::EndCommand, "."),
+ (Type::Spaces, " "),
+ (Type::Comment, "/* comment 1 */"),
+ (Type::Spaces, " "),
+ (Type::Comment, "/* comment 2 */"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "."),
+ (Type::Number, "1"),
+ (Type::Spaces, " "),
+ (Type::Number, "0.1"),
+ (Type::Spaces, " "),
+ (Type::Number, "00.1"),
+ (Type::Spaces, " "),
+ (Type::Number, "00.10"),
+ (Type::Newline, "\n"),
+ (Type::Number, "5e1"),
+ (Type::Spaces, " "),
+ (Type::Number, "6E-1"),
+ (Type::Spaces, " "),
+ (Type::Number, "7e+1"),
+ (Type::Spaces, " "),
+ (Type::Number, "6E+01"),
+ (Type::Spaces, " "),
+ (Type::Number, "6e-03"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "."),
+ (Type::Number, "3E1"),
+ (Type::Spaces, " "),
+ (Type::Number, ".4e-1"),
+ (Type::Spaces, " "),
+ (Type::Number, ".5E+1"),
+ (Type::Spaces, " "),
+ (Type::Number, ".6e+01"),
+ (Type::Spaces, " "),
+ (Type::Number, ".7E-03"),
+ (Type::Newline, "\n"),
+ (Type::Number, "1.23e1"),
+ (Type::Spaces, " "),
+ (Type::Number, "45.6E-1"),
+ (Type::Spaces, " "),
+ (Type::Number, "78.9e+1"),
+ (Type::Spaces, " "),
+ (Type::Number, "99.9E+01"),
+ (Type::Spaces, " "),
+ (Type::Number, "11.2e-03"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "."),
+ (Type::Spaces, " "),
+ (Type::ExpectedExponent, "1e"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "e1"),
+ (Type::Spaces, " "),
+ (Type::ExpectedExponent, "1e+"),
+ (Type::Spaces, " "),
+ (Type::ExpectedExponent, "1e-"),
+ (Type::Spaces, " "),
+ (Type::Number, "1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_negative_numbers() {
+ check_segmentation(
+ r#" -0 -1 -01 -001. -1.
+ -123. /* comment 1 */ /* comment 2 */
+ -.1 -0.1 -00.1 -00.10
+ -5e1 -6E-1 -7e+1 -6E+01 -6e-03
+ -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
+ -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
+ -/**/1
+ -. -1e -e1 -1e+ -1e- -1.
+"#,
+ Mode::Auto,
+ &[
+ (Type::Spaces, " "),
+ (Type::Number, "-0"),
+ (Type::Spaces, " "),
+ (Type::Number, "-1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-01"),
+ (Type::Spaces, " "),
+ (Type::Number, "-001."),
+ (Type::Spaces, " "),
+ (Type::Number, "-1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Number, "-123"),
+ (Type::EndCommand, "."),
+ (Type::Spaces, " "),
+ (Type::Comment, "/* comment 1 */"),
+ (Type::Spaces, " "),
+ (Type::Comment, "/* comment 2 */"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Number, "-.1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-0.1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-00.1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-00.10"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Number, "-5e1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-6E-1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-7e+1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-6E+01"),
+ (Type::Spaces, " "),
+ (Type::Number, "-6e-03"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Number, "-.3E1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-.4e-1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-.5E+1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-.6e+01"),
+ (Type::Spaces, " "),
+ (Type::Number, "-.7E-03"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Number, "-1.23e1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-45.6E-1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-78.9e+1"),
+ (Type::Spaces, " "),
+ (Type::Number, "-99.9E+01"),
+ (Type::Spaces, " "),
+ (Type::Number, "-11.2e-03"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Punct, "-"),
+ (Type::Comment, "/**/"),
+ (Type::Number, "1"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Punct, "-"),
+ (Type::Punct, "."),
+ (Type::Spaces, " "),
+ (Type::ExpectedExponent, "-1e"),
+ (Type::Spaces, " "),
+ (Type::Punct, "-"),
+ (Type::Identifier, "e1"),
+ (Type::Spaces, " "),
+ (Type::ExpectedExponent, "-1e+"),
+ (Type::Spaces, " "),
+ (Type::ExpectedExponent, "-1e-"),
+ (Type::Spaces, " "),
+ (Type::Number, "-1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_strings() {
+ check_segmentation(
+ r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' ""
+'missing end quote
+"missing double quote
+x"4142" X'5152'
+u'fffd' U"041"
++ new command
++ /* comment */ 'string continuation'
++ /* also a punctuator on blank line
+- 'new command'
+"#,
+ Mode::Auto,
+ &[
+ (Type::QuotedString, "'x'"),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "\"y\""),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "'abc'"),
+ (Type::Newline, "\n"),
+ (Type::QuotedString, "'Don''t'"),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "\"Can't\""),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "'Won''t'"),
+ (Type::Newline, "\n"),
+ (Type::QuotedString, "\"\"\"quoted\"\"\""),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "'\"quoted\"'"),
+ (Type::Newline, "\n"),
+ (Type::QuotedString, "''"),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "\"\""),
+ (Type::Newline, "\n"),
+ (Type::ExpectedQuote, "'missing end quote"),
+ (Type::Newline, "\n"),
+ (Type::ExpectedQuote, "\"missing double quote"),
+ (Type::Newline, "\n"),
+ (Type::HexString, "x\"4142\""),
+ (Type::Spaces, " "),
+ (Type::HexString, "X'5152'"),
+ (Type::Newline, "\n"),
+ (Type::UnicodeString, "u'fffd'"),
+ (Type::Spaces, " "),
+ (Type::UnicodeString, "U\"041\""),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "+"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "new"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::Newline, "\n"),
+ (Type::Punct, "+"),
+ (Type::Spaces, " "),
+ (Type::Comment, "/* comment */"),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "'string continuation'"),
+ (Type::Newline, "\n"),
+ (Type::Punct, "+"),
+ (Type::Spaces, " "),
+ (Type::Comment, "/* also a punctuator on blank line"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "-"),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "'new command'"),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ ],
+ );
+}
+
+#[test]
+fn test_shbang() {
+ check_segmentation(
+ r#"#! /usr/bin/pspp
+title my title.
+#! /usr/bin/pspp
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Shbang, "#! /usr/bin/pspp"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "title"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "my"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "title"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "#"),
+ (Type::MacroId, "!"),
+ (Type::Spaces, " "),
+ (Type::Punct, "/"),
+ (Type::Identifier, "usr"),
+ (Type::Punct, "/"),
+ (Type::Identifier, "bin"),
+ (Type::Punct, "/"),
+ (Type::Identifier, "pspp"),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::First, PromptStyle::First, PromptStyle::Later],
+ );
+}
+
+#[test]
+fn test_comment_command() {
+ check_segmentation(
+ r#"* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+ * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+"#,
+ Mode::Interactive,
+ &[
+ (Type::CommentCommand, "* Comment commands \"don't"),
+ (Type::Newline, "\n"),
+ (Type::CommentCommand, "have to contain valid tokens"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::CommentCommand, "** Check ambiguity with ** token"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::CommentCommand, "****************"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::CommentCommand, "comment keyword works too"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::CommentCommand, "COMM also"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "com"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "is"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "ambiguous"),
+ (Type::Spaces, " "),
+ (Type::ReservedWord, "with"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "COMPUTE"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (
+ Type::CommentCommand,
+ "* Comment need not start at left margin",
+ ),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::CommentCommand, "* Comment ends with blank line"),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "next"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Comment,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Comment,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_document_command() {
+ check_segmentation(
+ r#"DOCUMENT one line.
+DOC more
+ than
+ one
+ line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::StartDocument, ""),
+ (Type::Document, "DOCUMENT one line."),
+ (Type::EndCommand, ""),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::StartDocument, ""),
+ (Type::Document, "DOC more"),
+ (Type::Newline, "\n"),
+ (Type::Document, " than"),
+ (Type::Newline, "\n"),
+ (Type::Document, " one"),
+ (Type::Newline, "\n"),
+ (Type::Document, " line."),
+ (Type::EndCommand, ""),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::StartDocument, ""),
+ (Type::Document, "docu"),
+ (Type::Newline, "\n"),
+ (Type::Document, "first.paragraph"),
+ (Type::Newline, "\n"),
+ (Type::Document, "isn't parsed as tokens"),
+ (Type::Newline, "\n"),
+ (Type::Document, ""),
+ (Type::Newline, "\n"),
+ (Type::Document, "second paragraph."),
+ (Type::EndCommand, ""),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::First,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_file_label_command() {
+ check_segmentation(
+ r#"FIL label isn't quoted.
+FILE
+ lab 'is quoted'.
+FILE /*
+/**/ lab not quoted here either
+
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "FIL"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "label"),
+ (Type::Spaces, " "),
+ (Type::UnquotedString, "isn't quoted"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "FILE"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "lab"),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "'is quoted'"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "FILE"),
+ (Type::Spaces, " "),
+ (Type::Comment, "/*"),
+ (Type::Newline, "\n"),
+ (Type::Comment, "/**/"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "lab"),
+ (Type::Spaces, " "),
+ (Type::UnquotedString, "not quoted here either"),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_begin_data() {
+ check_segmentation(
+ r#"begin data.
+end data.
+
+begin data. /*
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end data
+end data
+.
+
+begin
+ data.
+data
+end data.
+
+begin data "xxx".
+begin data 123.
+not data
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "begin"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "end"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "begin"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::EndCommand, "."),
+ (Type::Spaces, " "),
+ (Type::Comment, "/*"),
+ (Type::Newline, "\n"),
+ (Type::InlineData, "123"),
+ (Type::Newline, "\n"),
+ (Type::InlineData, "xxx"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "end"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "BEG"),
+ (Type::Spaces, " "),
+ (Type::Comment, "/**/"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "DAT"),
+ (Type::Spaces, " "),
+ (Type::Comment, "/*"),
+ (Type::Newline, "\n"),
+ (Type::InlineData, "5 6 7 /* x"),
+ (Type::Newline, "\n"),
+ (Type::InlineData, ""),
+ (Type::Newline, "\n"),
+ (Type::InlineData, "end data"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "end"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "begin"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::InlineData, "data"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "end"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "begin"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::Spaces, " "),
+ (Type::QuotedString, "\"xxx\""),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "begin"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::Spaces, " "),
+ (Type::Number, "123"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::ReservedWord, "not"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "data"),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Data,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ ],
+ );
+}
+
+#[test]
+fn test_do_repeat() {
+ check_segmentation(
+ r#"do repeat x=a b c
+ y=d e f.
+ do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+do
+ repeat #a=1.
+ inner command.
+end repeat.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "do"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "x"),
+ (Type::Punct, "="),
+ (Type::Identifier, "a"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "b"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "c"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "y"),
+ (Type::Punct, "="),
+ (Type::Identifier, "d"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "e"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "f"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, " do repeat a=1 thru 5."),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, "another command."),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, "second command"),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, "+ third command."),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "end"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "do"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "#a"),
+ (Type::Punct, "="),
+ (Type::Number, "1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, " inner command."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "end"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_do_repeat_overflow() {
+ const N: usize = 257;
+ let do_repeat: Vec<String> = (0..N)
+ .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5))
+ .collect();
+ let end_repeat: Vec<String> = (0..N)
+ .rev()
+ .map(|i| format!("end repeat. /* {i}\n"))
+ .collect();
+
+ let s: String = do_repeat
+ .iter()
+ .chain(end_repeat.iter())
+ .map(|s| s.as_str())
+ .collect();
+ let mut expect_output = vec![
+ (Type::Identifier, "do"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "v0"),
+ (Type::Punct, "="),
+ (Type::Number, "0"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "thru"),
+ (Type::Spaces, " "),
+ (Type::Number, "5"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ ];
+ for i in 1..N {
+ expect_output.push((Type::DoRepeatCommand, &do_repeat[i].trim_end()));
+ if i >= 255 {
+ expect_output.push((Type::DoRepeatOverflow, ""));
+ }
+ expect_output.push((Type::Newline, "\n"));
+ }
+ for i in 0..254 {
+ expect_output.push((Type::DoRepeatCommand, &end_repeat[i].trim_end()));
+ expect_output.push((Type::Newline, "\n"));
+ }
+ let comments: Vec<String> = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect();
+ for comment in &comments {
+ expect_output.extend([
+ (Type::Identifier, "end"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::EndCommand, "."),
+ (Type::Spaces, " "),
+ (Type::Comment, comment),
+ (Type::Newline, "\n"),
+ ]);
+ }
+ expect_output.push((Type::End, ""));
+
+ let expect_prompts: Vec<_> = (0..N * 2 - 3)
+ .map(|_| PromptStyle::DoRepeat)
+ .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First])
+ .collect();
+ check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts);
+}
+
+#[test]
+fn test_do_repeat_batch() {
+ check_segmentation(
+ r#"do repeat x=a b c
+ y=d e f
+do repeat a=1 thru 5
+another command
+second command
++ third command
+end /* x */ /* y */ repeat print
+end
+ repeat
+do
+ repeat #a=1
+
+ inner command
+end repeat
+"#,
+ Mode::Batch,
+ &[
+ (Type::Identifier, "do"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "x"),
+ (Type::Punct, "="),
+ (Type::Identifier, "a"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "b"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "c"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "y"),
+ (Type::Punct, "="),
+ (Type::Identifier, "d"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "e"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "f"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, ""),
+ (Type::DoRepeatCommand, "do repeat a=1 thru 5"),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, "another command"),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, "second command"),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, "+ third command"),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "end"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, ""),
+ (Type::Identifier, "do"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "#a"),
+ (Type::Punct, "="),
+ (Type::Number, "1"),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::DoRepeatCommand, " inner command"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "end"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "repeat"),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::Later,
+ ],
+ );
+}
+
+mod define {
+ use crate::{
+ lex::segment::{Mode, Type},
+ prompt::PromptStyle,
+ };
+
+ use super::check_segmentation;
+
+ #[test]
+ fn test_simple() {
+ check_segmentation(
+ r#"define !macro1()
+var1 var2 var3 "!enddefine"
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::MacroBody, "var1 var2 var3 \"!enddefine\""),
+ (Type::Newline, "\n"),
+ (Type::MacroId, "!enddefine"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_no_newline_after_parentheses() {
+ check_segmentation(
+ r#"define !macro1() var1 var2 var3 /* !enddefine
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::MacroBody, " var1 var2 var3 /* !enddefine"),
+ (Type::Newline, "\n"),
+ (Type::MacroId, "!enddefine"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_no_newline_before_enddefine() {
+ check_segmentation(
+ r#"define !macro1()
+var1 var2 var3!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::MacroBody, "var1 var2 var3"),
+ (Type::MacroId, "!enddefine"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_all_on_one_line() {
+ check_segmentation(
+ r#"define !macro1()var1 var2 var3!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::MacroBody, "var1 var2 var3"),
+ (Type::MacroId, "!enddefine"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_empty() {
+ check_segmentation(
+ r#"define !macro1()
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::MacroId, "!enddefine"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_blank_lines() {
+ check_segmentation(
+ r#"define !macro1()
+
+
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::MacroBody, ""),
+ (Type::Newline, "\n"),
+ (Type::MacroBody, ""),
+ (Type::Newline, "\n"),
+ (Type::MacroId, "!enddefine"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::First,
+ ],
+ );
+ }
+
+ #[test]
+ fn test_arguments() {
+ check_segmentation(
+ r#"define !macro1(a(), b(), c())
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Identifier, "a"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Punct, ","),
+ (Type::Spaces, " "),
+ (Type::Identifier, "b"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Punct, ","),
+ (Type::Spaces, " "),
+ (Type::Identifier, "c"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::MacroId, "!enddefine"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_multiline_arguments() {
+ check_segmentation(
+ r#"define !macro1(
+ a(), b(
+ ),
+ c()
+)
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "a"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Punct, ","),
+ (Type::Spaces, " "),
+ (Type::Identifier, "b"),
+ (Type::Punct, "("),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Punct, ")"),
+ (Type::Punct, ","),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "c"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::MacroId, "!enddefine"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Define,
+ PromptStyle::First,
+ ],
+ );
+ }
+
+ #[test]
+ fn test_arguments_start_on_second_line() {
+ check_segmentation(
+ r#"define !macro1
+(x,y,z
+)
+content 1
+content 2
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Newline, "\n"),
+ (Type::Punct, "("),
+ (Type::Identifier, "x"),
+ (Type::Punct, ","),
+ (Type::Identifier, "y"),
+ (Type::Punct, ","),
+ (Type::Identifier, "z"),
+ (Type::Newline, "\n"),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::MacroBody, "content 1"),
+ (Type::Newline, "\n"),
+ (Type::MacroBody, "content 2"),
+ (Type::Newline, "\n"),
+ (Type::MacroId, "!enddefine"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::First,
+ ],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_1() {
+ check_segmentation(
+ r#"define !macro1.
+data list /x 1.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "data"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "list"),
+ (Type::Spaces, " "),
+ (Type::Punct, "/"),
+ (Type::Identifier, "x"),
+ (Type::Spaces, " "),
+ (Type::Number, "1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::First, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_2() {
+ check_segmentation(
+ r#"define !macro1
+x.
+data list /x 1.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "x"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "data"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "list"),
+ (Type::Spaces, " "),
+ (Type::Punct, "/"),
+ (Type::Identifier, "x"),
+ (Type::Spaces, " "),
+ (Type::Number, "1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::Later, PromptStyle::First, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_3() {
+ check_segmentation(
+ r#"define !macro1(.
+x.
+data list /x 1.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "x"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "data"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "list"),
+ (Type::Spaces, " "),
+ (Type::Punct, "/"),
+ (Type::Identifier, "x"),
+ (Type::Spaces, " "),
+ (Type::Number, "1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::First, PromptStyle::First, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_4() {
+ // Notice the command terminator at the end of the `DEFINE` command,
+ // which should not be there and ends it early.
+ check_segmentation(
+ r#"define !macro1.
+data list /x 1.
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "data"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "list"),
+ (Type::Spaces, " "),
+ (Type::Punct, "/"),
+ (Type::Identifier, "x"),
+ (Type::Spaces, " "),
+ (Type::Number, "1"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::First, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_missing_enddefine() {
+ check_segmentation(
+ r#"define !macro1()
+content line 1
+content line 2
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::MacroBody, "content line 1"),
+ (Type::Newline, "\n"),
+ (Type::MacroBody, "content line 2"),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::Define,
+ ],
+ );
+ }
+
+ #[test]
+ fn test_missing_enddefine_2() {
+ check_segmentation(
+ r#"define !macro1()
+"#,
+ Mode::Interactive,
+ &[
+ (Type::Identifier, "define"),
+ (Type::Spaces, " "),
+ (Type::MacroName, "!macro1"),
+ (Type::Punct, "("),
+ (Type::Punct, ")"),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[PromptStyle::Define],
+ );
+ }
+}
+
+#[test]
+fn test_batch_mode() {
+ check_segmentation(
+ r#"first command
+ another line of first command
++ second command
+third command
+
+fourth command.
+ fifth command.
+"#,
+ Mode::Batch,
+ &[
+ (Type::Identifier, "first"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "another"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "line"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "of"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "first"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "+"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "second"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, ""),
+ (Type::Identifier, "third"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "fourth"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "fifth"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_auto_mode() {
+ check_segmentation(
+ r#"command
+ another line of command
+2sls
++ another command
+another line of second command
+data list /x 1
+aggregate.
+print eject.
+twostep cluster
+
+
+fourth command.
+ fifth command.
+"#,
+ Mode::Auto,
+ &[
+ (Type::Identifier, "command"),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "another"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "line"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "of"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, ""),
+ (Type::Number, "2"),
+ (Type::Identifier, "sls"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, "+"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "another"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "another"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "line"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "of"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "second"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, ""),
+ (Type::Identifier, "data"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "list"),
+ (Type::Spaces, " "),
+ (Type::Punct, "/"),
+ (Type::Identifier, "x"),
+ (Type::Spaces, " "),
+ (Type::Number, "1"),
+ (Type::Newline, "\n"),
+ (Type::StartCommand, ""),
+ (Type::Identifier, "aggregate"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "print"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "eject"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "twostep"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "cluster"),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::SeparateCommands, ""),
+ (Type::Newline, "\n"),
+ (Type::Identifier, "fourth"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "fifth"),
+ (Type::Spaces, " "),
+ (Type::Identifier, "command"),
+ (Type::EndCommand, "."),
+ (Type::Newline, "\n"),
+ (Type::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ ],
+ );
+}