From ae4ce00f6e1c25a00bf37ada9b5485a9d2ebebbd Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 16 Aug 2024 21:45:44 -0700 Subject: [PATCH] work on lexer --- rust/src/autodecode.rs | 168 -------------- rust/src/lex/lexer.rs | 433 +++++++++++++++++++++++++++++++++-- rust/src/lex/scan/mod.rs | 123 +++++++--- rust/src/lex/segment/mod.rs | 26 ++- rust/src/lex/segment/test.rs | 24 +- rust/src/lib.rs | 1 - rust/src/macros.rs | 105 +++++---- rust/src/message.rs | 11 +- 8 files changed, 587 insertions(+), 304 deletions(-) delete mode 100644 rust/src/autodecode.rs diff --git a/rust/src/autodecode.rs b/rust/src/autodecode.rs deleted file mode 100644 index 84a8ec35c9..0000000000 --- a/rust/src/autodecode.rs +++ /dev/null @@ -1,168 +0,0 @@ -use chardetng::EncodingDetector; -use encoding_rs::{Decoder, Encoding}; -use std::io::{BufRead, Read, Result}; - -struct Autodecode -where - R: Read, -{ - inner: R, - buffer: Box<[u8]>, - state: State, -} - -enum State { - /// Stream encoding is not yet known. - Auto { - detector: EncodingDetector, - back: usize, - front: usize, - ascii: usize, - }, - - /// Stream encoding is known. - Decode(Decoder), -} - -fn read_fully(reader: &mut R, mut buffer: &mut [u8]) -> Result -where - R: Read, -{ - let mut len = 0; - while len < buffer.len() { - let n = reader.read(&mut buffer[len..])?; - if n == 0 { - break; - } - len += n; - } - Ok(len) -} - -impl Autodecode -where - R: Read, -{ - fn new(inner: R) -> Result { - Self::with_capacity(8192, inner) - } - fn with_capacity(capacity: usize, mut inner: R) -> Result { - let mut buffer = Vec::with_capacity(capacity); - buffer.resize(capacity, 0); - let len = read_fully(&mut inner, buffer.as_mut_slice())?; - let mut detector = EncodingDetector::new(); - let state = if len < buffer.len() { - detector.feed(&buffer[..len], true); - State::Decode(detector.guess(None, true).new_decoder_with_bom_removal()) - } else { - let ascii = feed(&mut detector, &buffer[..len], false); - State::Auto { - detector, - back: 0, - front: len, - ascii, - } - }; - Ok(Self { - inner, - buffer: buffer.into_boxed_slice(), - state, - }) - } -} - -impl Read for Autodecode -where - R: Read, -{ - fn read(&mut self, outbuf: &mut [u8]) -> Result { - let mut buffer = self.fill_buf()?; - let n = buffer.read(outbuf)?; - self.consume(n); - Ok(n) - } -} - -impl BufRead for Autodecode -where - R: Read, -{ - fn fill_buf(&mut self) -> Result<&[u8]> { - match &mut self.state { - State::Auto { - detector, - back, - front, - ascii, - } => { - if back < ascii { - // Consume data up to the first non-ASCII byte. - Ok(&self.buffer[*back..*ascii]) - } else if ascii < front { - // We had a non-ASCII byte and we consumed everything up to - // it. We want to get a full buffer starting at the - // non-ASCII byte before we decide on the encoding. - debug_assert_eq!(ascii, back); - - // Shift buffered data to the beginning of the buffer to - // make room to get a full buffer. - self.buffer.copy_within(*back..*front, 0); - *front -= *back; - *back = 0; - *ascii = 0; - - // Fill up the remainder of the buffer. - let old_front = *front; - *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?; - detector.feed(&self.buffer[old_front..*front], *front < self.buffer.len()); - self.state = State::Decode( - detector.guess(None, true).new_decoder_with_bom_removal(), - ); - self.fill_buf() - } else { - // We have not had a non-ASCII byte yet but we consumed the - // whole buffer. Read a new one. - *back = 0; - *front = 0; - *ascii = 0; - *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?; - let eof = *front < self.buffer.len(); - *ascii = feed(detector, &self.buffer[..*front], eof); - if eof || *ascii == 0 { - self.state = State::Decode( - detector.guess(None, true).new_decoder_with_bom_removal(), - ); - self.fill_buf() - } else { - Ok(&self.buffer[..*ascii]) - } - } - } - State::Decode(_) => todo!(), - } - } - - fn consume(&mut self, n: usize) { - todo!() - } -} - -fn feed(detector: &mut EncodingDetector, buffer: &[u8], last: bool) -> usize { - if detector.feed(buffer, last) { - Encoding::ascii_valid_up_to(buffer) - } else { - buffer.len() - } -} -/* - } else { - debug_assert_eq!(ascii, back); - debug_assert_eq!(back, front); - *back = 0; - *front = 0; - *ascii = 0; - *front += read_fully(&mut self.inner, &mut self.buffer[..])?; - *ascii = feed(detector, &self.buffer[..*front], *front < self.buffer.len()); - Ok(&self.buffer[*back..*ascii]) - } -*/ diff --git a/rust/src/lex/lexer.rs b/rust/src/lex/lexer.rs index d5728d5cac..1eb11e61ca 100644 --- a/rust/src/lex/lexer.rs +++ b/rust/src/lex/lexer.rs @@ -1,17 +1,35 @@ -use std::io::Read; +use std::{ + borrow::Borrow, + collections::{HashMap, VecDeque}, + io::Result, + num::NonZeroU32, + ops::RangeInclusive, + sync::Arc, +}; -use encoding_rs::Encoding; +use encoding_rs::{Encoding, UTF_8}; +use unicode_width::UnicodeWidthStr; -use crate::prompt::PromptStyle; +use crate::{ + macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser}, + message::{Location, Point}, + prompt::PromptStyle, +}; -use super::segment::Mode; +use super::{ + scan::{MergeResult, ScanToken}, + segment::{Mode, Segment, Segmenter}, + token::Token, +}; /// Error handling for a [`Reader`]. +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] pub enum ErrorHandling { /// Discard input line and continue reading. Terminal, /// Continue to next command, except for cascading failures. + #[default] Continue, /// Continue, even for cascading failures. @@ -21,11 +39,41 @@ pub enum ErrorHandling { Stop, } -/// Reads a single syntax file as a stream of bytes encoded in UTF-8. -pub struct Reader { - /// Segmentation mode. - mode: Mode, +pub trait LexRead { + /// Read some input from the source. If successful, returns the input that + /// was read. At end of file, returns `Ok(None)`. + /// + /// `prompt` provides a hint to interactive readers as to what kind of + /// syntax is being read right now. + fn read(&mut self, prompt: PromptStyle) -> Result>; +} + +impl LexRead for () { + fn read(&mut self, _prompt: PromptStyle) -> Result> { + Ok(None) + } +} +/// # Token pipeline +/// +/// Tokens pass through a pipeline with the following stages. Each token +/// eventually made available to the parser passes through of these stages. +/// The stages are named after the processing that happens in each one. +/// +/// Initially, tokens come from the segmenter and scanner to `pp`: +/// +/// - `pp`: Tokens that need to pass through the macro preprocessor to end up +/// in `merge`. +/// +/// - `merge`: Tokens that need to pass through +/// [`super::scan::ScanToken::merge`] to end up in `parse`. +/// +/// - `parse`: Tokens available to the client for parsing. +/// +/// `pp` and `merge` store tokens only temporarily until they pass into `parse`. +/// Tokens then live in `parse` until the command is fully consumed, at which +/// time they are freed together. +struct Source { /// Error-handling mode. error_handling: ErrorHandling, @@ -33,20 +81,373 @@ pub struct Reader { encoding: &'static Encoding, /// `None` if this reader is not associated with a file. - file_name: Option, + file_name: Option>, - /// Zero if there's no line number. - line_number: u32, + /// 1-based line number, if any. + line_number: Option, /// True if we've reached EOF already. eof: bool, /// Reads UTF-8 bytes. - reader: dyn LexRead, + reader: Box, + + /// Source file contents. + buffer: String, + + /// 0-based line number of the first line not yet written to the journal. + journal_line: usize, + + /// Byte offset of first character not yet scanned as token. + seg_pos: usize, + + /// Byte offsets into `buffer` of starts of lines. The first element is 0. + lines: Vec, + + /// Tokens that need to pass through the macro preprocessor to end up in + /// `merge`. + pp: VecDeque, + + /// Tokens that need to pass through [`super::scan::ScanToken::merge`] to + /// end up in `parse`. + merge: VecDeque, + + /// Tokens available to the client for parsing. + parse: Vec, + + /// Offset in `parse` of the current token. + parse_ofs: usize, + + segmenter: Segmenter, + + suppress_next_newline: bool, +} + +impl Source { + fn empty() -> Self { + Self { + error_handling: ErrorHandling::default(), + encoding: UTF_8, + file_name: None, + line_number: None, + eof: true, + reader: Box::new(()), + buffer: String::new(), + journal_line: 0, + seg_pos: 0, + lines: vec![0], + pp: VecDeque::new(), + merge: VecDeque::new(), + parse: Vec::new(), + parse_ofs: 0, + segmenter: Segmenter::new(Mode::default(), false), + suppress_next_newline: false, + } + } + + fn read(&mut self) { + todo!() + } + fn try_get_pp(&mut self) -> bool { + let (seg_len, seg_type) = loop { + if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) { + break result; + } + + debug_assert!(!self.eof); + self.read(); + }; + + let pos = self.seg_pos..self.seg_pos + seg_len; + self.seg_pos += seg_len; + if seg_type == Segment::Newline { + self.lines.push(self.seg_pos); + } + + let scan_token = ScanToken::from_segment(&self.buffer[pos.clone()], seg_type); + + let n_lines = match (seg_type, self.suppress_next_newline) { + (Segment::EndCommand, false) => { + self.suppress_next_newline = true; + 1 + } + (Segment::Newline, true) => { + self.suppress_next_newline = false; + 0 + } + (Segment::Newline, false) => 1, + _ => 0, + }; + for line_num in self.journal_line..self.journal_line + n_lines { + let start_ofs = self.lines[line_num]; + let end_ofs = self + .lines + .get(line_num + 1) + .copied() + .unwrap_or(self.buffer.len()); + let line = &self.buffer[start_ofs..end_ofs]; + let _line = line + .strip_suffix("\r\n") + .unwrap_or(line.strip_suffix('\n').unwrap_or(line)); + // XXX submit the line as syntax + } + self.journal_line += n_lines; + + let pos = pos.start..=pos.end - 1; + match scan_token { + None => false, + Some(ScanToken::Token(Token::End)) => { + self.pp.push_back(LexToken { + token: Token::EndCommand, + pos, + macro_rep: None, + }); + self.eof = true; + true + } + Some(ScanToken::Token(token)) => { + self.pp.push_back(LexToken { + token, + pos, + macro_rep: None, + }); + true + } + Some(ScanToken::Error(_error)) => { + // XXX report error + false + } + } + } + + fn get_pp(&mut self) -> bool { + while !self.eof { + if self.try_get_pp() { + return true; + } + } + false + } + + fn try_get_merge(&mut self) -> bool { + if self.pp.is_empty() && !self.get_pp() { + return false; + } + + const MEXPAND: bool = true; + + if !MEXPAND { + self.merge.append(&mut self.pp); + return true; + } + + // Now pass tokens one-by-one to the macro expander. + let Some(mut parser) = Parser::new(todo!(), &self.pp[0].token) else { + // Common case where there is no macro to expand. + self.merge.push_back(self.pp.pop_front().unwrap()); + return true; + }; + for ofs in 1.. { + if self.pp.len() <= ofs && !self.get_pp() { + // This should not be reachable because we always get a + // `Token::EndCommand` at the end of an input file, which should + // always terminate macro expansion. + unreachable!(); + } + let token = &self.pp[ofs]; + if parser.push(todo!(), &self.buffer[token.pos], &|e| println!("{e:?}")) + == ParseStatus::Complete + { + break; + } + } + let call = parser.finish(); + if call.len() == 0 { + // False alarm: no macro to expand after all. + self.merge.push_back(self.pp.pop_front().unwrap()); + return true; + } + + // Expand the tokens. + let c0 = &self.pp[0]; + let c1 = &self.pp[call.len() - 1]; + let mut expansion = Vec::new(); + call.expand( + self.segmenter.mode(), + self.token_location(c0..=c1), + &mut expansion, + |e| println!("{e:?}"), + ); + + const MPRINT: bool = false; + if MPRINT { + // XXX + } + + // Append the macro expansion tokens to the lookahead. + let macro_rep = Arc::new(macro_tokens_to_syntax(expansion.as_slice()).collect()); + for token in expansion { + let lt = LexToken { + token: token.token, + pos: todo!(), + macro_rep: Some(MacroRepresentation { + expansion: Arc::clone(¯o_rep), + pos: todo!(), + }), + }; + } + todo!() + } + + /// Attempts to obtain at least one new token into `self.merge`. + /// + /// Returns true if successful, false on failure. In the latter case, this source + /// exhausted and 'self.eof' is now true. + fn get_merge(&mut self) -> bool { + while !self.eof { + if self.try_get_merge() { + return true; + } + } + false + } + + fn get_parse__(&mut self) -> bool { + for i in 0.. { + if self.merge.len() <= i && !self.get_merge() { + // We always get a `Token::EndCommand` at the end of an input + // file and the merger should return `Some(...)` for that token. + debug_assert_eq!(self.merge.len(), 0); + return false; + } + + match ScanToken::merge(&self.merge) { + None => (), + Some(MergeResult::Copy) => { + self.parse.push(self.merge.pop_front().unwrap()); + return true; + } + Some(MergeResult::Expand { n, token }) => { + let first = &self.merge[0]; + let last = &self.merge[n - 1]; + self.parse.push(LexToken { + token, + pos: *first.pos.start()..=*last.pos.end(), + macro_rep: match (&first.macro_rep, &last.macro_rep) { + (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => { + Some(MacroRepresentation { + expansion: a.expansion.clone(), + pos: *a.pos.start()..=*b.pos.end(), + }) + } + _ => None, + }, + }); + self.merge.drain(..n); + return true; + } + } + } + unreachable!(); + } + fn get_parse(&mut self) -> bool { + todo!() + } + + fn offset_to_point(&self, offset: usize) -> Point { + let line = self + .lines + .partition_point(|&line_start| line_start <= offset); + Point { + line: line as i32, + column: Some( + self.buffer + .get(self.lines[line - 1]..offset) + .unwrap_or_default() + .width() as i32, + ), + } + } + fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location { + Location { + file_name: self.file_name.clone(), + span: Some( + self.offset_to_point(*range.start().pos.start()) + ..=self.offset_to_point(*range.end().pos.end()), + ), + omit_underlines: false, + } + } } -pub trait LexRead: Read { - /// Tells the reader what kind of prompt is appropriate for the next - /// read. Non-interactive readers can ignore this. - fn set_prompt_style(&mut self, _prompt: PromptStyle) {} +/// A token in a [`Source`]. +struct LexToken { + /// The regular token. + token: Token, + + /// For a token obtained through the lexer in an ordinary way, this is the + /// location of the token in the [`Source`]'s buffer. + /// + /// For a token produced through macro expansion, this is the entire macro + /// call. + pos: RangeInclusive, + + /// For a token obtained through macro expansion, the part of the macro + /// expansion that represents this token. + /// + /// For a token obtained through the lexer in an ordinary way, this is + /// `None`. + macro_rep: Option, +} + +impl Borrow for LexToken { + fn borrow(&self) -> &Token { + &self.token + } +} + +struct MacroRepresentation { + /// An entire macro expansion. + expansion: Arc, + + /// The substring of `expansion` that represents a single token. + pos: RangeInclusive, +} + +pub struct Lexer { + source: Source, + stack: Vec, + macros: MacroSet, +} + +impl Lexer { + pub fn new() -> Self { + Self { + source: Source::empty(), + stack: Vec::new(), + macros: HashMap::new(), + } + } + + pub fn get(&mut self) { + if self.source.parse_ofs < self.source.parse.len() { + if let Token::EndCommand = self.source.parse[self.source.parse_ofs].token { + self.source.parse.clear(); + } else { + self.source.parse_ofs += 1; + } + } + + while self.source.parse_ofs < self.source.parse.len() { + if !self.source.get_parse() { + match self.stack.pop() { + Some(source) => self.source = source, + None => { + self.source = Source::empty(); + return; + } + } + } + } + } } diff --git a/rust/src/lex/scan/mod.rs b/rust/src/lex/scan/mod.rs index 5f67819bfd..05577a9259 100644 --- a/rust/src/lex/scan/mod.rs +++ b/rust/src/lex/scan/mod.rs @@ -16,7 +16,7 @@ use super::{ segment::{Mode, Segment, Segmenter}, token::{Punct, Token}, }; -use std::collections::VecDeque; +use std::{borrow::Borrow, collections::VecDeque}; use thiserror::Error as ThisError; #[derive(ThisError, Clone, Debug, PartialEq, Eq)] @@ -66,12 +66,29 @@ pub enum ScanError { UnexpectedChar(char), } +/// The input or output to token merging. #[derive(Clone, Debug, PartialEq)] pub enum ScanToken { Token(Token), Error(ScanError), } +/// The result of merging tokens. +#[derive(Clone, Debug)] +pub enum MergeResult { + /// Copy one token literally from input to output. + Copy, + + /// Expand `n` tokens from the input into `token` in the output. + Expand { + /// Number of tokens to expand. + n: usize, + + /// Replacement token. + token: Token, + }, +} + impl ScanToken { pub fn from_segment(s: &str, segment: Segment) -> Option { match segment { @@ -202,8 +219,8 @@ impl ScanToken { "%" => Some(Self::Token(Token::Punct(Punct::Percent))), "?" => Some(Self::Token(Token::Punct(Punct::Question))), "`" => Some(Self::Token(Token::Punct(Punct::Backtick))), - "_" =>Some(Self::Token(Token::Punct(Punct::Underscore))), - "." =>Some(Self::Token(Token::Punct(Punct::Dot))), + "_" => Some(Self::Token(Token::Punct(Punct::Underscore))), + "." => Some(Self::Token(Token::Punct(Punct::Dot))), "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))), _ => unreachable!("bad punctuator {s:?}"), }, @@ -213,7 +230,9 @@ impl ScanToken { | Segment::Newline | Segment::CommentCommand => None, Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)), - Segment::StartDocument => Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap()))), + Segment::StartDocument => { + Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap()))) + } Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => { Some(Self::Token(Token::EndCommand)) } @@ -248,47 +267,62 @@ impl ScanToken { /// white space, as a negative number. It's only needed if we want /// intervening comments to be allowed or for part of the negative number /// token to be produced by macro expansion. - pub fn merge(input: &mut VecDeque) -> Option { - match input.get(0)? { - ScanToken::Token(Token::Punct(Punct::Dash)) => match input.get(1)? { - ScanToken::Token(Token::Number(number)) if number.is_sign_positive() => { + pub fn merge(tokens: &T) -> Option + where + T: Tokens, + { + match tokens.get(0)? { + Token::Punct(Punct::Dash) => match tokens.get(1)? { + Token::Number(number) if number.is_sign_positive() => { let number = *number; - input.pop_front().unwrap(); - input.pop_front().unwrap(); - return Some(ScanToken::Token(Token::Number(-number))); + return Some(MergeResult::Expand { + n: 2, + token: Token::Number(-number), + }); } - _ => Some(input.pop_front().unwrap()), + _ => Some(MergeResult::Copy), }, - ScanToken::Token(Token::String(_)) => { + Token::String(_) => { let mut i = 0; - while matches!( - input.get(i * 2 + 1)?, - ScanToken::Token(Token::Punct(Punct::Plus)) - ) && matches!(input.get(i * 2 + 2)?, ScanToken::Token(Token::String(_))) + while matches!(tokens.get(i * 2 + 1)?, Token::Punct(Punct::Plus)) + && matches!(tokens.get(i * 2 + 2)?, Token::String(_)) { i += 1; } if i == 0 { - Some(input.pop_front().unwrap()) + Some(MergeResult::Copy) } else { let mut output = String::new(); for i in 0..=i { - let ScanToken::Token(Token::String(s)) = &input[i * 2] else { + let Token::String(s) = tokens.get(i * 2).unwrap() else { unreachable!() }; output.push_str(&s); } - for _ in 0..i * 2 + 1 { - input.pop_front().unwrap(); - } - Some(ScanToken::Token(Token::String(output))) + Some(MergeResult::Expand { + n: i * 2 + 1, + token: Token::String(output), + }) } } - _ => Some(input.pop_front().unwrap()), + _ => Some(MergeResult::Copy), } } } +pub trait Tokens { + fn get(&self, index: usize) -> Option<&Token>; +} + +impl Tokens for VecDeque +where + T: Borrow, +{ + fn get(&self, index: usize) -> Option<&Token> { + self.get(index).map(|token| token.borrow()) + } +} + pub struct StringSegmenter<'a> { input: &'a str, segmenter: Segmenter, @@ -308,14 +342,14 @@ impl<'a> Iterator for StringSegmenter<'a> { fn next(&mut self) -> Option { loop { - let (rest, segment) = self.segmenter.push(self.input, true).unwrap(); - if segment == Segment::End { + let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap(); + if seg_type == Segment::End { return None; } - let s = &self.input[..self.input.len() - rest.len()]; + let (s, rest) = self.input.split_at(seg_len); self.input = rest; - if let Some(token) = ScanToken::from_segment(s, segment) { + if let Some(token) = ScanToken::from_segment(s, seg_type) { return Some((s, token)); } } @@ -325,7 +359,7 @@ impl<'a> Iterator for StringSegmenter<'a> { pub struct StringScanner<'a> { input: &'a str, segmenter: Segmenter, - tokens: VecDeque, + tokens: VecDeque, } impl<'a> StringScanner<'a> { @@ -336,28 +370,43 @@ impl<'a> StringScanner<'a> { tokens: VecDeque::with_capacity(1), } } + + fn merge(&mut self) -> Option { + let result = ScanToken::merge(&self.tokens)?; + match result { + MergeResult::Copy => Some(ScanToken::Token(self.tokens.pop_front().unwrap())), + MergeResult::Expand { n, token } => { + self.tokens.drain(..n); + Some(ScanToken::Token(token)) + } + } + } } impl<'a> Iterator for StringScanner<'a> { type Item = ScanToken; fn next(&mut self) -> Option { - if let Some(token) = ScanToken::merge(&mut self.tokens) { + if let Some(token) = self.merge() { return Some(token); } loop { - let (rest, segment) = self.segmenter.push(self.input, true).unwrap(); - if segment == Segment::End && self.tokens.is_empty() { + let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap(); + if seg_type == Segment::End && self.tokens.is_empty() { return None; } - let s = &self.input[..self.input.len() - rest.len()]; + let (s, rest) = self.input.split_at(seg_len); self.input = rest; - if let Some(token) = ScanToken::from_segment(s, segment) { - self.tokens.push_back(token); - if let Some(token) = ScanToken::merge(&mut self.tokens) { - return Some(token); + match ScanToken::from_segment(s, seg_type) { + Some(ScanToken::Error(error)) => return Some(ScanToken::Error(error)), + Some(ScanToken::Token(token)) => { + self.tokens.push_back(token); + if let Some(token) = self.merge() { + return Some(token); + } } + None => (), } } } diff --git a/rust/src/lex/segment/mod.rs b/rust/src/lex/segment/mod.rs index f53b46eb30..de682ac5dc 100644 --- a/rust/src/lex/segment/mod.rs +++ b/rust/src/lex/segment/mod.rs @@ -212,7 +212,7 @@ impl Segmenter { /// consumed, must not be provided with *different* values on subsequent /// calls. This is because the function must often make decisions based on /// looking ahead beyond the bytes that it consumes. - pub fn push<'a>( + fn push_rest<'a>( &mut self, input: &'a str, eof: bool, @@ -258,6 +258,11 @@ impl Segmenter { State::BeginData4 => self.parse_begin_data_4(input, eof), } } + + pub fn push(&mut self, input: &str, eof: bool) -> Result<(usize, Segment), Incomplete> { + let (rest, seg_type) = self.push_rest(input, eof)?; + Ok((input.len() - rest.len(), seg_type)) + } } #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -458,7 +463,7 @@ impl Segmenter { State::General, Substate::START_OF_COMMAND | Substate::START_OF_LINE, ); - self.push(input, eof) + self.push_rest(input, eof) } fn at_command_start(&self, input: &str, eof: bool) -> Result { match self.mode { @@ -641,11 +646,12 @@ impl Segmenter { ) -> Result<(&'a str, &'a str), Incomplete> { let mut sub = Segmenter::new(self.mode, true); loop { - let (rest, segment) = sub.push(input, eof)?; - match segment { + let (seg_len, seg_type) = sub.push(input, eof)?; + let (segment, rest) = input.split_at(seg_len); + match seg_type { Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (), - Segment::Identifier => return Ok((&input[..input.len() - rest.len()], rest)), + Segment::Identifier => return Ok((segment, rest)), Segment::Number | Segment::QuotedString @@ -916,7 +922,7 @@ impl Segmenter { state: (State::General, self.state.1), ..*self }; - let (rest, segment) = sub.push(input, eof)?; + let (rest, segment) = sub.push_rest(input, eof)?; if segment == Segment::Identifier { let id = &input[..input.len() - rest.len()]; debug_assert!(id_match("LABEL", id), "{id} should be LABEL"); @@ -970,7 +976,7 @@ impl Segmenter { state: (State::General, self.state.1), nest: 0, }; - let result = sub.push(input, eof)?; + let result = sub.push_rest(input, eof)?; self.state.1 = sub.state.1; Ok(result) } @@ -1085,7 +1091,7 @@ impl Segmenter { State::General, Substate::START_OF_COMMAND | Substate::START_OF_LINE, ); - return self.push(input, eof); + return self.push_rest(input, eof); } } return Ok((rest, Segment::DoRepeatCommand)); @@ -1200,7 +1206,7 @@ impl Segmenter { let (prefix, rest) = input.split_at(line.len() - end.len()); if prefix.is_empty() { // Line starts with `!ENDDEFINE`. - self.push(input, eof) + self.push_rest(input, eof) } else if prefix.trim_start().is_empty() { // Line starts with spaces followed by `!ENDDEFINE`. Ok((rest, Segment::Spaces)) @@ -1297,7 +1303,7 @@ impl Segmenter { State::General, Substate::START_OF_COMMAND | Substate::START_OF_LINE, ); - self.push(input, eof) + self.push_rest(input, eof) } else { self.state.0 = State::BeginData4; Ok((rest, Segment::InlineData)) diff --git a/rust/src/lex/segment/test.rs b/rust/src/lex/segment/test.rs index 05f0a23d6e..d8c337dcdf 100644 --- a/rust/src/lex/segment/test.rs +++ b/rust/src/lex/segment/test.rs @@ -6,11 +6,11 @@ fn push_segment<'a>( segmenter: &mut Segmenter, input: &'a str, one_byte: bool, -) -> (&'a str, Segment) { +) -> (usize, Segment) { if one_byte { for len in input.char_indices().map(|(pos, _c)| pos) { - if let Ok((rest, segment)) = segmenter.push(&input[..len], false) { - return (&input[len - rest.len()..], segment); + if let Ok(result) = segmenter.push(&input[..len], false) { + return result; } } } @@ -28,11 +28,10 @@ fn _check_segmentation( let mut prompts = Vec::new(); let mut segmenter = Segmenter::new(mode, false); loop { - let (rest, segment) = push_segment(&mut segmenter, input, one_byte); - let len = input.len() - rest.len(); - let token = &input[..len]; - segments.push((segment, token)); - match segment { + let (seg_len, seg_type) = push_segment(&mut segmenter, input, one_byte); + let (token, rest) = input.split_at(seg_len); + segments.push((seg_type, token)); + match seg_type { Segment::End => break, Segment::Newline => prompts.push(segmenter.prompt()), _ => (), @@ -118,11 +117,10 @@ fn check_segmentation( fn print_segmentation(mut input: &str) { let mut segmenter = Segmenter::new(Mode::Interactive, false); loop { - let (rest, segment) = segmenter.push(input, true).unwrap(); - let len = input.len() - rest.len(); - let token = &input[..len]; - print!("{segment:?} {token:?}"); - match segment { + let (seg_len, seg_type) = segmenter.push(input, true).unwrap(); + let (token, rest) = input.split_at(seg_len); + print!("{seg_type:?} {token:?}"); + match seg_type { Segment::Newline => print!(" ({:?})", segmenter.prompt()), Segment::End => break, _ => (), diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 32d508d6ac..46fe08622a 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -13,4 +13,3 @@ pub mod lex; pub mod prompt; pub mod message; pub mod macros; -pub mod autodecode; diff --git a/rust/src/macros.rs b/rust/src/macros.rs index 9bfaf28d94..f00262aac6 100644 --- a/rust/src/macros.rs +++ b/rust/src/macros.rs @@ -245,13 +245,18 @@ enum ValueType { #[derive(Clone)] pub struct MacroToken { /// The token. - token: Token, + pub token: Token, /// The syntax that produces `token`. - syntax: String, + pub syntax: String, } -fn tokenize_string_into(s: &str, mode: Mode, error: &impl Fn(MacroError), output: &mut Vec) { +fn tokenize_string_into( + s: &str, + mode: Mode, + error: &impl Fn(MacroError), + output: &mut Vec, +) { for (syntax, token) in StringSegmenter::new(s, mode, true) { match token { ScanToken::Token(token) => output.push(MacroToken { @@ -352,21 +357,20 @@ enum TokenClass { } impl TokenClass { - fn needs_space(prev: Self, next: Self) -> bool { + fn separator(prev: Self, next: Self) -> &'static str { match (prev, next) { - // Don't need a space before or after the end of a command. (A - // new-line is needed afterward as a special case.) - (Self::EndCommand, _) | (_, Self::EndCommand) => false, - - // Binary operators always have a space on both sides. - (Self::BinaryOperator, _) | (_, Self::BinaryOperator) => true, - - // A comma always has a space afterward. - (Self::Comma, _) => true, - - // Otherwise, `prev` is `Self::BinaryOperator` or `Self::Punct`, - // which only need a space if there are two or them in a row. - _ => prev == next, + // Don't need a separator before the end of a command, but we + // need a new-line afterward. + (_, Self::EndCommand) => "", + (Self::EndCommand, _) => "\n", + + // Binary operators always have a space on both sides, and a comma always has a space afterward. + (Self::BinaryOperator, _) | (_, Self::BinaryOperator) | (Self::Comma, _) => " ", + + // Otherwise, `prev` is `Self::Punct`, which only need a space if + // there are two or them in a row. + (Self::Punct, Self::Punct) => " ", + _ => "", } } } @@ -419,23 +423,16 @@ impl From<&Token> for TokenClass { } } -fn macro_tokens_to_syntax(input: &[MacroToken], output: &mut String) { - for (i, token) in input.iter().enumerate() { - if i > 0 { - let prev = &input[i].token; - let next = &token.token; - if let Token::EndCommand = prev { - output.push('\n'); - } else { - let prev_class: TokenClass = prev.into(); - let next_class: TokenClass = next.into(); - if TokenClass::needs_space(prev_class, next_class) { - output.push(' ') - } - } - output.push_str(&token.syntax); - } - } +pub fn macro_tokens_to_syntax(input: &[MacroToken]) -> impl Iterator { + input + .iter() + .take(1) + .map(|token| token.syntax.as_str()) + .chain(input.windows(2).flat_map(|w| { + let c0 = (&w[0].token).into(); + let c1 = (&w[1].token).into(); + [TokenClass::separator(c0, c1), w[1].syntax.as_str()] + })) } trait MacroId { @@ -499,7 +496,7 @@ impl RelOp { } } -type MacroSet = HashMap, Macro>; +pub type MacroSet = HashMap, Macro>; enum ParserState { /// Accumulating tokens toward the end of any type of argument. @@ -705,14 +702,13 @@ impl<'a> Parser<'a> { /// Adds `token`, which has the given `syntax`, to the collection of tokens /// in `self` that potentially need to be macro expanded. /// - /// Returns `false` if the macro expander needs more tokens, for macro - /// arguments or to decide whether this is actually a macro invocation. The - /// caller should call `push` again with the next token. - ///n - /// Returns `true` if the macro was complete with `n` tokens. The caller - /// should call [`Self::expand`] to obtain the expansion. (If `n == 0`, - /// then the tokens did not actually invoke a macro at all and the expansion - /// will be empty.) + /// Returns [ParseStatus::Incomplete] if the macro expander needs more + /// tokens, for macro arguments or to decide whether this is actually a + /// macro invocation. The caller should call `push` again with the next + /// token. + /// + /// Returns [ParseStatus::Complete] if the macro invocation is now complete. + /// The caller should call [`Self::finish()`] to obtain the expansion. pub fn push( &mut self, token: &Token, @@ -1000,11 +996,9 @@ impl<'a> Expander<'a> { subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output); subexpander.stack.pop(); e.stack = subexpander.stack; - let mut output_string = String::new(); - macro_tokens_to_syntax(&mut output, &mut output_string); - Some(output_string) + Some(macro_tokens_to_syntax(&output).collect()) } - + fn expand_head(e: &mut Expander, mut args: Vec) -> Option { let arg = unquote_string(args.remove(0), e.mode); let mut output = tokenize_string(&arg, e.mode, e.error); @@ -1165,13 +1159,11 @@ impl<'a> Expander<'a> { match &input.0.get(0)?.token { Token::Id(id) if id.0.starts_with('!') => { if let Some(param_idx) = macro_.find_parameter(id) { - let mut s = String::new(); - macro_tokens_to_syntax( - self.args.unwrap()[param_idx].as_ref().unwrap(), - &mut s, - ); input.advance(); - return Some(s); + return Some( + macro_tokens_to_syntax(self.args.unwrap()[param_idx].as_ref().unwrap()) + .collect(), + ); } if let Some(value) = self.vars.borrow().get(id) { return Some(value.clone()); @@ -1190,7 +1182,9 @@ impl<'a> Expander<'a> { if i > 0 { arg.push(' ') } - macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap(), &mut arg); + arg.extend(macro_tokens_to_syntax( + self.args.unwrap()[i].as_ref().unwrap(), + )); } input.advance(); return Some(arg); @@ -1660,6 +1654,9 @@ impl<'a> Call<'a> { me.expand(&mut body, output); } + /// Returns the number of tokens consumed from the input for the macro + /// invocation. If the result is 0, then there was no macro invocation and + /// the expansion will be empty. pub fn len(&self) -> usize { self.0.n_tokens } diff --git a/rust/src/message.rs b/rust/src/message.rs index 757ea78625..5238691031 100644 --- a/rust/src/message.rs +++ b/rust/src/message.rs @@ -12,14 +12,15 @@ use unicode_width::UnicodeWidthStr; #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] pub struct Point { /// 1-based line number. - line: i32, + pub line: i32, /// 1-based column number. /// - /// Column numbers are measured according to the width of characters as shown in - /// a typical fixed-width font, in which CJK characters have width 2 and - /// combining characters have width 0. - column: Option, + /// Column numbers are measured according to the width of characters as + /// shown in a typical fixed-width font, in which CJK characters have width + /// 2 and combining characters have width 0, as measured by the + /// `unicode_width` crate. + pub column: Option, } impl Point { -- 2.30.2