+++ /dev/null
-use chardetng::EncodingDetector;
-use encoding_rs::{Decoder, Encoding};
-use std::io::{BufRead, Read, Result};
-
-struct Autodecode<R>
-where
- R: Read,
-{
- inner: R,
- buffer: Box<[u8]>,
- state: State,
-}
-
-enum State {
- /// Stream encoding is not yet known.
- Auto {
- detector: EncodingDetector,
- back: usize,
- front: usize,
- ascii: usize,
- },
-
- /// Stream encoding is known.
- Decode(Decoder),
-}
-
-fn read_fully<R>(reader: &mut R, mut buffer: &mut [u8]) -> Result<usize>
-where
- R: Read,
-{
- let mut len = 0;
- while len < buffer.len() {
- let n = reader.read(&mut buffer[len..])?;
- if n == 0 {
- break;
- }
- len += n;
- }
- Ok(len)
-}
-
-impl<R> Autodecode<R>
-where
- R: Read,
-{
- fn new(inner: R) -> Result<Self> {
- Self::with_capacity(8192, inner)
- }
- fn with_capacity(capacity: usize, mut inner: R) -> Result<Self> {
- let mut buffer = Vec::with_capacity(capacity);
- buffer.resize(capacity, 0);
- let len = read_fully(&mut inner, buffer.as_mut_slice())?;
- let mut detector = EncodingDetector::new();
- let state = if len < buffer.len() {
- detector.feed(&buffer[..len], true);
- State::Decode(detector.guess(None, true).new_decoder_with_bom_removal())
- } else {
- let ascii = feed(&mut detector, &buffer[..len], false);
- State::Auto {
- detector,
- back: 0,
- front: len,
- ascii,
- }
- };
- Ok(Self {
- inner,
- buffer: buffer.into_boxed_slice(),
- state,
- })
- }
-}
-
-impl<R> Read for Autodecode<R>
-where
- R: Read,
-{
- fn read(&mut self, outbuf: &mut [u8]) -> Result<usize> {
- let mut buffer = self.fill_buf()?;
- let n = buffer.read(outbuf)?;
- self.consume(n);
- Ok(n)
- }
-}
-
-impl<R> BufRead for Autodecode<R>
-where
- R: Read,
-{
- fn fill_buf(&mut self) -> Result<&[u8]> {
- match &mut self.state {
- State::Auto {
- detector,
- back,
- front,
- ascii,
- } => {
- if back < ascii {
- // Consume data up to the first non-ASCII byte.
- Ok(&self.buffer[*back..*ascii])
- } else if ascii < front {
- // We had a non-ASCII byte and we consumed everything up to
- // it. We want to get a full buffer starting at the
- // non-ASCII byte before we decide on the encoding.
- debug_assert_eq!(ascii, back);
-
- // Shift buffered data to the beginning of the buffer to
- // make room to get a full buffer.
- self.buffer.copy_within(*back..*front, 0);
- *front -= *back;
- *back = 0;
- *ascii = 0;
-
- // Fill up the remainder of the buffer.
- let old_front = *front;
- *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?;
- detector.feed(&self.buffer[old_front..*front], *front < self.buffer.len());
- self.state = State::Decode(
- detector.guess(None, true).new_decoder_with_bom_removal(),
- );
- self.fill_buf()
- } else {
- // We have not had a non-ASCII byte yet but we consumed the
- // whole buffer. Read a new one.
- *back = 0;
- *front = 0;
- *ascii = 0;
- *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?;
- let eof = *front < self.buffer.len();
- *ascii = feed(detector, &self.buffer[..*front], eof);
- if eof || *ascii == 0 {
- self.state = State::Decode(
- detector.guess(None, true).new_decoder_with_bom_removal(),
- );
- self.fill_buf()
- } else {
- Ok(&self.buffer[..*ascii])
- }
- }
- }
- State::Decode(_) => todo!(),
- }
- }
-
- fn consume(&mut self, n: usize) {
- todo!()
- }
-}
-
-fn feed(detector: &mut EncodingDetector, buffer: &[u8], last: bool) -> usize {
- if detector.feed(buffer, last) {
- Encoding::ascii_valid_up_to(buffer)
- } else {
- buffer.len()
- }
-}
-/*
- } else {
- debug_assert_eq!(ascii, back);
- debug_assert_eq!(back, front);
- *back = 0;
- *front = 0;
- *ascii = 0;
- *front += read_fully(&mut self.inner, &mut self.buffer[..])?;
- *ascii = feed(detector, &self.buffer[..*front], *front < self.buffer.len());
- Ok(&self.buffer[*back..*ascii])
- }
-*/
-use std::io::Read;
+use std::{
+ borrow::Borrow,
+ collections::{HashMap, VecDeque},
+ io::Result,
+ num::NonZeroU32,
+ ops::RangeInclusive,
+ sync::Arc,
+};
-use encoding_rs::Encoding;
+use encoding_rs::{Encoding, UTF_8};
+use unicode_width::UnicodeWidthStr;
-use crate::prompt::PromptStyle;
+use crate::{
+ macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser},
+ message::{Location, Point},
+ prompt::PromptStyle,
+};
-use super::segment::Mode;
+use super::{
+ scan::{MergeResult, ScanToken},
+ segment::{Mode, Segment, Segmenter},
+ token::Token,
+};
/// Error handling for a [`Reader`].
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
pub enum ErrorHandling {
/// Discard input line and continue reading.
Terminal,
/// Continue to next command, except for cascading failures.
+ #[default]
Continue,
/// Continue, even for cascading failures.
Stop,
}
-/// Reads a single syntax file as a stream of bytes encoded in UTF-8.
-pub struct Reader {
- /// Segmentation mode.
- mode: Mode,
+pub trait LexRead {
+ /// Read some input from the source. If successful, returns the input that
+ /// was read. At end of file, returns `Ok(None)`.
+ ///
+ /// `prompt` provides a hint to interactive readers as to what kind of
+ /// syntax is being read right now.
+ fn read(&mut self, prompt: PromptStyle) -> Result<Option<String>>;
+}
+
+impl LexRead for () {
+ fn read(&mut self, _prompt: PromptStyle) -> Result<Option<String>> {
+ Ok(None)
+ }
+}
+/// # Token pipeline
+///
+/// Tokens pass through a pipeline with the following stages. Each token
+/// eventually made available to the parser passes through of these stages.
+/// The stages are named after the processing that happens in each one.
+///
+/// Initially, tokens come from the segmenter and scanner to `pp`:
+///
+/// - `pp`: Tokens that need to pass through the macro preprocessor to end up
+/// in `merge`.
+///
+/// - `merge`: Tokens that need to pass through
+/// [`super::scan::ScanToken::merge`] to end up in `parse`.
+///
+/// - `parse`: Tokens available to the client for parsing.
+///
+/// `pp` and `merge` store tokens only temporarily until they pass into `parse`.
+/// Tokens then live in `parse` until the command is fully consumed, at which
+/// time they are freed together.
+struct Source {
/// Error-handling mode.
error_handling: ErrorHandling,
encoding: &'static Encoding,
/// `None` if this reader is not associated with a file.
- file_name: Option<String>,
+ file_name: Option<Arc<String>>,
- /// Zero if there's no line number.
- line_number: u32,
+ /// 1-based line number, if any.
+ line_number: Option<NonZeroU32>,
/// True if we've reached EOF already.
eof: bool,
/// Reads UTF-8 bytes.
- reader: dyn LexRead,
+ reader: Box<dyn LexRead>,
+
+ /// Source file contents.
+ buffer: String,
+
+ /// 0-based line number of the first line not yet written to the journal.
+ journal_line: usize,
+
+ /// Byte offset of first character not yet scanned as token.
+ seg_pos: usize,
+
+ /// Byte offsets into `buffer` of starts of lines. The first element is 0.
+ lines: Vec<usize>,
+
+ /// Tokens that need to pass through the macro preprocessor to end up in
+ /// `merge`.
+ pp: VecDeque<LexToken>,
+
+ /// Tokens that need to pass through [`super::scan::ScanToken::merge`] to
+ /// end up in `parse`.
+ merge: VecDeque<LexToken>,
+
+ /// Tokens available to the client for parsing.
+ parse: Vec<LexToken>,
+
+ /// Offset in `parse` of the current token.
+ parse_ofs: usize,
+
+ segmenter: Segmenter,
+
+ suppress_next_newline: bool,
+}
+
+impl Source {
+ fn empty() -> Self {
+ Self {
+ error_handling: ErrorHandling::default(),
+ encoding: UTF_8,
+ file_name: None,
+ line_number: None,
+ eof: true,
+ reader: Box::new(()),
+ buffer: String::new(),
+ journal_line: 0,
+ seg_pos: 0,
+ lines: vec![0],
+ pp: VecDeque::new(),
+ merge: VecDeque::new(),
+ parse: Vec::new(),
+ parse_ofs: 0,
+ segmenter: Segmenter::new(Mode::default(), false),
+ suppress_next_newline: false,
+ }
+ }
+
+ fn read(&mut self) {
+ todo!()
+ }
+ fn try_get_pp(&mut self) -> bool {
+ let (seg_len, seg_type) = loop {
+ if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) {
+ break result;
+ }
+
+ debug_assert!(!self.eof);
+ self.read();
+ };
+
+ let pos = self.seg_pos..self.seg_pos + seg_len;
+ self.seg_pos += seg_len;
+ if seg_type == Segment::Newline {
+ self.lines.push(self.seg_pos);
+ }
+
+ let scan_token = ScanToken::from_segment(&self.buffer[pos.clone()], seg_type);
+
+ let n_lines = match (seg_type, self.suppress_next_newline) {
+ (Segment::EndCommand, false) => {
+ self.suppress_next_newline = true;
+ 1
+ }
+ (Segment::Newline, true) => {
+ self.suppress_next_newline = false;
+ 0
+ }
+ (Segment::Newline, false) => 1,
+ _ => 0,
+ };
+ for line_num in self.journal_line..self.journal_line + n_lines {
+ let start_ofs = self.lines[line_num];
+ let end_ofs = self
+ .lines
+ .get(line_num + 1)
+ .copied()
+ .unwrap_or(self.buffer.len());
+ let line = &self.buffer[start_ofs..end_ofs];
+ let _line = line
+ .strip_suffix("\r\n")
+ .unwrap_or(line.strip_suffix('\n').unwrap_or(line));
+ // XXX submit the line as syntax
+ }
+ self.journal_line += n_lines;
+
+ let pos = pos.start..=pos.end - 1;
+ match scan_token {
+ None => false,
+ Some(ScanToken::Token(Token::End)) => {
+ self.pp.push_back(LexToken {
+ token: Token::EndCommand,
+ pos,
+ macro_rep: None,
+ });
+ self.eof = true;
+ true
+ }
+ Some(ScanToken::Token(token)) => {
+ self.pp.push_back(LexToken {
+ token,
+ pos,
+ macro_rep: None,
+ });
+ true
+ }
+ Some(ScanToken::Error(_error)) => {
+ // XXX report error
+ false
+ }
+ }
+ }
+
+ fn get_pp(&mut self) -> bool {
+ while !self.eof {
+ if self.try_get_pp() {
+ return true;
+ }
+ }
+ false
+ }
+
+ fn try_get_merge(&mut self) -> bool {
+ if self.pp.is_empty() && !self.get_pp() {
+ return false;
+ }
+
+ const MEXPAND: bool = true;
+
+ if !MEXPAND {
+ self.merge.append(&mut self.pp);
+ return true;
+ }
+
+ // Now pass tokens one-by-one to the macro expander.
+ let Some(mut parser) = Parser::new(todo!(), &self.pp[0].token) else {
+ // Common case where there is no macro to expand.
+ self.merge.push_back(self.pp.pop_front().unwrap());
+ return true;
+ };
+ for ofs in 1.. {
+ if self.pp.len() <= ofs && !self.get_pp() {
+ // This should not be reachable because we always get a
+ // `Token::EndCommand` at the end of an input file, which should
+ // always terminate macro expansion.
+ unreachable!();
+ }
+ let token = &self.pp[ofs];
+ if parser.push(todo!(), &self.buffer[token.pos], &|e| println!("{e:?}"))
+ == ParseStatus::Complete
+ {
+ break;
+ }
+ }
+ let call = parser.finish();
+ if call.len() == 0 {
+ // False alarm: no macro to expand after all.
+ self.merge.push_back(self.pp.pop_front().unwrap());
+ return true;
+ }
+
+ // Expand the tokens.
+ let c0 = &self.pp[0];
+ let c1 = &self.pp[call.len() - 1];
+ let mut expansion = Vec::new();
+ call.expand(
+ self.segmenter.mode(),
+ self.token_location(c0..=c1),
+ &mut expansion,
+ |e| println!("{e:?}"),
+ );
+
+ const MPRINT: bool = false;
+ if MPRINT {
+ // XXX
+ }
+
+ // Append the macro expansion tokens to the lookahead.
+ let macro_rep = Arc::new(macro_tokens_to_syntax(expansion.as_slice()).collect());
+ for token in expansion {
+ let lt = LexToken {
+ token: token.token,
+ pos: todo!(),
+ macro_rep: Some(MacroRepresentation {
+ expansion: Arc::clone(¯o_rep),
+ pos: todo!(),
+ }),
+ };
+ }
+ todo!()
+ }
+
+ /// Attempts to obtain at least one new token into `self.merge`.
+ ///
+ /// Returns true if successful, false on failure. In the latter case, this source
+ /// exhausted and 'self.eof' is now true.
+ fn get_merge(&mut self) -> bool {
+ while !self.eof {
+ if self.try_get_merge() {
+ return true;
+ }
+ }
+ false
+ }
+
+ fn get_parse__(&mut self) -> bool {
+ for i in 0.. {
+ if self.merge.len() <= i && !self.get_merge() {
+ // We always get a `Token::EndCommand` at the end of an input
+ // file and the merger should return `Some(...)` for that token.
+ debug_assert_eq!(self.merge.len(), 0);
+ return false;
+ }
+
+ match ScanToken::merge(&self.merge) {
+ None => (),
+ Some(MergeResult::Copy) => {
+ self.parse.push(self.merge.pop_front().unwrap());
+ return true;
+ }
+ Some(MergeResult::Expand { n, token }) => {
+ let first = &self.merge[0];
+ let last = &self.merge[n - 1];
+ self.parse.push(LexToken {
+ token,
+ pos: *first.pos.start()..=*last.pos.end(),
+ macro_rep: match (&first.macro_rep, &last.macro_rep) {
+ (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => {
+ Some(MacroRepresentation {
+ expansion: a.expansion.clone(),
+ pos: *a.pos.start()..=*b.pos.end(),
+ })
+ }
+ _ => None,
+ },
+ });
+ self.merge.drain(..n);
+ return true;
+ }
+ }
+ }
+ unreachable!();
+ }
+ fn get_parse(&mut self) -> bool {
+ todo!()
+ }
+
+ fn offset_to_point(&self, offset: usize) -> Point {
+ let line = self
+ .lines
+ .partition_point(|&line_start| line_start <= offset);
+ Point {
+ line: line as i32,
+ column: Some(
+ self.buffer
+ .get(self.lines[line - 1]..offset)
+ .unwrap_or_default()
+ .width() as i32,
+ ),
+ }
+ }
+ fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location {
+ Location {
+ file_name: self.file_name.clone(),
+ span: Some(
+ self.offset_to_point(*range.start().pos.start())
+ ..=self.offset_to_point(*range.end().pos.end()),
+ ),
+ omit_underlines: false,
+ }
+ }
}
-pub trait LexRead: Read {
- /// Tells the reader what kind of prompt is appropriate for the next
- /// read. Non-interactive readers can ignore this.
- fn set_prompt_style(&mut self, _prompt: PromptStyle) {}
+/// A token in a [`Source`].
+struct LexToken {
+ /// The regular token.
+ token: Token,
+
+ /// For a token obtained through the lexer in an ordinary way, this is the
+ /// location of the token in the [`Source`]'s buffer.
+ ///
+ /// For a token produced through macro expansion, this is the entire macro
+ /// call.
+ pos: RangeInclusive<usize>,
+
+ /// For a token obtained through macro expansion, the part of the macro
+ /// expansion that represents this token.
+ ///
+ /// For a token obtained through the lexer in an ordinary way, this is
+ /// `None`.
+ macro_rep: Option<MacroRepresentation>,
+}
+
+impl Borrow<Token> for LexToken {
+ fn borrow(&self) -> &Token {
+ &self.token
+ }
+}
+
+struct MacroRepresentation {
+ /// An entire macro expansion.
+ expansion: Arc<String>,
+
+ /// The substring of `expansion` that represents a single token.
+ pos: RangeInclusive<usize>,
+}
+
+pub struct Lexer {
+ source: Source,
+ stack: Vec<Source>,
+ macros: MacroSet,
+}
+
+impl Lexer {
+ pub fn new() -> Self {
+ Self {
+ source: Source::empty(),
+ stack: Vec::new(),
+ macros: HashMap::new(),
+ }
+ }
+
+ pub fn get(&mut self) {
+ if self.source.parse_ofs < self.source.parse.len() {
+ if let Token::EndCommand = self.source.parse[self.source.parse_ofs].token {
+ self.source.parse.clear();
+ } else {
+ self.source.parse_ofs += 1;
+ }
+ }
+
+ while self.source.parse_ofs < self.source.parse.len() {
+ if !self.source.get_parse() {
+ match self.stack.pop() {
+ Some(source) => self.source = source,
+ None => {
+ self.source = Source::empty();
+ return;
+ }
+ }
+ }
+ }
+ }
}
segment::{Mode, Segment, Segmenter},
token::{Punct, Token},
};
-use std::collections::VecDeque;
+use std::{borrow::Borrow, collections::VecDeque};
use thiserror::Error as ThisError;
#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
UnexpectedChar(char),
}
+/// The input or output to token merging.
#[derive(Clone, Debug, PartialEq)]
pub enum ScanToken {
Token(Token),
Error(ScanError),
}
+/// The result of merging tokens.
+#[derive(Clone, Debug)]
+pub enum MergeResult {
+ /// Copy one token literally from input to output.
+ Copy,
+
+ /// Expand `n` tokens from the input into `token` in the output.
+ Expand {
+ /// Number of tokens to expand.
+ n: usize,
+
+ /// Replacement token.
+ token: Token,
+ },
+}
+
impl ScanToken {
pub fn from_segment(s: &str, segment: Segment) -> Option<Self> {
match segment {
"%" => Some(Self::Token(Token::Punct(Punct::Percent))),
"?" => Some(Self::Token(Token::Punct(Punct::Question))),
"`" => Some(Self::Token(Token::Punct(Punct::Backtick))),
- "_" =>Some(Self::Token(Token::Punct(Punct::Underscore))),
- "." =>Some(Self::Token(Token::Punct(Punct::Dot))),
+ "_" => Some(Self::Token(Token::Punct(Punct::Underscore))),
+ "." => Some(Self::Token(Token::Punct(Punct::Dot))),
"!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))),
_ => unreachable!("bad punctuator {s:?}"),
},
| Segment::Newline
| Segment::CommentCommand => None,
Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)),
- Segment::StartDocument => Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap()))),
+ Segment::StartDocument => {
+ Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())))
+ }
Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
Some(Self::Token(Token::EndCommand))
}
/// white space, as a negative number. It's only needed if we want
/// intervening comments to be allowed or for part of the negative number
/// token to be produced by macro expansion.
- pub fn merge(input: &mut VecDeque<ScanToken>) -> Option<ScanToken> {
- match input.get(0)? {
- ScanToken::Token(Token::Punct(Punct::Dash)) => match input.get(1)? {
- ScanToken::Token(Token::Number(number)) if number.is_sign_positive() => {
+ pub fn merge<T>(tokens: &T) -> Option<MergeResult>
+ where
+ T: Tokens,
+ {
+ match tokens.get(0)? {
+ Token::Punct(Punct::Dash) => match tokens.get(1)? {
+ Token::Number(number) if number.is_sign_positive() => {
let number = *number;
- input.pop_front().unwrap();
- input.pop_front().unwrap();
- return Some(ScanToken::Token(Token::Number(-number)));
+ return Some(MergeResult::Expand {
+ n: 2,
+ token: Token::Number(-number),
+ });
}
- _ => Some(input.pop_front().unwrap()),
+ _ => Some(MergeResult::Copy),
},
- ScanToken::Token(Token::String(_)) => {
+ Token::String(_) => {
let mut i = 0;
- while matches!(
- input.get(i * 2 + 1)?,
- ScanToken::Token(Token::Punct(Punct::Plus))
- ) && matches!(input.get(i * 2 + 2)?, ScanToken::Token(Token::String(_)))
+ while matches!(tokens.get(i * 2 + 1)?, Token::Punct(Punct::Plus))
+ && matches!(tokens.get(i * 2 + 2)?, Token::String(_))
{
i += 1;
}
if i == 0 {
- Some(input.pop_front().unwrap())
+ Some(MergeResult::Copy)
} else {
let mut output = String::new();
for i in 0..=i {
- let ScanToken::Token(Token::String(s)) = &input[i * 2] else {
+ let Token::String(s) = tokens.get(i * 2).unwrap() else {
unreachable!()
};
output.push_str(&s);
}
- for _ in 0..i * 2 + 1 {
- input.pop_front().unwrap();
- }
- Some(ScanToken::Token(Token::String(output)))
+ Some(MergeResult::Expand {
+ n: i * 2 + 1,
+ token: Token::String(output),
+ })
}
}
- _ => Some(input.pop_front().unwrap()),
+ _ => Some(MergeResult::Copy),
}
}
}
+pub trait Tokens {
+ fn get(&self, index: usize) -> Option<&Token>;
+}
+
+impl<T> Tokens for VecDeque<T>
+where
+ T: Borrow<Token>,
+{
+ fn get(&self, index: usize) -> Option<&Token> {
+ self.get(index).map(|token| token.borrow())
+ }
+}
+
pub struct StringSegmenter<'a> {
input: &'a str,
segmenter: Segmenter,
fn next(&mut self) -> Option<Self::Item> {
loop {
- let (rest, segment) = self.segmenter.push(self.input, true).unwrap();
- if segment == Segment::End {
+ let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
+ if seg_type == Segment::End {
return None;
}
- let s = &self.input[..self.input.len() - rest.len()];
+ let (s, rest) = self.input.split_at(seg_len);
self.input = rest;
- if let Some(token) = ScanToken::from_segment(s, segment) {
+ if let Some(token) = ScanToken::from_segment(s, seg_type) {
return Some((s, token));
}
}
pub struct StringScanner<'a> {
input: &'a str,
segmenter: Segmenter,
- tokens: VecDeque<ScanToken>,
+ tokens: VecDeque<Token>,
}
impl<'a> StringScanner<'a> {
tokens: VecDeque::with_capacity(1),
}
}
+
+ fn merge(&mut self) -> Option<ScanToken> {
+ let result = ScanToken::merge(&self.tokens)?;
+ match result {
+ MergeResult::Copy => Some(ScanToken::Token(self.tokens.pop_front().unwrap())),
+ MergeResult::Expand { n, token } => {
+ self.tokens.drain(..n);
+ Some(ScanToken::Token(token))
+ }
+ }
+ }
}
impl<'a> Iterator for StringScanner<'a> {
type Item = ScanToken;
fn next(&mut self) -> Option<Self::Item> {
- if let Some(token) = ScanToken::merge(&mut self.tokens) {
+ if let Some(token) = self.merge() {
return Some(token);
}
loop {
- let (rest, segment) = self.segmenter.push(self.input, true).unwrap();
- if segment == Segment::End && self.tokens.is_empty() {
+ let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
+ if seg_type == Segment::End && self.tokens.is_empty() {
return None;
}
- let s = &self.input[..self.input.len() - rest.len()];
+ let (s, rest) = self.input.split_at(seg_len);
self.input = rest;
- if let Some(token) = ScanToken::from_segment(s, segment) {
- self.tokens.push_back(token);
- if let Some(token) = ScanToken::merge(&mut self.tokens) {
- return Some(token);
+ match ScanToken::from_segment(s, seg_type) {
+ Some(ScanToken::Error(error)) => return Some(ScanToken::Error(error)),
+ Some(ScanToken::Token(token)) => {
+ self.tokens.push_back(token);
+ if let Some(token) = self.merge() {
+ return Some(token);
+ }
}
+ None => (),
}
}
}
/// consumed, must not be provided with *different* values on subsequent
/// calls. This is because the function must often make decisions based on
/// looking ahead beyond the bytes that it consumes.
- pub fn push<'a>(
+ fn push_rest<'a>(
&mut self,
input: &'a str,
eof: bool,
State::BeginData4 => self.parse_begin_data_4(input, eof),
}
}
+
+ pub fn push(&mut self, input: &str, eof: bool) -> Result<(usize, Segment), Incomplete> {
+ let (rest, seg_type) = self.push_rest(input, eof)?;
+ Ok((input.len() - rest.len(), seg_type))
+ }
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
State::General,
Substate::START_OF_COMMAND | Substate::START_OF_LINE,
);
- self.push(input, eof)
+ self.push_rest(input, eof)
}
fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
match self.mode {
) -> Result<(&'a str, &'a str), Incomplete> {
let mut sub = Segmenter::new(self.mode, true);
loop {
- let (rest, segment) = sub.push(input, eof)?;
- match segment {
+ let (seg_len, seg_type) = sub.push(input, eof)?;
+ let (segment, rest) = input.split_at(seg_len);
+ match seg_type {
Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (),
- Segment::Identifier => return Ok((&input[..input.len() - rest.len()], rest)),
+ Segment::Identifier => return Ok((segment, rest)),
Segment::Number
| Segment::QuotedString
state: (State::General, self.state.1),
..*self
};
- let (rest, segment) = sub.push(input, eof)?;
+ let (rest, segment) = sub.push_rest(input, eof)?;
if segment == Segment::Identifier {
let id = &input[..input.len() - rest.len()];
debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
state: (State::General, self.state.1),
nest: 0,
};
- let result = sub.push(input, eof)?;
+ let result = sub.push_rest(input, eof)?;
self.state.1 = sub.state.1;
Ok(result)
}
State::General,
Substate::START_OF_COMMAND | Substate::START_OF_LINE,
);
- return self.push(input, eof);
+ return self.push_rest(input, eof);
}
}
return Ok((rest, Segment::DoRepeatCommand));
let (prefix, rest) = input.split_at(line.len() - end.len());
if prefix.is_empty() {
// Line starts with `!ENDDEFINE`.
- self.push(input, eof)
+ self.push_rest(input, eof)
} else if prefix.trim_start().is_empty() {
// Line starts with spaces followed by `!ENDDEFINE`.
Ok((rest, Segment::Spaces))
State::General,
Substate::START_OF_COMMAND | Substate::START_OF_LINE,
);
- self.push(input, eof)
+ self.push_rest(input, eof)
} else {
self.state.0 = State::BeginData4;
Ok((rest, Segment::InlineData))
segmenter: &mut Segmenter,
input: &'a str,
one_byte: bool,
-) -> (&'a str, Segment) {
+) -> (usize, Segment) {
if one_byte {
for len in input.char_indices().map(|(pos, _c)| pos) {
- if let Ok((rest, segment)) = segmenter.push(&input[..len], false) {
- return (&input[len - rest.len()..], segment);
+ if let Ok(result) = segmenter.push(&input[..len], false) {
+ return result;
}
}
}
let mut prompts = Vec::new();
let mut segmenter = Segmenter::new(mode, false);
loop {
- let (rest, segment) = push_segment(&mut segmenter, input, one_byte);
- let len = input.len() - rest.len();
- let token = &input[..len];
- segments.push((segment, token));
- match segment {
+ let (seg_len, seg_type) = push_segment(&mut segmenter, input, one_byte);
+ let (token, rest) = input.split_at(seg_len);
+ segments.push((seg_type, token));
+ match seg_type {
Segment::End => break,
Segment::Newline => prompts.push(segmenter.prompt()),
_ => (),
fn print_segmentation(mut input: &str) {
let mut segmenter = Segmenter::new(Mode::Interactive, false);
loop {
- let (rest, segment) = segmenter.push(input, true).unwrap();
- let len = input.len() - rest.len();
- let token = &input[..len];
- print!("{segment:?} {token:?}");
- match segment {
+ let (seg_len, seg_type) = segmenter.push(input, true).unwrap();
+ let (token, rest) = input.split_at(seg_len);
+ print!("{seg_type:?} {token:?}");
+ match seg_type {
Segment::Newline => print!(" ({:?})", segmenter.prompt()),
Segment::End => break,
_ => (),
pub mod prompt;
pub mod message;
pub mod macros;
-pub mod autodecode;
#[derive(Clone)]
pub struct MacroToken {
/// The token.
- token: Token,
+ pub token: Token,
/// The syntax that produces `token`.
- syntax: String,
+ pub syntax: String,
}
-fn tokenize_string_into(s: &str, mode: Mode, error: &impl Fn(MacroError), output: &mut Vec<MacroToken>) {
+fn tokenize_string_into(
+ s: &str,
+ mode: Mode,
+ error: &impl Fn(MacroError),
+ output: &mut Vec<MacroToken>,
+) {
for (syntax, token) in StringSegmenter::new(s, mode, true) {
match token {
ScanToken::Token(token) => output.push(MacroToken {
}
impl TokenClass {
- fn needs_space(prev: Self, next: Self) -> bool {
+ fn separator(prev: Self, next: Self) -> &'static str {
match (prev, next) {
- // Don't need a space before or after the end of a command. (A
- // new-line is needed afterward as a special case.)
- (Self::EndCommand, _) | (_, Self::EndCommand) => false,
-
- // Binary operators always have a space on both sides.
- (Self::BinaryOperator, _) | (_, Self::BinaryOperator) => true,
-
- // A comma always has a space afterward.
- (Self::Comma, _) => true,
-
- // Otherwise, `prev` is `Self::BinaryOperator` or `Self::Punct`,
- // which only need a space if there are two or them in a row.
- _ => prev == next,
+ // Don't need a separator before the end of a command, but we
+ // need a new-line afterward.
+ (_, Self::EndCommand) => "",
+ (Self::EndCommand, _) => "\n",
+
+ // Binary operators always have a space on both sides, and a comma always has a space afterward.
+ (Self::BinaryOperator, _) | (_, Self::BinaryOperator) | (Self::Comma, _) => " ",
+
+ // Otherwise, `prev` is `Self::Punct`, which only need a space if
+ // there are two or them in a row.
+ (Self::Punct, Self::Punct) => " ",
+ _ => "",
}
}
}
}
}
-fn macro_tokens_to_syntax(input: &[MacroToken], output: &mut String) {
- for (i, token) in input.iter().enumerate() {
- if i > 0 {
- let prev = &input[i].token;
- let next = &token.token;
- if let Token::EndCommand = prev {
- output.push('\n');
- } else {
- let prev_class: TokenClass = prev.into();
- let next_class: TokenClass = next.into();
- if TokenClass::needs_space(prev_class, next_class) {
- output.push(' ')
- }
- }
- output.push_str(&token.syntax);
- }
- }
+pub fn macro_tokens_to_syntax(input: &[MacroToken]) -> impl Iterator<Item = &str> {
+ input
+ .iter()
+ .take(1)
+ .map(|token| token.syntax.as_str())
+ .chain(input.windows(2).flat_map(|w| {
+ let c0 = (&w[0].token).into();
+ let c1 = (&w[1].token).into();
+ [TokenClass::separator(c0, c1), w[1].syntax.as_str()]
+ }))
}
trait MacroId {
}
}
-type MacroSet = HashMap<UniCase<String>, Macro>;
+pub type MacroSet = HashMap<UniCase<String>, Macro>;
enum ParserState {
/// Accumulating tokens toward the end of any type of argument.
/// Adds `token`, which has the given `syntax`, to the collection of tokens
/// in `self` that potentially need to be macro expanded.
///
- /// Returns `false` if the macro expander needs more tokens, for macro
- /// arguments or to decide whether this is actually a macro invocation. The
- /// caller should call `push` again with the next token.
- ///n
- /// Returns `true` if the macro was complete with `n` tokens. The caller
- /// should call [`Self::expand`] to obtain the expansion. (If `n == 0`,
- /// then the tokens did not actually invoke a macro at all and the expansion
- /// will be empty.)
+ /// Returns [ParseStatus::Incomplete] if the macro expander needs more
+ /// tokens, for macro arguments or to decide whether this is actually a
+ /// macro invocation. The caller should call `push` again with the next
+ /// token.
+ ///
+ /// Returns [ParseStatus::Complete] if the macro invocation is now complete.
+ /// The caller should call [`Self::finish()`] to obtain the expansion.
pub fn push(
&mut self,
token: &Token,
subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output);
subexpander.stack.pop();
e.stack = subexpander.stack;
- let mut output_string = String::new();
- macro_tokens_to_syntax(&mut output, &mut output_string);
- Some(output_string)
+ Some(macro_tokens_to_syntax(&output).collect())
}
-
+
fn expand_head(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
let arg = unquote_string(args.remove(0), e.mode);
let mut output = tokenize_string(&arg, e.mode, e.error);
match &input.0.get(0)?.token {
Token::Id(id) if id.0.starts_with('!') => {
if let Some(param_idx) = macro_.find_parameter(id) {
- let mut s = String::new();
- macro_tokens_to_syntax(
- self.args.unwrap()[param_idx].as_ref().unwrap(),
- &mut s,
- );
input.advance();
- return Some(s);
+ return Some(
+ macro_tokens_to_syntax(self.args.unwrap()[param_idx].as_ref().unwrap())
+ .collect(),
+ );
}
if let Some(value) = self.vars.borrow().get(id) {
return Some(value.clone());
if i > 0 {
arg.push(' ')
}
- macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap(), &mut arg);
+ arg.extend(macro_tokens_to_syntax(
+ self.args.unwrap()[i].as_ref().unwrap(),
+ ));
}
input.advance();
return Some(arg);
me.expand(&mut body, output);
}
+ /// Returns the number of tokens consumed from the input for the macro
+ /// invocation. If the result is 0, then there was no macro invocation and
+ /// the expansion will be empty.
pub fn len(&self) -> usize {
self.0.n_tokens
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct Point {
/// 1-based line number.
- line: i32,
+ pub line: i32,
/// 1-based column number.
///
- /// Column numbers are measured according to the width of characters as shown in
- /// a typical fixed-width font, in which CJK characters have width 2 and
- /// combining characters have width 0.
- column: Option<i32>,
+ /// Column numbers are measured according to the width of characters as
+ /// shown in a typical fixed-width font, in which CJK characters have width
+ /// 2 and combining characters have width 0, as measured by the
+ /// `unicode_width` crate.
+ pub column: Option<i32>,
}
impl Point {