work on lexer
authorBen Pfaff <blp@cs.stanford.edu>
Sat, 17 Aug 2024 04:45:44 +0000 (21:45 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sat, 17 Aug 2024 04:45:44 +0000 (21:45 -0700)
rust/src/autodecode.rs [deleted file]
rust/src/lex/lexer.rs
rust/src/lex/scan/mod.rs
rust/src/lex/segment/mod.rs
rust/src/lex/segment/test.rs
rust/src/lib.rs
rust/src/macros.rs
rust/src/message.rs

diff --git a/rust/src/autodecode.rs b/rust/src/autodecode.rs
deleted file mode 100644 (file)
index 84a8ec3..0000000
+++ /dev/null
@@ -1,168 +0,0 @@
-use chardetng::EncodingDetector;
-use encoding_rs::{Decoder, Encoding};
-use std::io::{BufRead, Read, Result};
-
-struct Autodecode<R>
-where
-    R: Read,
-{
-    inner: R,
-    buffer: Box<[u8]>,
-    state: State,
-}
-
-enum State {
-    /// Stream encoding is not yet known.
-    Auto {
-        detector: EncodingDetector,
-        back: usize,
-        front: usize,
-        ascii: usize,
-    },
-
-    /// Stream encoding is known.
-    Decode(Decoder),
-}
-
-fn read_fully<R>(reader: &mut R, mut buffer: &mut [u8]) -> Result<usize>
-where
-    R: Read,
-{
-    let mut len = 0;
-    while len < buffer.len() {
-        let n = reader.read(&mut buffer[len..])?;
-        if n == 0 {
-            break;
-        }
-        len += n;
-    }
-    Ok(len)
-}
-
-impl<R> Autodecode<R>
-where
-    R: Read,
-{
-    fn new(inner: R) -> Result<Self> {
-        Self::with_capacity(8192, inner)
-    }
-    fn with_capacity(capacity: usize, mut inner: R) -> Result<Self> {
-        let mut buffer = Vec::with_capacity(capacity);
-        buffer.resize(capacity, 0);
-        let len = read_fully(&mut inner, buffer.as_mut_slice())?;
-        let mut detector = EncodingDetector::new();
-        let state = if len < buffer.len() {
-            detector.feed(&buffer[..len], true);
-            State::Decode(detector.guess(None, true).new_decoder_with_bom_removal())
-        } else {
-            let ascii = feed(&mut detector, &buffer[..len], false);
-            State::Auto {
-                detector,
-                back: 0,
-                front: len,
-                ascii,
-            }
-        };
-        Ok(Self {
-            inner,
-            buffer: buffer.into_boxed_slice(),
-            state,
-        })
-    }
-}
-
-impl<R> Read for Autodecode<R>
-where
-    R: Read,
-{
-    fn read(&mut self, outbuf: &mut [u8]) -> Result<usize> {
-        let mut buffer = self.fill_buf()?;
-        let n = buffer.read(outbuf)?;
-        self.consume(n);
-        Ok(n)
-    }
-}
-
-impl<R> BufRead for Autodecode<R>
-where
-    R: Read,
-{
-    fn fill_buf(&mut self) -> Result<&[u8]> {
-        match &mut self.state {
-            State::Auto {
-                detector,
-                back,
-                front,
-                ascii,
-            } => {
-                if back < ascii {
-                    // Consume data up to the first non-ASCII byte.
-                    Ok(&self.buffer[*back..*ascii])
-                } else if ascii < front {
-                    // We had a non-ASCII byte and we consumed everything up to
-                    // it.  We want to get a full buffer starting at the
-                    // non-ASCII byte before we decide on the encoding.
-                    debug_assert_eq!(ascii, back);
-
-                    // Shift buffered data to the beginning of the buffer to
-                    // make room to get a full buffer.
-                    self.buffer.copy_within(*back..*front, 0);
-                    *front -= *back;
-                    *back = 0;
-                    *ascii = 0;
-
-                    // Fill up the remainder of the buffer.
-                    let old_front = *front;
-                    *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?;
-                    detector.feed(&self.buffer[old_front..*front], *front < self.buffer.len());
-                    self.state = State::Decode(
-                        detector.guess(None, true).new_decoder_with_bom_removal(),
-                    );
-                    self.fill_buf()
-                } else {
-                    // We have not had a non-ASCII byte yet but we consumed the
-                    // whole buffer. Read a new one.
-                    *back = 0;
-                    *front = 0;
-                    *ascii = 0;
-                    *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?;
-                    let eof = *front < self.buffer.len();
-                    *ascii = feed(detector, &self.buffer[..*front], eof);
-                    if eof || *ascii == 0 {
-                        self.state = State::Decode(
-                            detector.guess(None, true).new_decoder_with_bom_removal(),
-                        );
-                        self.fill_buf()
-                    } else {
-                        Ok(&self.buffer[..*ascii])
-                    }
-                }
-            }
-            State::Decode(_) => todo!(),
-        }
-    }
-
-    fn consume(&mut self, n: usize) {
-        todo!()
-    }
-}
-
-fn feed(detector: &mut EncodingDetector, buffer: &[u8], last: bool) -> usize {
-    if detector.feed(buffer, last) {
-        Encoding::ascii_valid_up_to(buffer)
-    } else {
-        buffer.len()
-    }
-}
-/*
-                    } else {
-                        debug_assert_eq!(ascii, back);
-                        debug_assert_eq!(back, front);
-                        *back = 0;
-                        *front = 0;
-                        *ascii = 0;
-                        *front += read_fully(&mut self.inner, &mut self.buffer[..])?;
-                        *ascii = feed(detector, &self.buffer[..*front], *front < self.buffer.len());
-                        Ok(&self.buffer[*back..*ascii])
-                    }
-*/
index d5728d5cac7fd3e2714f412ba581b25176f00f86..1eb11e61ca6acc61a736d5795af59617e8104816 100644 (file)
@@ -1,17 +1,35 @@
-use std::io::Read;
+use std::{
+    borrow::Borrow,
+    collections::{HashMap, VecDeque},
+    io::Result,
+    num::NonZeroU32,
+    ops::RangeInclusive,
+    sync::Arc,
+};
 
-use encoding_rs::Encoding;
+use encoding_rs::{Encoding, UTF_8};
+use unicode_width::UnicodeWidthStr;
 
-use crate::prompt::PromptStyle;
+use crate::{
+    macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser},
+    message::{Location, Point},
+    prompt::PromptStyle,
+};
 
-use super::segment::Mode;
+use super::{
+    scan::{MergeResult, ScanToken},
+    segment::{Mode, Segment, Segmenter},
+    token::Token,
+};
 
 /// Error handling for a [`Reader`].
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
 pub enum ErrorHandling {
     /// Discard input line and continue reading.
     Terminal,
 
     /// Continue to next command, except for cascading failures.
+    #[default]
     Continue,
 
     /// Continue, even for cascading failures.
@@ -21,11 +39,41 @@ pub enum ErrorHandling {
     Stop,
 }
 
-/// Reads a single syntax file as a stream of bytes encoded in UTF-8.
-pub struct Reader {
-    /// Segmentation mode.
-    mode: Mode,
+pub trait LexRead {
+    /// Read some input from the source. If successful, returns the input that
+    /// was read.  At end of file, returns `Ok(None)`.
+    ///
+    /// `prompt` provides a hint to interactive readers as to what kind of
+    /// syntax is being read right now.
+    fn read(&mut self, prompt: PromptStyle) -> Result<Option<String>>;
+}
+
+impl LexRead for () {
+    fn read(&mut self, _prompt: PromptStyle) -> Result<Option<String>> {
+        Ok(None)
+    }
+}
 
+///  # Token pipeline
+///
+///  Tokens pass through a pipeline with the following stages.  Each token
+///  eventually made available to the parser passes through of these stages.
+///  The stages are named after the processing that happens in each one.
+///
+///  Initially, tokens come from the segmenter and scanner to `pp`:
+///
+///  - `pp`: Tokens that need to pass through the macro preprocessor to end up
+///    in `merge`.
+///
+///  - `merge`: Tokens that need to pass through
+///    [`super::scan::ScanToken::merge`] to end up in `parse`.
+///
+///  - `parse`: Tokens available to the client for parsing.
+///
+/// `pp` and `merge` store tokens only temporarily until they pass into `parse`.
+/// Tokens then live in `parse` until the command is fully consumed, at which
+/// time they are freed together.
+struct Source {
     /// Error-handling mode.
     error_handling: ErrorHandling,
 
@@ -33,20 +81,373 @@ pub struct Reader {
     encoding: &'static Encoding,
 
     /// `None` if this reader is not associated with a file.
-    file_name: Option<String>,
+    file_name: Option<Arc<String>>,
 
-    /// Zero if there's no line number.
-    line_number: u32,
+    /// 1-based line number, if any.
+    line_number: Option<NonZeroU32>,
 
     /// True if we've reached EOF already.
     eof: bool,
 
     /// Reads UTF-8 bytes.
-    reader: dyn LexRead,
+    reader: Box<dyn LexRead>,
+
+    /// Source file contents.
+    buffer: String,
+
+    /// 0-based line number of the first line not yet written to the journal.
+    journal_line: usize,
+
+    /// Byte offset of first character not yet scanned as token.
+    seg_pos: usize,
+
+    /// Byte offsets into `buffer` of starts of lines.  The first element is 0.
+    lines: Vec<usize>,
+
+    /// Tokens that need to pass through the macro preprocessor to end up in
+    /// `merge`.
+    pp: VecDeque<LexToken>,
+
+    ///  Tokens that need to pass through [`super::scan::ScanToken::merge`] to
+    ///  end up in `parse`.
+    merge: VecDeque<LexToken>,
+
+    /// Tokens available to the client for parsing.
+    parse: Vec<LexToken>,
+
+    /// Offset in `parse` of the current token.
+    parse_ofs: usize,
+
+    segmenter: Segmenter,
+
+    suppress_next_newline: bool,
+}
+
+impl Source {
+    fn empty() -> Self {
+        Self {
+            error_handling: ErrorHandling::default(),
+            encoding: UTF_8,
+            file_name: None,
+            line_number: None,
+            eof: true,
+            reader: Box::new(()),
+            buffer: String::new(),
+            journal_line: 0,
+            seg_pos: 0,
+            lines: vec![0],
+            pp: VecDeque::new(),
+            merge: VecDeque::new(),
+            parse: Vec::new(),
+            parse_ofs: 0,
+            segmenter: Segmenter::new(Mode::default(), false),
+            suppress_next_newline: false,
+        }
+    }
+
+    fn read(&mut self) {
+        todo!()
+    }
+    fn try_get_pp(&mut self) -> bool {
+        let (seg_len, seg_type) = loop {
+            if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) {
+                break result;
+            }
+
+            debug_assert!(!self.eof);
+            self.read();
+        };
+
+        let pos = self.seg_pos..self.seg_pos + seg_len;
+        self.seg_pos += seg_len;
+        if seg_type == Segment::Newline {
+            self.lines.push(self.seg_pos);
+        }
+
+        let scan_token = ScanToken::from_segment(&self.buffer[pos.clone()], seg_type);
+
+        let n_lines = match (seg_type, self.suppress_next_newline) {
+            (Segment::EndCommand, false) => {
+                self.suppress_next_newline = true;
+                1
+            }
+            (Segment::Newline, true) => {
+                self.suppress_next_newline = false;
+                0
+            }
+            (Segment::Newline, false) => 1,
+            _ => 0,
+        };
+        for line_num in self.journal_line..self.journal_line + n_lines {
+            let start_ofs = self.lines[line_num];
+            let end_ofs = self
+                .lines
+                .get(line_num + 1)
+                .copied()
+                .unwrap_or(self.buffer.len());
+            let line = &self.buffer[start_ofs..end_ofs];
+            let _line = line
+                .strip_suffix("\r\n")
+                .unwrap_or(line.strip_suffix('\n').unwrap_or(line));
+            // XXX submit the line as syntax
+        }
+        self.journal_line += n_lines;
+
+        let pos = pos.start..=pos.end - 1;
+        match scan_token {
+            None => false,
+            Some(ScanToken::Token(Token::End)) => {
+                self.pp.push_back(LexToken {
+                    token: Token::EndCommand,
+                    pos,
+                    macro_rep: None,
+                });
+                self.eof = true;
+                true
+            }
+            Some(ScanToken::Token(token)) => {
+                self.pp.push_back(LexToken {
+                    token,
+                    pos,
+                    macro_rep: None,
+                });
+                true
+            }
+            Some(ScanToken::Error(_error)) => {
+                // XXX report error
+                false
+            }
+        }
+    }
+
+    fn get_pp(&mut self) -> bool {
+        while !self.eof {
+            if self.try_get_pp() {
+                return true;
+            }
+        }
+        false
+    }
+
+    fn try_get_merge(&mut self) -> bool {
+        if self.pp.is_empty() && !self.get_pp() {
+            return false;
+        }
+
+        const MEXPAND: bool = true;
+
+        if !MEXPAND {
+            self.merge.append(&mut self.pp);
+            return true;
+        }
+
+        // Now pass tokens one-by-one to the macro expander.
+        let Some(mut parser) = Parser::new(todo!(), &self.pp[0].token) else {
+            // Common case where there is no macro to expand.
+            self.merge.push_back(self.pp.pop_front().unwrap());
+            return true;
+        };
+        for ofs in 1.. {
+            if self.pp.len() <= ofs && !self.get_pp() {
+                // This should not be reachable because we always get a
+                // `Token::EndCommand` at the end of an input file, which should
+                // always terminate macro expansion.
+                unreachable!();
+            }
+            let token = &self.pp[ofs];
+            if parser.push(todo!(), &self.buffer[token.pos], &|e| println!("{e:?}"))
+                == ParseStatus::Complete
+            {
+                break;
+            }
+        }
+        let call = parser.finish();
+        if call.len() == 0 {
+            // False alarm: no macro to expand after all.
+            self.merge.push_back(self.pp.pop_front().unwrap());
+            return true;
+        }
+
+        // Expand the tokens.
+        let c0 = &self.pp[0];
+        let c1 = &self.pp[call.len() - 1];
+        let mut expansion = Vec::new();
+        call.expand(
+            self.segmenter.mode(),
+            self.token_location(c0..=c1),
+            &mut expansion,
+            |e| println!("{e:?}"),
+        );
+
+        const MPRINT: bool = false;
+        if MPRINT {
+            // XXX
+        }
+
+        // Append the macro expansion tokens to the lookahead.
+        let macro_rep = Arc::new(macro_tokens_to_syntax(expansion.as_slice()).collect());
+        for token in expansion {
+            let lt = LexToken {
+                token: token.token,
+                pos: todo!(),
+                macro_rep: Some(MacroRepresentation {
+                    expansion: Arc::clone(&macro_rep),
+                    pos: todo!(),
+                }),
+            };
+        }
+        todo!()
+    }
+
+    /// Attempts to obtain at least one new token into `self.merge`.
+    ///
+    /// Returns true if successful, false on failure.  In the latter case, this source
+    /// exhausted and 'self.eof' is now true.
+    fn get_merge(&mut self) -> bool {
+        while !self.eof {
+            if self.try_get_merge() {
+                return true;
+            }
+        }
+        false
+    }
+
+    fn get_parse__(&mut self) -> bool {
+        for i in 0.. {
+            if self.merge.len() <= i && !self.get_merge() {
+                // We always get a `Token::EndCommand` at the end of an input
+                // file and the merger should return `Some(...)` for that token.
+                debug_assert_eq!(self.merge.len(), 0);
+                return false;
+            }
+
+            match ScanToken::merge(&self.merge) {
+                None => (),
+                Some(MergeResult::Copy) => {
+                    self.parse.push(self.merge.pop_front().unwrap());
+                    return true;
+                }
+                Some(MergeResult::Expand { n, token }) => {
+                    let first = &self.merge[0];
+                    let last = &self.merge[n - 1];
+                    self.parse.push(LexToken {
+                        token,
+                        pos: *first.pos.start()..=*last.pos.end(),
+                        macro_rep: match (&first.macro_rep, &last.macro_rep) {
+                            (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => {
+                                Some(MacroRepresentation {
+                                    expansion: a.expansion.clone(),
+                                    pos: *a.pos.start()..=*b.pos.end(),
+                                })
+                            }
+                            _ => None,
+                        },
+                    });
+                    self.merge.drain(..n);
+                    return true;
+                }
+            }
+        }
+        unreachable!();
+    }
+    fn get_parse(&mut self) -> bool {
+        todo!()
+    }
+
+    fn offset_to_point(&self, offset: usize) -> Point {
+        let line = self
+            .lines
+            .partition_point(|&line_start| line_start <= offset);
+        Point {
+            line: line as i32,
+            column: Some(
+                self.buffer
+                    .get(self.lines[line - 1]..offset)
+                    .unwrap_or_default()
+                    .width() as i32,
+            ),
+        }
+    }
+    fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location {
+        Location {
+            file_name: self.file_name.clone(),
+            span: Some(
+                self.offset_to_point(*range.start().pos.start())
+                    ..=self.offset_to_point(*range.end().pos.end()),
+            ),
+            omit_underlines: false,
+        }
+    }
 }
 
-pub trait LexRead: Read {
-    /// Tells the reader what kind of prompt is appropriate for the next
-    /// read. Non-interactive readers can ignore this.
-    fn set_prompt_style(&mut self, _prompt: PromptStyle) {}
+/// A token in a [`Source`].
+struct LexToken {
+    /// The regular token.
+    token: Token,
+
+    /// For a token obtained through the lexer in an ordinary way, this is the
+    /// location of the token in the [`Source`]'s buffer.
+    ///
+    /// For a token produced through macro expansion, this is the entire macro
+    /// call.
+    pos: RangeInclusive<usize>,
+
+    /// For a token obtained through macro expansion, the part of the macro
+    /// expansion that represents this token.
+    ///
+    /// For a token obtained through the lexer in an ordinary way, this is
+    /// `None`.
+    macro_rep: Option<MacroRepresentation>,
+}
+
+impl Borrow<Token> for LexToken {
+    fn borrow(&self) -> &Token {
+        &self.token
+    }
+}
+
+struct MacroRepresentation {
+    /// An entire macro expansion.
+    expansion: Arc<String>,
+
+    /// The substring of `expansion` that represents a single token.
+    pos: RangeInclusive<usize>,
+}
+
+pub struct Lexer {
+    source: Source,
+    stack: Vec<Source>,
+    macros: MacroSet,
+}
+
+impl Lexer {
+    pub fn new() -> Self {
+        Self {
+            source: Source::empty(),
+            stack: Vec::new(),
+            macros: HashMap::new(),
+        }
+    }
+
+    pub fn get(&mut self) {
+        if self.source.parse_ofs < self.source.parse.len() {
+            if let Token::EndCommand = self.source.parse[self.source.parse_ofs].token {
+                self.source.parse.clear();
+            } else {
+                self.source.parse_ofs += 1;
+            }
+        }
+
+        while self.source.parse_ofs < self.source.parse.len() {
+            if !self.source.get_parse() {
+                match self.stack.pop() {
+                    Some(source) => self.source = source,
+                    None => {
+                        self.source = Source::empty();
+                        return;
+                    }
+                }
+            }
+        }
+    }
 }
index 5f67819bfda1b83d36654e1c9a14e6763142b2d3..05577a92591377cceb59016266fb7959c85c1e45 100644 (file)
@@ -16,7 +16,7 @@ use super::{
     segment::{Mode, Segment, Segmenter},
     token::{Punct, Token},
 };
-use std::collections::VecDeque;
+use std::{borrow::Borrow, collections::VecDeque};
 use thiserror::Error as ThisError;
 
 #[derive(ThisError, Clone, Debug, PartialEq, Eq)]
@@ -66,12 +66,29 @@ pub enum ScanError {
     UnexpectedChar(char),
 }
 
+/// The input or output to token merging.
 #[derive(Clone, Debug, PartialEq)]
 pub enum ScanToken {
     Token(Token),
     Error(ScanError),
 }
 
+/// The result of merging tokens.
+#[derive(Clone, Debug)]
+pub enum MergeResult {
+    /// Copy one token literally from input to output.
+    Copy,
+
+    /// Expand `n` tokens from the input into `token` in the output.
+    Expand {
+        /// Number of tokens to expand.
+        n: usize,
+
+        /// Replacement token.
+        token: Token,
+    },
+}
+
 impl ScanToken {
     pub fn from_segment(s: &str, segment: Segment) -> Option<Self> {
         match segment {
@@ -202,8 +219,8 @@ impl ScanToken {
                 "%" => Some(Self::Token(Token::Punct(Punct::Percent))),
                 "?" => Some(Self::Token(Token::Punct(Punct::Question))),
                 "`" => Some(Self::Token(Token::Punct(Punct::Backtick))),
-                "_" =>Some(Self::Token(Token::Punct(Punct::Underscore))),
-                "." =>Some(Self::Token(Token::Punct(Punct::Dot))),
+                "_" => Some(Self::Token(Token::Punct(Punct::Underscore))),
+                "." => Some(Self::Token(Token::Punct(Punct::Dot))),
                 "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))),
                 _ => unreachable!("bad punctuator {s:?}"),
             },
@@ -213,7 +230,9 @@ impl ScanToken {
             | Segment::Newline
             | Segment::CommentCommand => None,
             Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)),
-            Segment::StartDocument => Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap()))),
+            Segment::StartDocument => {
+                Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())))
+            }
             Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
                 Some(Self::Token(Token::EndCommand))
             }
@@ -248,47 +267,62 @@ impl ScanToken {
     ///     white space, as a negative number.  It's only needed if we want
     ///     intervening comments to be allowed or for part of the negative number
     ///     token to be produced by macro expansion.
-    pub fn merge(input: &mut VecDeque<ScanToken>) -> Option<ScanToken> {
-        match input.get(0)? {
-            ScanToken::Token(Token::Punct(Punct::Dash)) => match input.get(1)? {
-                ScanToken::Token(Token::Number(number)) if number.is_sign_positive() => {
+    pub fn merge<T>(tokens: &T) -> Option<MergeResult>
+    where
+        T: Tokens,
+    {
+        match tokens.get(0)? {
+            Token::Punct(Punct::Dash) => match tokens.get(1)? {
+                Token::Number(number) if number.is_sign_positive() => {
                     let number = *number;
-                    input.pop_front().unwrap();
-                    input.pop_front().unwrap();
-                    return Some(ScanToken::Token(Token::Number(-number)));
+                    return Some(MergeResult::Expand {
+                        n: 2,
+                        token: Token::Number(-number),
+                    });
                 }
-                _ => Some(input.pop_front().unwrap()),
+                _ => Some(MergeResult::Copy),
             },
-            ScanToken::Token(Token::String(_)) => {
+            Token::String(_) => {
                 let mut i = 0;
-                while matches!(
-                    input.get(i * 2 + 1)?,
-                    ScanToken::Token(Token::Punct(Punct::Plus))
-                ) && matches!(input.get(i * 2 + 2)?, ScanToken::Token(Token::String(_)))
+                while matches!(tokens.get(i * 2 + 1)?, Token::Punct(Punct::Plus))
+                    && matches!(tokens.get(i * 2 + 2)?, Token::String(_))
                 {
                     i += 1;
                 }
                 if i == 0 {
-                    Some(input.pop_front().unwrap())
+                    Some(MergeResult::Copy)
                 } else {
                     let mut output = String::new();
                     for i in 0..=i {
-                        let ScanToken::Token(Token::String(s)) = &input[i * 2] else {
+                        let Token::String(s) = tokens.get(i * 2).unwrap() else {
                             unreachable!()
                         };
                         output.push_str(&s);
                     }
-                    for _ in 0..i * 2 + 1 {
-                        input.pop_front().unwrap();
-                    }
-                    Some(ScanToken::Token(Token::String(output)))
+                    Some(MergeResult::Expand {
+                        n: i * 2 + 1,
+                        token: Token::String(output),
+                    })
                 }
             }
-            _ => Some(input.pop_front().unwrap()),
+            _ => Some(MergeResult::Copy),
         }
     }
 }
 
+pub trait Tokens {
+    fn get(&self, index: usize) -> Option<&Token>;
+}
+
+impl<T> Tokens for VecDeque<T>
+where
+    T: Borrow<Token>,
+{
+    fn get(&self, index: usize) -> Option<&Token> {
+        self.get(index).map(|token| token.borrow())
+    }
+}
+
 pub struct StringSegmenter<'a> {
     input: &'a str,
     segmenter: Segmenter,
@@ -308,14 +342,14 @@ impl<'a> Iterator for StringSegmenter<'a> {
 
     fn next(&mut self) -> Option<Self::Item> {
         loop {
-            let (rest, segment) = self.segmenter.push(self.input, true).unwrap();
-            if segment == Segment::End {
+            let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
+            if seg_type == Segment::End {
                 return None;
             }
-            let s = &self.input[..self.input.len() - rest.len()];
+            let (s, rest) = self.input.split_at(seg_len);
             self.input = rest;
 
-            if let Some(token) = ScanToken::from_segment(s, segment) {
+            if let Some(token) = ScanToken::from_segment(s, seg_type) {
                 return Some((s, token));
             }
         }
@@ -325,7 +359,7 @@ impl<'a> Iterator for StringSegmenter<'a> {
 pub struct StringScanner<'a> {
     input: &'a str,
     segmenter: Segmenter,
-    tokens: VecDeque<ScanToken>,
+    tokens: VecDeque<Token>,
 }
 
 impl<'a> StringScanner<'a> {
@@ -336,28 +370,43 @@ impl<'a> StringScanner<'a> {
             tokens: VecDeque::with_capacity(1),
         }
     }
+
+    fn merge(&mut self) -> Option<ScanToken> {
+        let result = ScanToken::merge(&self.tokens)?;
+        match result {
+            MergeResult::Copy => Some(ScanToken::Token(self.tokens.pop_front().unwrap())),
+            MergeResult::Expand { n, token } => {
+                self.tokens.drain(..n);
+                Some(ScanToken::Token(token))
+            }
+        }
+    }
 }
 
 impl<'a> Iterator for StringScanner<'a> {
     type Item = ScanToken;
 
     fn next(&mut self) -> Option<Self::Item> {
-        if let Some(token) = ScanToken::merge(&mut self.tokens) {
+        if let Some(token) = self.merge() {
             return Some(token);
         }
         loop {
-            let (rest, segment) = self.segmenter.push(self.input, true).unwrap();
-            if segment == Segment::End && self.tokens.is_empty() {
+            let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
+            if seg_type == Segment::End && self.tokens.is_empty() {
                 return None;
             }
-            let s = &self.input[..self.input.len() - rest.len()];
+            let (s, rest) = self.input.split_at(seg_len);
             self.input = rest;
 
-            if let Some(token) = ScanToken::from_segment(s, segment) {
-                self.tokens.push_back(token);
-                if let Some(token) = ScanToken::merge(&mut self.tokens) {
-                    return Some(token);
+            match ScanToken::from_segment(s, seg_type) {
+                Some(ScanToken::Error(error)) => return Some(ScanToken::Error(error)),
+                Some(ScanToken::Token(token)) => {
+                    self.tokens.push_back(token);
+                    if let Some(token) = self.merge() {
+                        return Some(token);
+                    }
                 }
+                None => (),
             }
         }
     }
index f53b46eb30aaac5bcc4893a5ac77477034f79448..de682ac5dc4b6e341c0c19259db531142aae8433 100644 (file)
@@ -212,7 +212,7 @@ impl Segmenter {
     /// consumed, must not be provided with *different* values on subsequent
     /// calls.  This is because the function must often make decisions based on
     /// looking ahead beyond the bytes that it consumes.
-    pub fn push<'a>(
+    fn push_rest<'a>(
         &mut self,
         input: &'a str,
         eof: bool,
@@ -258,6 +258,11 @@ impl Segmenter {
             State::BeginData4 => self.parse_begin_data_4(input, eof),
         }
     }
+
+    pub fn push(&mut self, input: &str, eof: bool) -> Result<(usize, Segment), Incomplete> {
+        let (rest, seg_type) = self.push_rest(input, eof)?;
+        Ok((input.len() - rest.len(), seg_type))
+    }
 }
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -458,7 +463,7 @@ impl Segmenter {
             State::General,
             Substate::START_OF_COMMAND | Substate::START_OF_LINE,
         );
-        self.push(input, eof)
+        self.push_rest(input, eof)
     }
     fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
         match self.mode {
@@ -641,11 +646,12 @@ impl Segmenter {
     ) -> Result<(&'a str, &'a str), Incomplete> {
         let mut sub = Segmenter::new(self.mode, true);
         loop {
-            let (rest, segment) = sub.push(input, eof)?;
-            match segment {
+            let (seg_len, seg_type) = sub.push(input, eof)?;
+            let (segment, rest) = input.split_at(seg_len);
+            match seg_type {
                 Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (),
 
-                Segment::Identifier => return Ok((&input[..input.len() - rest.len()], rest)),
+                Segment::Identifier => return Ok((segment, rest)),
 
                 Segment::Number
                 | Segment::QuotedString
@@ -916,7 +922,7 @@ impl Segmenter {
             state: (State::General, self.state.1),
             ..*self
         };
-        let (rest, segment) = sub.push(input, eof)?;
+        let (rest, segment) = sub.push_rest(input, eof)?;
         if segment == Segment::Identifier {
             let id = &input[..input.len() - rest.len()];
             debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
@@ -970,7 +976,7 @@ impl Segmenter {
             state: (State::General, self.state.1),
             nest: 0,
         };
-        let result = sub.push(input, eof)?;
+        let result = sub.push_rest(input, eof)?;
         self.state.1 = sub.state.1;
         Ok(result)
     }
@@ -1085,7 +1091,7 @@ impl Segmenter {
                     State::General,
                     Substate::START_OF_COMMAND | Substate::START_OF_LINE,
                 );
-                return self.push(input, eof);
+                return self.push_rest(input, eof);
             }
         }
         return Ok((rest, Segment::DoRepeatCommand));
@@ -1200,7 +1206,7 @@ impl Segmenter {
             let (prefix, rest) = input.split_at(line.len() - end.len());
             if prefix.is_empty() {
                 // Line starts with `!ENDDEFINE`.
-                self.push(input, eof)
+                self.push_rest(input, eof)
             } else if prefix.trim_start().is_empty() {
                 // Line starts with spaces followed by `!ENDDEFINE`.
                 Ok((rest, Segment::Spaces))
@@ -1297,7 +1303,7 @@ impl Segmenter {
                 State::General,
                 Substate::START_OF_COMMAND | Substate::START_OF_LINE,
             );
-            self.push(input, eof)
+            self.push_rest(input, eof)
         } else {
             self.state.0 = State::BeginData4;
             Ok((rest, Segment::InlineData))
index 05f0a23d6e59fb6848961924de8d8faf45713c63..d8c337dcdfd1536565f9516c0a2fa28a88dab301 100644 (file)
@@ -6,11 +6,11 @@ fn push_segment<'a>(
     segmenter: &mut Segmenter,
     input: &'a str,
     one_byte: bool,
-) -> (&'a str, Segment) {
+) -> (usize, Segment) {
     if one_byte {
         for len in input.char_indices().map(|(pos, _c)| pos) {
-            if let Ok((rest, segment)) = segmenter.push(&input[..len], false) {
-                return (&input[len - rest.len()..], segment);
+            if let Ok(result) = segmenter.push(&input[..len], false) {
+                return result;
             }
         }
     }
@@ -28,11 +28,10 @@ fn _check_segmentation(
     let mut prompts = Vec::new();
     let mut segmenter = Segmenter::new(mode, false);
     loop {
-        let (rest, segment) = push_segment(&mut segmenter, input, one_byte);
-        let len = input.len() - rest.len();
-        let token = &input[..len];
-        segments.push((segment, token));
-        match segment {
+        let (seg_len, seg_type) = push_segment(&mut segmenter, input, one_byte);
+        let (token, rest) = input.split_at(seg_len);
+        segments.push((seg_type, token));
+        match seg_type {
             Segment::End => break,
             Segment::Newline => prompts.push(segmenter.prompt()),
             _ => (),
@@ -118,11 +117,10 @@ fn check_segmentation(
 fn print_segmentation(mut input: &str) {
     let mut segmenter = Segmenter::new(Mode::Interactive, false);
     loop {
-        let (rest, segment) = segmenter.push(input, true).unwrap();
-        let len = input.len() - rest.len();
-        let token = &input[..len];
-        print!("{segment:?} {token:?}");
-        match segment {
+        let (seg_len, seg_type) = segmenter.push(input, true).unwrap();
+        let (token, rest) = input.split_at(seg_len);
+        print!("{seg_type:?} {token:?}");
+        match seg_type {
             Segment::Newline => print!(" ({:?})", segmenter.prompt()),
             Segment::End => break,
             _ => (),
index 32d508d6ac94427d89ab51215fc17b8040398f2a..46fe08622ad6346cf731e7a42a817696f1617292 100644 (file)
@@ -13,4 +13,3 @@ pub mod lex;
 pub mod prompt;
 pub mod message;
 pub mod macros;
-pub mod autodecode;
index 9bfaf28d94522d74b77c3dd9f9bb6e4b049806aa..f00262aac63cea9032211ccbdeab76e4c80de61c 100644 (file)
@@ -245,13 +245,18 @@ enum ValueType {
 #[derive(Clone)]
 pub struct MacroToken {
     /// The token.
-    token: Token,
+    pub token: Token,
 
     /// The syntax that produces `token`.
-    syntax: String,
+    pub syntax: String,
 }
 
-fn tokenize_string_into(s: &str, mode: Mode, error: &impl Fn(MacroError),  output: &mut Vec<MacroToken>) {
+fn tokenize_string_into(
+    s: &str,
+    mode: Mode,
+    error: &impl Fn(MacroError),
+    output: &mut Vec<MacroToken>,
+) {
     for (syntax, token) in StringSegmenter::new(s, mode, true) {
         match token {
             ScanToken::Token(token) => output.push(MacroToken {
@@ -352,21 +357,20 @@ enum TokenClass {
 }
 
 impl TokenClass {
-    fn needs_space(prev: Self, next: Self) -> bool {
+    fn separator(prev: Self, next: Self) -> &'static str {
         match (prev, next) {
-            // Don't need a space before or after the end of a command.  (A
-            // new-line is needed afterward as a special case.)
-            (Self::EndCommand, _) | (_, Self::EndCommand) => false,
-
-            // Binary operators always have a space on both sides.
-            (Self::BinaryOperator, _) | (_, Self::BinaryOperator) => true,
-
-            // A comma always has a space afterward.
-            (Self::Comma, _) => true,
-
-            // Otherwise, `prev` is `Self::BinaryOperator` or `Self::Punct`,
-            // which only need a space if there are two or them in a row.
-            _ => prev == next,
+            // Don't need a separator before the end of a command, but we
+            // need a new-line afterward.
+            (_, Self::EndCommand) => "",
+            (Self::EndCommand, _) => "\n",
+
+            // Binary operators always have a space on both sides, and a comma always has a space afterward.
+            (Self::BinaryOperator, _) | (_, Self::BinaryOperator) | (Self::Comma, _) => " ",
+
+            // Otherwise, `prev` is `Self::Punct`, which only need a space if
+            // there are two or them in a row.
+            (Self::Punct, Self::Punct) => " ",
+            _ => "",
         }
     }
 }
@@ -419,23 +423,16 @@ impl From<&Token> for TokenClass {
     }
 }
 
-fn macro_tokens_to_syntax(input: &[MacroToken], output: &mut String) {
-    for (i, token) in input.iter().enumerate() {
-        if i > 0 {
-            let prev = &input[i].token;
-            let next = &token.token;
-            if let Token::EndCommand = prev {
-                output.push('\n');
-            } else {
-                let prev_class: TokenClass = prev.into();
-                let next_class: TokenClass = next.into();
-                if TokenClass::needs_space(prev_class, next_class) {
-                    output.push(' ')
-                }
-            }
-            output.push_str(&token.syntax);
-        }
-    }
+pub fn macro_tokens_to_syntax(input: &[MacroToken]) -> impl Iterator<Item = &str> {
+    input
+        .iter()
+        .take(1)
+        .map(|token| token.syntax.as_str())
+        .chain(input.windows(2).flat_map(|w| {
+            let c0 = (&w[0].token).into();
+            let c1 = (&w[1].token).into();
+            [TokenClass::separator(c0, c1), w[1].syntax.as_str()]
+        }))
 }
 
 trait MacroId {
@@ -499,7 +496,7 @@ impl RelOp {
     }
 }
 
-type MacroSet = HashMap<UniCase<String>, Macro>;
+pub type MacroSet = HashMap<UniCase<String>, Macro>;
 
 enum ParserState {
     /// Accumulating tokens toward the end of any type of argument.
@@ -705,14 +702,13 @@ impl<'a> Parser<'a> {
     /// Adds `token`, which has the given `syntax`, to the collection of tokens
     /// in `self` that potentially need to be macro expanded.
     ///
-    /// Returns `false` if the macro expander needs more tokens, for macro
-    /// arguments or to decide whether this is actually a macro invocation.  The
-    /// caller should call `push` again with the next token.
-    ///n
-    /// Returns `true` if the macro was complete with `n` tokens.  The caller
-    /// should call [`Self::expand`] to obtain the expansion.  (If `n == 0`,
-    /// then the tokens did not actually invoke a macro at all and the expansion
-    /// will be empty.)
+    /// Returns [ParseStatus::Incomplete] if the macro expander needs more
+    /// tokens, for macro arguments or to decide whether this is actually a
+    /// macro invocation.  The caller should call `push` again with the next
+    /// token.
+    ///
+    /// Returns [ParseStatus::Complete] if the macro invocation is now complete.
+    /// The caller should call [`Self::finish()`] to obtain the expansion.
     pub fn push(
         &mut self,
         token: &Token,
@@ -1000,11 +996,9 @@ impl<'a> Expander<'a> {
         subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output);
         subexpander.stack.pop();
         e.stack = subexpander.stack;
-        let mut output_string = String::new();
-        macro_tokens_to_syntax(&mut output, &mut output_string);
-        Some(output_string)
+        Some(macro_tokens_to_syntax(&output).collect())
     }
-    
+
     fn expand_head(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
         let arg = unquote_string(args.remove(0), e.mode);
         let mut output = tokenize_string(&arg, e.mode, e.error);
@@ -1165,13 +1159,11 @@ impl<'a> Expander<'a> {
             match &input.0.get(0)?.token {
                 Token::Id(id) if id.0.starts_with('!') => {
                     if let Some(param_idx) = macro_.find_parameter(id) {
-                        let mut s = String::new();
-                        macro_tokens_to_syntax(
-                            self.args.unwrap()[param_idx].as_ref().unwrap(),
-                            &mut s,
-                        );
                         input.advance();
-                        return Some(s);
+                        return Some(
+                            macro_tokens_to_syntax(self.args.unwrap()[param_idx].as_ref().unwrap())
+                                .collect(),
+                        );
                     }
                     if let Some(value) = self.vars.borrow().get(id) {
                         return Some(value.clone());
@@ -1190,7 +1182,9 @@ impl<'a> Expander<'a> {
                         if i > 0 {
                             arg.push(' ')
                         }
-                        macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap(), &mut arg);
+                        arg.extend(macro_tokens_to_syntax(
+                            self.args.unwrap()[i].as_ref().unwrap(),
+                        ));
                     }
                     input.advance();
                     return Some(arg);
@@ -1660,6 +1654,9 @@ impl<'a> Call<'a> {
         me.expand(&mut body, output);
     }
 
+    /// Returns the number of tokens consumed from the input for the macro
+    /// invocation. If the result is 0, then there was no macro invocation and
+    /// the expansion will be empty.
     pub fn len(&self) -> usize {
         self.0.n_tokens
     }
index 757ea7862573e5ed517ef70d75309a6618b2ebbd..52386910318cfec27cc3c8bb39c3e4ac78e5869b 100644 (file)
@@ -12,14 +12,15 @@ use unicode_width::UnicodeWidthStr;
 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub struct Point {
     /// 1-based line number.
-    line: i32,
+    pub line: i32,
 
     /// 1-based column number.
     ///
-    /// Column numbers are measured according to the width of characters as shown in
-    /// a typical fixed-width font, in which CJK characters have width 2 and
-    /// combining characters have width 0.
-    column: Option<i32>,
+    /// Column numbers are measured according to the width of characters as
+    /// shown in a typical fixed-width font, in which CJK characters have width
+    /// 2 and combining characters have width 0, as measured by the
+    /// `unicode_width` crate.
+    pub column: Option<i32>,
 }
 
 impl Point {