From 4559955df51f9a9caec91aac78ce359e751aca4c Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 17 Aug 2024 22:15:19 -0700 Subject: [PATCH] work --- rust/src/lex/lexer.rs | 377 ++++++++++++++++++++++++++++++++++-------- rust/src/message.rs | 21 ++- 2 files changed, 319 insertions(+), 79 deletions(-) diff --git a/rust/src/lex/lexer.rs b/rust/src/lex/lexer.rs index 00611ffb9f..9c53b40830 100644 --- a/rust/src/lex/lexer.rs +++ b/rust/src/lex/lexer.rs @@ -1,12 +1,17 @@ use std::{ borrow::Borrow, collections::{HashMap, VecDeque}, - num::NonZeroU32, - ops::RangeInclusive, + fs, + io::Result as IoResult, + mem, + ops::{Range, RangeInclusive}, + path::Path, sync::Arc, }; +use chardetng::EncodingDetector; use encoding_rs::{Encoding, UTF_8}; +use thiserror::Error as ThisError; use unicode_width::UnicodeWidthStr; use crate::{ @@ -16,7 +21,7 @@ use crate::{ }; use super::{ - scan::{MergeResult, ScanToken}, + scan::{MergeResult, ScanError, ScanToken}, segment::{Mode, Segment, Segmenter}, token::Token, }; @@ -38,21 +43,6 @@ pub enum ErrorHandling { Stop, } -pub trait LexRead { - /// Read some input from the source. If successful, returns the input that - /// was read. At end of file or on error, returns an empty string. - /// - /// `prompt` provides a hint to interactive readers as to what kind of - /// syntax is being read right now. - fn read(&mut self, prompt: PromptStyle) -> String; -} - -impl LexRead for () { - fn read(&mut self, _prompt: PromptStyle) -> String { - String::from("") - } -} - /// # Token pipeline /// /// Tokens pass through a pipeline with the following stages. Each token @@ -72,24 +62,25 @@ impl LexRead for () { /// `pp` and `merge` store tokens only temporarily until they pass into `parse`. /// Tokens then live in `parse` until the command is fully consumed, at which /// time they are freed together. -struct Source { +pub struct Source { /// Error-handling mode. error_handling: ErrorHandling, - /// Encoding (although the reader must always produce UTF-8). + /// Encoding. encoding: &'static Encoding, /// `None` if this reader is not associated with a file. file_name: Option>, - /// 1-based line number, if any. - line_number: Option, - /// True if we've reached EOF already. eof: bool, - /// Reads UTF-8 bytes. - reader: Box, + /// Read some input from the source. If successful, returns the input that + /// was read. At end of file or on error, returns an empty string. + /// + /// `prompt` provides a hint to interactive readers as to what kind of + /// syntax is being read right now. + read: Box String>, /// Source file contents. buffer: String, @@ -122,15 +113,14 @@ struct Source { suppress_next_newline: bool, } -impl Source { - fn empty() -> Self { +impl Default for Source { + fn default() -> Self { Self { error_handling: ErrorHandling::default(), encoding: UTF_8, file_name: None, - line_number: None, - eof: true, - reader: Box::new(()), + eof: false, + read: Box::new(|_| String::new()), buffer: String::new(), journal_line: 0, seg_pos: 0, @@ -143,11 +133,80 @@ impl Source { suppress_next_newline: false, } } +} + +impl Source { + pub fn for_file

( + path: P, + encoding: Option<&'static Encoding>, + syntax: Mode, + error_handling: ErrorHandling, + ) -> IoResult + where + P: AsRef, + { + let bytes = fs::read(path.as_ref())?; + let encoding = encoding.unwrap_or_else(|| { + let mut encoding_detector = EncodingDetector::new(); + encoding_detector.feed(&bytes, true); + encoding_detector.guess(None, true) + }); + let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes); + Ok(Self::for_file_contents( + contents.to_string(), + Some(path.as_ref().to_string_lossy().to_string()), + encoding, + syntax, + error_handling, + )) + } + + pub fn for_file_contents( + contents: String, + file_name: Option, + encoding: &'static Encoding, + syntax: Mode, + error_handling: ErrorHandling, + ) -> Self { + Self { + buffer: contents, + file_name: file_name.map(Arc::new), + encoding, + error_handling, + segmenter: Segmenter::new(syntax, false), + ..Self::default() + } + } + + pub fn for_string(contents: String, encoding: &'static Encoding) -> Self { + Self { + buffer: contents, + encoding, + ..Self::default() + } + } + + pub fn for_function( + read: Box String>, + file_name: Option, + encoding: &'static Encoding, + syntax: Mode, + error_handling: ErrorHandling, + ) -> Self { + Self { + read, + file_name: file_name.map(Arc::new), + encoding, + segmenter: Segmenter::new(syntax, false), + error_handling, + ..Self::default() + } + } fn read(&mut self) { loop { let prompt = self.segmenter.prompt(); - let s = self.reader.read(prompt); + let s = (self.read)(prompt); if s.is_empty() { self.eof = true; return; @@ -158,7 +217,7 @@ impl Source { } } } - fn try_get_pp(&mut self) -> bool { + fn try_get_pp(&mut self, context: &Context) -> bool { let (seg_len, seg_type) = loop { if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) { break result; @@ -203,7 +262,7 @@ impl Source { } self.journal_line += n_lines; - let pos = pos.start..=pos.end - 1; + let pos = pos.start..pos.end; match scan_token { None => false, Some(ScanToken::Token(Token::End)) => { @@ -223,24 +282,31 @@ impl Source { }); true } - Some(ScanToken::Error(_error)) => { - // XXX report error + Some(ScanToken::Error(error)) => { + (context.error)( + Location { + file_name: self.file_name.clone(), + span: Some(self.offset_to_point(pos.start)..self.offset_to_point(pos.end)), + omit_underlines: false, + }, + error.into(), + ); false } } } - fn get_pp(&mut self) -> bool { + fn get_pp(&mut self, context: &Context) -> bool { while !self.eof { - if self.try_get_pp() { + if self.try_get_pp(context) { return true; } } false } - fn try_get_merge(&mut self) -> bool { - if self.pp.is_empty() && !self.get_pp() { + fn try_get_merge(&mut self, context: &Context) -> bool { + if self.pp.is_empty() && !self.get_pp(context) { return false; } @@ -252,20 +318,20 @@ impl Source { } // Now pass tokens one-by-one to the macro expander. - let Some(mut parser) = Parser::new(todo!(), &self.pp[0].token) else { + let Some(mut parser) = Parser::new(context.macros, &self.pp[0].token) else { // Common case where there is no macro to expand. self.merge.push_back(self.pp.pop_front().unwrap()); return true; }; for ofs in 1.. { - if self.pp.len() <= ofs && !self.get_pp() { + if self.pp.len() <= ofs && !self.get_pp(context) { // This should not be reachable because we always get a // `Token::EndCommand` at the end of an input file, which should // always terminate macro expansion. unreachable!(); } let token = &self.pp[ofs]; - if parser.push(&token.token, &self.buffer[token.pos], &|e| { + if parser.push(&token.token, &self.buffer[token.pos.clone()], &|e| { println!("{e:?}") }) == ParseStatus::Complete { @@ -308,7 +374,7 @@ impl Source { for (index, token) in expansion.into_iter().enumerate() { let lt = LexToken { token: token.token, - pos: *c0.pos.start()..=*c1.pos.end(), + pos: c0.pos.start..c1.pos.end, macro_rep: Some(MacroRepresentation { expansion: Arc::clone(¯o_rep), pos: pos[index].clone(), @@ -324,18 +390,18 @@ impl Source { /// /// Returns true if successful, false on failure. In the latter case, this source /// exhausted and 'self.eof' is now true. - fn get_merge(&mut self) -> bool { + fn get_merge(&mut self, context: &Context) -> bool { while !self.eof { - if self.try_get_merge() { + if self.try_get_merge(context) { return true; } } false } - fn get_parse__(&mut self) -> bool { + fn get_parse__(&mut self, context: &Context) -> bool { for i in 0.. { - if self.merge.len() <= i && !self.get_merge() { + if self.merge.len() <= i && !self.get_merge(context) { // We always get a `Token::EndCommand` at the end of an input // file and the merger should return `Some(...)` for that token. debug_assert_eq!(self.merge.len(), 0); @@ -353,7 +419,7 @@ impl Source { let last = &self.merge[n - 1]; self.parse.push(LexToken { token, - pos: *first.pos.start()..=*last.pos.end(), + pos: first.pos.start..last.pos.end, macro_rep: match (&first.macro_rep, &last.macro_rep) { (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => { Some(MacroRepresentation { @@ -371,9 +437,9 @@ impl Source { } unreachable!(); } - fn get_parse(&mut self) -> bool { - // XXX deal with accumulate messages - self.get_parse__() + fn get_parse(&mut self, context: &Context) -> bool { + // XXX deal with accumulated messages + self.get_parse__(context) } fn offset_to_point(&self, offset: usize) -> Point { @@ -386,20 +452,46 @@ impl Source { self.buffer .get(self.lines[line - 1]..offset) .unwrap_or_default() - .width() as i32, + .width() as i32 + 1, ), } } + fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location { Location { file_name: self.file_name.clone(), span: Some( - self.offset_to_point(*range.start().pos.start()) - ..=self.offset_to_point(*range.end().pos.end()), + self.offset_to_point(range.start().pos.start) + ..self.offset_to_point(range.end().pos.end), ), omit_underlines: false, } } + + fn token(&self) -> &Token { + &self.parse[self.parse_ofs].token + } + + fn next(&mut self, offset: isize, context: &Context) -> &Token { + let Some(index) = offset.checked_add(self.parse_ofs as isize) else { + return &Token::EndCommand; + }; + let Ok(index) = usize::try_from(index) else { + return &Token::EndCommand; + }; + + while index >= self.parse.len() { + if let Some(token) = self.parse.last() { + match token.token { + Token::End => return &Token::End, + Token::EndCommand => return &Token::EndCommand, + _ => (), + } + } + self.get_parse(context); + } + &self.parse[index].token + } } /// A token in a [`Source`]. @@ -412,7 +504,7 @@ struct LexToken { /// /// For a token produced through macro expansion, this is the entire macro /// call. - pos: RangeInclusive, + pos: Range, /// For a token obtained through macro expansion, the part of the macro /// expansion that represents this token. @@ -440,35 +532,184 @@ pub struct Lexer { source: Source, stack: Vec, macros: MacroSet, + error: Box, +} + +struct Context<'a> { + macros: &'a MacroSet, + error: &'a Box, } impl Lexer { - pub fn new() -> Self { + pub fn new(error: Box) -> Self { Self { - source: Source::empty(), + source: Source::default(), stack: Vec::new(), macros: HashMap::new(), + error, } } - pub fn get(&mut self) { + pub fn get(&mut self) -> &Token { if self.source.parse_ofs < self.source.parse.len() { - if let Token::EndCommand = self.source.parse[self.source.parse_ofs].token { + if let Token::EndCommand = self.source.token() { self.source.parse.clear(); + self.source.parse_ofs = 0; } else { self.source.parse_ofs += 1; } } - while self.source.parse_ofs < self.source.parse.len() { - if !self.source.get_parse() { - match self.stack.pop() { - Some(source) => self.source = source, - None => { - self.source = Source::empty(); - return; - } - } + while self.source.parse_ofs == self.source.parse.len() { + let context = Context { + macros: &self.macros, + error: &self.error, + }; + if !self.source.get_parse(&context) { + let Some(new_source) = self.stack.pop() else { + self.source = Source::default(); + self.source.parse.push(LexToken { + token: Token::End, + pos: 0..0, + macro_rep: None, + }); + return &Token::End; + }; + self.source = new_source; + } + } + self.source.token() + } + + /// Inserts `source` so that the next token comes from it. This is only + /// permitted when the lexer is either empty or at `Token::EndCommand`. + pub fn include(&mut self, mut source: Source) { + // XXX what's the right assertion? + let context = Context { + macros: &self.macros, + error: &self.error, + }; + source.get_parse(&context); + let old_source = mem::replace(&mut self.source, source); + self.stack.push(old_source); + } + + /// Inserts `source` so that it will be read after all the other sources. + pub fn append(&mut self, mut source: Source) { + let context = Context { + macros: &self.macros, + error: &self.error, + }; + source.get_parse(&context); + self.stack.insert(0, source); + } + + pub fn token(&self) -> &Token { + self.source.token() + } + + pub fn next(&mut self, offset: isize) -> &Token { + let context = Context { + macros: &self.macros, + error: &self.error, + }; + self.source.next(offset, &context) + } +} + +#[derive(ThisError, Clone, Debug, PartialEq, Eq)] +pub enum Error { + /// Error forming tokens from the input. + #[error("{0}")] + TokenError(#[from] ScanError), +} + +#[cfg(test)] +mod tests { + use encoding_rs::UTF_8; + + use crate::lex::{segment::Mode, token::Token}; + + use super::{ErrorHandling, Lexer, Source}; + + #[test] + fn test() { + let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); + lexer.include(Source::for_string( + String::from( + r#"#! /usr/local/bin/pspp +DATA LIST LIST NOTABLE /a. +BEGIN DATA. +1 +2 +END DATA. +LIST. +"#, + ), + UTF_8, + )); + loop { + lexer.get(); + let token = lexer.token(); + println!("{token:?}"); + if let Token::End = token { + break; + } + } + } + + #[test] + fn test_scan_errors() { + let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); + lexer.include(Source::for_file_contents( + String::from( + r#"x'123' +x'1x' +u'' +u'012345678' +u'd800' +u'110000' +'foo +'very long unterminated string that be ellipsized in its error message +1e .x +^ +� +"#, + ), + Some(String::from("syntax.sps")), + UTF_8, + Mode::default(), + ErrorHandling::default(), + )); + loop { + lexer.get(); + let token = lexer.token(); + println!("{token:?}"); + if let Token::End = token { + break; + } + } + } + + #[test] + fn test_null_byte() { + let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); + lexer.include(Source::for_file_contents( + String::from( + "datA dist list notable file='input.txt'/a b c. +lis|.\0", + ), + Some(String::from("syntax.sps")), + UTF_8, + Mode::default(), + ErrorHandling::default(), + )); + loop { + lexer.get(); + let token = lexer.token(); + println!("{token:?}"); + if let Token::End = token { + break; } } } diff --git a/rust/src/message.rs b/rust/src/message.rs index 5238691031..f8682050c0 100644 --- a/rust/src/message.rs +++ b/rust/src/message.rs @@ -1,8 +1,7 @@ use std::{ cmp::{max, min}, - fmt::Result as FmtResult, - fmt::{Display, Formatter}, - ops::RangeInclusive, + fmt::{Display, Formatter, Result as FmtResult}, + ops::Range, sync::Arc, }; @@ -55,7 +54,7 @@ pub struct Location { pub file_name: Option>, /// Starting and ending point, if any. - pub span: Option>, + pub span: Option>, /// Normally, if `span` contains column information, then displaying the /// message will underline the location. Setting this to true disables @@ -73,13 +72,13 @@ impl Display for Location { if self.file_name.is_some() { write!(f, ":")?; } - let l1 = span.start().line; - let l2 = span.end().line; - if let (Some(c1), Some(c2)) = (span.start().column, span.end().column) { + let l1 = span.start.line; + let l2 = span.end.line; + if let (Some(c1), Some(c2)) = (span.start.column, span.end.column) { if l2 > l1 { - write!(f, "{l1}.{c1}-{l2}.{c2}")?; + write!(f, "{l1}.{c1}-{l2}.{}", c2 - 1)?; } else { - write!(f, "{l1}.{c1}-{c2}")?; + write!(f, "{l1}.{c1}-{}", c2 - 1)?; } } else { if l2 > l1 { @@ -100,7 +99,7 @@ impl Location { span: self .span .as_ref() - .map(|span| span.start().without_column()..=span.end().without_column()), + .map(|span| span.start.without_column()..span.end.without_column()), omit_underlines: self.omit_underlines, } } @@ -115,7 +114,7 @@ impl Location { (None, None) => None, (Some(r), None) | (None, Some(r)) => Some(r.clone()), (Some(ar), Some(br)) => { - Some(min(ar.start(), br.start()).clone()..=max(ar.end(), br.end()).clone()) + Some(min(ar.start, br.start).clone()..max(ar.end, br.end).clone()) } }; Some(Self { -- 2.30.2