use std::{
borrow::Borrow,
collections::{HashMap, VecDeque},
- num::NonZeroU32,
- ops::RangeInclusive,
+ fs,
+ io::Result as IoResult,
+ mem,
+ ops::{Range, RangeInclusive},
+ path::Path,
sync::Arc,
};
+use chardetng::EncodingDetector;
use encoding_rs::{Encoding, UTF_8};
+use thiserror::Error as ThisError;
use unicode_width::UnicodeWidthStr;
use crate::{
};
use super::{
- scan::{MergeResult, ScanToken},
+ scan::{MergeResult, ScanError, ScanToken},
segment::{Mode, Segment, Segmenter},
token::Token,
};
Stop,
}
-pub trait LexRead {
- /// Read some input from the source. If successful, returns the input that
- /// was read. At end of file or on error, returns an empty string.
- ///
- /// `prompt` provides a hint to interactive readers as to what kind of
- /// syntax is being read right now.
- fn read(&mut self, prompt: PromptStyle) -> String;
-}
-
-impl LexRead for () {
- fn read(&mut self, _prompt: PromptStyle) -> String {
- String::from("")
- }
-}
-
/// # Token pipeline
///
/// Tokens pass through a pipeline with the following stages. Each token
/// `pp` and `merge` store tokens only temporarily until they pass into `parse`.
/// Tokens then live in `parse` until the command is fully consumed, at which
/// time they are freed together.
-struct Source {
+pub struct Source {
/// Error-handling mode.
error_handling: ErrorHandling,
- /// Encoding (although the reader must always produce UTF-8).
+ /// Encoding.
encoding: &'static Encoding,
/// `None` if this reader is not associated with a file.
file_name: Option<Arc<String>>,
- /// 1-based line number, if any.
- line_number: Option<NonZeroU32>,
-
/// True if we've reached EOF already.
eof: bool,
- /// Reads UTF-8 bytes.
- reader: Box<dyn LexRead>,
+ /// Read some input from the source. If successful, returns the input that
+ /// was read. At end of file or on error, returns an empty string.
+ ///
+ /// `prompt` provides a hint to interactive readers as to what kind of
+ /// syntax is being read right now.
+ read: Box<dyn Fn(PromptStyle) -> String>,
/// Source file contents.
buffer: String,
suppress_next_newline: bool,
}
-impl Source {
- fn empty() -> Self {
+impl Default for Source {
+ fn default() -> Self {
Self {
error_handling: ErrorHandling::default(),
encoding: UTF_8,
file_name: None,
- line_number: None,
- eof: true,
- reader: Box::new(()),
+ eof: false,
+ read: Box::new(|_| String::new()),
buffer: String::new(),
journal_line: 0,
seg_pos: 0,
suppress_next_newline: false,
}
}
+}
+
+impl Source {
+ pub fn for_file<P>(
+ path: P,
+ encoding: Option<&'static Encoding>,
+ syntax: Mode,
+ error_handling: ErrorHandling,
+ ) -> IoResult<Self>
+ where
+ P: AsRef<Path>,
+ {
+ let bytes = fs::read(path.as_ref())?;
+ let encoding = encoding.unwrap_or_else(|| {
+ let mut encoding_detector = EncodingDetector::new();
+ encoding_detector.feed(&bytes, true);
+ encoding_detector.guess(None, true)
+ });
+ let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes);
+ Ok(Self::for_file_contents(
+ contents.to_string(),
+ Some(path.as_ref().to_string_lossy().to_string()),
+ encoding,
+ syntax,
+ error_handling,
+ ))
+ }
+
+ pub fn for_file_contents(
+ contents: String,
+ file_name: Option<String>,
+ encoding: &'static Encoding,
+ syntax: Mode,
+ error_handling: ErrorHandling,
+ ) -> Self {
+ Self {
+ buffer: contents,
+ file_name: file_name.map(Arc::new),
+ encoding,
+ error_handling,
+ segmenter: Segmenter::new(syntax, false),
+ ..Self::default()
+ }
+ }
+
+ pub fn for_string(contents: String, encoding: &'static Encoding) -> Self {
+ Self {
+ buffer: contents,
+ encoding,
+ ..Self::default()
+ }
+ }
+
+ pub fn for_function(
+ read: Box<dyn Fn(PromptStyle) -> String>,
+ file_name: Option<String>,
+ encoding: &'static Encoding,
+ syntax: Mode,
+ error_handling: ErrorHandling,
+ ) -> Self {
+ Self {
+ read,
+ file_name: file_name.map(Arc::new),
+ encoding,
+ segmenter: Segmenter::new(syntax, false),
+ error_handling,
+ ..Self::default()
+ }
+ }
fn read(&mut self) {
loop {
let prompt = self.segmenter.prompt();
- let s = self.reader.read(prompt);
+ let s = (self.read)(prompt);
if s.is_empty() {
self.eof = true;
return;
}
}
}
- fn try_get_pp(&mut self) -> bool {
+ fn try_get_pp(&mut self, context: &Context) -> bool {
let (seg_len, seg_type) = loop {
if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) {
break result;
}
self.journal_line += n_lines;
- let pos = pos.start..=pos.end - 1;
+ let pos = pos.start..pos.end;
match scan_token {
None => false,
Some(ScanToken::Token(Token::End)) => {
});
true
}
- Some(ScanToken::Error(_error)) => {
- // XXX report error
+ Some(ScanToken::Error(error)) => {
+ (context.error)(
+ Location {
+ file_name: self.file_name.clone(),
+ span: Some(self.offset_to_point(pos.start)..self.offset_to_point(pos.end)),
+ omit_underlines: false,
+ },
+ error.into(),
+ );
false
}
}
}
- fn get_pp(&mut self) -> bool {
+ fn get_pp(&mut self, context: &Context) -> bool {
while !self.eof {
- if self.try_get_pp() {
+ if self.try_get_pp(context) {
return true;
}
}
false
}
- fn try_get_merge(&mut self) -> bool {
- if self.pp.is_empty() && !self.get_pp() {
+ fn try_get_merge(&mut self, context: &Context) -> bool {
+ if self.pp.is_empty() && !self.get_pp(context) {
return false;
}
}
// Now pass tokens one-by-one to the macro expander.
- let Some(mut parser) = Parser::new(todo!(), &self.pp[0].token) else {
+ let Some(mut parser) = Parser::new(context.macros, &self.pp[0].token) else {
// Common case where there is no macro to expand.
self.merge.push_back(self.pp.pop_front().unwrap());
return true;
};
for ofs in 1.. {
- if self.pp.len() <= ofs && !self.get_pp() {
+ if self.pp.len() <= ofs && !self.get_pp(context) {
// This should not be reachable because we always get a
// `Token::EndCommand` at the end of an input file, which should
// always terminate macro expansion.
unreachable!();
}
let token = &self.pp[ofs];
- if parser.push(&token.token, &self.buffer[token.pos], &|e| {
+ if parser.push(&token.token, &self.buffer[token.pos.clone()], &|e| {
println!("{e:?}")
}) == ParseStatus::Complete
{
for (index, token) in expansion.into_iter().enumerate() {
let lt = LexToken {
token: token.token,
- pos: *c0.pos.start()..=*c1.pos.end(),
+ pos: c0.pos.start..c1.pos.end,
macro_rep: Some(MacroRepresentation {
expansion: Arc::clone(¯o_rep),
pos: pos[index].clone(),
///
/// Returns true if successful, false on failure. In the latter case, this source
/// exhausted and 'self.eof' is now true.
- fn get_merge(&mut self) -> bool {
+ fn get_merge(&mut self, context: &Context) -> bool {
while !self.eof {
- if self.try_get_merge() {
+ if self.try_get_merge(context) {
return true;
}
}
false
}
- fn get_parse__(&mut self) -> bool {
+ fn get_parse__(&mut self, context: &Context) -> bool {
for i in 0.. {
- if self.merge.len() <= i && !self.get_merge() {
+ if self.merge.len() <= i && !self.get_merge(context) {
// We always get a `Token::EndCommand` at the end of an input
// file and the merger should return `Some(...)` for that token.
debug_assert_eq!(self.merge.len(), 0);
let last = &self.merge[n - 1];
self.parse.push(LexToken {
token,
- pos: *first.pos.start()..=*last.pos.end(),
+ pos: first.pos.start..last.pos.end,
macro_rep: match (&first.macro_rep, &last.macro_rep) {
(Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => {
Some(MacroRepresentation {
}
unreachable!();
}
- fn get_parse(&mut self) -> bool {
- // XXX deal with accumulate messages
- self.get_parse__()
+ fn get_parse(&mut self, context: &Context) -> bool {
+ // XXX deal with accumulated messages
+ self.get_parse__(context)
}
fn offset_to_point(&self, offset: usize) -> Point {
self.buffer
.get(self.lines[line - 1]..offset)
.unwrap_or_default()
- .width() as i32,
+ .width() as i32 + 1,
),
}
}
+
fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location {
Location {
file_name: self.file_name.clone(),
span: Some(
- self.offset_to_point(*range.start().pos.start())
- ..=self.offset_to_point(*range.end().pos.end()),
+ self.offset_to_point(range.start().pos.start)
+ ..self.offset_to_point(range.end().pos.end),
),
omit_underlines: false,
}
}
+
+ fn token(&self) -> &Token {
+ &self.parse[self.parse_ofs].token
+ }
+
+ fn next(&mut self, offset: isize, context: &Context) -> &Token {
+ let Some(index) = offset.checked_add(self.parse_ofs as isize) else {
+ return &Token::EndCommand;
+ };
+ let Ok(index) = usize::try_from(index) else {
+ return &Token::EndCommand;
+ };
+
+ while index >= self.parse.len() {
+ if let Some(token) = self.parse.last() {
+ match token.token {
+ Token::End => return &Token::End,
+ Token::EndCommand => return &Token::EndCommand,
+ _ => (),
+ }
+ }
+ self.get_parse(context);
+ }
+ &self.parse[index].token
+ }
}
/// A token in a [`Source`].
///
/// For a token produced through macro expansion, this is the entire macro
/// call.
- pos: RangeInclusive<usize>,
+ pos: Range<usize>,
/// For a token obtained through macro expansion, the part of the macro
/// expansion that represents this token.
source: Source,
stack: Vec<Source>,
macros: MacroSet,
+ error: Box<dyn Fn(Location, Error)>,
+}
+
+struct Context<'a> {
+ macros: &'a MacroSet,
+ error: &'a Box<dyn Fn(Location, Error)>,
}
impl Lexer {
- pub fn new() -> Self {
+ pub fn new(error: Box<dyn Fn(Location, Error)>) -> Self {
Self {
- source: Source::empty(),
+ source: Source::default(),
stack: Vec::new(),
macros: HashMap::new(),
+ error,
}
}
- pub fn get(&mut self) {
+ pub fn get(&mut self) -> &Token {
if self.source.parse_ofs < self.source.parse.len() {
- if let Token::EndCommand = self.source.parse[self.source.parse_ofs].token {
+ if let Token::EndCommand = self.source.token() {
self.source.parse.clear();
+ self.source.parse_ofs = 0;
} else {
self.source.parse_ofs += 1;
}
}
- while self.source.parse_ofs < self.source.parse.len() {
- if !self.source.get_parse() {
- match self.stack.pop() {
- Some(source) => self.source = source,
- None => {
- self.source = Source::empty();
- return;
- }
- }
+ while self.source.parse_ofs == self.source.parse.len() {
+ let context = Context {
+ macros: &self.macros,
+ error: &self.error,
+ };
+ if !self.source.get_parse(&context) {
+ let Some(new_source) = self.stack.pop() else {
+ self.source = Source::default();
+ self.source.parse.push(LexToken {
+ token: Token::End,
+ pos: 0..0,
+ macro_rep: None,
+ });
+ return &Token::End;
+ };
+ self.source = new_source;
+ }
+ }
+ self.source.token()
+ }
+
+ /// Inserts `source` so that the next token comes from it. This is only
+ /// permitted when the lexer is either empty or at `Token::EndCommand`.
+ pub fn include(&mut self, mut source: Source) {
+ // XXX what's the right assertion?
+ let context = Context {
+ macros: &self.macros,
+ error: &self.error,
+ };
+ source.get_parse(&context);
+ let old_source = mem::replace(&mut self.source, source);
+ self.stack.push(old_source);
+ }
+
+ /// Inserts `source` so that it will be read after all the other sources.
+ pub fn append(&mut self, mut source: Source) {
+ let context = Context {
+ macros: &self.macros,
+ error: &self.error,
+ };
+ source.get_parse(&context);
+ self.stack.insert(0, source);
+ }
+
+ pub fn token(&self) -> &Token {
+ self.source.token()
+ }
+
+ pub fn next(&mut self, offset: isize) -> &Token {
+ let context = Context {
+ macros: &self.macros,
+ error: &self.error,
+ };
+ self.source.next(offset, &context)
+ }
+}
+
+#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
+pub enum Error {
+ /// Error forming tokens from the input.
+ #[error("{0}")]
+ TokenError(#[from] ScanError),
+}
+
+#[cfg(test)]
+mod tests {
+ use encoding_rs::UTF_8;
+
+ use crate::lex::{segment::Mode, token::Token};
+
+ use super::{ErrorHandling, Lexer, Source};
+
+ #[test]
+ fn test() {
+ let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
+ lexer.include(Source::for_string(
+ String::from(
+ r#"#! /usr/local/bin/pspp
+DATA LIST LIST NOTABLE /a.
+BEGIN DATA.
+1
+2
+END DATA.
+LIST.
+"#,
+ ),
+ UTF_8,
+ ));
+ loop {
+ lexer.get();
+ let token = lexer.token();
+ println!("{token:?}");
+ if let Token::End = token {
+ break;
+ }
+ }
+ }
+
+ #[test]
+ fn test_scan_errors() {
+ let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
+ lexer.include(Source::for_file_contents(
+ String::from(
+ r#"x'123'
+x'1x'
+u''
+u'012345678'
+u'd800'
+u'110000'
+'foo
+'very long unterminated string that be ellipsized in its error message
+1e .x
+^
+�
+"#,
+ ),
+ Some(String::from("syntax.sps")),
+ UTF_8,
+ Mode::default(),
+ ErrorHandling::default(),
+ ));
+ loop {
+ lexer.get();
+ let token = lexer.token();
+ println!("{token:?}");
+ if let Token::End = token {
+ break;
+ }
+ }
+ }
+
+ #[test]
+ fn test_null_byte() {
+ let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
+ lexer.include(Source::for_file_contents(
+ String::from(
+ "datA dist list notable file='input.txt'/a b c.
+lis|.\0",
+ ),
+ Some(String::from("syntax.sps")),
+ UTF_8,
+ Mode::default(),
+ ErrorHandling::default(),
+ ));
+ loop {
+ lexer.get();
+ let token = lexer.token();
+ println!("{token:?}");
+ if let Token::End = token {
+ break;
}
}
}