From: Ben Pfaff Date: Tue, 13 Aug 2024 21:45:16 +0000 (-0700) Subject: work on autodecode X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ab157c5e5e471c47ebce90daa7b60d04903d1f34;p=pspp work on autodecode --- diff --git a/rust/Cargo.lock b/rust/Cargo.lock index c736c92eba..2ebc35eb62 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -76,6 +76,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "chrono" version = "0.4.26" @@ -350,6 +361,12 @@ version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + [[package]] name = "miniz_oxide" version = "0.7.1" @@ -506,6 +523,7 @@ version = "1.0.0" dependencies = [ "anyhow", "bitflags 2.5.0", + "chardetng", "chrono", "clap", "diff", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index f980072290..4b4aba6b3f 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -25,6 +25,7 @@ indexmap = "2.1.0" utf8-decode = "1.0.1" bitflags = "2.5.0" unicode-width = "0.1.13" +chardetng = "0.1.17" [target.'cfg(windows)'.dependencies] windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } diff --git a/rust/src/autodecode.rs b/rust/src/autodecode.rs new file mode 100644 index 0000000000..84a8ec35c9 --- /dev/null +++ b/rust/src/autodecode.rs @@ -0,0 +1,168 @@ +use chardetng::EncodingDetector; +use encoding_rs::{Decoder, Encoding}; +use std::io::{BufRead, Read, Result}; + +struct Autodecode +where + R: Read, +{ + inner: R, + buffer: Box<[u8]>, + state: State, +} + +enum State { + /// Stream encoding is not yet known. + Auto { + detector: EncodingDetector, + back: usize, + front: usize, + ascii: usize, + }, + + /// Stream encoding is known. + Decode(Decoder), +} + +fn read_fully(reader: &mut R, mut buffer: &mut [u8]) -> Result +where + R: Read, +{ + let mut len = 0; + while len < buffer.len() { + let n = reader.read(&mut buffer[len..])?; + if n == 0 { + break; + } + len += n; + } + Ok(len) +} + +impl Autodecode +where + R: Read, +{ + fn new(inner: R) -> Result { + Self::with_capacity(8192, inner) + } + fn with_capacity(capacity: usize, mut inner: R) -> Result { + let mut buffer = Vec::with_capacity(capacity); + buffer.resize(capacity, 0); + let len = read_fully(&mut inner, buffer.as_mut_slice())?; + let mut detector = EncodingDetector::new(); + let state = if len < buffer.len() { + detector.feed(&buffer[..len], true); + State::Decode(detector.guess(None, true).new_decoder_with_bom_removal()) + } else { + let ascii = feed(&mut detector, &buffer[..len], false); + State::Auto { + detector, + back: 0, + front: len, + ascii, + } + }; + Ok(Self { + inner, + buffer: buffer.into_boxed_slice(), + state, + }) + } +} + +impl Read for Autodecode +where + R: Read, +{ + fn read(&mut self, outbuf: &mut [u8]) -> Result { + let mut buffer = self.fill_buf()?; + let n = buffer.read(outbuf)?; + self.consume(n); + Ok(n) + } +} + +impl BufRead for Autodecode +where + R: Read, +{ + fn fill_buf(&mut self) -> Result<&[u8]> { + match &mut self.state { + State::Auto { + detector, + back, + front, + ascii, + } => { + if back < ascii { + // Consume data up to the first non-ASCII byte. + Ok(&self.buffer[*back..*ascii]) + } else if ascii < front { + // We had a non-ASCII byte and we consumed everything up to + // it. We want to get a full buffer starting at the + // non-ASCII byte before we decide on the encoding. + debug_assert_eq!(ascii, back); + + // Shift buffered data to the beginning of the buffer to + // make room to get a full buffer. + self.buffer.copy_within(*back..*front, 0); + *front -= *back; + *back = 0; + *ascii = 0; + + // Fill up the remainder of the buffer. + let old_front = *front; + *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?; + detector.feed(&self.buffer[old_front..*front], *front < self.buffer.len()); + self.state = State::Decode( + detector.guess(None, true).new_decoder_with_bom_removal(), + ); + self.fill_buf() + } else { + // We have not had a non-ASCII byte yet but we consumed the + // whole buffer. Read a new one. + *back = 0; + *front = 0; + *ascii = 0; + *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?; + let eof = *front < self.buffer.len(); + *ascii = feed(detector, &self.buffer[..*front], eof); + if eof || *ascii == 0 { + self.state = State::Decode( + detector.guess(None, true).new_decoder_with_bom_removal(), + ); + self.fill_buf() + } else { + Ok(&self.buffer[..*ascii]) + } + } + } + State::Decode(_) => todo!(), + } + } + + fn consume(&mut self, n: usize) { + todo!() + } +} + +fn feed(detector: &mut EncodingDetector, buffer: &[u8], last: bool) -> usize { + if detector.feed(buffer, last) { + Encoding::ascii_valid_up_to(buffer) + } else { + buffer.len() + } +} +/* + } else { + debug_assert_eq!(ascii, back); + debug_assert_eq!(back, front); + *back = 0; + *front = 0; + *ascii = 0; + *front += read_fully(&mut self.inner, &mut self.buffer[..])?; + *ascii = feed(detector, &self.buffer[..*front], *front < self.buffer.len()); + Ok(&self.buffer[*back..*ascii]) + } +*/ diff --git a/rust/src/lex/lexer.rs b/rust/src/lex/lexer.rs new file mode 100644 index 0000000000..d5728d5cac --- /dev/null +++ b/rust/src/lex/lexer.rs @@ -0,0 +1,52 @@ +use std::io::Read; + +use encoding_rs::Encoding; + +use crate::prompt::PromptStyle; + +use super::segment::Mode; + +/// Error handling for a [`Reader`]. +pub enum ErrorHandling { + /// Discard input line and continue reading. + Terminal, + + /// Continue to next command, except for cascading failures. + Continue, + + /// Continue, even for cascading failures. + Ignore, + + /// Stop processing, + Stop, +} + +/// Reads a single syntax file as a stream of bytes encoded in UTF-8. +pub struct Reader { + /// Segmentation mode. + mode: Mode, + + /// Error-handling mode. + error_handling: ErrorHandling, + + /// Encoding (although the reader must always produce UTF-8). + encoding: &'static Encoding, + + /// `None` if this reader is not associated with a file. + file_name: Option, + + /// Zero if there's no line number. + line_number: u32, + + /// True if we've reached EOF already. + eof: bool, + + /// Reads UTF-8 bytes. + reader: dyn LexRead, +} + +pub trait LexRead: Read { + /// Tells the reader what kind of prompt is appropriate for the next + /// read. Non-interactive readers can ignore this. + fn set_prompt_style(&mut self, _prompt: PromptStyle) {} +} diff --git a/rust/src/lex/mod.rs b/rust/src/lex/mod.rs index 732cf3a09c..e87b088cf4 100644 --- a/rust/src/lex/mod.rs +++ b/rust/src/lex/mod.rs @@ -14,3 +14,4 @@ pub mod segment; pub mod scan; pub mod command_name; pub mod token; +pub mod lexer; diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 46fe08622a..32d508d6ac 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -13,3 +13,4 @@ pub mod lex; pub mod prompt; pub mod message; pub mod macros; +pub mod autodecode;