work on autodecode
authorBen Pfaff <blp@cs.stanford.edu>
Tue, 13 Aug 2024 21:45:16 +0000 (14:45 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Tue, 13 Aug 2024 21:45:16 +0000 (14:45 -0700)
rust/Cargo.lock
rust/Cargo.toml
rust/src/autodecode.rs [new file with mode: 0644]
rust/src/lex/lexer.rs [new file with mode: 0644]
rust/src/lex/mod.rs
rust/src/lib.rs

index c736c92ebace53018cbb21f17f05ebd0ea82492c..2ebc35eb62c7953a0fb3ebdb5b25054c1c9ac903 100644 (file)
@@ -76,6 +76,17 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "chardetng"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
+dependencies = [
+ "cfg-if",
+ "encoding_rs",
+ "memchr",
+]
+
 [[package]]
 name = "chrono"
 version = "0.4.26"
@@ -350,6 +361,12 @@ version = "0.4.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
 
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
 [[package]]
 name = "miniz_oxide"
 version = "0.7.1"
@@ -506,6 +523,7 @@ version = "1.0.0"
 dependencies = [
  "anyhow",
  "bitflags 2.5.0",
+ "chardetng",
  "chrono",
  "clap",
  "diff",
index f9800722903f6e2582839c39a13a67e056993412..4b4aba6b3ffaf9d9ef2b4dd660a890e7ecab50d3 100644 (file)
@@ -25,6 +25,7 @@ indexmap = "2.1.0"
 utf8-decode = "1.0.1"
 bitflags = "2.5.0"
 unicode-width = "0.1.13"
+chardetng = "0.1.17"
 
 [target.'cfg(windows)'.dependencies]
 windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
diff --git a/rust/src/autodecode.rs b/rust/src/autodecode.rs
new file mode 100644 (file)
index 0000000..84a8ec3
--- /dev/null
@@ -0,0 +1,168 @@
+use chardetng::EncodingDetector;
+use encoding_rs::{Decoder, Encoding};
+use std::io::{BufRead, Read, Result};
+
+struct Autodecode<R>
+where
+    R: Read,
+{
+    inner: R,
+    buffer: Box<[u8]>,
+    state: State,
+}
+
+enum State {
+    /// Stream encoding is not yet known.
+    Auto {
+        detector: EncodingDetector,
+        back: usize,
+        front: usize,
+        ascii: usize,
+    },
+
+    /// Stream encoding is known.
+    Decode(Decoder),
+}
+
+fn read_fully<R>(reader: &mut R, mut buffer: &mut [u8]) -> Result<usize>
+where
+    R: Read,
+{
+    let mut len = 0;
+    while len < buffer.len() {
+        let n = reader.read(&mut buffer[len..])?;
+        if n == 0 {
+            break;
+        }
+        len += n;
+    }
+    Ok(len)
+}
+
+impl<R> Autodecode<R>
+where
+    R: Read,
+{
+    fn new(inner: R) -> Result<Self> {
+        Self::with_capacity(8192, inner)
+    }
+    fn with_capacity(capacity: usize, mut inner: R) -> Result<Self> {
+        let mut buffer = Vec::with_capacity(capacity);
+        buffer.resize(capacity, 0);
+        let len = read_fully(&mut inner, buffer.as_mut_slice())?;
+        let mut detector = EncodingDetector::new();
+        let state = if len < buffer.len() {
+            detector.feed(&buffer[..len], true);
+            State::Decode(detector.guess(None, true).new_decoder_with_bom_removal())
+        } else {
+            let ascii = feed(&mut detector, &buffer[..len], false);
+            State::Auto {
+                detector,
+                back: 0,
+                front: len,
+                ascii,
+            }
+        };
+        Ok(Self {
+            inner,
+            buffer: buffer.into_boxed_slice(),
+            state,
+        })
+    }
+}
+
+impl<R> Read for Autodecode<R>
+where
+    R: Read,
+{
+    fn read(&mut self, outbuf: &mut [u8]) -> Result<usize> {
+        let mut buffer = self.fill_buf()?;
+        let n = buffer.read(outbuf)?;
+        self.consume(n);
+        Ok(n)
+    }
+}
+
+impl<R> BufRead for Autodecode<R>
+where
+    R: Read,
+{
+    fn fill_buf(&mut self) -> Result<&[u8]> {
+        match &mut self.state {
+            State::Auto {
+                detector,
+                back,
+                front,
+                ascii,
+            } => {
+                if back < ascii {
+                    // Consume data up to the first non-ASCII byte.
+                    Ok(&self.buffer[*back..*ascii])
+                } else if ascii < front {
+                    // We had a non-ASCII byte and we consumed everything up to
+                    // it.  We want to get a full buffer starting at the
+                    // non-ASCII byte before we decide on the encoding.
+                    debug_assert_eq!(ascii, back);
+
+                    // Shift buffered data to the beginning of the buffer to
+                    // make room to get a full buffer.
+                    self.buffer.copy_within(*back..*front, 0);
+                    *front -= *back;
+                    *back = 0;
+                    *ascii = 0;
+
+                    // Fill up the remainder of the buffer.
+                    let old_front = *front;
+                    *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?;
+                    detector.feed(&self.buffer[old_front..*front], *front < self.buffer.len());
+                    self.state = State::Decode(
+                        detector.guess(None, true).new_decoder_with_bom_removal(),
+                    );
+                    self.fill_buf()
+                } else {
+                    // We have not had a non-ASCII byte yet but we consumed the
+                    // whole buffer. Read a new one.
+                    *back = 0;
+                    *front = 0;
+                    *ascii = 0;
+                    *front += read_fully(&mut self.inner, &mut self.buffer[*front..])?;
+                    let eof = *front < self.buffer.len();
+                    *ascii = feed(detector, &self.buffer[..*front], eof);
+                    if eof || *ascii == 0 {
+                        self.state = State::Decode(
+                            detector.guess(None, true).new_decoder_with_bom_removal(),
+                        );
+                        self.fill_buf()
+                    } else {
+                        Ok(&self.buffer[..*ascii])
+                    }
+                }
+            }
+            State::Decode(_) => todo!(),
+        }
+    }
+
+    fn consume(&mut self, n: usize) {
+        todo!()
+    }
+}
+
+fn feed(detector: &mut EncodingDetector, buffer: &[u8], last: bool) -> usize {
+    if detector.feed(buffer, last) {
+        Encoding::ascii_valid_up_to(buffer)
+    } else {
+        buffer.len()
+    }
+}
+/*
+                    } else {
+                        debug_assert_eq!(ascii, back);
+                        debug_assert_eq!(back, front);
+                        *back = 0;
+                        *front = 0;
+                        *ascii = 0;
+                        *front += read_fully(&mut self.inner, &mut self.buffer[..])?;
+                        *ascii = feed(detector, &self.buffer[..*front], *front < self.buffer.len());
+                        Ok(&self.buffer[*back..*ascii])
+                    }
+*/
diff --git a/rust/src/lex/lexer.rs b/rust/src/lex/lexer.rs
new file mode 100644 (file)
index 0000000..d5728d5
--- /dev/null
@@ -0,0 +1,52 @@
+use std::io::Read;
+
+use encoding_rs::Encoding;
+
+use crate::prompt::PromptStyle;
+
+use super::segment::Mode;
+
+/// Error handling for a [`Reader`].
+pub enum ErrorHandling {
+    /// Discard input line and continue reading.
+    Terminal,
+
+    /// Continue to next command, except for cascading failures.
+    Continue,
+
+    /// Continue, even for cascading failures.
+    Ignore,
+
+    /// Stop processing,
+    Stop,
+}
+
+/// Reads a single syntax file as a stream of bytes encoded in UTF-8.
+pub struct Reader {
+    /// Segmentation mode.
+    mode: Mode,
+
+    /// Error-handling mode.
+    error_handling: ErrorHandling,
+
+    /// Encoding (although the reader must always produce UTF-8).
+    encoding: &'static Encoding,
+
+    /// `None` if this reader is not associated with a file.
+    file_name: Option<String>,
+
+    /// Zero if there's no line number.
+    line_number: u32,
+
+    /// True if we've reached EOF already.
+    eof: bool,
+
+    /// Reads UTF-8 bytes.
+    reader: dyn LexRead,
+}
+
+pub trait LexRead: Read {
+    /// Tells the reader what kind of prompt is appropriate for the next
+    /// read. Non-interactive readers can ignore this.
+    fn set_prompt_style(&mut self, _prompt: PromptStyle) {}
+}
index 732cf3a09c6d2fa571b8c247545edf8a50e64339..e87b088cf443b74b0d0cde3e93e2424fab363902 100644 (file)
@@ -14,3 +14,4 @@ pub mod segment;
 pub mod scan;
 pub mod command_name;
 pub mod token;
+pub mod lexer;
index 46fe08622ad6346cf731e7a42a817696f1617292..32d508d6ac94427d89ab51215fc17b8040398f2a 100644 (file)
@@ -13,3 +13,4 @@ pub mod lex;
 pub mod prompt;
 pub mod message;
 pub mod macros;
+pub mod autodecode;