From 5f214768d311a2a44f103099f42bba719a88ad12 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 20 Dec 2024 08:58:34 -0800 Subject: [PATCH] work on sysfiles --- rust/pspp-derive/src/lib.rs | 4 +- rust/pspp/src/cooked.rs | 17 ++- rust/pspp/src/dictionary.rs | 240 ++++++++++++++++++++++++++++++++++-- rust/pspp/src/encoding.rs | 31 +++++ 4 files changed, 277 insertions(+), 15 deletions(-) diff --git a/rust/pspp-derive/src/lib.rs b/rust/pspp-derive/src/lib.rs index d13b46b708..28438ad9bf 100644 --- a/rust/pspp-derive/src/lib.rs +++ b/rust/pspp-derive/src/lib.rs @@ -75,7 +75,7 @@ fn derive_enum(ast: &DeriveInput, e: &DataEnum) -> Result { } } }; - println!("{output}"); + //println!("{output}"); Ok(output) } @@ -145,7 +145,7 @@ fn derive_struct(ast: &DeriveInput, s: &DataStruct) -> Result Identifier { loop { self.n_generated_names += 1; - let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding) - .unwrap(); + let name = Identifier::from_encoding( + &format!("VAR{:03}", self.n_generated_names), + self.encoding, + ) + .unwrap(); if !dictionary.variables.contains(&name.0) { return name; } @@ -550,7 +553,13 @@ pub fn decode( for dict_index in dict_indexes { let mut variable = &dictionary.variables[dict_index]; for ValueLabel { value, label } in record.labels.iter().cloned() { - + let value = match value { + raw::Value::Number(number) => Value::Number(number.map(|n| n.into())), + raw::Value::String(string) => Value::String(EncodedString::from_raw( + &string.0[..variable.width.as_string_width().unwrap()], + encoding, + )), + }; } } } diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index c26009921b..8ca82db21a 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -1,11 +1,14 @@ +use core::str; use std::{ + borrow::Cow, cmp::Ordering, collections::{HashMap, HashSet}, fmt::Debug, + hash::{Hash, Hasher}, ops::{Bound, RangeBounds}, }; -use encoding_rs::Encoding; +use encoding_rs::{Encoding, UTF_8}; use indexmap::IndexSet; use num::integer::div_ceil; use ordered_float::OrderedFloat; @@ -14,7 +17,7 @@ use unicase::UniCase; use crate::{ format::Format, identifier::{ByIdentifier, HasIdentifier, Identifier}, - raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType}, + raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType}, }; pub type DictIndex = usize; @@ -96,6 +99,13 @@ impl VarWidth { false } } + + pub fn as_string_width(&self) -> Option { + match self { + VarWidth::Numeric => None, + VarWidth::String(width) => Some(*width as usize), + } + } } impl From for VarType { @@ -107,18 +117,230 @@ impl From for VarType { } } -#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Debug)] pub enum Value { - Number(Option>), - String(String), + Number(Option), + String(ValueString), +} + +impl PartialEq for Value { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Number(Some(l0)), Self::Number(Some(r0))) => { + OrderedFloat(*l0) == OrderedFloat(*r0) + } + (Self::Number(None), Self::Number(None)) => true, + (Self::Number(_), Self::Number(_)) => false, + (Self::String(l0), Self::String(r0)) => l0 == r0, + } + } +} + +impl Eq for Value {} + +impl PartialOrd for Value { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Value { + fn cmp(&self, other: &Self) -> Ordering { + match (self, other) { + (Value::Number(a), Value::Number(b)) => match (a, b) { + (None, None) => Ordering::Equal, + (None, Some(_)) => Ordering::Less, + (Some(_), None) => Ordering::Greater, + (Some(a), Some(b)) => a.total_cmp(b), + }, + (Value::Number(_), Value::String(_)) => Ordering::Less, + (Value::String(_), Value::Number(_)) => Ordering::Greater, + (Value::String(a), Value::String(b)) => a.cmp(b), + } + } +} + +impl Hash for Value { + fn hash(&self, state: &mut H) + where + H: Hasher, + { + match self { + Value::Number(Some(a)) => OrderedFloat(*a).hash(state), + Value::Number(None) => (), + Value::String(string) => string.hash(state), + } + } +} + +impl Clone for Value { + fn clone(&self) -> Self { + match self { + Self::Number(number) => Self::Number(*number), + Self::String(string) => Self::String(string.clone_boxed()), + } + } } impl Value { - pub fn decode(raw: &raw::Value>, decoder: &Decoder) -> Self { - match raw { - raw::Value::Number(x) => Value::Number(x.map(|x| x.into())), - raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), + fn sysmis() -> Self { + Self::Number(None) + } + + fn for_string(s: S) -> Self + where + S: AsRef, + { + Self::String(ValueString::new(s)) + } +} + +impl From for Value { + fn from(value: f64) -> Self { + Self::Number(Some(value.into())) + } +} + +#[derive(Debug)] +pub struct ValueString { + nonutf8: Option>, + utf8: Box +} + +impl ValueString { + fn clone_boxed(&self) -> Box { + Box::new(ValueString { + nonutf8: self.nonutf8.map(|s| s.clone_boxed()), + utf8: self.utf8, + }) + } + + fn new(s: S) -> Box + where + S: AsRef, + { + Box::new(Self { + nonutf8: None, + utf8: s, + }) + } + + fn new_encoded(s: &[u8], encoding: &'static Encoding) -> Box { + if encoding == &UTF_8 { + if let Some(utf8) = str::from_utf8(s) { + return Self::new(utf8); + } } + todo!() + } +} + +impl PartialEq for ValueString { + fn eq(&self, other: &Self) -> bool { + self.utf8 == other.utf8 + } +} + +impl Eq for ValueString {} + +impl PartialOrd for ValueString { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ValueString { + fn cmp(&self, other: &Self) -> Ordering { + self.utf8.cmp(&other.utf8) + } +} + +impl Hash for ValueString { + fn hash(&self, state: &mut H) + where + H: Hasher, + { + self.utf8.hash(state); + } +} + +#[derive(Debug, Hash)] +pub struct EncodedString { + encoding: &'static Encoding, + s: Box<[u8]>, +} + +impl PartialEq for EncodedString { + fn eq(&self, other: &Self) -> bool { + self.as_str().eq(&other.as_str()) + } +} + +impl Eq for EncodedString {} + +impl PartialOrd for EncodedString { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for EncodedString { + fn cmp(&self, other: &Self) -> Ordering { + self.as_str().cmp(&other.as_str()) + } +} + +impl EncodedString { + fn clone_boxed(&self) -> Box { + todo!() + } + fn as_str(&self) -> EncodedStr { + EncodedStr { + s: &*self.s, + encoding: self.encoding, + } + } +} + +#[derive(Clone, Debug, Hash)] +pub struct EncodedStr<'a> { + s: &'a [u8], + encoding: &'static Encoding, +} + +impl<'a> PartialOrd for EncodedStr<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> Ord for EncodedStr<'a> { + fn cmp(&self, other: &Self) -> Ordering { + if self.encoding == other.encoding { + self.s.cmp(&other.s) + } else { + // Get an arbitary but stable ordering for strings with different + // encodings. It would be nice to do something like + // `self.as_utf8().partial_cmp(other.as_utf8())` but it's likely that + // this would violate transitivity. + let this = self.encoding as *const Encoding; + let other = other.encoding as *const Encoding; + this.cmp(&other) + } + } +} + +impl<'a> Eq for EncodedStr<'a> {} + +impl<'a> EncodedStr<'a> { + fn as_utf8(&self) -> Cow<'a, str> { + self.encoding.decode_without_bom_handling(self.s).0 + } +} + +impl<'a> PartialEq for EncodedStr<'a> { + fn eq(&self, other: &Self) -> bool { + self.encoding == other.encoding && self.s == other.s } } diff --git a/rust/pspp/src/encoding.rs b/rust/pspp/src/encoding.rs index aaed5fd4ca..c408bf56fa 100644 --- a/rust/pspp/src/encoding.rs +++ b/rust/pspp/src/encoding.rs @@ -62,3 +62,34 @@ pub fn get_encoding( Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) } + +/* +#[cfg(test)] +mod tests { + use std::thread::spawn; + + use encoding_rs::{EUC_JP, UTF_8, WINDOWS_1252}; + + #[test] + fn round_trip() { + let mut threads = Vec::new(); + for thread in 0..128 { + let start: u32 = thread << 25; + let end = start + ((1 << 25) - 1); + threads.push(spawn(move || { + for i in start..=end { + let s = i.to_le_bytes(); + let (utf8, replacement) = EUC_JP.decode_without_bom_handling(&s); + if !replacement { + let s2 = UTF_8.encode(&utf8).0; + assert_eq!(s.as_slice(), &*s2); + } + } + })); + } + for thread in threads { + thread.join().unwrap(); + } + } +} +*/ -- 2.30.2