From: Ben Pfaff Date: Sat, 21 Dec 2024 01:17:32 +0000 (-0800) Subject: dictionary decoding! untested though X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0432c6e76ef1fd2f5c79e0450cfe077001c8a1d4;p=pspp dictionary decoding! untested though --- diff --git a/rust/pspp/src/cooked.rs b/rust/pspp/src/cooked.rs index 8b3c3d07cd..7bfba78ecb 100644 --- a/rust/pspp/src/cooked.rs +++ b/rust/pspp/src/cooked.rs @@ -1,7 +1,7 @@ use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; use crate::{ - dictionary::{Dictionary, EncodedString, Value, VarWidth, Variable}, + dictionary::{Dictionary, Value, VarWidth, Variable}, encoding::Error as EncodingError, endian::Endian, format::{Error as FormatError, Format, UncheckedFormat}, @@ -339,6 +339,7 @@ impl Headers { } } +#[derive(Debug)] pub struct Metadata { creation: NaiveDateTime, endian: Endian, @@ -555,10 +556,9 @@ pub fn decode( for ValueLabel { value, label } in record.labels.iter().cloned() { let value = match value { raw::Value::Number(number) => Value::Number(number.map(|n| n.into())), - raw::Value::String(string) => Value::String(EncodedString::from_raw( - &string.0[..variable.width.as_string_width().unwrap()], - encoding, - )), + raw::Value::String(string) => { + string.0[..variable.width.as_string_width().unwrap()].into() + } }; } } diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 8ca82db21a..d4fdd6c458 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -1,14 +1,13 @@ use core::str; use std::{ - borrow::Cow, cmp::Ordering, collections::{HashMap, HashSet}, fmt::Debug, - hash::{Hash, Hasher}, + hash::Hash, ops::{Bound, RangeBounds}, }; -use encoding_rs::{Encoding, UTF_8}; +use encoding_rs::Encoding; use indexmap::IndexSet; use num::integer::div_ceil; use ordered_float::OrderedFloat; @@ -117,10 +116,10 @@ impl From for VarType { } } -#[derive(Debug)] +#[derive(Clone, Debug)] pub enum Value { Number(Option), - String(ValueString), + String(Box<[u8]>), } impl PartialEq for Value { @@ -130,8 +129,8 @@ impl PartialEq for Value { OrderedFloat(*l0) == OrderedFloat(*r0) } (Self::Number(None), Self::Number(None)) => true, - (Self::Number(_), Self::Number(_)) => false, (Self::String(l0), Self::String(r0)) => l0 == r0, + _ => false, } } } @@ -160,187 +159,33 @@ impl Ord for Value { } } -impl Hash for Value { - fn hash(&self, state: &mut H) - where - H: Hasher, - { - match self { - Value::Number(Some(a)) => OrderedFloat(*a).hash(state), - Value::Number(None) => (), - Value::String(string) => string.hash(state), - } - } -} - -impl Clone for Value { - fn clone(&self) -> Self { - match self { - Self::Number(number) => Self::Number(*number), - Self::String(string) => Self::String(string.clone_boxed()), - } - } -} - impl Value { fn sysmis() -> Self { Self::Number(None) } - - fn for_string(s: S) -> Self - where - S: AsRef, - { - Self::String(ValueString::new(s)) - } } impl From for Value { fn from(value: f64) -> Self { - Self::Number(Some(value.into())) - } -} - -#[derive(Debug)] -pub struct ValueString { - nonutf8: Option>, - utf8: Box -} - -impl ValueString { - fn clone_boxed(&self) -> Box { - Box::new(ValueString { - nonutf8: self.nonutf8.map(|s| s.clone_boxed()), - utf8: self.utf8, - }) - } - - fn new(s: S) -> Box - where - S: AsRef, - { - Box::new(Self { - nonutf8: None, - utf8: s, - }) - } - - fn new_encoded(s: &[u8], encoding: &'static Encoding) -> Box { - if encoding == &UTF_8 { - if let Some(utf8) = str::from_utf8(s) { - return Self::new(utf8); - } - } - todo!() - } -} - -impl PartialEq for ValueString { - fn eq(&self, other: &Self) -> bool { - self.utf8 == other.utf8 - } -} - -impl Eq for ValueString {} - -impl PartialOrd for ValueString { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) + Some(value).into() } } -impl Ord for ValueString { - fn cmp(&self, other: &Self) -> Ordering { - self.utf8.cmp(&other.utf8) - } -} - -impl Hash for ValueString { - fn hash(&self, state: &mut H) - where - H: Hasher, - { - self.utf8.hash(state); - } -} - -#[derive(Debug, Hash)] -pub struct EncodedString { - encoding: &'static Encoding, - s: Box<[u8]>, -} - -impl PartialEq for EncodedString { - fn eq(&self, other: &Self) -> bool { - self.as_str().eq(&other.as_str()) - } -} - -impl Eq for EncodedString {} - -impl PartialOrd for EncodedString { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for EncodedString { - fn cmp(&self, other: &Self) -> Ordering { - self.as_str().cmp(&other.as_str()) +impl From> for Value { + fn from(value: Option) -> Self { + Self::Number(value) } } -impl EncodedString { - fn clone_boxed(&self) -> Box { - todo!() - } - fn as_str(&self) -> EncodedStr { - EncodedStr { - s: &*self.s, - encoding: self.encoding, - } +impl From<&str> for Value { + fn from(value: &str) -> Self { + value.as_bytes().into() } } -#[derive(Clone, Debug, Hash)] -pub struct EncodedStr<'a> { - s: &'a [u8], - encoding: &'static Encoding, -} - -impl<'a> PartialOrd for EncodedStr<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl<'a> Ord for EncodedStr<'a> { - fn cmp(&self, other: &Self) -> Ordering { - if self.encoding == other.encoding { - self.s.cmp(&other.s) - } else { - // Get an arbitary but stable ordering for strings with different - // encodings. It would be nice to do something like - // `self.as_utf8().partial_cmp(other.as_utf8())` but it's likely that - // this would violate transitivity. - let this = self.encoding as *const Encoding; - let other = other.encoding as *const Encoding; - this.cmp(&other) - } - } -} - -impl<'a> Eq for EncodedStr<'a> {} - -impl<'a> EncodedStr<'a> { - fn as_utf8(&self) -> Cow<'a, str> { - self.encoding.decode_without_bom_handling(self.s).0 - } -} - -impl<'a> PartialEq for EncodedStr<'a> { - fn eq(&self, other: &Self) -> bool { - self.encoding == other.encoding && self.s == other.s +impl From<&[u8]> for Value { + fn from(value: &[u8]) -> Self { + Self::String(value.into()) } } diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index a3b3145bed..5fb57135f7 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -17,6 +17,7 @@ use anyhow::Result; use clap::{Parser, ValueEnum}; use encoding_rs::Encoding; +use pspp::cooked::{decode, Headers}; use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record}; use std::fs::File; use std::io::BufReader; @@ -140,14 +141,20 @@ fn dissect( } } Mode::Cooked => { - /* - let headers: Vec = reader.collect::, _>>()?; - let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?; - let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?; - for header in headers { - println!("{header:?}"); + let headers: Vec = reader.collect::, _>>()?; + let encoding = match encoding { + Some(encoding) => encoding, + None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?, + }; + let decoder = Decoder::new(encoding, |e| eprintln!("{e}")); + let mut decoded_records = Vec::new(); + for header in headers { + decoded_records.push(header.decode(&decoder)?); } - */ + let headers = Headers::new(decoded_records, &|e| eprintln!("{e}"))?; + let (dictionary, metadata) = decode(headers, encoding, |e| eprintln!("{e}"))?; + println!("{dictionary:?}"); + println!("{metadata:?}"); } }