dictionary decoding! untested though
authorBen Pfaff <blp@cs.stanford.edu>
Sat, 21 Dec 2024 01:17:32 +0000 (17:17 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Sat, 21 Dec 2024 01:17:32 +0000 (17:17 -0800)
rust/pspp/src/cooked.rs
rust/pspp/src/dictionary.rs
rust/pspp/src/main.rs

index 8b3c3d07cd6be43479019ed01a0be2eaf35ef3b8..7bfba78ecb951a009943c6c502ddc6f006f4c7b2 100644 (file)
@@ -1,7 +1,7 @@
 use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
 
 use crate::{
-    dictionary::{Dictionary, EncodedString, Value, VarWidth, Variable},
+    dictionary::{Dictionary, Value, VarWidth, Variable},
     encoding::Error as EncodingError,
     endian::Endian,
     format::{Error as FormatError, Format, UncheckedFormat},
@@ -339,6 +339,7 @@ impl Headers {
     }
 }
 
+#[derive(Debug)]
 pub struct Metadata {
     creation: NaiveDateTime,
     endian: Endian,
@@ -555,10 +556,9 @@ pub fn decode(
             for ValueLabel { value, label } in record.labels.iter().cloned() {
                 let value = match value {
                     raw::Value::Number(number) => Value::Number(number.map(|n| n.into())),
-                    raw::Value::String(string) => Value::String(EncodedString::from_raw(
-                        &string.0[..variable.width.as_string_width().unwrap()],
-                        encoding,
-                    )),
+                    raw::Value::String(string) => {
+                        string.0[..variable.width.as_string_width().unwrap()].into()
+                    }
                 };
             }
         }
index 8ca82db21aa370ebb4829572449d27955b59aeff..d4fdd6c458e4d30c7e1bd6d848f30268ec928444 100644 (file)
@@ -1,14 +1,13 @@
 use core::str;
 use std::{
-    borrow::Cow,
     cmp::Ordering,
     collections::{HashMap, HashSet},
     fmt::Debug,
-    hash::{Hash, Hasher},
+    hash::Hash,
     ops::{Bound, RangeBounds},
 };
 
-use encoding_rs::{Encoding, UTF_8};
+use encoding_rs::Encoding;
 use indexmap::IndexSet;
 use num::integer::div_ceil;
 use ordered_float::OrderedFloat;
@@ -117,10 +116,10 @@ impl From<VarWidth> for VarType {
     }
 }
 
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub enum Value {
     Number(Option<f64>),
-    String(ValueString),
+    String(Box<[u8]>),
 }
 
 impl PartialEq for Value {
@@ -130,8 +129,8 @@ impl PartialEq for Value {
                 OrderedFloat(*l0) == OrderedFloat(*r0)
             }
             (Self::Number(None), Self::Number(None)) => true,
-            (Self::Number(_), Self::Number(_)) => false,
             (Self::String(l0), Self::String(r0)) => l0 == r0,
+            _ => false,
         }
     }
 }
@@ -160,187 +159,33 @@ impl Ord for Value {
     }
 }
 
-impl Hash for Value {
-    fn hash<H>(&self, state: &mut H)
-    where
-        H: Hasher,
-    {
-        match self {
-            Value::Number(Some(a)) => OrderedFloat(*a).hash(state),
-            Value::Number(None) => (),
-            Value::String(string) => string.hash(state),
-        }
-    }
-}
-
-impl Clone for Value {
-    fn clone(&self) -> Self {
-        match self {
-            Self::Number(number) => Self::Number(*number),
-            Self::String(string) => Self::String(string.clone_boxed()),
-        }
-    }
-}
-
 impl Value {
     fn sysmis() -> Self {
         Self::Number(None)
     }
-
-    fn for_string<S>(s: S) -> Self
-    where
-        S: AsRef<str>,
-    {
-        Self::String(ValueString::new(s))
-    }
 }
 
 impl From<f64> for Value {
     fn from(value: f64) -> Self {
-        Self::Number(Some(value.into()))
-    }
-}
-
-#[derive(Debug)]
-pub struct ValueString {
-    nonutf8: Option<Box<EncodedString>>,
-    utf8: Box<str>
-}
-
-impl ValueString {
-    fn clone_boxed(&self) -> Box<Self> {
-        Box::new(ValueString {
-            nonutf8: self.nonutf8.map(|s| s.clone_boxed()),
-            utf8: self.utf8,
-        })
-    }
-
-    fn new<S>(s: S) -> Box<Self>
-    where
-        S: AsRef<str>,
-    {
-        Box::new(Self {
-            nonutf8: None,
-            utf8: s,
-        })
-    }
-
-    fn new_encoded(s: &[u8], encoding: &'static Encoding) -> Box<Self> {
-        if encoding == &UTF_8 {
-            if let Some(utf8) = str::from_utf8(s) {
-                return Self::new(utf8);
-            }
-        }
-        todo!()
-    }
-}
-
-impl PartialEq for ValueString {
-    fn eq(&self, other: &Self) -> bool {
-        self.utf8 == other.utf8
-    }
-}
-
-impl Eq for ValueString {}
-
-impl PartialOrd for ValueString {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
+        Some(value).into()
     }
 }
 
-impl Ord for ValueString {
-    fn cmp(&self, other: &Self) -> Ordering {
-        self.utf8.cmp(&other.utf8)
-    }
-}
-
-impl Hash for ValueString {
-    fn hash<H>(&self, state: &mut H)
-    where
-        H: Hasher,
-    {
-        self.utf8.hash(state);
-    }
-}
-
-#[derive(Debug, Hash)]
-pub struct EncodedString {
-    encoding: &'static Encoding,
-    s: Box<[u8]>,
-}
-
-impl PartialEq for EncodedString {
-    fn eq(&self, other: &Self) -> bool {
-        self.as_str().eq(&other.as_str())
-    }
-}
-
-impl Eq for EncodedString {}
-
-impl PartialOrd for EncodedString {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl Ord for EncodedString {
-    fn cmp(&self, other: &Self) -> Ordering {
-        self.as_str().cmp(&other.as_str())
+impl From<Option<f64>> for Value {
+    fn from(value: Option<f64>) -> Self {
+        Self::Number(value)
     }
 }
 
-impl EncodedString {
-    fn clone_boxed(&self) -> Box<Self> {
-        todo!()
-    }
-    fn as_str(&self) -> EncodedStr {
-        EncodedStr {
-            s: &*self.s,
-            encoding: self.encoding,
-        }
+impl From<&str> for Value {
+    fn from(value: &str) -> Self {
+        value.as_bytes().into()
     }
 }
 
-#[derive(Clone, Debug, Hash)]
-pub struct EncodedStr<'a> {
-    s: &'a [u8],
-    encoding: &'static Encoding,
-}
-
-impl<'a> PartialOrd for EncodedStr<'a> {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl<'a> Ord for EncodedStr<'a> {
-    fn cmp(&self, other: &Self) -> Ordering {
-        if self.encoding == other.encoding {
-            self.s.cmp(&other.s)
-        } else {
-            // Get an arbitary but stable ordering for strings with different
-            // encodings.  It would be nice to do something like
-            // `self.as_utf8().partial_cmp(other.as_utf8())` but it's likely that
-            // this would violate transitivity.
-            let this = self.encoding as *const Encoding;
-            let other = other.encoding as *const Encoding;
-            this.cmp(&other)
-        }
-    }
-}
-
-impl<'a> Eq for EncodedStr<'a> {}
-
-impl<'a> EncodedStr<'a> {
-    fn as_utf8(&self) -> Cow<'a, str> {
-        self.encoding.decode_without_bom_handling(self.s).0
-    }
-}
-
-impl<'a> PartialEq for EncodedStr<'a> {
-    fn eq(&self, other: &Self) -> bool {
-        self.encoding == other.encoding && self.s == other.s
+impl From<&[u8]> for Value {
+    fn from(value: &[u8]) -> Self {
+        Self::String(value.into())
     }
 }
 
index a3b3145bedff7c4edbf34bb2fc3440da5c787f43..5fb57135f74232838a3659b9cc165b619ee9222c 100644 (file)
@@ -17,6 +17,7 @@
 use anyhow::Result;
 use clap::{Parser, ValueEnum};
 use encoding_rs::Encoding;
+use pspp::cooked::{decode, Headers};
 use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record};
 use std::fs::File;
 use std::io::BufReader;
@@ -140,14 +141,20 @@ fn dissect(
             }
         }
         Mode::Cooked => {
-            /*
-                let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
-                let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?;
-                let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?;
-                for header in headers {
-                    println!("{header:?}");
+            let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
+            let encoding = match encoding {
+                Some(encoding) => encoding,
+                None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?,
+            };
+            let decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
+            let mut decoded_records = Vec::new();
+            for header in headers {
+                decoded_records.push(header.decode(&decoder)?);
             }
-                */
+            let headers = Headers::new(decoded_records, &|e| eprintln!("{e}"))?;
+            let (dictionary, metadata) = decode(headers, encoding, |e| eprintln!("{e}"))?;
+            println!("{dictionary:?}");
+            println!("{metadata:?}");
         }
     }