use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
use crate::{
- dictionary::{Dictionary, VarWidth, Variable},
+ dictionary::{Dictionary, EncodedString, Value, VarWidth, Variable},
encoding::Error as EncodingError,
endian::Endian,
format::{Error as FormatError, Format, UncheckedFormat},
fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
loop {
self.n_generated_names += 1;
- let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding)
- .unwrap();
+ let name = Identifier::from_encoding(
+ &format!("VAR{:03}", self.n_generated_names),
+ self.encoding,
+ )
+ .unwrap();
if !dictionary.variables.contains(&name.0) {
return name;
}
for dict_index in dict_indexes {
let mut variable = &dictionary.variables[dict_index];
for ValueLabel { value, label } in record.labels.iter().cloned() {
-
+ let value = match value {
+ raw::Value::Number(number) => Value::Number(number.map(|n| n.into())),
+ raw::Value::String(string) => Value::String(EncodedString::from_raw(
+ &string.0[..variable.width.as_string_width().unwrap()],
+ encoding,
+ )),
+ };
}
}
}
+use core::str;
use std::{
+ borrow::Cow,
cmp::Ordering,
collections::{HashMap, HashSet},
fmt::Debug,
+ hash::{Hash, Hasher},
ops::{Bound, RangeBounds},
};
-use encoding_rs::Encoding;
+use encoding_rs::{Encoding, UTF_8};
use indexmap::IndexSet;
use num::integer::div_ceil;
use ordered_float::OrderedFloat;
use crate::{
format::Format,
identifier::{ByIdentifier, HasIdentifier, Identifier},
- raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
+ raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType},
};
pub type DictIndex = usize;
false
}
}
+
+ pub fn as_string_width(&self) -> Option<usize> {
+ match self {
+ VarWidth::Numeric => None,
+ VarWidth::String(width) => Some(*width as usize),
+ }
+ }
}
impl From<VarWidth> for VarType {
}
}
-#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[derive(Debug)]
pub enum Value {
- Number(Option<OrderedFloat<f64>>),
- String(String),
+ Number(Option<f64>),
+ String(ValueString),
+}
+
+impl PartialEq for Value {
+ fn eq(&self, other: &Self) -> bool {
+ match (self, other) {
+ (Self::Number(Some(l0)), Self::Number(Some(r0))) => {
+ OrderedFloat(*l0) == OrderedFloat(*r0)
+ }
+ (Self::Number(None), Self::Number(None)) => true,
+ (Self::Number(_), Self::Number(_)) => false,
+ (Self::String(l0), Self::String(r0)) => l0 == r0,
+ }
+ }
+}
+
+impl Eq for Value {}
+
+impl PartialOrd for Value {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for Value {
+ fn cmp(&self, other: &Self) -> Ordering {
+ match (self, other) {
+ (Value::Number(a), Value::Number(b)) => match (a, b) {
+ (None, None) => Ordering::Equal,
+ (None, Some(_)) => Ordering::Less,
+ (Some(_), None) => Ordering::Greater,
+ (Some(a), Some(b)) => a.total_cmp(b),
+ },
+ (Value::Number(_), Value::String(_)) => Ordering::Less,
+ (Value::String(_), Value::Number(_)) => Ordering::Greater,
+ (Value::String(a), Value::String(b)) => a.cmp(b),
+ }
+ }
+}
+
+impl Hash for Value {
+ fn hash<H>(&self, state: &mut H)
+ where
+ H: Hasher,
+ {
+ match self {
+ Value::Number(Some(a)) => OrderedFloat(*a).hash(state),
+ Value::Number(None) => (),
+ Value::String(string) => string.hash(state),
+ }
+ }
+}
+
+impl Clone for Value {
+ fn clone(&self) -> Self {
+ match self {
+ Self::Number(number) => Self::Number(*number),
+ Self::String(string) => Self::String(string.clone_boxed()),
+ }
+ }
}
impl Value {
- pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
- match raw {
- raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
- raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+ fn sysmis() -> Self {
+ Self::Number(None)
+ }
+
+ fn for_string<S>(s: S) -> Self
+ where
+ S: AsRef<str>,
+ {
+ Self::String(ValueString::new(s))
+ }
+}
+
+impl From<f64> for Value {
+ fn from(value: f64) -> Self {
+ Self::Number(Some(value.into()))
+ }
+}
+
+#[derive(Debug)]
+pub struct ValueString {
+ nonutf8: Option<Box<EncodedString>>,
+ utf8: Box<str>
+}
+
+impl ValueString {
+ fn clone_boxed(&self) -> Box<Self> {
+ Box::new(ValueString {
+ nonutf8: self.nonutf8.map(|s| s.clone_boxed()),
+ utf8: self.utf8,
+ })
+ }
+
+ fn new<S>(s: S) -> Box<Self>
+ where
+ S: AsRef<str>,
+ {
+ Box::new(Self {
+ nonutf8: None,
+ utf8: s,
+ })
+ }
+
+ fn new_encoded(s: &[u8], encoding: &'static Encoding) -> Box<Self> {
+ if encoding == &UTF_8 {
+ if let Some(utf8) = str::from_utf8(s) {
+ return Self::new(utf8);
+ }
}
+ todo!()
+ }
+}
+
+impl PartialEq for ValueString {
+ fn eq(&self, other: &Self) -> bool {
+ self.utf8 == other.utf8
+ }
+}
+
+impl Eq for ValueString {}
+
+impl PartialOrd for ValueString {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for ValueString {
+ fn cmp(&self, other: &Self) -> Ordering {
+ self.utf8.cmp(&other.utf8)
+ }
+}
+
+impl Hash for ValueString {
+ fn hash<H>(&self, state: &mut H)
+ where
+ H: Hasher,
+ {
+ self.utf8.hash(state);
+ }
+}
+
+#[derive(Debug, Hash)]
+pub struct EncodedString {
+ encoding: &'static Encoding,
+ s: Box<[u8]>,
+}
+
+impl PartialEq for EncodedString {
+ fn eq(&self, other: &Self) -> bool {
+ self.as_str().eq(&other.as_str())
+ }
+}
+
+impl Eq for EncodedString {}
+
+impl PartialOrd for EncodedString {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for EncodedString {
+ fn cmp(&self, other: &Self) -> Ordering {
+ self.as_str().cmp(&other.as_str())
+ }
+}
+
+impl EncodedString {
+ fn clone_boxed(&self) -> Box<Self> {
+ todo!()
+ }
+ fn as_str(&self) -> EncodedStr {
+ EncodedStr {
+ s: &*self.s,
+ encoding: self.encoding,
+ }
+ }
+}
+
+#[derive(Clone, Debug, Hash)]
+pub struct EncodedStr<'a> {
+ s: &'a [u8],
+ encoding: &'static Encoding,
+}
+
+impl<'a> PartialOrd for EncodedStr<'a> {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl<'a> Ord for EncodedStr<'a> {
+ fn cmp(&self, other: &Self) -> Ordering {
+ if self.encoding == other.encoding {
+ self.s.cmp(&other.s)
+ } else {
+ // Get an arbitary but stable ordering for strings with different
+ // encodings. It would be nice to do something like
+ // `self.as_utf8().partial_cmp(other.as_utf8())` but it's likely that
+ // this would violate transitivity.
+ let this = self.encoding as *const Encoding;
+ let other = other.encoding as *const Encoding;
+ this.cmp(&other)
+ }
+ }
+}
+
+impl<'a> Eq for EncodedStr<'a> {}
+
+impl<'a> EncodedStr<'a> {
+ fn as_utf8(&self) -> Cow<'a, str> {
+ self.encoding.decode_without_bom_handling(self.s).0
+ }
+}
+
+impl<'a> PartialEq for EncodedStr<'a> {
+ fn eq(&self, other: &Self) -> bool {
+ self.encoding == other.encoding && self.s == other.s
}
}