From 168475a9c04b570944e03eb7eda05f7dfd946213 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 19 Aug 2023 13:30:22 -0700 Subject: [PATCH] work --- rust/src/cooked.rs | 318 ++++++++++++++++++++++++++++++++----- rust/src/raw.rs | 388 +++++++-------------------------------------- 2 files changed, 334 insertions(+), 372 deletions(-) diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 4b12e8f314..e0a7ef1fea 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -6,7 +6,8 @@ use encoding_rs::Encoding; use crate::{ format::{Spec, UncheckedSpec, Width}, identifier::{Error as IdError, Identifier}, - {endian::Endian, CategoryLabels, Compression}, raw, + raw::{self, MissingValues}, + {endian::Endian, CategoryLabels, Compression}, }; use thiserror::Error as ThisError; @@ -113,7 +114,7 @@ pub struct Variable { pub name: Identifier, pub print_format: Spec, pub write_format: Spec, - //pub missing_values: MissingValues, + pub missing_values: MissingValues, pub label: Option, } @@ -127,40 +128,52 @@ fn decode_format(raw: raw::Spec, name: &str, width: Width) -> Spec { }) } -fn decode_var( - decoder: &mut Decoder, - input: &crate::raw::Variable, - warn: impl Fn(Error), -) -> Result, Error> { - match input.width { - 0..=255 => (), - -1 => return Ok(None), - _ => { - return Err(Error::BadVariableWidth { - offset: input.offset, - width: input.width, - }) - } - }; - let width = input.width as Width; - let name = decoder.decode_string(&input.name.0, &warn); - let name = match Identifier::new(&name, decoder.encoding) { - Ok(name) => { - if !decoder.take_name(&name) { +impl Variable { + pub fn decode( + decoder: &mut Decoder, + input: &crate::raw::Variable, + warn: impl Fn(Error), + ) -> Result, Error> { + match input.width { + 0..=255 => (), + -1 => return Ok(None), + _ => { + return Err(Error::BadVariableWidth { + offset: input.offset, + width: input.width, + }) + } + }; + let width = input.width as Width; + let name = decoder.decode_string(&input.name.0, &warn); + let name = match Identifier::new(&name, decoder.encoding) { + Ok(name) => { + if !decoder.take_name(&name) { + decoder.generate_name() + } else { + name + } + } + Err(error) => { + warn(error.into()); decoder.generate_name() - } else { - name } - } - Err(error) => { - warn(error.into()); - decoder.generate_name() - } - }; - let print_format = decode_format(input.print_format, &name.0, width); - let write_format = decode_format(input.write_format, &name.0, width); - let label = input.label.as_ref().map(|label| decoder.decode_string(&label.0, &warn).into()); - Ok(Some(Variable { width, name, print_format, write_format, label })) + }; + let print_format = decode_format(input.print_format, &name.0, width); + let write_format = decode_format(input.write_format, &name.0, width); + let label = input + .label + .as_ref() + .map(|label| decoder.decode_string(&label.0, &warn).into()); + Ok(Some(Variable { + width, + name, + print_format, + write_format, + missing_values: input.missing_values.clone(), + label, + })) + } } #[derive(Clone)] @@ -183,6 +196,240 @@ impl Decode for Document { pub use crate::raw::FloatInfo; pub use crate::raw::IntegerInfo; +trait TextRecord +where + Self: Sized, +{ + const NAME: &'static str; + fn parse(input: &str, warn: impl Fn(Error)) -> Result; +} + +pub struct VariableSet { + pub name: String, + pub vars: Vec, +} + +impl VariableSet { + fn parse(input: &str) -> Result { + let (name, input) = input.split_once('=').ok_or(Error::TBD)?; + let vars = input.split_ascii_whitespace().map(String::from).collect(); + Ok(VariableSet { + name: name.into(), + vars, + }) + } +} + +pub struct VariableSetRecord(Vec); + +impl TextRecord for VariableSetRecord { + const NAME: &'static str = "variable set"; + fn parse(input: &str, warn: impl Fn(Error)) -> Result { + let mut sets = Vec::new(); + for line in input.lines() { + match VariableSet::parse(line) { + Ok(set) => sets.push(set), + Err(error) => warn(error), + } + } + Ok(VariableSetRecord(sets)) + } +} + +pub struct ProductInfo(pub String); + +impl TextRecord for ProductInfo { + const NAME: &'static str = "extra product info"; + fn parse(input: &str, _warn: impl Fn(Error)) -> Result { + Ok(ProductInfo(input.into())) + } +} + +pub struct LongVariableName { + pub short_name: String, + pub long_name: String, +} + +pub struct LongVariableNameRecord(Vec); + +impl TextRecord for LongVariableNameRecord { + const NAME: &'static str = "long variable names"; + fn parse(input: &str, warn: impl Fn(Error)) -> Result { + let mut names = Vec::new(); + for pair in input.split('\t').filter(|s| !s.is_empty()) { + if let Some((short_name, long_name)) = pair.split_once('=') { + let name = LongVariableName { + short_name: short_name.into(), + long_name: long_name.into(), + }; + names.push(name); + } else { + warn(Error::TBD) + } + } + Ok(LongVariableNameRecord(names)) + } +} + +pub struct VeryLongString { + pub short_name: String, + pub length: usize, +} + +impl VeryLongString { + fn parse(input: &str) -> Result { + let Some((short_name, length)) = input.split_once('=') else { + return Err(Error::TBD); + }; + let length: usize = length.parse().map_err(|_| Error::TBD)?; + Ok(VeryLongString { + short_name: short_name.into(), + length, + }) + } +} + +pub struct VeryLongStringRecord(Vec); + +impl TextRecord for VeryLongStringRecord { + const NAME: &'static str = "very long strings"; + fn parse(input: &str, warn: impl Fn(Error)) -> Result { + let mut very_long_strings = Vec::new(); + for tuple in input + .split('\0') + .map(|s| s.trim_end_matches('\t')) + .filter(|s| !s.is_empty()) + { + match VeryLongString::parse(tuple) { + Ok(vls) => very_long_strings.push(vls), + Err(error) => warn(error), + } + } + Ok(VeryLongStringRecord(very_long_strings)) + } +} + +pub struct Attribute { + pub name: String, + pub values: Vec, +} + +impl Attribute { + fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> { + let Some((name, mut input)) = input.split_once('(') else { + return Err(Error::TBD); + }; + let mut values = Vec::new(); + loop { + let Some((value, rest)) = input.split_once('\n') else { + return Err(Error::TBD); + }; + if let Some(stripped) = value + .strip_prefix('\'') + .and_then(|value| value.strip_suffix('\'')) + { + values.push(stripped.into()); + } else { + warn(Error::TBD); + values.push(value.into()); + } + if let Some(rest) = rest.strip_prefix(')') { + return Ok(( + Attribute { + name: name.into(), + values, + }, + rest, + )); + } + input = rest; + } + } +} + +pub struct AttributeSet(pub Vec); + +impl AttributeSet { + fn parse<'a>( + mut input: &'a str, + sentinel: Option, + warn: &impl Fn(Error), + ) -> Result<(AttributeSet, &'a str), Error> { + let mut attributes = Vec::new(); + let rest = loop { + match input.chars().next() { + None => break input, + c if c == sentinel => break &input[1..], + _ => { + let (attribute, rest) = Attribute::parse(input, &warn)?; + attributes.push(attribute); + input = rest; + } + } + }; + Ok((AttributeSet(attributes), rest)) + } +} + +pub struct FileAttributeRecord(AttributeSet); + +impl TextRecord for FileAttributeRecord { + const NAME: &'static str = "data file attributes"; + fn parse(input: &str, warn: impl Fn(Error)) -> Result { + let (set, rest) = AttributeSet::parse(input, None, &warn)?; + if !rest.is_empty() { + warn(Error::TBD); + } + Ok(FileAttributeRecord(set)) + } +} + +pub struct VarAttributeSet { + pub long_var_name: String, + pub attributes: AttributeSet, +} + +impl VarAttributeSet { + fn parse<'a>( + input: &'a str, + warn: &impl Fn(Error), + ) -> Result<(VarAttributeSet, &'a str), Error> { + let Some((long_var_name, rest)) = input.split_once(':') else { + return Err(Error::TBD); + }; + let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?; + Ok(( + VarAttributeSet { + long_var_name: long_var_name.into(), + attributes, + }, + rest, + )) + } +} + +pub struct VariableAttributeRecord(Vec); + +impl TextRecord for VariableAttributeRecord { + const NAME: &'static str = "variable attributes"; + fn parse(mut input: &str, warn: impl Fn(Error)) -> Result { + let mut var_attribute_sets = Vec::new(); + while !input.is_empty() { + match VarAttributeSet::parse(input, &warn) { + Ok((var_attribute, rest)) => { + var_attribute_sets.push(var_attribute); + input = rest; + } + Err(error) => { + warn(error); + break; + } + } + } + Ok(VariableAttributeRecord(var_attribute_sets)) + } +} + #[derive(Clone, Debug)] pub enum MultipleResponseType { MultipleDichotomy { @@ -202,9 +449,6 @@ pub struct MultipleResponseSet { #[derive(Clone, Debug)] pub struct MultipleResponseRecord(Vec); -#[derive(Clone, Debug)] -pub struct ProductInfo(String); - pub enum Measure { Nominal, Ordinal, diff --git a/rust/src/raw.rs b/rust/src/raw.rs index eda5b5724f..ab3ad844d9 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,6 +1,7 @@ use crate::endian::{Endian, Parse, ToBytes}; use crate::{CategoryLabels, Compression}; +use encoding_rs::mem::decode_latin1; use flate2::read::ZlibDecoder; use num::Integer; use std::borrow::Cow; @@ -157,33 +158,10 @@ impl Record { } } -pub struct FallbackEncoding<'a>(&'a [u8]); - -fn fallback_encode<'a>(s: &'a [u8]) -> Cow<'a, str> { - if let Ok(s) = from_utf8(s) { - s.into() - } else { - let s: String = s.iter().map(|c| char::from(*c)).collect(); - s.into() - } -} - -impl<'a> Debug for FallbackEncoding<'a> { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - if let Ok(s) = from_utf8(self.0) { - let s = s.trim_end(); - write!(f, "\"{s}\"") - } else { - let s: String = self - .0 - .iter() - .map(|c| char::from(*c).escape_default()) - .flatten() - .collect(); - let s = s.trim_end(); - write!(f, "\"{s}\"") - } - } +// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it +// decoded as Latin-1 (actually bytes interpreted as Unicode code points). +fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> { + from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) } #[derive(Clone)] @@ -946,12 +924,12 @@ impl Debug for UntypedValue { }; write!(f, "{number}")?; - let string = fallback_encode(&self.0); + let string = default_decode(&self.0); let string = string .split(|c: char| c == '\0' || c.is_control()) .next() .unwrap(); - write!(f, "/\"{string}\"")?; + write!(f, "{string:?}")?; Ok(()) } } @@ -973,7 +951,7 @@ impl From<&[u8]> for UnencodedString { impl Debug for UnencodedString { fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", FallbackEncoding(self.0.as_slice())) + write!(f, "{:?}", default_decode(self.0.as_slice())) } } @@ -988,7 +966,7 @@ impl From<[u8; N]> for UnencodedStr { impl Debug for UnencodedStr { fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", FallbackEncoding(&self.0)) + write!(f, "{:?}", default_decode(&self.0)) } } @@ -1126,14 +1104,6 @@ impl Document { } } -trait TextRecord -where - Self: Sized, -{ - const NAME: &'static str; - fn parse(input: &str, warn: impl Fn(Error)) -> Result; -} - trait ExtensionRecord where Self: Sized, @@ -1340,15 +1310,6 @@ fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> Ok((string.into(), rest)) } -pub struct ProductInfo(String); - -impl TextRecord for ProductInfo { - const NAME: &'static str = "extra product info"; - fn parse(input: &str, _warn: impl Fn(Error)) -> Result { - Ok(ProductInfo(input.into())) - } -} - #[derive(Clone, Debug)] pub struct VarDisplayRecord(pub Vec); @@ -1369,145 +1330,6 @@ impl ExtensionRecord for VarDisplayRecord { } } -pub struct VariableSet { - pub name: String, - pub vars: Vec, -} - -impl VariableSet { - fn parse(input: &str) -> Result { - let (name, input) = input.split_once('=').ok_or(Error::TBD)?; - let vars = input.split_ascii_whitespace().map(String::from).collect(); - Ok(VariableSet { - name: name.into(), - vars, - }) - } -} - -pub struct VariableSetRecord(Vec); - -impl TextRecord for VariableSetRecord { - const NAME: &'static str = "variable set"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let mut sets = Vec::new(); - for line in input.lines() { - match VariableSet::parse(line) { - Ok(set) => sets.push(set), - Err(error) => warn(error), - } - } - Ok(VariableSetRecord(sets)) - } -} - -pub struct LongVariableName { - pub short_name: String, - pub long_name: String, -} - -pub struct LongVariableNameRecord(Vec); - -impl TextRecord for LongVariableNameRecord { - const NAME: &'static str = "long variable names"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let mut names = Vec::new(); - for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some((short_name, long_name)) = pair.split_once('=') { - let name = LongVariableName { - short_name: short_name.into(), - long_name: long_name.into(), - }; - names.push(name); - } else { - warn(Error::TBD) - } - } - Ok(LongVariableNameRecord(names)) - } -} - -pub struct VeryLongString { - pub short_name: String, - pub length: usize, -} - -impl VeryLongString { - fn parse(input: &str) -> Result { - let Some((short_name, length)) = input.split_once('=') else { - return Err(Error::TBD); - }; - let length: usize = length.parse().map_err(|_| Error::TBD)?; - Ok(VeryLongString { - short_name: short_name.into(), - length, - }) - } -} - -pub struct VeryLongStringRecord(Vec); - -impl TextRecord for VeryLongStringRecord { - const NAME: &'static str = "very long strings"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let mut very_long_strings = Vec::new(); - for tuple in input - .split('\0') - .map(|s| s.trim_end_matches('\t')) - .filter(|s| !s.is_empty()) - { - match VeryLongString::parse(tuple) { - Ok(vls) => very_long_strings.push(vls), - Err(error) => warn(error), - } - } - Ok(VeryLongStringRecord(very_long_strings)) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringValueLabels { - pub var_name: UnencodedString, - pub width: u32, - - /// `(value, label)` pairs, where each value is `width` bytes. - pub labels: Vec<(UnencodedString, UnencodedString)>, -} - -#[derive(Clone, Debug)] -pub struct LongStringValueLabelRecord(Vec); - -impl ExtensionRecord for LongStringValueLabelRecord { - const SUBTYPE: u32 = 21; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "long string value labels record"; - - fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let mut label_set = Vec::new(); - while !input.is_empty() { - let var_name = read_string(&mut input, endian)?; - let width: u32 = endian.parse(read_bytes(&mut input)?); - let n_labels: u32 = endian.parse(read_bytes(&mut input)?); - let mut labels = Vec::new(); - for _ in 0..n_labels { - let value = read_string(&mut input, endian)?; - let label = read_string(&mut input, endian)?; - labels.push((value, label)); - } - label_set.push(LongStringValueLabels { - var_name, - width, - labels, - }) - } - Ok(LongStringValueLabelRecord(label_set)) - } -} - pub struct LongStringMissingValues { /// Variable name. pub var_name: UnencodedString, @@ -1587,126 +1409,6 @@ impl ExtensionRecord for EncodingRecord { } } -pub struct Attribute { - pub name: String, - pub values: Vec, -} - -impl Attribute { - fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> { - let Some((name, mut input)) = input.split_once('(') else { - return Err(Error::TBD); - }; - let mut values = Vec::new(); - loop { - let Some((value, rest)) = input.split_once('\n') else { - return Err(Error::TBD); - }; - if let Some(stripped) = value - .strip_prefix('\'') - .and_then(|value| value.strip_suffix('\'')) - { - values.push(stripped.into()); - } else { - warn(Error::TBD); - values.push(value.into()); - } - if let Some(rest) = rest.strip_prefix(')') { - return Ok(( - Attribute { - name: name.into(), - values, - }, - rest, - )); - } - input = rest; - } - } -} - -pub struct AttributeSet(pub Vec); - -impl AttributeSet { - fn parse<'a>( - mut input: &'a str, - sentinel: Option, - warn: &impl Fn(Error), - ) -> Result<(AttributeSet, &'a str), Error> { - let mut attributes = Vec::new(); - let rest = loop { - match input.chars().next() { - None => break input, - c if c == sentinel => break &input[1..], - _ => { - let (attribute, rest) = Attribute::parse(input, &warn)?; - attributes.push(attribute); - input = rest; - } - } - }; - Ok((AttributeSet(attributes), rest)) - } -} - -pub struct FileAttributeRecord(AttributeSet); - -impl TextRecord for FileAttributeRecord { - const NAME: &'static str = "data file attributes"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let (set, rest) = AttributeSet::parse(input, None, &warn)?; - if !rest.is_empty() { - warn(Error::TBD); - } - Ok(FileAttributeRecord(set)) - } -} - -pub struct VarAttributeSet { - pub long_var_name: String, - pub attributes: AttributeSet, -} - -impl VarAttributeSet { - fn parse<'a>( - input: &'a str, - warn: &impl Fn(Error), - ) -> Result<(VarAttributeSet, &'a str), Error> { - let Some((long_var_name, rest)) = input.split_once(':') else { - return Err(Error::TBD); - }; - let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?; - Ok(( - VarAttributeSet { - long_var_name: long_var_name.into(), - attributes, - }, - rest, - )) - } -} - -pub struct VariableAttributeRecord(Vec); - -impl TextRecord for VariableAttributeRecord { - const NAME: &'static str = "variable attributes"; - fn parse(mut input: &str, warn: impl Fn(Error)) -> Result { - let mut var_attribute_sets = Vec::new(); - while !input.is_empty() { - match VarAttributeSet::parse(input, &warn) { - Ok((var_attribute, rest)) => { - var_attribute_sets.push(var_attribute); - input = rest; - } - Err(error) => { - warn(error); - break; - } - } - } - Ok(VariableAttributeRecord(var_attribute_sets)) - } -} #[derive(Clone, Debug)] pub struct NumberOfCasesRecord { @@ -1768,34 +1470,6 @@ pub struct Extension { pub data: Vec, } -/* -fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) { - match extension { - /* Implemented record types. */ - ExtensionType::Integer => (4, 8), - ExtensionType::Float => (8, 3), - ExtensionType::VarSets => (1, 0), - ExtensionType::Mrsets => (1, 0), - ExtensionType::ProductInfo => (1, 0), - ExtensionType::Display => (4, 0), - ExtensionType::LongNames => (1, 0), - ExtensionType::LongStrings => (1, 0), - ExtensionType::Ncases => (8, 2), - ExtensionType::FileAttrs => (1, 0), - ExtensionType::VarAttrs => (1, 0), - ExtensionType::Mrsets2 => (1, 0), - ExtensionType::Encoding => (1, 0), - ExtensionType::LongLabels => (1, 0), - ExtensionType::LongMissing => (1, 0), - - /* Ignored record types. */ - ExtensionType::Date => (0, 0), - ExtensionType::DataEntry => (0, 0), - ExtensionType::Dataview => (0, 0), - } -} - */ - impl Extension { fn check_size(&self) -> Result<(), Error> { if let Some(expected_size) = E::SIZE { @@ -2043,3 +1717,47 @@ fn read_string(r: &mut R, endian: Endian) -> Result, +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabelRecord(Vec); + +impl ExtensionRecord for LongStringValueLabelRecord { + const SUBTYPE: u32 = 21; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string value labels record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut label_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let width: u32 = endian.parse(read_bytes(&mut input)?); + let n_labels: u32 = endian.parse(read_bytes(&mut input)?); + let mut labels = Vec::new(); + for _ in 0..n_labels { + let value = read_string(&mut input, endian)?; + let label = read_string(&mut input, endian)?; + labels.push((value, label)); + } + label_set.push(LongStringValueLabels { + var_name, + width, + labels, + }) + } + Ok(LongStringValueLabelRecord(label_set)) + } +} + -- 2.30.2