From: Ben Pfaff Date: Fri, 22 Dec 2023 20:14:13 +0000 (-0800) Subject: hey it compiles again X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=757ca4b8a7b88c1423a346161ef3e537a2db1b4c;p=pspp hey it compiles again --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 2cea330740..b3c0ff74bf 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -8,7 +8,7 @@ use crate::{ endian::Endian, format::{Error as FormatError, Spec, UncheckedSpec}, identifier::{Error as IdError, Identifier}, - raw::{self, UnencodedStr, VarType}, + raw::{self, RawStr, RawString, VarType}, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::{DecoderResult, Encoding}; @@ -212,9 +212,9 @@ pub struct Decoder { #[derive(Default)] struct Headers<'a> { - header: Option<&'a raw::HeaderRecord>, - variables: Vec<&'a raw::VariableRecord>, - value_labels: Vec<&'a raw::ValueLabelRecord>, + header: Option<&'a raw::HeaderRecord>, + variables: Vec<&'a raw::VariableRecord>>, + value_labels: Vec<&'a raw::ValueLabelRecord, RawString>>, document: Option<&'a raw::DocumentRecord>, integer_info: Option<&'a raw::IntegerInfoRecord>, float_info: Option<&'a raw::FloatInfoRecord>, @@ -519,8 +519,8 @@ pub trait Decode: Sized { fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self; } -impl Decode> for String { - fn decode(decoder: &Decoder, input: &UnencodedStr, warn: impl Fn(Error)) -> Self { +impl Decode> for String { + fn decode(decoder: &Decoder, input: &RawStr, warn: impl Fn(Error)) -> Self { decoder.decode_string(&input.0, &warn) } } @@ -540,7 +540,7 @@ fn trim_end_spaces(mut s: String) -> String { } impl TryDecode for HeaderRecord { - type Input = crate::raw::HeaderRecord; + type Input = crate::raw::HeaderRecord; fn try_decode( decoder: &mut Decoder, @@ -658,8 +658,12 @@ pub struct MissingValues { pub range: Option<(Value, Value)>, } -impl Decode for MissingValues { - fn decode(decoder: &Decoder, input: &raw::MissingValues, _warn: impl Fn(Error)) -> Self { +impl Decode>> for MissingValues { + fn decode( + decoder: &Decoder, + input: &raw::MissingValues>, + _warn: impl Fn(Error), + ) -> Self { MissingValues { values: input .values @@ -686,11 +690,11 @@ fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatErro } impl TryDecode for VariableRecord { - type Input = raw::VariableRecord; + type Input = raw::VariableRecord>; fn try_decode( decoder: &mut Decoder, - input: &crate::raw::VariableRecord, + input: &Self::Input, warn: impl Fn(Error), ) -> Result, Error> { let width = match input.width { @@ -836,11 +840,11 @@ impl WarnOnError for Result { #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Value { Number(Option>), - String(Box), + String(String), } impl Value { - pub fn decode(raw: &raw::Value, decoder: &Decoder) -> Self { + pub fn decode(raw: &raw::Value>, decoder: &Decoder) -> Self { match raw { raw::Value::Number(x) => Value::Number(x.map(|x| x.into())), raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), @@ -862,7 +866,7 @@ pub struct ValueLabelRecord { } impl TryDecode for ValueLabelRecord { - type Input = crate::raw::ValueLabelRecord; + type Input = crate::raw::ValueLabelRecord, RawString>; fn try_decode( decoder: &mut Decoder, input: &Self::Input, @@ -908,10 +912,10 @@ impl TryDecode for ValueLabelRecord { let labels = input .labels .iter() - .map(|(value, label)| { + .map(|raw::ValueLabel { value, label }| { let label = decoder.decode_string(&label.0, &warn); let value = Value::decode( - &raw::Value::from_raw(value, var_type, decoder.endian), + value, decoder, ); ValueLabel { value, label } diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 2eb96b2fec..af29cdf61e 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,18 +1,20 @@ use crate::endian::{Endian, Parse, ToBytes}; -use encoding_rs::mem::decode_latin1; +use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding}; use flate2::read::ZlibDecoder; use num::Integer; use std::{ borrow::Cow, + cell::RefCell, cmp::Ordering, collections::VecDeque, - fmt::{Debug, Formatter, Result as FmtResult}, + fmt::{Debug, Display, Formatter, Result as FmtResult}, io::{Error as IoError, Read, Seek, SeekFrom}, + iter::repeat, mem::take, ops::Range, rc::Rc, - str::from_utf8, cell::RefCell, + str::from_utf8, }; use thiserror::Error as ThisError; @@ -63,8 +65,25 @@ pub enum Error { #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")] ExpectedVarIndexRecord { offset: u64, rec_type: u32 }, - #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] - BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, + #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")] + TooManyVarIndexes { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")] + NoVarIndexes { offset: u64 }, + + #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())] + MixedVarTypes { + offset: u64, + var_type: VarType, + wrong_types: Vec, + }, + + #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")] + InvalidVarIndexes { + offset: u64, + max: usize, + invalid: Vec, + }, #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] ExtensionRecordTooLarge { @@ -129,15 +148,19 @@ pub enum Error { #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] BadEncodingName { offset: u64 }, + // XXX This is risky because `text` might be arbitarily long. + #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] + MalformedString { encoding: String, text: String }, + #[error("Details TBD")] TBD, } #[derive(Clone, Debug)] pub enum Record { - Header(HeaderRecord), - Variable(VariableRecord), - ValueLabel(ValueLabelRecord), + Header(HeaderRecord), + Variable(VariableRecord>), + ValueLabel(ValueLabelRecord, RawString>), Document(DocumentRecord), IntegerInfo(IntegerInfoRecord), FloatInfo(FloatInfoRecord), @@ -164,6 +187,7 @@ impl Record { fn read( reader: &mut R, endian: Endian, + var_types: &[VarType], warn: &Box, ) -> Result, Error> where @@ -172,7 +196,7 @@ impl Record { let rec_type: u32 = endian.parse(read_bytes(reader)?); match rec_type { 2 => Ok(Some(VariableRecord::read(reader, endian)?)), - 3 => Ok(Some(ValueLabelRecord::read(reader, endian)?)), + 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?), 6 => Ok(Some(DocumentRecord::read(reader, endian)?)), 7 => Extension::read(reader, endian, warn), 999 => Ok(Some(Record::EndOfHeaders( @@ -203,7 +227,10 @@ trait Header { } #[derive(Clone)] -pub struct HeaderRecord { +pub struct HeaderRecord +where + S: Debug, +{ /// Offset in file. pub offsets: Range, @@ -212,7 +239,7 @@ pub struct HeaderRecord { /// Eye-catcher string, product name, in the file's encoding. Padded /// on the right with spaces. - pub eye_catcher: UnencodedStr<60>, + pub eye_catcher: S, /// Layout code, normally either 2 or 3. pub layout_code: u32, @@ -235,50 +262,59 @@ pub struct HeaderRecord { pub bias: f64, /// `dd mmm yy` in the file's encoding. - pub creation_date: UnencodedStr<9>, + pub creation_date: S, /// `HH:MM:SS` in the file's encoding. - pub creation_time: UnencodedStr<8>, + pub creation_time: S, /// File label, in the file's encoding. Padded on the right with spaces. - pub file_label: UnencodedStr<64>, + pub file_label: S, /// Endianness of the data in the file header. pub endian: Endian, } -impl HeaderRecord { - fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult { +impl HeaderRecord +where + S: Debug, +{ + fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult + where + T: Debug, + { writeln!(f, "{name:>17}: {:?}", value) } } -impl Debug for HeaderRecord { +impl Debug for HeaderRecord +where + S: Debug, +{ fn fmt(&self, f: &mut Formatter) -> FmtResult { writeln!(f, "File header record:")?; self.debug_field(f, "Magic", self.magic)?; - self.debug_field(f, "Product name", self.eye_catcher)?; + self.debug_field(f, "Product name", &self.eye_catcher)?; self.debug_field(f, "Layout code", self.layout_code)?; self.debug_field(f, "Nominal case size", self.nominal_case_size)?; self.debug_field(f, "Compression", self.compression)?; self.debug_field(f, "Weight index", self.weight_index)?; self.debug_field(f, "Number of cases", self.n_cases)?; self.debug_field(f, "Compression bias", self.bias)?; - self.debug_field(f, "Creation date", self.creation_date)?; - self.debug_field(f, "Creation time", self.creation_time)?; - self.debug_field(f, "File label", self.file_label)?; + self.debug_field(f, "Creation date", &self.creation_date)?; + self.debug_field(f, "Creation time", &self.creation_time)?; + self.debug_field(f, "File label", &self.file_label)?; self.debug_field(f, "Endianness", self.endian) } } -impl HeaderRecord { - fn read(r: &mut R) -> Result { +impl HeaderRecord { + fn read(r: &mut R) -> Result { let start = r.stream_position()?; let magic: [u8; 4] = read_bytes(r)?; let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; - let eye_catcher = UnencodedStr::<60>(read_bytes(r)?); + let eye_catcher = RawString(read_vec(r, 60)?); let layout_code: [u8; 4] = read_bytes(r)?; let endian = Endian::identify_u32(2, layout_code) .or_else(|| Endian::identify_u32(2, layout_code)) @@ -306,9 +342,9 @@ impl HeaderRecord { let bias: f64 = endian.parse(read_bytes(r)?); - let creation_date = UnencodedStr::<9>(read_bytes(r)?); - let creation_time = UnencodedStr::<8>(read_bytes(r)?); - let file_label = UnencodedStr::<64>(read_bytes(r)?); + let creation_date = RawString(read_vec(r, 9)?); + let creation_time = RawString(read_vec(r, 8)?); + let file_label = RawString(read_vec(r, 64)?); let _: [u8; 3] = read_bytes(r)?; Ok(HeaderRecord { @@ -327,9 +363,86 @@ impl HeaderRecord { endian, }) } + + fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord> { + let eye_catcher = decoder.decode(&self.eye_catcher); + let file_label = decoder.decode(&self.file_label); + let creation_date = decoder.decode(&self.creation_date); + let creation_time = decoder.decode(&self.creation_time); + HeaderRecord { + eye_catcher, + weight_index: self.weight_index, + n_cases: self.n_cases, + file_label, + offsets: self.offsets.clone(), + magic: self.magic, + layout_code: self.layout_code, + nominal_case_size: self.nominal_case_size, + compression: self.compression, + bias: self.bias, + creation_date, + creation_time, + endian: self.endian, + } + } +} + +struct Decoder { + encoding: &'static Encoding, + warn: Box, +} + +impl Decoder { + fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> { + let (output, malformed) = self.encoding.decode_without_bom_handling(&input.0); + if malformed { + (self.warn)(Error::MalformedString { + encoding: self.encoding.name().into(), + text: output.clone().into(), + }); + } + output + } + + /// Returns `input` decoded from `self.encoding` into UTF-8 such that + /// re-encoding the result back into `self.encoding` will have exactly the + /// same length in bytes. + /// + /// XXX warn about errors? + fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + if let (s, false) = self.encoding.decode_without_bom_handling(input) { + // This is the common case. Usually there will be no errors. + s + } else { + // Unusual case. Don't bother to optimize it much. + let mut decoder = self.encoding.new_decoder_without_bom_handling(); + let mut output = String::with_capacity( + decoder + .max_utf8_buffer_length_without_replacement(input.len()) + .unwrap(), + ); + let mut rest = input; + while !rest.is_empty() { + match decoder.decode_to_string_without_replacement(rest, &mut output, true) { + (DecoderResult::InputEmpty, _) => break, + (DecoderResult::OutputFull, _) => unreachable!(), + (DecoderResult::Malformed(a, b), consumed) => { + let skipped = a as usize + b as usize; + output.extend(repeat('?').take(skipped)); + rest = &rest[consumed..]; + } + } + } + assert_eq!(self.encoding.encode(&output).0.len(), input.len()); + output.into() + } + } } -impl Header for HeaderRecord { +impl Header for HeaderRecord +where + S: Debug, +{ fn offsets(&self) -> Range { self.offsets.clone() } @@ -396,26 +509,50 @@ impl VarType { _ => VarType::String, } } + + fn opposite(self) -> VarType { + match self { + Self::Numeric => Self::String, + Self::String => Self::Numeric, + } + } +} + +impl Display for VarType { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match self { + VarType::Numeric => write!(f, "numeric"), + VarType::String => write!(f, "string"), + } + } } #[derive(Copy, Clone)] -pub enum Value { +pub enum Value +where + S: Debug, +{ Number(Option), - String(UnencodedStr<8>), + String(S), } -impl Debug for Value { +type RawValue = Value>; + +impl Debug for Value +where + S: Debug, +{ fn fmt(&self, f: &mut Formatter) -> FmtResult { match self { Value::Number(Some(number)) => write!(f, "{number:?}"), Value::Number(None) => write!(f, "SYSMIS"), - Value::String(bytes) => write!(f, "{:?}", bytes), + Value::String(s) => write!(f, "{:?}", s), } } } -impl Value { - fn read(r: &mut R, var_type: VarType, endian: Endian) -> Result { +impl RawValue { + fn read(r: &mut R, var_type: VarType, endian: Endian) -> Result { Ok(Self::from_raw( &UntypedValue(read_bytes(r)?), var_type, @@ -423,9 +560,9 @@ impl Value { )) } - pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Value { + pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self { match var_type { - VarType::String => Value::String(UnencodedStr(raw.0)), + VarType::String => Value::String(RawStr(raw.0)), VarType::Numeric => { let number: f64 = endian.parse(raw.0); Value::Number((number != -f64::MAX).then_some(number)) @@ -437,7 +574,7 @@ impl Value { reader: &mut R, var_types: &[VarType], endian: Endian, - ) -> Result>, Error> { + ) -> Result>, Error> { let case_start = reader.stream_position()?; let mut values = Vec::with_capacity(var_types.len()); for (i, &var_type) in var_types.iter().enumerate() { @@ -464,7 +601,7 @@ impl Value { codes: &mut VecDeque, endian: Endian, bias: f64, - ) -> Result>, Error> { + ) -> Result>, Error> { let case_start = reader.stream_position()?; let mut values = Vec::with_capacity(var_types.len()); for (i, &var_type) in var_types.iter().enumerate() { @@ -487,9 +624,9 @@ impl Value { match code { 0 => (), 1..=251 => match var_type { - VarType::Numeric => break Value::Number(Some(code as f64 - bias)), + VarType::Numeric => break Self::Number(Some(code as f64 - bias)), VarType::String => { - break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias))) + break Self::String(RawStr(endian.to_bytes(code as f64 - bias))) } }, 252 => { @@ -504,10 +641,10 @@ impl Value { } } 253 => { - break Value::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian) + break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian) } 254 => match var_type { - VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC + VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC VarType::Numeric => { return Err(Error::CompressedStringExpected { offset: case_start, @@ -516,7 +653,7 @@ impl Value { } }, 255 => match var_type { - VarType::Numeric => break Value::Number(None), + VarType::Numeric => break Self::Number(None), VarType::String => { return Err(Error::CompressedNumberExpected { offset: case_start, @@ -530,6 +667,13 @@ impl Value { } Ok(Some(values)) } + + fn decode(&self, decoder: &Decoder) -> Value { + match self { + Self::Number(x) => Value::Number(*x), + Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), + } + } } struct ZlibDecodeMultiple @@ -595,7 +739,7 @@ where reader: Option, warn: Box, - header: HeaderRecord, + header: HeaderRecord, var_types: Vec, state: ReaderState, @@ -645,6 +789,7 @@ where match Record::read( self.reader.as_mut().unwrap(), self.header.endian, + self.var_types.as_slice(), &self.warn, ) { Ok(Some(record)) => break record, @@ -710,7 +855,7 @@ pub struct Cases { bias: f64, endian: Endian, codes: VecDeque, - eof: bool + eof: bool, } impl Debug for Cases { @@ -720,7 +865,7 @@ impl Debug for Cases { } impl Cases { - fn new(reader: R, var_types: Vec, header: &HeaderRecord) -> Self + fn new(reader: R, var_types: Vec, header: &HeaderRecord) -> Self where R: Read + Seek + 'static, { @@ -741,7 +886,7 @@ impl Cases { } impl Iterator for Cases { - type Item = Result, Error>; + type Item = Result, Error>; fn next(&mut self) -> Option { if self.eof { @@ -822,15 +967,21 @@ fn format_name(type_: u32) -> Cow<'static, str> { } #[derive(Clone)] -pub struct MissingValues { +pub struct MissingValues +where + S: Debug, +{ /// Individual missing values, up to 3 of them. - pub values: Vec, + pub values: Vec>, /// Optional range of missing values. - pub range: Option<(Value, Value)>, + pub range: Option<(Value, Value)>, } -impl Debug for MissingValues { +impl Debug for MissingValues +where + S: Debug, +{ fn fmt(&self, f: &mut Formatter) -> FmtResult { for (i, value) in self.values.iter().enumerate() { if i > 0 { @@ -839,7 +990,7 @@ impl Debug for MissingValues { write!(f, "{value:?}")?; } - if let Some((low, high)) = self.range { + if let Some((low, high)) = &self.range { if !self.values.is_empty() { write!(f, ", ")?; } @@ -854,18 +1005,23 @@ impl Debug for MissingValues { } } -impl MissingValues { +impl MissingValues +where + S: Debug, +{ fn is_empty(&self) -> bool { self.values.is_empty() && self.range.is_none() } +} +impl MissingValues> { fn read( r: &mut R, offset: u64, width: i32, code: i32, endian: Endian, - ) -> Result { + ) -> Result { let (n_values, has_range) = match (width, code) { (_, 0..=3) => (code, false), (0, -2) => (0, true), @@ -878,21 +1034,38 @@ impl MissingValues { let mut values = Vec::new(); for _ in 0..n_values { - values.push(Value::read(r, var_type, endian)?); + values.push(RawValue::read(r, var_type, endian)?); } let range = if has_range { - let low = Value::read(r, var_type, endian)?; - let high = Value::read(r, var_type, endian)?; + let low = RawValue::read(r, var_type, endian)?; + let high = RawValue::read(r, var_type, endian)?; Some((low, high)) } else { None }; - Ok(MissingValues { values, range }) + Ok(Self { values, range }) + } + fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues { + MissingValues { + values: self + .values + .iter() + .map(|value| value.decode(decoder)) + .collect(), + range: self + .range + .as_ref() + .map(|(low, high)| (low.decode(decoder), high.decode(decoder))), + } } } #[derive(Clone)] -pub struct VariableRecord { +pub struct VariableRecord +where + S: Debug, + V: Debug, +{ /// Range of offsets in file. pub offsets: Range, @@ -900,7 +1073,7 @@ pub struct VariableRecord { pub width: i32, /// Variable name, padded on the right with spaces. - pub name: UnencodedStr<8>, + pub name: S, /// Print format. pub print_format: Spec, @@ -909,13 +1082,17 @@ pub struct VariableRecord { pub write_format: Spec, /// Missing values. - pub missing_values: MissingValues, + pub missing_values: MissingValues, /// Optional variable label. - pub label: Option, + pub label: Option, } -impl Debug for VariableRecord { +impl Debug for VariableRecord +where + S: Debug, + V: Debug, +{ fn fmt(&self, f: &mut Formatter) -> FmtResult { writeln!( f, @@ -935,7 +1112,7 @@ impl Debug for VariableRecord { } } -impl VariableRecord { +impl VariableRecord> { fn read(r: &mut R, endian: Endian) -> Result { let start_offset = r.stream_position()?; let width: i32 = endian.parse(read_bytes(r)?); @@ -944,14 +1121,14 @@ impl VariableRecord { let missing_value_code: i32 = endian.parse(read_bytes(r)?); let print_format = Spec(endian.parse(read_bytes(r)?)); let write_format = Spec(endian.parse(read_bytes(r)?)); - let name = UnencodedStr::<8>(read_bytes(r)?); + let name = RawString(read_vec(r, 8)?); let label = match has_variable_label { 0 => None, 1 => { let len: u32 = endian.parse(read_bytes(r)?); let read_len = len.min(65535) as usize; - let label = UnencodedString(read_vec(r, read_len)?); + let label = RawString(read_vec(r, read_len)?); let padding_bytes = Integer::next_multiple_of(&len, &4) - len; let _ = read_vec(r, padding_bytes as usize)?; @@ -982,6 +1159,18 @@ impl VariableRecord { label, })) } + + fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord, String> { + VariableRecord { + offsets: self.offsets.clone(), + width: self.width, + name: decoder.decode(&self.name), + print_format: self.print_format, + write_format: self.write_format, + missing_values: self.missing_values.decode(decoder), + label: self.label.as_ref().map(|label| decoder.decode(label)), + } + } } #[derive(Copy, Clone)] @@ -1011,60 +1200,81 @@ impl Debug for UntypedValue { } #[derive(Clone)] -pub struct UnencodedString(pub Vec); +pub struct RawString(pub Vec); -impl From> for UnencodedString { +impl From> for RawString { fn from(source: Vec) -> Self { Self(source) } } -impl From<&[u8]> for UnencodedString { +impl From<&[u8]> for RawString { fn from(source: &[u8]) -> Self { Self(source.into()) } } -impl Debug for UnencodedString { +impl Debug for RawString { fn fmt(&self, f: &mut Formatter) -> FmtResult { write!(f, "{:?}", default_decode(self.0.as_slice())) } } #[derive(Copy, Clone)] -pub struct UnencodedStr(pub [u8; N]); +pub struct RawStr(pub [u8; N]); -impl From<[u8; N]> for UnencodedStr { +impl From<[u8; N]> for RawStr { fn from(source: [u8; N]) -> Self { Self(source) } } -impl Debug for UnencodedStr { +impl Debug for RawStr { fn fmt(&self, f: &mut Formatter) -> FmtResult { write!(f, "{:?}", default_decode(&self.0)) } } +#[derive(Clone, Debug)] +pub struct ValueLabel +where + V: Debug, + S: Debug, +{ + pub value: Value, + pub label: S, +} + #[derive(Clone)] -pub struct ValueLabelRecord { +pub struct ValueLabelRecord +where + V: Debug, + S: Debug, +{ /// Range of offsets in file. pub offsets: Range, /// The labels. - pub labels: Vec<(UntypedValue, UnencodedString)>, + pub labels: Vec>, /// The 1-based indexes of the variable indexes. pub dict_indexes: Vec, + + /// The types of the variables. + pub var_type: VarType, } -impl Debug for ValueLabelRecord { +impl Debug for ValueLabelRecord +where + V: Debug, + S: Debug, +{ fn fmt(&self, f: &mut Formatter) -> FmtResult { writeln!(f, "labels: ")?; - for (value, label) in self.labels.iter() { - writeln!(f, "{value:?}: {label:?}")?; + for label in self.labels.iter() { + writeln!(f, "{label:?}")?; } - write!(f, "apply to variables")?; + write!(f, "apply to {} variables", self.var_type)?; for dict_index in self.dict_indexes.iter() { write!(f, " #{dict_index}")?; } @@ -1072,20 +1282,35 @@ impl Debug for ValueLabelRecord { } } -impl Header for ValueLabelRecord { +impl Header for ValueLabelRecord +where + V: Debug, + S: Debug, +{ fn offsets(&self) -> Range { self.offsets.clone() } } -impl ValueLabelRecord { +impl ValueLabelRecord +where + V: Debug, + S: Debug, +{ /// Maximum number of value labels in a record. pub const MAX_LABELS: u32 = u32::MAX / 8; /// Maximum number of variable indexes in a record. pub const MAX_INDEXES: u32 = u32::MAX / 8; +} - fn read(r: &mut R, endian: Endian) -> Result { +impl ValueLabelRecord, RawString> { + fn read( + r: &mut R, + endian: Endian, + var_types: &[VarType], + warn: &Box, + ) -> Result, Error> { let label_offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); if n > Self::MAX_LABELS { @@ -1105,7 +1330,7 @@ impl ValueLabelRecord { let mut label = read_vec(r, padded_len - 1)?; label.truncate(label_len); - labels.push((value, UnencodedString(label))); + labels.push((value, RawString(label))); } let index_offset = r.stream_position()?; @@ -1119,23 +1344,71 @@ impl ValueLabelRecord { let n: u32 = endian.parse(read_bytes(r)?); if n > Self::MAX_INDEXES { - return Err(Error::BadNumberOfVarIndexes { + return Err(Error::TooManyVarIndexes { offset: index_offset, n, max: Self::MAX_INDEXES, }); } + + let index_offset = r.stream_position()?; let mut dict_indexes = Vec::with_capacity(n as usize); + let mut invalid_indexes = Vec::new(); for _ in 0..n { - dict_indexes.push(endian.parse(read_bytes(r)?)); + let index: u32 = endian.parse(read_bytes(r)?); + if index == 0 || index as usize > var_types.len() { + dict_indexes.push(index); + } else { + invalid_indexes.push(index); + } + } + if !invalid_indexes.is_empty() { + warn(Error::InvalidVarIndexes { + offset: index_offset, + max: var_types.len(), + invalid: invalid_indexes, + }); + } + + let Some(&first_index) = dict_indexes.first() else { + warn(Error::NoVarIndexes { + offset: index_offset, + }); + return Ok(None); + }; + let var_type = var_types[first_index as usize - 1]; + let mut wrong_type_indexes = Vec::new(); + dict_indexes.retain(|&index| { + if var_types[index as usize - 1] != var_type { + wrong_type_indexes.push(index); + false + } else { + true + } + }); + if !wrong_type_indexes.is_empty() { + warn(Error::MixedVarTypes { + offset: index_offset, + var_type, + wrong_types: wrong_type_indexes, + }); } + let labels = labels + .into_iter() + .map(|(value, label)| ValueLabel { + value: Value::from_raw(&value, var_type, endian), + label, + }) + .collect(); + let end_offset = r.stream_position()?; - Ok(Record::ValueLabel(ValueLabelRecord { + Ok(Some(Record::ValueLabel(ValueLabelRecord { offsets: label_offset..end_offset, labels, dict_indexes, - })) + var_type, + }))) } } @@ -1147,7 +1420,7 @@ pub struct DocumentRecord { pub lines: Vec, } -pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>; +pub type DocumentLine = RawStr<{ DocumentRecord::LINE_LEN }>; impl DocumentRecord { /// Length of a line in a document. Document lines are fixed-length and @@ -1171,7 +1444,7 @@ impl DocumentRecord { } else { let mut lines = Vec::with_capacity(n); for _ in 0..n { - lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?)); + lines.push(RawStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?)); } let end_offset = r.stream_position()?; Ok(Record::Document(DocumentRecord { @@ -1269,7 +1542,7 @@ pub enum CategoryLabels { #[derive(Clone, Debug)] pub enum MultipleResponseType { MultipleDichotomy { - value: UnencodedString, + value: RawString, labels: CategoryLabels, }, MultipleCategory, @@ -1311,10 +1584,10 @@ impl MultipleResponseType { #[derive(Clone, Debug)] pub struct MultipleResponseSet { - pub name: UnencodedString, - pub label: UnencodedString, + pub name: RawString, + pub label: RawString, pub mr_type: MultipleResponseType, - pub short_names: Vec, + pub short_names: Vec, } impl MultipleResponseSet { @@ -1382,7 +1655,7 @@ impl ExtensionRecord for MultipleResponseRecord { } } -fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> { +fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Error> { let Some(space) = input.iter().position(|&b| b == b' ') else { return Err(Error::TBD); }; @@ -1425,10 +1698,10 @@ impl ExtensionRecord for VarDisplayRecord { #[derive(Clone, Debug)] pub struct LongStringMissingValues { /// Variable name. - pub var_name: UnencodedString, + pub var_name: RawString, /// Missing values. - pub missing_values: MissingValues, + pub missing_values: MissingValues>, } #[derive(Clone, Debug)] @@ -1469,7 +1742,7 @@ impl ExtensionRecord for LongStringMissingValueRecord { } else { value }; - values.push(Value::String(UnencodedStr(value))); + values.push(Value::String(RawStr(value))); } let missing_values = MissingValues { values, @@ -1480,9 +1753,9 @@ impl ExtensionRecord for LongStringMissingValueRecord { missing_values, }); } - Ok(Record::LongStringMissingValues(LongStringMissingValueRecord( - missing_value_set, - ))) + Ok(Record::LongStringMissingValues( + LongStringMissingValueRecord(missing_value_set), + )) } } @@ -1537,7 +1810,7 @@ pub struct TextRecord { pub offsets: Range, /// The text content of the record. - pub text: UnencodedString, + pub text: RawString, } impl From for TextRecord { @@ -1789,18 +2062,18 @@ fn read_vec(r: &mut R, n: usize) -> Result, IoError> { Ok(vec) } -fn read_string(r: &mut R, endian: Endian) -> Result { +fn read_string(r: &mut R, endian: Endian) -> Result { let length: u32 = endian.parse(read_bytes(r)?); Ok(read_vec(r, length as usize)?.into()) } #[derive(Clone, Debug)] pub struct LongStringValueLabels { - pub var_name: UnencodedString, + pub var_name: RawString, pub width: u32, /// `(value, label)` pairs, where each value is `width` bytes. - pub labels: Vec<(UnencodedString, UnencodedString)>, + pub labels: Vec<(RawString, RawString)>, } #[derive(Clone, Debug)]