From 0705cf7eb7ded6388ef4ab33ec38cbe0c7d4c52c Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 3 Aug 2025 15:51:57 -0700 Subject: [PATCH] test writing value labels --- rust/pspp/src/dictionary.rs | 7 +- rust/pspp/src/sys/write.rs | 125 ++++++++++++++++++++++++++---------- 2 files changed, 95 insertions(+), 37 deletions(-) diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 1cd3cbfa27..a07db8dd39 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -199,9 +199,10 @@ impl VarWidth { } /// Returns the number of 8-byte chunks used for writing case data for a - /// variable with this width. This concept does not apply to very long - /// string variables, which are divided into [multiple - /// segments](Self::n_segments) (which in turn are divided into chunks). + /// variable with this width. Very long string variables (wider than 255 + /// bytes) cannot directly be divided into chunks: they must first be + /// divided into multiple [segments](Self::segments), which can then be + /// divided into chunks. pub fn n_chunks(&self) -> Option { match *self { VarWidth::Numeric => Some(1), diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index 5444b374d8..cb05364e3c 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -410,14 +410,22 @@ where // Label record. (3u32, value_labels.0.len() as u32).write_le(self.writer)?; for (datum, label) in &value_labels.0 { + let datum_padding = datum.width().as_string_width().map_or(0, |width| 8 - width); let label = &*self.dictionary.encoding().encode(&label).0; let label = if label.len() > 255 { &label[..255] } else { label }; - let padding = (1 + label.len()).next_multiple_of(8) - (1 + label.len()); - (datum, label.len() as u8, label, Zeros(padding)).write_le(self.writer)?; + let label_padding = (1 + label.len()).next_multiple_of(8) - (1 + label.len()); + ( + datum, + Zeros(datum_padding), + label.len() as u8, + label, + Zeros(label_padding), + ) + .write_le(self.writer)?; } // Variable record. @@ -610,7 +618,7 @@ where let mut cursor = Cursor::new(&mut body); for variable in &self.dictionary.variables { if variable.value_labels.is_empty() || !variable.width.is_long_string() { - break; + continue; } let name = self.dictionary.encoding().encode(&variable.name).0; ( @@ -1198,7 +1206,6 @@ mod tests { use binrw::{BinRead, Endian}; use encoding_rs::UTF_8; - use hexplay::HexView; use itertools::Itertools; use crate::{ @@ -1206,9 +1213,9 @@ mod tests { dictionary::{Dictionary, MissingValueRange, VarWidth, Variable}, identifier::Identifier, sys::{ - raw::records::{RawHeader, RawVariableRecord, ValueLabelRecord, VariableRecord}, + raw::records::{RawHeader, RawVariableRecord, VariableRecord}, write::DictionaryWriter, - WriteOptions, + ReadOptions, WriteOptions, }, }; @@ -1463,37 +1470,87 @@ mod tests { /// Checks that value labels are written correctly. #[test] fn variables_value_labels() { - let test_cases = [( - VarWidth::Numeric, - 1, - vec![(Datum::Number(Some(1.0)), "One")], - )]; + let variables = [ + (VarWidth::Numeric, vec![(Datum::Number(Some(1.0)), "One")]), + ( + VarWidth::Numeric, + vec![ + (Datum::Number(Some(1.0)), "One"), + (Datum::Number(Some(2.0)), "Two"), + ], + ), + ( + VarWidth::Numeric, + vec![ + (Datum::Number(Some(1.0)), "One"), + (Datum::Number(Some(2.0)), "Two"), + (Datum::Number(Some(3.0)), "Three"), + ], + ), + ( + VarWidth::String(4), + vec![(Datum::String(ByteString::from("abcd")), "One")], + ), + ( + VarWidth::String(8), + vec![( + Datum::String(ByteString::from("abcdefgh")), + "Longer variable label", + )], + ), + ( + VarWidth::String(9), + vec![( + Datum::String(ByteString::from("abcdefghi")), + "Variable label for 9-byte value", + )], + ), + ( + VarWidth::String(300), + vec![( + Datum::String(ByteString::from(vec![b'x'; 300])), + "Variable label for 300-byte value", + )], + ), + ]; - for (width, n_chunks, value_labels) in test_cases { + for test_case in variables.iter().combinations_with_replacement(3) { let mut dictionary = Dictionary::new(UTF_8); - let mut variable = Variable::new(Identifier::new("var").unwrap(), width, UTF_8); - for (value, label) in &value_labels { - variable - .value_labels - .insert(value.clone(), (*label).into()) - .unwrap(); + for (index, (width, value_labels)) in test_case.iter().enumerate() { + let mut variable = Variable::new( + Identifier::new(format!("var{index}")).unwrap(), + *width, + UTF_8, + ); + for (value, label) in value_labels { + assert_eq!( + variable.value_labels.insert(value.clone(), (*label).into()), + None + ); + } + dictionary.add_var(variable).unwrap(); + } + dbg!(&dictionary); + + let raw = WriteOptions::new() + .write_writer(&dictionary, Cursor::new(Vec::new())) + .unwrap() + .finish() + .unwrap() + .unwrap() + .into_inner(); + let dictionary2 = ReadOptions::new(|_| panic!()) + .open_reader(Cursor::new(raw)) + .unwrap() + .dictionary; + + for (expected, actual) in dictionary + .variables + .iter() + .zip_eq(dictionary2.variables.iter()) + { + assert_eq!(&expected.value_labels, &actual.value_labels); } - dictionary.add_var(variable).unwrap(); - - let mut raw = Vec::new(); - DictionaryWriter::new( - &WriteOptions::reproducible(None), - &mut Cursor::new(&mut raw), - &dictionary, - ) - .write_value_labels() - .unwrap(); - println!("{}", HexView::new(&raw)); - - let mut cursor = Cursor::new(&raw[4..]); - //let record = - //ValueLabelRecord::read(&mut cursor, Endian::Little, &mut |_| panic!()).unwrap(); - //dbg!(&record); } } } -- 2.30.2