From addf213e7839ff8ab20e4336c598d1e3cfbfc890 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 5 Jul 2025 12:30:07 -0700 Subject: [PATCH] work --- rust/pspp/src/main.rs | 2 +- rust/pspp/src/sys/cooked.rs | 4 +-- rust/pspp/src/sys/raw.rs | 70 +++++++++++++++++++++++++++++++++---- rust/pspp/src/sys/test.rs | 3 +- 4 files changed, 69 insertions(+), 10 deletions(-) diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 4f557d9366..16138eef89 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -152,7 +152,7 @@ fn dissect( decoded_records.push(header.decode(&mut decoder)?); } let headers = Headers::new(decoded_records, &mut |e| eprintln!("{e}"))?; - let (dictionary, metadata) = decode(headers, encoding, |e| eprintln!("{e}"))?; + let (dictionary, metadata, _cases) = decode(headers, encoding, |e| eprintln!("{e}"))?; println!("{dictionary:#?}"); println!("{metadata:#?}"); } diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 7c4299b26e..86ad39edde 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -586,7 +586,7 @@ pub fn decode( mut headers: Headers, encoding: &'static Encoding, mut warn: impl FnMut(Error), -) -> Result<(Dictionary, Metadata), Error> { +) -> Result<(Dictionary, Metadata, Rc>), Error> { let mut dictionary = Dictionary::new(encoding); let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' ')); @@ -1093,7 +1093,7 @@ pub fn decode( } let metadata = Metadata::decode(&headers, warn); - Ok((dictionary, metadata)) + Ok((dictionary, metadata, headers.cases.take().unwrap())) } impl MultipleResponseSet { diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 50149caaea..d2681c6e2b 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -10,15 +10,17 @@ use encoding_rs::{mem::decode_latin1, Encoding}; use flate2::read::ZlibDecoder; use itertools::Itertools; use num::Integer; +use smallvec::SmallVec; use std::{ borrow::{Borrow, Cow}, cell::RefCell, collections::{BTreeMap, VecDeque}, fmt::{Debug, Display, Formatter, Result as FmtResult}, io::{Error as IoError, Read, Seek, SeekFrom}, + iter::repeat_n, mem::take, num::NonZeroU8, - ops::{Deref, Range}, + ops::{Deref, Not, Range}, rc::Rc, str::from_utf8, }; @@ -126,7 +128,7 @@ pub enum Warning { #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")] NoVarIndexes { offset: u64 }, - #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())] + #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", !var_type)] MixedVarTypes { offset: u64, var_type: VarType, @@ -703,8 +705,10 @@ pub enum VarType { String, } -impl VarType { - pub fn opposite(self) -> VarType { +impl Not for VarType { + type Output = Self; + + fn not(self) -> Self::Output { match self { Self::Numeric => Self::String, Self::String => Self::Numeric, @@ -712,6 +716,14 @@ impl VarType { } } +impl Not for &VarType { + type Output = VarType; + + fn not(self) -> Self::Output { + !*self + } +} + impl Display for VarType { fn fmt(&self, f: &mut Formatter) -> FmtResult { match self { @@ -1041,9 +1053,55 @@ where trait ReadSeek: Read + Seek {} impl ReadSeek for T where T: Read + Seek {} +pub struct Case(pub Vec); + +struct StringSegment { + data_bytes: usize, + padding_bytes: usize, +} + +fn segment_widths(width: usize) -> impl Iterator { + let n_segments = width.div_ceil(252); + repeat_n(255, n_segments - 1).chain(if n_segments > 1 { + std::iter::once(width - (n_segments - 1) * 252) + } else { + std::iter::once(width) + }) +} + +enum CaseVar { + Numeric, + String { + width: usize, + encoding: SmallVec<[StringSegment; 1]>, + }, +} + +impl CaseVar { + fn new(width: VarWidth) -> Self { + match width { + VarWidth::Numeric => Self::Numeric, + VarWidth::String(width) => { + let width = width as usize; + let mut segments = SmallVec::<[StringSegment; 1]>::new(); + let mut remaining = width; + for segment in segment_widths(width) { + let data_bytes = remaining.min(255); + let padding_bytes = data_bytes.next_multiple_of(8) - data_bytes; + segments.push(StringSegment { + data_bytes, + padding_bytes, + }); + remaining -= data_bytes; + } + } + } + } +} + pub struct Cases { reader: Box, - var_types: VarTypes, + vars: Vec, compression: Option, bias: f64, endian: Endian, @@ -1079,7 +1137,7 @@ impl Cases { } impl Iterator for Cases { - type Item = Result, Error>; + type Item = Result, Error>; fn next(&mut self) -> Option { if self.eof { diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index f0846a160f..8004947c68 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -544,7 +544,8 @@ fn test_sysfile(name: &str) { let mut errors = Vec::new(); let headers = Headers::new(decoded_records, &mut |e| errors.push(e)).unwrap(); - let (dictionary, metadata) = decode(headers, encoding, |e| errors.push(e)).unwrap(); + let (dictionary, metadata, cases) = + decode(headers, encoding, |e| errors.push(e)).unwrap(); let (group, data) = metadata.to_pivot_rows(); let metadata_table = PivotTable::new([(Axis3::Y, Dimension::new(group))]) .with_data( -- 2.30.2