From e3c885c421380a059bd1b0b5b0333fb04623a6d6 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 27 Dec 2024 11:26:36 -0800 Subject: [PATCH] very long strings --- rust/pspp/src/cooked.rs | 36 ++++++++++++++++++++++++++++++++ rust/pspp/src/dictionary.rs | 41 +++++++++++++++++++++++++++++++++++++ rust/pspp/src/raw.rs | 2 +- 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/rust/pspp/src/cooked.rs b/rust/pspp/src/cooked.rs index 5954e2c88c..3dba3301f6 100644 --- a/rust/pspp/src/cooked.rs +++ b/rust/pspp/src/cooked.rs @@ -604,6 +604,42 @@ pub fn decode( } } + 'outer: for record in headers + .very_long_strings + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else { + warn(Error::TBD); + continue; + }; + let width = VarWidth::String(record.length); + let n_segments = width.n_segments(); + if n_segments == 1 { + warn(Error::TBD); + continue; + } + if index + n_segments > dictionary.variables.len() { + warn(Error::TBD); + continue; + } + let mut short_names = Vec::with_capacity(n_segments); + for i in 0..n_segments { + let alloc_width = width.segment_alloc_width(i); + let segment = &dictionary.variables[index + i]; + short_names.push(segment.short_names[0].clone()); + let segment_width = segment.width.as_string_width().unwrap_or(0); + if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) { + warn(Error::TBD); + continue 'outer; + } + } + dictionary.delete_vars(index + 1..index + n_segments); + let variable = dictionary.variables.get_index_mut2(index).unwrap(); + variable.short_names = short_names; + variable.width = width; + } + if headers.long_names.is_empty() { // There are no long variable names. Use the short variable names, // converted to lowercase, as the long variable names. diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 1f4837972a..9edf063ec7 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -104,6 +104,47 @@ impl VarWidth { pub fn is_string(&self) -> bool { !self.is_numeric() } + + pub fn is_very_long(&self) -> bool { + match *self { + VarWidth::Numeric => false, + VarWidth::String(width) => width >= 256, + } + } + + /// Number of bytes per segment by which the amount of space for very long + /// string variables is allocated. + const EFFECTIVE_VLS_CHUNK: usize = 252; + + /// Returns the number of "segments" used for writing case data for a + /// variable with this width. A segment is a physical variable in the + /// system file that represents some piece of a logical variable as seen by + /// a PSPP user. Only very long string variables have more than one + /// segment. + pub fn n_segments(&self) -> usize { + if self.is_very_long() { + self.as_string_width() + .unwrap() + .div_ceil(Self::EFFECTIVE_VLS_CHUNK) + } else { + 1 + } + } + + /// Returns the width to allocate to the segment with the given + /// `segment_idx` within this variable. A segment is a physical variable in + /// the system file that represents some piece of a logical variable as seen + /// by a PSPP user. + pub fn segment_alloc_width(&self, segment_idx: usize) -> usize { + debug_assert!(segment_idx < self.n_segments()); + debug_assert!(self.is_very_long()); + + if segment_idx < self.n_segments() - 1 { + 255 + } else { + self.as_string_width().unwrap() - segment_idx * Self::EFFECTIVE_VLS_CHUNK + } + } } impl From for VarType { diff --git a/rust/pspp/src/raw.rs b/rust/pspp/src/raw.rs index c9f80d0b62..9da9ea5a77 100644 --- a/rust/pspp/src/raw.rs +++ b/rust/pspp/src/raw.rs @@ -2260,7 +2260,7 @@ impl VeryLongString { } #[derive(Clone, Debug)] -pub struct VeryLongStringsRecord(Vec); +pub struct VeryLongStringsRecord(pub Vec); impl VeryLongStringsRecord { fn decode(source: &TextRecord, decoder: &Decoder) -> Self { -- 2.30.2