From: Ben Pfaff Date: Sun, 3 Aug 2025 17:46:37 +0000 (-0700) Subject: segments X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=756614b486692b78021d3a55a96f26bb17ebef62;p=pspp segments --- diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 509ef7f819..1cd3cbfa27 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -188,17 +188,14 @@ impl VarWidth { /// string variables is allocated. pub const SEGMENT_SIZE: usize = 252; - /// Returns the number of "segments" used for writing case data for a + /// Returns an iterator over the "segments" used for writing case data for a /// variable with this width. A segment is a physical variable in the /// system file that represents some piece of a logical variable as seen by - /// a PSPP user. Only very long string variables have more than one - /// segment. - pub fn n_segments(&self) -> usize { - if self.is_very_long_string() { - self.as_string_width().unwrap().div_ceil(Self::SEGMENT_SIZE) - } else { - 1 - } + /// a PSPP user. Most variables have one segment whose width is their own + /// width, but very long string variables, with width greater than 255, have + /// multiple segments each with width 255 or less. + pub fn segments(&self) -> Segments { + Segments::new(*self) } /// Returns the number of 8-byte chunks used for writing case data for a @@ -218,10 +215,10 @@ impl VarWidth { /// the system file that represents some piece of a logical variable as seen /// by a PSPP user. pub fn segment_alloc_width(&self, segment_idx: usize) -> usize { - debug_assert!(segment_idx < self.n_segments()); + debug_assert!(segment_idx < self.segments().len()); debug_assert!(self.is_very_long_string()); - if segment_idx < self.n_segments() - 1 { + if segment_idx < self.segments().len() - 1 { 255 } else { self.as_string_width().unwrap() - segment_idx * Self::SEGMENT_SIZE @@ -233,6 +230,55 @@ impl VarWidth { } } +pub struct Segments { + width: VarWidth, + i: usize, + n: usize, +} +impl Segments { + pub fn new(width: VarWidth) -> Self { + Self { + width, + i: 0, + n: if width.is_very_long_string() { + width + .as_string_width() + .unwrap() + .div_ceil(VarWidth::SEGMENT_SIZE) + } else { + 1 + }, + } + } +} + +impl Iterator for Segments { + type Item = VarWidth; + + fn next(&mut self) -> Option { + let i = self.i; + if i >= self.n { + None + } else { + self.i += 1; + match self.width { + VarWidth::Numeric => Some(VarWidth::Numeric), + VarWidth::String(_) if i < self.n - 1 => Some(VarWidth::String(255)), + VarWidth::String(width) => Some(VarWidth::String( + width - (self.n as u16 - 1) * VarWidth::SEGMENT_SIZE as u16, + )), + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let n = self.n - self.i; + (n, Some(n)) + } +} + +impl ExactSizeIterator for Segments {} + impl From for VarType { fn from(source: VarWidth) -> Self { match source { @@ -736,7 +782,7 @@ impl Dictionary { .variables .iter() .map(|variable| { - let n = variable.width.n_segments(); + let n = variable.width.segments().len(); let mut names = SmallVec::with_capacity(n); if self.encoding.encode(variable.name.as_str()).0.len() <= 8 { used_names.insert(variable.name.clone()); @@ -1313,8 +1359,7 @@ pub struct Variable { /// Output format used on the `WRITE` command. pub write_format: Format, - /// Value labels, to associate a number (or a string) with a more meaningful - /// description, e.g. 1 -> Apple, 2 -> Banana, ... + /// Value labels. pub value_labels: ValueLabels, /// Variable label, an optional meaningful description for the variable @@ -1927,6 +1972,10 @@ impl DictIndexVariableSet { } } +/// Associates values of a variable with meaningful labels. +/// +/// For example, 1 => strongly disagree, 2 => disagree, 3 => neither agree nor +/// disagree, ... #[derive(Clone, Default, PartialEq, Eq, Serialize)] pub struct ValueLabels(pub HashMap, String>); @@ -1939,12 +1988,12 @@ impl ValueLabels { self.0.is_empty() } - pub fn get(&self, datum: &Datum) -> Option<&str> { - self.0.get(datum).map(|s| s.as_str()) + pub fn get(&self, value: &Datum) -> Option<&str> { + self.0.get(value).map(|s| s.as_str()) } - pub fn insert(&mut self, datum: Datum, label: String) -> Option { - self.0.insert(datum, label) + pub fn insert(&mut self, value: Datum, label: String) -> Option { + self.0.insert(value, label) } pub fn is_resizable(&self, width: VarWidth) -> bool { diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 8609590e6c..f541ad6fbc 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -1094,7 +1094,7 @@ impl Records { continue; }; let width = VarWidth::String(record.length); - let n_segments = width.n_segments(); + let n_segments = width.segments().len(); if n_segments == 1 { warn(Error::ShortVeryLongString { short_name: record.short_name.clone(), diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 7dce91626c..3aa3c0d706 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -616,9 +616,15 @@ impl Record { { let rec_type: u32 = endian.parse(read_bytes(reader)?); match rec_type { - 2 => Ok(Some(Record::Variable(VariableRecord::read(reader, endian, warn)?))), - 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?), - 6 => Ok(Some(DocumentRecord::read(reader, endian)?)), + 2 => Ok(Some(Record::Variable(VariableRecord::read( + reader, endian, warn, + )?))), + 3 => Ok( + ValueLabelRecord::read(reader, endian, var_types, warn)?.map(Record::ValueLabel) + ), + 6 => Ok(Some(Record::Document(DocumentRecord::read( + reader, endian, + )?))), 7 => Extension::read(reader, endian, var_types, warn), 999 => Ok(Some(Record::EndOfHeaders( endian.parse(read_bytes(reader)?), @@ -1235,7 +1241,11 @@ where } }; match record { - Record::Variable(VariableRecord { width, .. }) => self.0.var_types.push(width), + Record::Variable(VariableRecord { width, .. }) => { + if let Ok(width) = width.try_into() { + self.0.var_types.push(width) + } + } Record::EndOfHeaders(_) => { self.0.state = if let Some(Compression::ZLib) = self.0.header.compression { ReaderState::ZlibHeader @@ -1664,7 +1674,7 @@ fn read_string(r: &mut R, endian: Endian) -> Result>, } @@ -1673,12 +1683,10 @@ impl VarTypes { Self::default() } - pub fn push(&mut self, width: RawWidth) { - if let Ok(var_width) = VarWidth::try_from(width) { - self.types.push(Some(var_width)); - for _ in 1..width.n_values().unwrap() { - self.types.push(None); - } + pub fn push(&mut self, width: VarWidth) { + self.types.push(Some(width)); + for _ in 1..width.n_chunks().unwrap() { + self.types.push(None); } } diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index 2a76362e67..5411e03eb5 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -696,12 +696,12 @@ where } impl ValueLabelRecord { - pub(super) fn read( + pub fn read( r: &mut R, endian: Endian, var_types: &VarTypes, warn: &mut dyn FnMut(Warning), - ) -> Result, Error> { + ) -> Result, Error> { let label_offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); if n > Self::MAX_LABELS { @@ -807,12 +807,12 @@ impl ValueLabelRecord { .collect(); let end_offset = r.stream_position()?; - Ok(Some(Record::ValueLabel(ValueLabelRecord { + Ok(Some(ValueLabelRecord { offsets: label_offset..end_offset, labels, dict_indexes, var_type, - }))) + })) } /// Decodes a value label record using `decoder`. @@ -866,7 +866,7 @@ impl DocumentRecord { pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN; /// Reads a document record from `r`. - pub fn read(r: &mut R, endian: Endian) -> Result + pub fn read(r: &mut R, endian: Endian) -> Result where R: Read + Seek, { @@ -889,7 +889,7 @@ impl DocumentRecord { read_bytes(r).map_err(|e| Error::new(Some(offsets.clone()), e.into()))?, )); } - Ok(Record::Document(DocumentRecord { offsets, lines })) + Ok(DocumentRecord { offsets, lines }) } } diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index bd88012442..5444b374d8 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -288,9 +288,9 @@ where .iter() .zip(self.short_names.iter()) { - let mut segment_widths = SegmentWidths::new(variable.width); + let mut segments = variable.width.segments(); let mut short_names = short_names.iter(); - let seg0_width = segment_widths.next().unwrap(); + let seg0_width = segments.next().unwrap(); let name0 = short_names.next().unwrap(); let record = RawVariableRecord { width: seg0_width.as_string_width().unwrap_or(0) as i32, @@ -338,7 +338,7 @@ where write_variable_continuation_records(&mut self.writer, seg0_width)?; // Write additional segments for very long string variables. - for (width, name) in segment_widths.zip(short_names) { + for (width, name) in segments.zip(short_names) { let format: RawFormat = Format::default_for_width(width).try_into().unwrap(); ( 2u32, @@ -399,7 +399,9 @@ where .or_default() .push(index as u32); } - index += SegmentWidths::new(variable.width) + index += variable + .width + .segments() .map(|w| w.n_chunks().unwrap()) .sum::(); } @@ -566,7 +568,7 @@ where Alignment::Right => 1, Alignment::Center => 2, }; - for (index, segment) in SegmentWidths::new(variable.width).enumerate() { + for (index, segment) in variable.width.segments().enumerate() { let display_width = match index { 0 => variable.display_width, _ => segment.default_display_width(), @@ -777,41 +779,6 @@ struct StringSegment { padding_bytes: usize, } -struct SegmentWidths { - width: VarWidth, - i: usize, - n: usize, -} -impl SegmentWidths { - pub fn new(width: VarWidth) -> Self { - Self { - width, - i: 0, - n: width.n_segments(), - } - } -} - -impl Iterator for SegmentWidths { - type Item = VarWidth; - - fn next(&mut self) -> Option { - let i = self.i; - if i >= self.n { - None - } else { - self.i += 1; - match self.width { - VarWidth::Numeric => Some(VarWidth::Numeric), - VarWidth::String(_) if i < self.n - 1 => Some(VarWidth::String(255)), - VarWidth::String(width) => Some(VarWidth::String( - width - (self.n as u16 - 1) * VarWidth::SEGMENT_SIZE as u16, - )), - } - } - } -} - enum CaseVar { Numeric, String(SmallVec<[StringSegment; 1]>), @@ -824,7 +791,7 @@ impl CaseVar { VarWidth::String(w) => { let mut encoding = SmallVec::<[StringSegment; 1]>::new(); let mut remaining = w as usize; - for segment in SegmentWidths::new(width) { + for segment in width.segments() { let segment = segment.as_string_width().unwrap().next_multiple_of(8); let data_bytes = remaining.min(segment).min(255); let padding_bytes = segment - data_bytes; @@ -1239,7 +1206,7 @@ mod tests { dictionary::{Dictionary, MissingValueRange, VarWidth, Variable}, identifier::Identifier, sys::{ - raw::records::{RawHeader, RawVariableRecord, VariableRecord}, + raw::records::{RawHeader, RawVariableRecord, ValueLabelRecord, VariableRecord}, write::DictionaryWriter, WriteOptions, }, @@ -1480,12 +1447,10 @@ mod tests { ) .write_variables() .unwrap(); - println!("{}", HexView::new(&raw)); let mut cursor = Cursor::new(&raw[4..]); let record = VariableRecord::read(&mut cursor, Endian::Little, &mut |_| panic!()).unwrap(); - dbg!(&record); if !width.is_long_string() { assert_eq!(&record.missing_values.values, &values); } else { @@ -1494,4 +1459,41 @@ mod tests { assert_eq!(&record.missing_values.range, &range); } } + + /// Checks that value labels are written correctly. + #[test] + fn variables_value_labels() { + let test_cases = [( + VarWidth::Numeric, + 1, + vec![(Datum::Number(Some(1.0)), "One")], + )]; + + for (width, n_chunks, value_labels) in test_cases { + let mut dictionary = Dictionary::new(UTF_8); + let mut variable = Variable::new(Identifier::new("var").unwrap(), width, UTF_8); + for (value, label) in &value_labels { + variable + .value_labels + .insert(value.clone(), (*label).into()) + .unwrap(); + } + dictionary.add_var(variable).unwrap(); + + let mut raw = Vec::new(); + DictionaryWriter::new( + &WriteOptions::reproducible(None), + &mut Cursor::new(&mut raw), + &dictionary, + ) + .write_value_labels() + .unwrap(); + println!("{}", HexView::new(&raw)); + + let mut cursor = Cursor::new(&raw[4..]); + //let record = + //ValueLabelRecord::read(&mut cursor, Endian::Little, &mut |_| panic!()).unwrap(); + //dbg!(&record); + } + } }