From: Ben Pfaff Date: Thu, 1 May 2025 23:29:35 +0000 (-0700) Subject: start work on testing sysfile reader X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d453cd422a600709f1e5a954c6a0c0850a67185d;p=pspp start work on testing sysfile reader --- diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index 2d17270b84..e0eedf9d3e 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -1,5 +1,5 @@ use std::{ - fmt::{Display, Formatter, Result as FmtResult}, + fmt::{Debug, Display, Formatter, Result as FmtResult}, ops::{Not, RangeInclusive}, str::{Chars, FromStr}, sync::LazyLock, @@ -464,7 +464,7 @@ impl TryFrom for UncheckedFormat { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +#[derive(Copy, Clone, PartialEq, Eq, Hash)] pub struct Format { type_: Type, w: Width, @@ -604,6 +604,12 @@ impl Format { } } +impl Debug for Format { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{self}") + } +} + impl Display for Format { fn fmt(&self, f: &mut Formatter) -> FmtResult { write!(f, "{}{}", self.type_, self.w)?; diff --git a/rust/pspp/src/sys/mod.rs b/rust/pspp/src/sys/mod.rs index 57a1d00e56..5b6ba905ae 100644 --- a/rust/pspp/src/sys/mod.rs +++ b/rust/pspp/src/sys/mod.rs @@ -2,3 +2,6 @@ pub mod cooked; pub mod encoding; pub mod raw; pub mod sack; + +#[cfg(test)] +mod test; diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 7a0af59e1d..3d6f548e21 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -1080,7 +1080,17 @@ where pub values: Vec>, /// Optional range of missing values. - pub range: Option<(Value, Value)>, + pub range: Option>, +} + +#[derive(Clone)] +pub enum MissingValueRange> +where + S: Debug, +{ + In { low: Value, high: Value }, + From { low: Value }, + To { high: Value }, } impl Debug for MissingValues @@ -1095,11 +1105,15 @@ where write!(f, "{value:?}")?; } - if let Some((low, high)) = &self.range { + if let Some(range) = &self.range { if !self.values.is_empty() { write!(f, ", ")?; } - write!(f, "{low:?} THRU {high:?}")?; + match range { + MissingValueRange::In { low, high } => write!(f, "{low:?} THRU {high:?}")?, + MissingValueRange::From { low } => write!(f, "{low:?} THRU HI")?, + MissingValueRange::To { high } => write!(f, "LOW THRU {high:?}")?, + } } if self.is_empty() { @@ -1148,9 +1162,6 @@ impl MissingValues { }; let mut values = Vec::with_capacity(individual_values); - for _ in 0..individual_values { - values.push(read_bytes::<8, _>(r)?); - } let range = if has_range { let low = read_bytes::<8, _>(r)?; let high = read_bytes::<8, _>(r)?; @@ -1158,6 +1169,9 @@ impl MissingValues { } else { None }; + for _ in 0..individual_values { + values.push(read_bytes::<8, _>(r)?); + } match VarWidth::try_from(width) { Ok(VarWidth::Numeric) => { @@ -1165,12 +1179,23 @@ impl MissingValues { .into_iter() .map(|v| Value::Number(endian.parse(v))) .collect(); - let range = range.map(|(low, high)| { - ( - Value::Number(endian.parse(low)), - Value::Number(endian.parse(high)), - ) - }); + + const LOWEST: f64 = f64::MIN.next_up(); + let range = + range.map( + |(low, high)| match (endian.parse(low), endian.parse(high)) { + (f64::MIN | LOWEST, high) => MissingValueRange::To { + high: Value::Number(Some(high)), + }, + (low, f64::MAX) => MissingValueRange::From { + low: Value::Number(Some(low)), + }, + (low, high) => MissingValueRange::In { + low: Value::Number(Some(low)), + high: Value::Number(Some(high)), + }, + }, + ); return Ok(Self { values, range }); } Ok(VarWidth::String(width)) if width <= 8 && range.is_none() => { diff --git a/rust/pspp/src/sys/sack.rs b/rust/pspp/src/sys/sack.rs index 103a9be847..b2ac013beb 100644 --- a/rust/pspp/src/sys/sack.rs +++ b/rust/pspp/src/sys/sack.rs @@ -57,9 +57,6 @@ pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Resul let mut symbol_table = HashMap::new(); let output = _sack(input, input_file_name, endian, &mut symbol_table)?; let output = if !symbol_table.is_empty() { - for (k, v) in symbol_table.iter() { - println!("{k} => {v:?}"); - } for (k, v) in symbol_table.iter() { if v.is_none() { Err(Error::new( @@ -171,7 +168,6 @@ fn parse_data_item( lexer.get()?; } Token::Label(name) => { - println!("define {name}"); let value = output.len() as u32; match symbol_table.entry(name.clone()) { Entry::Vacant(v) => { @@ -294,13 +290,11 @@ where T: Bounded + Display + TryFrom + Copy, Endian: ToBytes, { - println!("put_integers {:?}", lexer.token); let mut n = 0; while let Some(integer) = lexer.take_if(|t| match t { Token::Integer(integer) => Some(*integer), _ => None, })? { - println!("got integer {integer}"); let Ok(integer) = integer.try_into() else { Err(lexer.error(format!( "{integer} is not in the valid range [{},{}]", @@ -311,7 +305,6 @@ where output.extend_from_slice(&lexer.endian.to_bytes(integer)); n += 1; } - println!("put_integers {:?} {n}", lexer.token); if n == 0 { Err(lexer.error(format!("integer expected after '{name}'")))? } @@ -515,7 +508,6 @@ impl<'a> Lexer<'a> { }; self.input = rest; let repr = &start[..start.len() - rest.len()]; - println!("{token:?} {repr}"); Ok(Some((token, repr))) } } diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs new file mode 100644 index 0000000000..c5737b398c --- /dev/null +++ b/rust/pspp/src/sys/test.rs @@ -0,0 +1,147 @@ +use std::io::Cursor; + +use crate::{ + endian::Endian, + sys::{ + cooked::{decode, Headers}, + raw::{encoding_from_headers, Decoder, Reader, Record}, + sack::sack, + }, +}; + +#[test] +fn variable_labels_and_missing_values() { + let input = r#" +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; # Layout code +28; # Nominal case size +0; # Not compressed +0; # Not weighted +1; # 1 case. +100.0; # Bias. +"01 Jan 11"; "20:53:52"; +"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; +i8 0 *3; + +# Numeric variable, no label or missing values. +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; + +# Numeric variable, variable label. +2; 0; 1; 0; 0x050800 *2; s8 "NUM2"; +32; "Numeric variable 2's label ("; i8 249; i8 250; i8 251; ")"; + +# Numeric variable, one missing value. +2; 0; 0; 1; 0x050800 *2; s8 "NUM3"; +1.0; + +# Numeric variable, variable label and missing value. +2; 0; 1; 1; 0x050800 *2; s8 "NUM4"; +30; "Another numeric variable label"; i8 0 * 2; +1.0; + +# Numeric variable, two missing values. +2; 0; 0; 2; 0x050800 *2; s8 "NUM5"; 1.0; 2.0; + +# Numeric variable, three missing values. +2; 0; 0; 3; 0x050800 *2; s8 "NUM6"; 1.0; 2.0; 3.0; + +# Numeric variable, range of missing values. +2; 0; 0; -2; 0x050800 *2; s8 "NUM7"; 1.0; 3.0; + +# Numeric variables, range of missing values plus discrete value. +2; 0; 0; -3; 0x050800 *2; s8 "NUM8"; 1.0; 3.0; 5.0; +2; 0; 0; -3; 0x050800 *2; s8 "NUM9"; 1.0; HIGHEST; -5.0; +2; 0; 0; -3; 0x050800 *2; "NUM"; i8 192; i8 200; i8 204; i8 209; i8 210; +LOWEST; 1.0; 5.0; + +# String variable, no label or missing values. +2; 4; 0; 0; 0x010400 *2; s8 "STR1"; + +# String variable, variable label. +2; 4; 1; 0; 0x010400 *2; s8 "STR2"; +25; "String variable 2's label"; i8 0 * 3; + +# String variable, one missing value. +2; 4; 0; 1; 0x010400 *2; s8 "STR3"; s8 "MISS"; + +# String variable, variable label and missing value. +2; 4; 1; 1; 0x010400 *2; s8 "STR4"; +29; "Another string variable label"; i8 0 * 3; +s8 "OTHR"; + +# String variable, two missing values. +2; 4; 0; 2; 0x010400 *2; s8 "STR5"; s8 "MISS"; s8 "OTHR"; + +# String variable, three missing values. +2; 4; 0; 3; 0x010400 *2; s8 "STR6"; s8 "MISS"; s8 "OTHR"; s8 "MORE"; + +# Long string variable, one missing value. +# (This is not how SPSS represents missing values for long strings--it +# uses a separate record as shown later below--but old versions of PSPP +# did use this representation so we continue supporting it for backward +# compatibility. +2; 11; 0; 1; 0x010b00 *2; s8 "STR7"; "first8by"; +2; -1; 0; 0; 0; 0; s8 ""; + +# Long string variables that will have missing values added with a +# later record. +2; 9; 0; 0; 0x010900 *2; s8 "STR8"; +2; -1; 0; 0; 0; 0; s8 ""; +2; 10; 0; 0; 0x010a00 *2; s8 "STR9"; +2; -1; 0; 0; 0; 0; s8 ""; +2; 11; 0; 0; 0x010b00 *2; s8 "STR10"; +2; -1; 0; 0; 0; 0; s8 ""; + +# Long string variable, value label. +2; 25; 1; 0; 0x011900 *2; s8 "STR11"; 14; "25-byte string"; i8 0 * 2; +( 2; -1; 0; 0; 0; 0; s8 ""; ) * 2; +# Variable label fields on continuation records have been spotted in system +# files created by "SPSS Power Macintosh Release 6.1". +2; -1; 1; 0; 0; 0; s8 ""; 20; "dummy variable label"; + +# Machine integer info record. +7; 3; 4; 8; 1; 2; 3; -1; 1; 1; ENDIAN; 1252; + +# Machine floating-point info record. +7; 4; 8; 3; SYSMIS; HIGHEST; LOWEST; + +# Long string variable missing values record. +7; 22; 1; COUNT ( +# One missing value for STR8. +COUNT("STR8"); i8 1; 8; "abcdefgh"; + +# Two missing values for STR9. +COUNT("STR9"); i8 2; 8; "abcdefgh"; "01234567"; + +# Three missing values for STR9. +COUNT("STR10"); i8 3; 8; "abcdefgh"; "01234567"; "0 "; +); + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# Dictionary termination record. +999; 0; + +# Data. +1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0; 8.0; 9.0; 10.0; +s8 "abcd"; s8 "efgh"; s8 "ijkl"; s8 "mnop"; s8 "qrst"; s8 "uvwx"; +s16 "yzABCDEFGHI"; s16 "JKLMNOPQR"; s16 "STUVWXYZ01"; +s16 "23456789abc"; s32 "defghijklmnopqstuvwxyzABC"; +"#; + let sysfile = sack(input, None, Endian::Big).unwrap(); + let cursor = Cursor::new(sysfile); + let reader = Reader::new(cursor, |warning| println!("{warning}")).unwrap(); + let headers: Vec = reader.collect::, _>>().unwrap(); + let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}")).unwrap(); + let decoder = Decoder::new(encoding, |e| eprintln!("{e}")); + let mut decoded_records = Vec::new(); + for header in headers { + decoded_records.push(header.decode(&decoder).unwrap()); + } + let headers = Headers::new(decoded_records, &|e| eprintln!("{e}")).unwrap(); + let (dictionary, metadata) = decode(headers, encoding, |e| eprintln!("{e}")).unwrap(); + println!("{dictionary:#?}"); + println!("{metadata:#?}"); +}