fixed test
authorBen Pfaff <blp@cs.stanford.edu>
Mon, 16 Jun 2025 15:20:06 +0000 (08:20 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Mon, 16 Jun 2025 15:20:06 +0000 (08:20 -0700)
rust/pspp/src/dictionary.rs
rust/pspp/src/sys/cooked.rs
rust/pspp/src/sys/raw.rs
rust/pspp/src/sys/testdata/invalid_long_string_missing_values.expected
rust/pspp/src/sys/testdata/invalid_long_string_missing_values.sack
rust/pspp/src/sys/testdata/variable_labels_and_missing_values.expected

index e6154fb4e1bf8c8ac6725314251e1069c71dd6ba..876284d6b061d843530b46be62cadbeed38f3d60 100644 (file)
@@ -5,7 +5,7 @@ use std::{
     borrow::Cow,
     cmp::Ordering,
     collections::{BTreeMap, BTreeSet, HashMap, HashSet},
-    fmt::{Debug, Formatter, Result as FmtResult},
+    fmt::{Debug, Display, Formatter, Result as FmtResult},
     hash::Hash,
     ops::{Bound, RangeBounds, RangeInclusive},
     str::FromStr,
@@ -155,6 +155,10 @@ impl VarWidth {
             self.as_string_width().unwrap() - segment_idx * Self::EFFECTIVE_VLS_CHUNK
         }
     }
+
+    pub fn display_adjective(&self) -> VarWidthAdjective {
+        VarWidthAdjective(*self)
+    }
 }
 
 impl From<VarWidth> for VarType {
@@ -166,6 +170,17 @@ impl From<VarWidth> for VarType {
     }
 }
 
+pub struct VarWidthAdjective(VarWidth);
+
+impl Display for VarWidthAdjective {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        match self.0 {
+            VarWidth::Numeric => write!(f, "numeric"),
+            VarWidth::String(width) => write!(f, "{width}-byte string"),
+        }
+    }
+}
+
 #[derive(Clone)]
 pub enum Datum<S = RawString> {
     Number(Option<f64>),
index d1ee09ebff3f10714072b3bc46d9927630998568..0121133528e30cae6a124c46459d82f10783b489 100644 (file)
@@ -192,6 +192,15 @@ pub enum Error {
     #[error("File header claims {expected} variable positions but {actual} were read from file.")]
     WrongVariablePositions { actual: usize, expected: usize },
 
+    #[error("Unknown variable name \"{name}\" in long string missing value record.")]
+    LongStringMissingValueUnknownVariable { name: Identifier },
+
+    #[error("Invalid long string missing value for {} variable {name}.", width.display_adjective())]
+    LongStringMissingValueBadWdith { name: Identifier, width: VarWidth },
+
+    #[error("Long string missing values record says variable {name} has {count} missing values, but only 1 to 3 missing values are allowed.")]
+    LongStringMissingValueInvalidCount { name: Identifier, count: usize },
+
     #[error("Details TBD (cooked)")]
     TBD,
 }
@@ -833,15 +842,31 @@ pub fn decode(
         }
     }
 
-    for record in headers
+    for mut record in headers
         .long_string_missing_values
         .drain(..)
         .flat_map(|record| record.0.into_iter())
     {
         let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else {
-            warn(dbg!(Error::TBD));
+            warn(Error::LongStringMissingValueUnknownVariable {
+                name: record.var_name.clone(),
+            });
             continue;
         };
+        if !variable.width.is_long_string() {
+            warn(Error::LongStringMissingValueBadWdith {
+                name: record.var_name.clone(),
+                width: variable.width,
+            });
+            continue;
+        }
+        if record.missing_values.len() > 3 {
+            warn(Error::LongStringMissingValueInvalidCount {
+                name: record.var_name.clone(),
+                count: record.missing_values.len(),
+            });
+            record.missing_values.truncate(3);
+        }
         let values = record
             .missing_values
             .into_iter()
@@ -851,12 +876,12 @@ pub fn decode(
                 Datum::String(value)
             })
             .collect::<Vec<_>>();
-        dbg!(&values);
         match MissingValues::new(values, None) {
             Ok(missing_values) => variable.missing_values = missing_values,
-            Err(MissingValuesError::TooMany) => warn(Error::TBD),
             Err(MissingValuesError::TooWide) => warn(Error::TBD),
-            Err(MissingValuesError::MixedTypes) => unreachable!(),
+            Err(MissingValuesError::TooMany) | Err(MissingValuesError::MixedTypes) => {
+                unreachable!()
+            }
         }
     }
 
index 7b8970a1af57b26e6c7bfa3496105e3296e15a80..2d10d651aee004892058dcd013187e5ce26693d3 100644 (file)
@@ -2511,35 +2511,47 @@ static LONG_STRING_MISSING_VALUE_RECORD: ExtensionRecord = ExtensionRecord {
 };
 
 impl LongStringMissingValueRecord<RawString> {
-    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+    fn parse(
+        ext: &Extension,
+        endian: Endian,
+        warn: &mut dyn FnMut(Warning),
+    ) -> Result<Record, Warning> {
         ext.check_size(&LONG_STRING_MISSING_VALUE_RECORD)?;
 
         let mut input = &ext.data[..];
         let mut missing_value_set = Vec::new();
         while !input.is_empty() {
             let var_name = read_string(&mut input, endian)?;
+            dbg!(&var_name);
             let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
             let value_len: u32 = endian.parse(read_bytes(&mut input)?);
             if value_len != 8 {
                 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
-                return Err(Warning::BadLongMissingValueLength {
+                warn(Warning::BadLongMissingValueLength {
                     record_offset: ext.offsets.start,
                     offset,
                     value_len,
                 });
+                read_vec(
+                    &mut input,
+                    dbg!(value_len as usize * n_missing_values as usize),
+                )?;
+                continue;
             }
             let mut missing_values = Vec::new();
             for i in 0..n_missing_values {
-                let value: [u8; 8] = read_bytes(&mut input)?;
-                let numeric_value: u64 = endian.parse(value);
-                let value = if i > 0 && numeric_value == 8 {
+                if i > 0 {
                     // Tolerate files written by old, buggy versions of PSPP
                     // where we believed that the value_length was repeated
                     // before each missing value.
-                    read_bytes(&mut input)?
-                } else {
-                    value
-                };
+                    let mut peek = input;
+                    let number: u32 = endian.parse(read_bytes(&mut peek)?);
+                    if number == 8 {
+                        input = peek;
+                    }
+                }
+
+                let value: [u8; 8] = read_bytes(&mut input)?;
                 missing_values.push(RawStrArray(value));
             }
             missing_value_set.push(LongStringMissingValues {
@@ -3059,7 +3071,7 @@ impl Extension {
             11 => VarDisplayRecord::parse(&extension, var_types, endian, warn),
             7 | 19 => MultipleResponseRecord::parse(&extension, endian),
             21 => LongStringValueLabelRecord::parse(&extension, endian),
-            22 => LongStringMissingValueRecord::parse(&extension, endian),
+            22 => LongStringMissingValueRecord::parse(&extension, endian, warn),
             20 => EncodingRecord::parse(&extension, endian),
             16 => NumberOfCasesRecord::parse(&extension, endian),
             5 => RawVariableSetRecord::parse(extension),
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..95e4106d27813911553fe4f6d75289d0ff27e89b 100644 (file)
@@ -0,0 +1,36 @@
+In long string missing values record starting at offset 0x238, value length at offset 0x2a8 is 12 instead of the expected 8.
+
+File header claims 8 variable positions but 9 were read from file.
+
+Long string missing values record says variable STR2 has 4 missing values, but only 1 to 3 missing values are allowed.
+
+Unknown variable name "Nonexistent" in long string missing value record.
+
+Invalid long string missing value for numeric variable NUM1.
+
+Invalid long string missing value for 7-byte string variable STR4.
+
+╭──────────────────────┬────────────────────────╮
+│       Created        │    01-JAN-2011 20:53:52│
+├──────────────────────┼────────────────────────┤
+│Writer Product        │PSPP synthetic test file│
+│       Version        │1.2.3                   │
+├──────────────────────┼────────────────────────┤
+│       Compression    │None                    │
+│       Number of Cases│                       1│
+╰──────────────────────┴────────────────────────╯
+
+╭─────────┬──────────────────────────────╮
+│Label    │PSPP synthetic test file: ôõöø│
+│Variables│                             5│
+╰─────────┴──────────────────────────────╯
+
+╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────────────────────────╮
+│    │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│          Missing Values          │
+├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────────────────────────┤
+│num1│       1│     │                 │Input│    8│Right    │F8.0        │F8.0        │                                  │
+│str1│       2│     │Nominal          │Input│    9│Left     │A9          │A9          │                                  │
+│str2│       3│     │Nominal          │Input│   10│Left     │A10         │A10         │"abcdefgh"; "ijklmnop"; "qrstuvwx"│
+│str3│       4│     │Nominal          │Input│   11│Left     │A11         │A11         │"ABCDEFGH"; "IJKLMNOP"            │
+│str4│       5│     │Nominal          │Input│    7│Left     │A7          │A7          │                                  │
+╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────────────────────────╯
index d2db3d807071777198f94eb1a125a40525f8312c..3b4ef337d5156eeea1a73b0dccc562c6c6c87d76 100644 (file)
@@ -1,7 +1,7 @@
 # File header.
 "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
 2; # Layout code
-7; # Nominal case size
+8; # Nominal case size
 0; # Not compressed
 0; # Not weighted
 1; # 1 case.
@@ -21,6 +21,8 @@ i8 0 *3;
 2; -1; 0; 0; 0; 0; s8 "";
 2; 11; 0; 0; 0x010b00 *2; s8 "STR3";
 2; -1; 0; 0; 0; 0; s8 "";
+2; 7; 0; 0; 0x010700 *2; s8 "STR4";
+2; -1; 0; 0; 0; 0; s8 "";
 
 # Machine integer info record.
 7; 3; 4; 8; 1; 2; 3; -1; 1; 1; ENDIAN; 1252;
@@ -49,6 +51,9 @@ COUNT("STR3"); i8 1; >>COUNT("abcdefghijkl")<<;
 # Buggy way that this was written in old PSPP, with a length
 # before each missing value instead of just once.
 COUNT("STR3"); i8 2; 8; "ABCDEFGH"; >>8<<; "IJKLMNOP";
+
+# Invalid 8-byte missing value for 7-byte string.
+COUNT("STR4"); i8 1; 8; "ABCDEFGH";
 );
 
 # Character encoding record.
index f7e5a27ad29a0126eddeafb97cd01e5942d8e833..a448646811114847db7108aea7f6c3a74805cd66 100644 (file)
 │Variables│                            21│
 ╰─────────┴──────────────────────────────╯
 
\95­â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¬â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¬â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¬â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¬â\94\80â\94\80â\94\80â\94\80â\94\80â\94¬â\94\80â\94\80â\94\80â\94\80â\94\80â\94¬â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¬â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¬â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¬â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\95®
-│                                │Position│              Label             │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│               Missing Values              │
\94\9câ\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¼â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¼â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¼â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¼â\94\80â\94\80â\94\80â\94\80â\94\80â\94¼â\94\80â\94\80â\94\80â\94\80â\94\80â\94¼â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¼â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¼â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¼â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94¤
-│num1                            │       1│                                │                 │Input│    8│Right    │F8.0        │F8.0        │                                           
-│Numeric variable 2's label (ùúû)│       2│Numeric variable 2's label (ùúû)│                 │Input│    8│Right    │F8.0        │F8.0        │                                           
-│num3                            │       3│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1                                          
-│Another numeric variable label  │       4│Another numeric variable label  │                 │Input│    8│Right    │F8.0        │F8.0        │1                                          
-│num5                            │       5│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1; 2                                       
-│num6                            │       6│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1; 2; 3                                    
-│num7                            │       7│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1 THRU 3                                   
-│num8                            │       8│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1 THRU 3; 5                                
-│num9                            │       9│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1 THRU HIGH; -5                            
-│numàèìñò                        │      10│                                │                 │Input│    8│Right    │F8.0        │F8.0        │LOW THRU 1; 5                              
-│str1                            │      11│                                │Nominal          │Input│    4│Left     │A4          │A4          │                                           
-│String variable 2's label       │      12│String variable 2's label       │Nominal          │Input│    4│Left     │A4          │A4          │                                           
-│str3                            │      13│                                │Nominal          │Input│    4│Left     │A4          │A4          │"MISS"                                     
-│Another string variable label   │      14│Another string variable label   │Nominal          │Input│    4│Left     │A4          │A4          │"OTHR"                                     
-│str5                            │      15│                                │Nominal          │Input│    4│Left     │A4          │A4          │"MISS"; "OTHR"                             
-│str6                            │      16│                                │Nominal          │Input│    4│Left     │A4          │A4          │"MISS"; "OTHR"; "MORE"                     
-│str7                            │      17│                                │Nominal          │Input│   11│Left     │A11         │A11         │"first8by"                                 
-│str8                            │      18│                                │Nominal          │Input│    9│Left     │A9          │A9          │"abcdefgh "                                │
-│str9                            │      19│                                │Nominal          │Input│   10│Left     │A10         │A10         │"abcdefgh  "; "01234567  "                 │
-│str10                           │      20│                                │Nominal          │Input│   11│Left     │A11         │A11         │"abcdefgh   "; "01234567   "; "0          "│
-│25-byte string                  │      21│25-byte string                  │Nominal          │Input│   25│Left     │A25         │A25         │                                           
\95°â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94´â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94´â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94´â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94´â\94\80â\94\80â\94\80â\94\80â\94\80â\94´â\94\80â\94\80â\94\80â\94\80â\94\80â\94´â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94´â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94´â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94´â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\94\80â\95¯
+╭────────────────────────────────┬────────┬────────────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬───────────────────────────╮
+│                                │Position│              Label             │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│       Missing Values      │
+├────────────────────────────────┼────────┼────────────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼───────────────────────────┤
+│num1                            │       1│                                │                 │Input│    8│Right    │F8.0        │F8.0        │                           │
+│Numeric variable 2's label (ùúû)│       2│Numeric variable 2's label (ùúû)│                 │Input│    8│Right    │F8.0        │F8.0        │                           │
+│num3                            │       3│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1                          │
+│Another numeric variable label  │       4│Another numeric variable label  │                 │Input│    8│Right    │F8.0        │F8.0        │1                          │
+│num5                            │       5│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1; 2                       │
+│num6                            │       6│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1; 2; 3                    │
+│num7                            │       7│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1 THRU 3                   │
+│num8                            │       8│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1 THRU 3; 5                │
+│num9                            │       9│                                │                 │Input│    8│Right    │F8.0        │F8.0        │1 THRU HIGH; -5            │
+│numàèìñò                        │      10│                                │                 │Input│    8│Right    │F8.0        │F8.0        │LOW THRU 1; 5              │
+│str1                            │      11│                                │Nominal          │Input│    4│Left     │A4          │A4          │                           │
+│String variable 2's label       │      12│String variable 2's label       │Nominal          │Input│    4│Left     │A4          │A4          │                           │
+│str3                            │      13│                                │Nominal          │Input│    4│Left     │A4          │A4          │"MISS"                     │
+│Another string variable label   │      14│Another string variable label   │Nominal          │Input│    4│Left     │A4          │A4          │"OTHR"                     │
+│str5                            │      15│                                │Nominal          │Input│    4│Left     │A4          │A4          │"MISS"; "OTHR"             │
+│str6                            │      16│                                │Nominal          │Input│    4│Left     │A4          │A4          │"MISS"; "OTHR"; "MORE"     │
+│str7                            │      17│                                │Nominal          │Input│   11│Left     │A11         │A11         │"first8by"                 │
+│str8                            │      18│                                │Nominal          │Input│    9│Left     │A9          │A9          │"abcdefgh"                 │
+│str9                            │      19│                                │Nominal          │Input│   10│Left     │A10         │A10         │"abcdefgh"; "01234567"     │
+│str10                           │      20│                                │Nominal          │Input│   11│Left     │A11         │A11         │"abcdefgh"; "01234567"; "0"│
+│25-byte string                  │      21│25-byte string                  │Nominal          │Input│   25│Left     │A25         │A25         │                           │
+╰────────────────────────────────┴────────┴────────────────────────────────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴───────────────────────────╯