From c2f8e68aeff78226cffbfc8aa10fdad28a3b6087 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 5 Aug 2023 22:13:22 -0700 Subject: [PATCH] work --- doc/dev/system-file-format.texi | 5 +- rust/src/lib.rs | 3 + rust/src/raw.rs | 247 ++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 4 deletions(-) diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index a12c353d96..9d408f8680 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -936,12 +936,9 @@ counted value for multiple dichotomy sets. A string of length 0 means that the set does not have a label. A string of length 0 is also written if LABELSOURCE=VARLABEL was specified. -@item -A space. - @item The short names of the variables in the set, converted to lowercase, -each separated from the previous by a single space. +each preceded by a single space. Even though a multiple response set must have at least two variables, some system files contain multiple response sets with no variables or diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 8a4c7d0704..5253402699 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -101,4 +101,7 @@ pub enum Error { #[error("This file has corrupted metadata written by a buggy version of PSPP. To fix it, save a new copy of the file.")] BadLongMissingValueFormat, + + #[error("Details TBD")] + TBD, } diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 48ff3ee5fa..fccc33b41b 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -3,6 +3,7 @@ use crate::Error; use flate2::read::ZlibDecoder; use num::Integer; +use std::str::from_utf8; use std::{ collections::VecDeque, io::{Error as IoError, Read, Seek, SeekFrom}, @@ -759,6 +760,14 @@ enum ExtensionType { } */ +trait TextRecord +where + Self: Sized, +{ + const NAME: &'static str; + fn parse(input: &str, warn: impl Fn(Error)) -> Result; +} + trait ExtensionRecord where Self: Sized, @@ -827,6 +836,148 @@ impl ExtensionRecord for FloatInfo { } } +pub enum CategoryLabels { + VarLabels, + CountedValues, +} +pub enum MultipleResponseType { + MultipleDichotomy { + value: Vec, + labels: CategoryLabels, + }, + MultipleCategory, +} +pub struct MultipleResponseSet { + pub name: Vec, + pub label: Vec, + pub mr_type: MultipleResponseType, + pub vars: Vec>, +} + +impl MultipleResponseSet { + fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> { + let Some(equals) = input.iter().position(|&b| b == b'=') else { + return Err(Error::TBD); + }; + let (name, input) = input.split_at(equals); + let (mr_type, input) = match input.get(0) { + Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]), + Some(b'D') => { + let (value, input) = parse_counted_string(&input[1..])?; + ( + MultipleResponseType::MultipleDichotomy { + value: value.into(), + labels: CategoryLabels::VarLabels, + }, + input, + ) + } + Some(b'E') => { + let Some(b' ') = input.get(1) else { + return Err(Error::TBD); + }; + let input = &input[2..]; + let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") { + (CategoryLabels::CountedValues, rest) + } else if let Some(rest) = input.strip_prefix(b" 11 ") { + (CategoryLabels::VarLabels, rest) + } else { + return Err(Error::TBD); + }; + let (value, input) = parse_counted_string(input)?; + ( + MultipleResponseType::MultipleDichotomy { + value: value.into(), + labels, + }, + input, + ) + } + _ => return Err(Error::TBD), + }; + let Some(b' ') = input.get(0) else { + return Err(Error::TBD); + }; + let (label, mut input) = parse_counted_string(&input[1..])?; + let mut vars = Vec::new(); + while input.get(0) == Some(&b' ') { + input = &input[1..]; + let Some(length) = input.iter().position(|b| b" \n".contains(b)) else { + return Err(Error::TBD); + }; + if length > 0 { + vars.push(input[..length].into()); + } + input = &input[length..]; + } + if input.get(0) != Some(&b'\n') { + return Err(Error::TBD); + } + while input.get(0) == Some(&b'\n') { + input = &input[1..]; + } + Ok(( + MultipleResponseSet { + name: name.into(), + label: label.into(), + mr_type, + vars, + }, + input, + )) + } +} + +pub struct MultipleResponseSets(Vec); + +impl ExtensionRecord for MultipleResponseSets { + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "multiple response set record"; + + fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut sets = Vec::new(); + while !input.is_empty() { + let (set, rest) = MultipleResponseSet::parse(input)?; + sets.push(set); + input = rest; + } + Ok(MultipleResponseSets(sets)) + } +} + +fn parse_counted_string(input: &[u8]) -> Result<(&[u8], &[u8]), Error> { + let Some(space) = input.iter().position(|&b| b == b' ') else { + return Err(Error::TBD); + }; + let Ok(length) = from_utf8(&input[..space]) else { + return Err(Error::TBD); + }; + let Ok(length): Result = length.parse() else { + return Err(Error::TBD); + }; + + let input = &input[space + 1..]; + if input.len() < length { + return Err(Error::TBD); + }; + + let (string, rest) = input.split_at(length); + Ok((string, rest)) +} + +pub struct ExtraProductInfo(String); + +impl TextRecord for ExtraProductInfo { + const NAME: &'static str = "extra product info"; + fn parse(input: &str, _warn: impl Fn(Error)) -> Result { + Ok(ExtraProductInfo(input.into())) + } +} + pub struct VarDisplayRecord(Vec); impl ExtensionRecord for VarDisplayRecord { @@ -845,6 +996,102 @@ impl ExtensionRecord for VarDisplayRecord { } } +pub struct VariableSet { + pub name: String, + pub vars: Vec, +} + +impl VariableSet { + fn parse(input: &str) -> Result { + let (name, input) = input.split_once('=').ok_or(Error::TBD)?; + let vars = input.split_ascii_whitespace().map(String::from).collect(); + Ok(VariableSet { + name: name.into(), + vars, + }) + } +} + +pub struct VariableSetRecord(Vec); + +impl TextRecord for VariableSetRecord { + const NAME: &'static str = "variable set"; + fn parse(input: &str, warn: impl Fn(Error)) -> Result { + let mut sets = Vec::new(); + for line in input.lines() { + match VariableSet::parse(line) { + Ok(set) => sets.push(set), + Err(error) => warn(error), + } + } + Ok(VariableSetRecord(sets)) + } +} + +pub struct LongVariableName { + pub short_name: String, + pub long_name: String, +} + +pub struct LongVariableNameRecord(Vec); + +impl TextRecord for LongVariableNameRecord { + const NAME: &'static str = "long variable names"; + fn parse(input: &str, warn: impl Fn(Error)) -> Result { + let mut names = Vec::new(); + for pair in input.split('\t').filter(|s| !s.is_empty()) { + if let Some((short_name, long_name)) = pair.split_once('=') { + let name = LongVariableName { + short_name: short_name.into(), + long_name: long_name.into(), + }; + names.push(name); + } else { + warn(Error::TBD) + } + } + Ok(LongVariableNameRecord(names)) + } +} + +pub struct VeryLongString { + short_name: String, + length: usize, +} + +impl VeryLongString { + fn parse(input: &str) -> Result { + let Some((short_name, length)) = input.split_once('=') else { + return Err(Error::TBD); + }; + let length: usize = length.parse().map_err(|_| Error::TBD)?; + Ok(VeryLongString { + short_name: short_name.into(), + length, + }) + } +} + +pub struct VeryLongStringRecord(Vec); + +impl TextRecord for VeryLongStringRecord { + const NAME: &'static str = "very long strings"; + fn parse(input: &str, warn: impl Fn(Error)) -> Result { + let mut very_long_strings = Vec::new(); + for tuple in input + .split('\0') + .map(|s| s.trim_end_matches('\t')) + .filter(|s| !s.is_empty()) + { + match VeryLongString::parse(tuple) { + Ok(vls) => very_long_strings.push(vls), + Err(error) => warn(error), + } + } + Ok(VeryLongStringRecord(very_long_strings)) + } +} + pub struct LongStringValueLabels { pub var_name: Vec, pub width: u32, -- 2.30.2