From: Ben Pfaff Date: Sat, 19 Jul 2025 20:30:22 +0000 (-0700) Subject: convert to sav works! X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=529a934ce408508dfe49ab1a8792f62d843e9a70;p=pspp convert to sav works! --- diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index b6fa22bfa5..d3fa05d4d7 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -116,7 +116,7 @@ impl From<&[u8]> for RawString { impl Debug for RawString { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "{:?}", *self) + ::fmt(&*self, f) } } @@ -193,14 +193,14 @@ impl RawStr { pub struct DisplayRawString<'a>(Cow<'a, str>); impl<'a> Display for DisplayRawString<'a> { - // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 - // (actually bytes interpreted as Unicode code points). fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}", &self.0) } } impl Debug for RawStr { + // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 + // (actually bytes interpreted as Unicode code points). fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { let s = from_utf8(&self.0).map_or_else(|_| decode_latin1(&self.0), Cow::from); write!(f, "{s:?}") diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index ee5b15f0e7..977f166e5d 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -1087,7 +1087,7 @@ impl From for i32 { } } -#[derive(Clone, Debug, Default, PartialEq, Eq)] +#[derive(Clone, Default, PartialEq, Eq)] pub struct Attributes(pub BTreeMap>); impl Attributes { @@ -1126,6 +1126,12 @@ impl Attributes { } } +impl Debug for Attributes { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + self.0.fmt(f) + } +} + #[derive(Clone, Debug, ThisError, PartialEq, Eq)] pub enum InvalidRole { #[error("Unknown role {0:?}.")] @@ -1414,7 +1420,7 @@ impl VariableSet { } } -#[derive(Clone, Debug, Default, PartialEq, Eq)] +#[derive(Clone, Default, PartialEq, Eq)] pub struct ValueLabels(pub HashMap); impl ValueLabels { @@ -1450,6 +1456,12 @@ impl ValueLabels { } } +impl Debug for ValueLabels { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + self.0.fmt(f) + } +} + impl Hash for ValueLabels { fn hash(&self, state: &mut H) { let mut hash = 0; diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index f3f885d085..458dd01839 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -14,13 +14,14 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, bail, Error as AnyError, Result}; use clap::{Args, Parser, Subcommand, ValueEnum}; use encoding_rs::Encoding; use pspp::{ crypto::EncryptedFile, sys::{ - raw::{infer_encoding, Decoder, Magic, Reader, Record}, + self, + raw::{infer_encoding, records::Compression, Decoder, Magic, Reader, Record}, ReaderOptions, Records, }, }; @@ -47,6 +48,27 @@ enum OutputFormat { /// Comma-separated values using each variable's print format (variable /// names are written as the first line) Csv, + + /// System file + Sys, +} + +impl TryFrom<&Path> for OutputFormat { + type Error = AnyError; + + fn try_from(value: &Path) -> std::result::Result { + let extension = value.extension().unwrap_or_default(); + if extension.eq_ignore_ascii_case("csv") || extension.eq_ignore_ascii_case("txt") { + Ok(OutputFormat::Csv) + } else if extension.eq_ignore_ascii_case("sav") || extension.eq_ignore_ascii_case("sys") { + Ok(OutputFormat::Sys) + } else { + Err(anyhow!( + "Unknown output file extension '{}'", + extension.display() + )) + } + } } /// Convert SPSS data files into other formats. @@ -75,10 +97,13 @@ struct Convert { /// Maximum number of cases to print. #[arg(short = 'c', long = "cases")] - max_cases: Option, + max_cases: Option, #[command(flatten, next_help_heading = "Options for CSV output")] csv_options: CsvOptions, + + #[command(flatten, next_help_heading = "Options for system file output")] + sys_options: SysOptions, } #[derive(Args, Clone, Debug)] @@ -88,6 +113,13 @@ struct CsvOptions { no_var_names: bool, } +#[derive(Args, Clone, Debug)] +struct SysOptions { + /// How to compress data in the system file. + #[arg(long)] + compression: Option, +} + impl Convert { fn run(self) -> Result<()> { fn warn(warning: anyhow::Error) { @@ -99,23 +131,55 @@ impl Convert { .with_password(self.password.clone()) .open_file(&self.input, warn)? .into_parts(); - let writer = match self.output { - Some(path) => Box::new(File::create(path)?) as Box, - None => Box::new(stdout()), + + // Take only the first `self.max_cases` cases. + let cases = cases.take(self.max_cases.unwrap_or(usize::MAX)); + + let output_format = match self.output_format { + Some(format) => format, + None => { + let Some(output) = &self.output else { + bail!("either --output-format or an output file name must be specified"); + }; + output.as_path().try_into()? + } }; - let mut output = csv::WriterBuilder::new().from_writer(writer); - if !self.csv_options.no_var_names { - output.write_record(dictionary.variables.iter().map(|var| var.name.as_str()))?; - } - for (_case_number, case) in (0..self.max_cases.unwrap_or(u64::MAX)).zip(cases) { - output.write_record(case?.0.into_iter().zip(dictionary.variables.iter()).map( - |(datum, variable)| { - datum - .display(variable.print_format, variable.encoding) - .to_string() - }, - ))?; + match output_format { + OutputFormat::Csv => { + let writer = match self.output { + Some(path) => Box::new(File::create(path)?) as Box, + None => Box::new(stdout()), + }; + let mut output = csv::WriterBuilder::new().from_writer(writer); + if !self.csv_options.no_var_names { + output + .write_record(dictionary.variables.iter().map(|var| var.name.as_str()))?; + } + + for case in cases { + output.write_record( + case?.0.into_iter().zip(dictionary.variables.iter()).map( + |(datum, variable)| { + datum + .display(variable.print_format, variable.encoding) + .to_string() + }, + ), + )?; + } + } + OutputFormat::Sys => { + let Some(output) = &self.output else { + bail!("output file name must be specified for output to a system file") + }; + let mut output = sys::WriteOptions::new() + .with_compression(self.sys_options.compression) + .write_file(&dictionary, output)?; + for case in cases { + output.write_case(&case?)?; + } + } } Ok(()) } @@ -220,7 +284,7 @@ enum Mode { Raw, Decoded, #[default] - Cooked, + Parsed, } fn main() -> Result<()> { @@ -247,12 +311,13 @@ fn dissect( return Ok(()); } Mode::Raw => { + println!("{:#?}", reader.header()); for record in reader.records() { let header = record?; - println!("{:?}", header); + println!("{:#?}", header); } for (_index, case) in (0..max_cases).zip(reader.cases()) { - println!("{:?}", case?); + println!("{:#?}", case?); } } Mode::Decoded => { @@ -264,21 +329,10 @@ fn dissect( let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}")); for header in records { let header = header.decode(&mut decoder); - println!("{:?}", header); - /* - if let Record::Cases(cases) = header { - let mut cases = cases.borrow_mut(); - for _ in 0..max_cases { - let Some(Ok(record)) = cases.next() else { - break; - }; - println!("{:?}", record); - } - } - */ + println!("{:#?}", header); } } - Mode::Cooked => { + Mode::Parsed => { let records: Vec = reader.records().collect::, _>>()?; let encoding = match encoding { Some(encoding) => encoding, diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 05dc637fb3..802cd87d3d 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -16,6 +16,7 @@ use std::{ collections::BTreeMap, + fmt::{Debug, Display}, fs::File, io::{Read, Seek}, ops::Range, @@ -49,7 +50,7 @@ use crate::{ }, }; use anyhow::{anyhow, Error as AnyError}; -use binrw::io::BufReader; +use binrw::{io::BufReader, BinRead, BinWrite}; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::Encoding; use indexmap::set::MutableValues; @@ -1298,6 +1299,72 @@ impl Records { } } +/// Product version number in a system file. +/// +/// # Example +/// +/// `ProductVersion(1,2,3)` is version 1.2.3. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, BinRead, BinWrite)] +pub struct ProductVersion( + /// Major version. + pub i32, + /// Minor version + pub i32, + /// Revision. + pub i32, +); + +impl ProductVersion { + /// This version of PSPP. + pub const VERSION: Self = { + const fn parse_integer(mut s: &[u8]) -> (i32, &[u8]) { + let mut value = 0; + let mut n = 0; + while let Some((c, rest)) = s.split_first() + && *c >= b'0' + && *c <= b'9' + { + value = value * 10 + (*c - b'0') as i32; + n += 1; + s = rest; + } + assert!(n > 0); + (value, s) + } + + const fn skip_dot(s: &[u8]) -> &[u8] { + let Some((c, rest)) = s.split_first() else { + unreachable!() + }; + assert!(*c == b'.'); + rest + } + + // Parse `CARGO_PKG_VERSION`. This could be easier if `const` contexts + // were less restricted. + let s = env!("CARGO_PKG_VERSION").as_bytes(); + let (first, s) = parse_integer(s); + let s = skip_dot(s); + let (second, s) = parse_integer(s); + let s = skip_dot(s); + let (third, s) = parse_integer(s); + assert!(matches!(s.first(), None | Some(b'-' | b'+'))); + Self(first, second, third) + }; +} + +impl Display for ProductVersion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}.{}.{}", self.0, self.1, self.2) + } +} + +impl Debug for ProductVersion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + ::fmt(self, f) + } +} + /// System file metadata that is not part of [Dictionary]. /// /// [Dictionary]: crate::dictionary::Dictionary @@ -1327,8 +1394,7 @@ pub struct Metadata { /// Version number of the product that wrote the file. /// - /// For example, `(1,2,3)` is version 1.2.3. - pub version: Option<(i32, i32, i32)>, + pub version: Option, } impl Metadata { @@ -1403,7 +1469,7 @@ impl Metadata { let product = header .eye_catcher .trim_start_matches("@(#) SPSS DATA FILE") - .trim_end() + .trim() .to_string(); Self { diff --git a/rust/pspp/src/sys/mod.rs b/rust/pspp/src/sys/mod.rs index b05eba3ec6..c3549ed358 100644 --- a/rust/pspp/src/sys/mod.rs +++ b/rust/pspp/src/sys/mod.rs @@ -36,6 +36,7 @@ pub mod raw; pub mod sack; mod write; +pub use write::{Version, WriteOptions, Writer}; #[cfg(test)] mod test; diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index a246f137c3..feddf3fa70 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -20,18 +20,22 @@ use crate::{ endian::{Endian, Parse}, format::{Format, Type}, identifier::{Error as IdError, Identifier}, - sys::raw::{ - read_bytes, read_string, read_vec, Decoder, Error, ErrorDetails, Magic, RawDatum, - RawStrArray, RawWidth, Record, UntypedDatum, VarTypes, Warning, WarningDetails, + sys::{ + raw::{ + read_bytes, read_string, read_vec, Decoder, Error, ErrorDetails, Magic, RawDatum, + RawStrArray, RawWidth, Record, UntypedDatum, VarTypes, Warning, WarningDetails, + }, + ProductVersion, }, }; use binrw::{BinRead, BinWrite}; +use clap::ValueEnum; use itertools::Itertools; use thiserror::Error as ThisError; /// Type of compression in a system file. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] pub enum Compression { /// Simple bytecode-based compression. Simple, @@ -50,7 +54,7 @@ pub enum HeaderWarning { } /// A file header record in a system file. -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct FileHeader where S: Debug, @@ -95,39 +99,6 @@ where pub endian: Endian, } -impl FileHeader -where - S: Debug, -{ - fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> std::fmt::Result - where - T: Debug, - { - writeln!(f, "{name:>17}: {:?}", value) - } -} - -impl Debug for FileHeader -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - writeln!(f, "File header record:")?; - self.debug_field(f, "Magic", self.magic)?; - self.debug_field(f, "Product name", &self.eye_catcher)?; - self.debug_field(f, "Layout code", self.layout_code)?; - self.debug_field(f, "Nominal case size", self.nominal_case_size)?; - self.debug_field(f, "Compression", self.compression)?; - self.debug_field(f, "Weight index", self.weight_index)?; - self.debug_field(f, "Number of cases", self.n_cases)?; - self.debug_field(f, "Compression bias", self.bias)?; - self.debug_field(f, "Creation date", &self.creation_date)?; - self.debug_field(f, "Creation time", &self.creation_time)?; - self.debug_field(f, "File label", &self.file_label)?; - self.debug_field(f, "Endianness", self.endian) - } -} - #[allow(missing_docs)] #[derive(BinRead, BinWrite)] pub struct RawHeader { @@ -450,7 +421,7 @@ pub enum VariableWarning { } /// A variable record in a system file. -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct VariableRecord where S: Debug, @@ -477,20 +448,6 @@ where pub label: Option, } -impl Debug for VariableRecord -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - writeln!(f, "Width: {}", self.width,)?; - writeln!(f, "Print format: {:?}", self.print_format)?; - writeln!(f, "Write format: {:?}", self.write_format)?; - writeln!(f, "Name: {:?}", &self.name)?; - writeln!(f, "Variable label: {:?}", self.label)?; - writeln!(f, "Missing values: {:?}", self.missing_values) - } -} - #[allow(missing_docs)] #[derive(BinRead, BinWrite)] pub struct RawVariableRecord { @@ -630,7 +587,7 @@ where /// /// This represents both the type-3 and type-4 records together, since they are /// always paired anyway. -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct ValueLabelRecord where D: Debug, @@ -649,24 +606,6 @@ where pub var_type: VarType, } -impl Debug for ValueLabelRecord -where - D: Debug, - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - writeln!(f, "labels: ")?; - for label in self.labels.iter() { - writeln!(f, "{label:?}")?; - } - write!(f, "apply to {} variables", self.var_type)?; - for dict_index in self.dict_indexes.iter() { - write!(f, " #{dict_index}")?; - } - Ok(()) - } -} - impl ValueLabelRecord where D: Debug, @@ -918,9 +857,7 @@ pub struct IntegerInfoRecord { #[derive(Clone, Debug, BinRead, BinWrite)] pub struct RawIntegerInfoRecord { /// Version number. - /// - /// e.g. `(1,2,3)` for version 1.2.3. - pub version: (i32, i32, i32), + pub version: ProductVersion, /// Identifies the type of machine. /// diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index a0869ead14..53391f670b 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -4,16 +4,19 @@ use std::{ borrow::Cow, collections::HashMap, fmt::Write as _, + fs::File, io::{Cursor, Seek, Write}, + path::Path, }; use binrw::{BinWrite, Endian, Error as BinError}; use chrono::Local; use encoding_rs::Encoding; +use itertools::zip_eq; use smallvec::SmallVec; use crate::{ - data::Datum, + data::{Case, Datum}, dictionary::{ Alignment, Attributes, CategoryLabels, Dictionary, Measure, MultipleResponseType, ValueLabels, VarWidth, @@ -30,6 +33,7 @@ use crate::{ }, Magic, }, + ProductVersion, }, }; @@ -62,6 +66,40 @@ impl Default for WriteOptions { } } +impl WriteOptions { + pub fn new() -> Self { + Self::default() + } + pub fn with_compression(self, compression: Option) -> Self { + Self { + compression, + ..self + } + } + pub fn with_version(self, version: Version) -> Self { + Self { version, ..self } + } + pub fn write_file( + self, + dictionary: &Dictionary, + path: impl AsRef, + ) -> Result, BinError> { + self.write_writer(dictionary, File::create(path)?) + } + pub fn write_writer( + self, + dictionary: &Dictionary, + mut writer: W, + ) -> Result, BinError> + where + W: Write + Seek, + { + let mut dict_writer = DictionaryWriter::new(&self, &mut writer, dictionary); + dict_writer.write()?; + Ok(Writer::new(self, dict_writer.case_vars, writer)) + } +} + struct DictionaryWriter<'a, W> { compression: Option, version: Version, @@ -159,7 +197,7 @@ where }, n_cases: u32::MAX, bias: 100.0, - creation_date: as_byte_array(now.format("%d %b %Y").to_string()), + creation_date: as_byte_array(now.format("%d %b %y").to_string()), creation_time: as_byte_array(now.format("%H:%M:%S").to_string()), file_label: as_byte_array(self.dictionary.file_label.clone().unwrap_or_default()), }; @@ -292,8 +330,13 @@ where (3u32, value_labels.0.len() as u32).write_le(self.writer)?; for (datum, label) in &value_labels.0 { let label = &*self.dictionary.encoding.encode(&label).0; - let padding = label.len().next_multiple_of(8) - label.len(); - (datum, label.len() as u32, label, Zeros(padding)).write_le(self.writer)?; + let label = if label.len() > 255 { + &label[..255] + } else { + label + }; + let padding = (1 + label.len()).next_multiple_of(8) - (1 + label.len()); + (datum, label.len() as u8, label, Zeros(padding)).write_le(self.writer)?; } // Variable record. @@ -313,40 +356,6 @@ where Ok(()) } - const fn version() -> (i32, i32, i32) { - const fn parse_integer(mut s: &[u8]) -> (i32, &[u8]) { - let mut value = 0; - let mut n = 0; - while let Some((c, rest)) = s.split_first() - && *c >= b'0' - && *c <= b'9' - { - value = value * 10 + (*c - b'0') as i32; - n += 1; - s = rest; - } - assert!(n > 0); - (value, s) - } - - const fn skip_dot(s: &[u8]) -> &[u8] { - let Some((c, rest)) = s.split_first() else { - unreachable!() - }; - assert!(*c == b'.'); - rest - } - - let s = env!("CARGO_PKG_VERSION").as_bytes(); - let (first, s) = parse_integer(s); - let s = skip_dot(s); - let (second, s) = parse_integer(s); - let s = skip_dot(s); - let (third, s) = parse_integer(s); - assert!(matches!(s.first(), None | Some(b'-' | b'+'))); - (first, second, third) - } - fn write_integer_record(&mut self) -> Result<(), BinError> { ( 7u32, @@ -354,7 +363,7 @@ where 4u32, 8u32, RawIntegerInfoRecord { - version: Self::version(), + version: ProductVersion::VERSION, machine_code: -1, floating_point_rep: 1, compression_code: 1, @@ -662,23 +671,6 @@ impl BinWrite for Pad { } } -impl WriteOptions { - pub fn new() -> Self { - Self::default() - } - pub fn write_writer( - self, - dictionary: &Dictionary, - mut writer: W, - ) -> Result, BinError> - where - W: Write + Seek, - { - DictionaryWriter::new(&self, &mut writer, dictionary).write()?; - todo!() - } -} - impl BinWrite for Datum { type Args<'a> = (); @@ -790,32 +782,52 @@ impl CaseVar { } } } -/* -/// A variable in a system file. -struct WriteVar { - width: VarWidth, - segment_width: u8, - case_index: usize, - /// Offset within string variable in case. - offset: usize, - - /// Number of padding bytes following data. - padding: usize, +/// System file writer. +pub struct Writer { + compression: Option, + case_vars: Vec, + inner: W, } -impl WriteVar { - fn new_vars(dictionary: &Dictionary) -> Vec { - let mut vars = Vec::new(); - for dv in &dictionary.variables { - +impl Writer { + fn new(options: WriteOptions, case_vars: Vec, inner: W) -> Self { + Self { + compression: options.compression, + case_vars, + inner, } } -}*/ - -/// System file writer. -pub struct Writer { - inner: W, } -impl Writer where W: Write + Seek {} +impl Writer +where + W: Write + Seek, +{ + pub fn write_case(&mut self, case: &Case) -> Result<(), BinError> { + match self.compression { + Some(_) => todo!(), + None => self.write_case_uncompressed(case), + } + } + fn write_case_uncompressed(&mut self, case: &Case) -> Result<(), BinError> { + for (var, datum) in zip_eq(&self.case_vars, &case.0) { + match var { + CaseVar::Numeric => datum + .as_number() + .unwrap() + .unwrap_or(f64::MIN) + .write_le(&mut self.inner)?, + CaseVar::String { width: _, encoding } => { + let mut s = datum.as_string().unwrap().as_bytes(); + for segment in encoding { + let data; + (data, s) = s.split_at(segment.data_bytes); + (data, Pad::new(segment.padding_bytes, 0)).write_le(&mut self.inner)?; + } + } + } + } + Ok(()) + } +}