From 835ba65119975c4babafd7af3ca75b894762666c Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 6 Jul 2025 16:50:55 -0700 Subject: [PATCH] convert works! --- rust/Cargo.lock | 22 ++++++ rust/pspp/Cargo.toml | 3 +- rust/pspp/src/main.rs | 130 +++++++++++++++++++++++++++++++----- rust/pspp/src/sys/cooked.rs | 1 - rust/pspp/src/sys/raw.rs | 3 +- 5 files changed, 139 insertions(+), 20 deletions(-) diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 254f167ca3..d5622a9845 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -440,6 +440,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +dependencies = [ + "memchr", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -1568,6 +1589,7 @@ dependencies = [ "chrono", "clap", "color", + "csv", "derive_more", "diff", "either", diff --git a/rust/pspp/Cargo.toml b/rust/pspp/Cargo.toml index f0a454af7d..a7080d8b1c 100644 --- a/rust/pspp/Cargo.toml +++ b/rust/pspp/Cargo.toml @@ -42,6 +42,7 @@ pango = "0.20.9" pangocairo = "0.20.7" zip = "4.0.0" xmlwriter = "0.1.0" +csv = "1.3.1" [target.'cfg(windows)'.dependencies] windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } @@ -51,7 +52,7 @@ anyhow = "1.0.69" flate2 = "1.0.26" [[bin]] -name = "pspp-dump-sav" +name = "pspp" path = "src/main.rs" [lib] diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 8677e3e01b..57ca1bb260 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -15,20 +15,102 @@ * along with this program. If not, see . */ use anyhow::Result; -use clap::{Parser, ValueEnum}; +use clap::{Args, Parser, Subcommand, ValueEnum}; use encoding_rs::Encoding; -use pspp::sys::cooked::{decode, Headers}; -use pspp::sys::raw::{encoding_from_headers, Decoder, Magic, Reader, Record}; +use pspp::sys::cooked::{decode, Error, Headers}; +use pspp::sys::raw::{encoding_from_headers, Decoder, Magic, Reader, Record, Warning}; use std::fs::File; -use std::io::BufReader; +use std::io::{stdout, BufReader, Write}; use std::path::{Path, PathBuf}; use std::str; use thiserror::Error as ThisError; -/// A utility to dissect SPSS system files. +/// PSPP, a program for statistical analysis of sampled data. #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] -struct Args { +struct Cli { + #[command(subcommand)] + command: Command, +} + +/// Output file format. +#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] +enum OutputFormat { + /// Comma-separated values using each variable's print format (variable + /// names are written as the first line) + Csv, + + /// SPSS system file. + Sav, +} + +/// Convert SPSS data files into other formats. +#[derive(Args, Clone, Debug)] +struct Convert { + /// Input file name. + input: PathBuf, + + /// Output file name (if omitted, output is written to stdout). + output: Option, + + /// Format for output file (if omitted, the intended format is inferred + /// based on file extension). + output_format: Option, + + /// The encoding to use. + #[arg(long, value_parser = parse_encoding)] + encoding: Option<&'static Encoding>, +} + +impl Convert { + fn warn(warning: Warning) { + eprintln!("warning: {warning}"); + } + + fn err(error: Error) { + eprintln!("error: {error}"); + } + + fn run(self) -> Result<()> { + let input = BufReader::new(File::open(&self.input)?); + let mut reader = Reader::new(input, Self::warn)?; + let headers = reader.headers().collect::, _>>()?; + let cases = reader.cases(); + let encoding = encoding_from_headers(&headers, &mut |w| Self::warn(w))?; + let mut decoder = Decoder::new(encoding, |w| Self::warn(w)); + let mut decoded_records = Vec::new(); + for header in headers { + decoded_records.push(header.decode(&mut decoder)?); + } + drop(decoder); + + let headers = Headers::new(decoded_records, &mut |e| Self::err(e))?; + let (dictionary, metadata, cases) = decode(headers, cases, encoding, |e| Self::err(e))?; + let writer = match self.output { + Some(path) => Box::new(File::create(path)?) as Box, + None => Box::new(stdout()), + }; + let mut output = csv::WriterBuilder::new().from_writer(writer); + output.write_record(dictionary.variables.iter().map(|var| var.name.as_str()))?; + + if let Some(cases) = cases { + for case in cases { + output.write_record(case?.into_iter().zip(dictionary.variables.iter()).map( + |(datum, variable)| { + datum + .display(variable.print_format, variable.encoding) + .to_string() + }, + ))?; + } + } + Ok(()) + } +} + +/// Dissects SPSS system files. +#[derive(Args, Clone, Debug)] +struct Dissect { /// Maximum number of cases to print. #[arg(long = "data", default_value_t = 0)] max_cases: u64, @@ -46,6 +128,30 @@ struct Args { encoding: Option<&'static Encoding>, } +impl Dissect { + fn run(self) -> Result<()> { + for file in self.files { + dissect(&file, self.max_cases, self.mode, self.encoding)?; + } + Ok(()) + } +} + +#[derive(Subcommand, Clone, Debug)] +enum Command { + Convert(Convert), + Dissect(Dissect), +} + +impl Command { + fn run(self) -> Result<()> { + match self { + Command::Convert(convert) => convert.run(), + Command::Dissect(dissect) => dissect.run(), + } + } +} + #[derive(ThisError, Debug)] #[error("{0}: unknown encoding")] struct UnknownEncodingError(String); @@ -67,17 +173,7 @@ enum Mode { } fn main() -> Result<()> { - let Args { - max_cases, - files, - mode, - encoding, - } = Args::parse(); - - for file in files { - dissect(&file, max_cases, mode, encoding)?; - } - Ok(()) + Cli::parse().command.run() } fn dissect( diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index bf4169fc28..c513c20934 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -950,7 +950,6 @@ pub fn decode( .iter() .flat_map(|record| record.0.iter().cloned()) { - dbg!(&renaming); let LongName { short_name, long_name, diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 44c40c4570..b1e0528459 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -945,7 +945,7 @@ impl Datum { values.push(Datum::Number(endian.parse(raw))); } CaseVar::String { width, encoding } => { - let mut datum = vec![0; *width]; + let mut datum = Vec::with_capacity(*width); for segment in encoding { let mut data_bytes = segment.data_bytes; let mut padding_bytes = segment.padding_bytes; @@ -1177,6 +1177,7 @@ impl ReadSeek for T where T: Read + Seek {} pub struct Case(pub Vec); +#[derive(Debug)] struct StringSegment { data_bytes: usize, padding_bytes: usize, -- 2.30.2