From: Ben Pfaff Date: Mon, 26 Feb 2024 17:11:17 +0000 (-0800) Subject: variables parsed X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7e2346251b2a07f03e2b5e77f2f9b938a9b00ab7;p=pspp variables parsed --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 5d32c91ced..ee66890027 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -1,10 +1,10 @@ -use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; +use std::{cell::RefCell, ops::Range, rc::Rc, collections::HashMap}; use crate::{ - dictionary::{Dictionary, VarWidth}, + dictionary::{Dictionary, VarWidth, Variable}, encoding::Error as EncodingError, endian::Endian, - format::{Error as FormatError, Spec}, + format::{Error as FormatError, Spec, UncheckedSpec}, identifier::{Error as IdError, Identifier}, raw::{ self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, @@ -17,6 +17,7 @@ use crate::{ }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::Encoding; +use num::Integer; use thiserror::Error as ThisError; pub use crate::raw::{CategoryLabels, Compression}; @@ -162,23 +163,6 @@ pub enum Error { type DictIndex = usize; -pub struct Variable { - pub dict_index: DictIndex, - pub short_name: Identifier, - pub long_name: Option, - pub width: VarWidth, -} - -pub struct Decoder { - pub raw: raw::Decoder, - pub encoding: &'static Encoding, - pub variables: HashMap, - pub var_names: HashMap, - pub dictionary: Dictionary, - n_dict_indexes: usize, - n_generated_names: usize, -} - #[derive(Clone, Debug)] pub struct Headers { pub header: HeaderRecord, @@ -396,6 +380,30 @@ impl Metadata { } } +struct Decoder { + //pub raw: raw::Decoder, + pub encoding: &'static Encoding, + //pub variables: HashMap, + //pub var_names: HashMap, + //pub dictionary: Dictionary, + //n_dict_indexes: usize, + n_generated_names: usize, +} + +impl Decoder { + fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier { + loop { + self.n_generated_names += 1; + let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding) + .unwrap(); + if !dictionary.variables.contains(&name) { + return name; + } + assert!(self.n_generated_names < usize::MAX); + } + } +} + pub fn decode( mut headers: Headers, encoding: &'static Encoding, @@ -409,7 +417,7 @@ pub fn decode( } for attributes in headers.file_attributes.drain(..) { - dictionary.attributes.extend(attributes.0.0.into_iter()) + dictionary.attributes.extend(attributes.0 .0.into_iter()) } // Concatenate all the document records (really there should only be one) @@ -424,16 +432,86 @@ pub fn decode( // XXX warn for weird integer format // XXX warn for weird floating-point format, etc. - /* - let mut decoder = Decoder { - raw: decoder, - variables: HashMap::new(), - var_names: HashMap::new(), - dictionary, - n_dict_indexes: 0, - n_generated_names: 0, + let mut decoder = Decoder { + encoding, + n_generated_names: 0, + }; + + let mut header_vars = headers.variable.iter().enumerate(); + let mut var_index_map = HashMap::new(); + while let Some((value_index, input)) = header_vars.next() { + let name = trim_end_spaces(input.name.to_string()); + let name = match Identifier::new(&name, encoding) { + Ok(name) => { + if !dictionary.variables.contains(&name) { + name + } else { + let new_name = decoder.generate_name(&dictionary); + warn(Error::DuplicateVariableName { + duplicate_name: name.clone(), + new_name: new_name.clone(), + }); + new_name + } + } + Err(id_error) => { + let new_name = decoder.generate_name(&dictionary); + warn(Error::InvalidVariableName { + id_error, + new_name: new_name.clone(), + }); + new_name + } }; - */ + let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap()); + + // Set the short name the same as the long name (even if we renamed it). + variable.short_names = vec![name]; + + variable.label = input.label.clone(); + + variable.missing_values = input.missing_values.clone(); + + variable.print_format = decode_format( + input.print_format, + variable.width, + |new_spec, format_error| { + warn(Error::InvalidPrintFormat { + new_spec, + variable: variable.name.clone(), + format_error, + }) + }, + ); + variable.write_format = decode_format( + input.write_format, + variable.width, + |new_spec, format_error| { + warn(Error::InvalidWriteFormat { + new_spec, + variable: variable.name.clone(), + format_error, + }) + }, + ); + + // Skip long string continuation records. + if input.width > 0 { + #[allow(unstable_name_collisions)] + for _ in 1..input.width.div_ceil(&8) { + if let Some((_, continuation)) = header_vars.next() { + if continuation.width == -1 { + continue; + } + } + return Err(Error::TBD); + } + } + + let dict_index = dictionary.add_var(variable).unwrap(); + assert_eq!(var_index_map.insert(value_index, dict_index), None); + } + let metadata = Metadata::decode(&headers, warn); Ok((dictionary, metadata)) } @@ -464,6 +542,17 @@ fn fix_line_ends(s: &str) -> String { out } +fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec { + UncheckedSpec::try_from(raw) + .and_then(Spec::try_from) + .and_then(|x| x.check_width_compatibility(width)) + .unwrap_or_else(|error| { + let new_format = Spec::default_for_width(width); + warn(new_format, error); + new_format + }) +} + /* impl Decoder { fn generate_name(&mut self) -> Identifier { @@ -677,16 +766,6 @@ pub struct VariableRecord { pub label: Option, } -fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec { - UncheckedSpec::try_from(raw) - .and_then(Spec::try_from) - .and_then(|x| x.check_width_compatibility(width)) - .unwrap_or_else(|error| { - let new_format = Spec::default_for_width(width); - warn(new_format, error); - new_format - }) -} fn parse_variable_record( decoder: &mut Decoder, diff --git a/rust/src/dictionary.rs b/rust/src/dictionary.rs index 59e1e3a853..042a294452 100644 --- a/rust/src/dictionary.rs +++ b/rust/src/dictionary.rs @@ -78,6 +78,15 @@ impl VarWidth { VarWidth::String(width) => *width.min(&32) as u32, } } + + pub fn from_raw(raw: impl Into) -> Result { + let raw: i32 = raw.into(); + match raw { + 0 => Ok(Self::Numeric), + 1..=255 => Ok(Self::String(raw as u16)), + _ => Err(()), + } + } } impl From for VarType { @@ -120,6 +129,7 @@ pub struct Dictionary { pub encoding: &'static Encoding, } +#[derive(Debug)] pub struct DuplicateVariableName; impl Dictionary { @@ -140,9 +150,10 @@ impl Dictionary { } } - pub fn add_var(&mut self, variable: Variable) -> Result<(), DuplicateVariableName> { - if self.variables.insert(ByIdentifier::new(variable)) { - Ok(()) + pub fn add_var(&mut self, variable: Variable) -> Result { + let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable)); + if inserted { + Ok(index) } else { Err(DuplicateVariableName) } diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 0620d4eea6..e8a279f5e8 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -46,6 +46,12 @@ pub enum Error { #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] BadRecordType { offset: u64, rec_type: u32 }, + #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")] + BadVariableWidth { + start_offset: u64, + width: i32, + }, + #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")] BadVariableLabelCode { start_offset: u64, @@ -1298,6 +1304,9 @@ impl VariableRecord> { fn read(r: &mut R, endian: Endian) -> Result { let start_offset = r.stream_position()?; let width: i32 = endian.parse(read_bytes(r)?); + if !(-1..=255).contains(&width) { + return Err(Error::BadVariableWidth { start_offset, width }); + } let code_offset = r.stream_position()?; let has_variable_label: u32 = endian.parse(read_bytes(r)?); let missing_value_code: i32 = endian.parse(read_bytes(r)?);