From: Ben Pfaff Date: Sun, 10 Dec 2023 18:19:41 +0000 (-0800) Subject: work X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a9e83f5f9daa98e4f8c9fd611dadbaa0f2fe4332;p=pspp work --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 966e4b3e45..f1753846c2 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -1,4 +1,7 @@ -use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range}; +use std::{ + borrow::Cow, cell::RefCell, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range, + rc::Rc, +}; use crate::{ encoding::{default_encoding, get_encoding, Error as EncodingError}, @@ -207,37 +210,140 @@ pub struct Decoder { n_generated_names: usize, } +fn decode_sort_order(record: &raw::Record) -> i32 { + match record { + // File header record. + raw::Record::Header(_) => 0, + + // Then the records used to decide character encoding. + raw::Record::Encoding(_) => 1, + raw::Record::IntegerInfo(_) => 2, + + // Then the other records that don't use variables at all. + raw::Record::Document(_) => 3, + raw::Record::FloatInfo(_) => 4, + raw::Record::ProductInfo(_) => 5, + raw::Record::FileAttributes(_) => 6, + + // Variable records. + raw::Record::Variable(_) => 7, + + // These records use variable indexes that would be invalidated by very + // long string variables. + raw::Record::ValueLabel(_) => 8, + raw::Record::VarDisplay(_) => 9, + + // These records use short names. + raw::Record::MultipleResponse(_) => 10, + raw::Record::VeryLongStrings(_) => 11, + + // Rename short names to long names. + raw::Record::LongNames(_) => 12, + + // These records use long names. + raw::Record::VariableAttributes(_) => 13, + raw::Record::LongStringValueLabels(_) => 14, + raw::Record::LongStringMissingValues(_) => 15, + raw::Record::VariableSets(_) => 16, + + // Cases come last. + raw::Record::Cases(_) => 17, + + // We don't use these records at all. + raw::Record::NumberOfCases(_) => i32::MAX, + raw::Record::OtherExtension(_) => i32::MAX, + raw::Record::EndOfHeaders(_) => i32::MAX, + raw::Record::ZHeader(_) => i32::MAX, + raw::Record::ZTrailer(_) => i32::MAX, + } +} + +#[derive(Default)] +struct Headers<'a> { + header: Option<&'a raw::HeaderRecord>, + variables: Vec<&'a raw::VariableRecord>, + value_labels: Vec<&'a raw::ValueLabelRecord>, + document: Option<&'a raw::DocumentRecord>, + integer_info: Option<&'a raw::IntegerInfoRecord>, + float_info: Option<&'a raw::FloatInfoRecord>, + variable_sets: Vec<&'a raw::TextRecord>, + var_display: Option<&'a raw::VarDisplayRecord>, + multiple_response: Vec<&'a raw::MultipleResponseRecord>, + long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord>, + long_string_missing_values: Vec<&'a raw::LongStringMissingValueRecord>, + encoding: Option<&'a raw::EncodingRecord>, + number_of_cases: Option<&'a raw::NumberOfCasesRecord>, + product_info: Option<&'a raw::TextRecord>, + long_names: Vec<&'a raw::TextRecord>, + very_long_strings: Vec<&'a raw::TextRecord>, + file_attributes: Vec<&'a raw::TextRecord>, + variable_attributes: Vec<&'a raw::TextRecord>, + other_extensions: Vec<&'a raw::Extension>, + cases: Option<&'a Rc>>, +} + +fn set_or_warn(option: &mut Option, value: T, warn: &impl Fn(Error)) { + if option.is_none() { + let _ = option.insert(value); + } else { + warn(Error::TBD); + } +} + +impl<'a> Headers<'a> { + fn new(headers: &'a Vec, warn: &impl Fn(Error)) -> Headers<'a> { + let mut h = Headers::default(); + for header in headers { + match header { + raw::Record::Header(record) => set_or_warn(&mut h.header, record, warn), + raw::Record::Variable(record) => h.variables.push(record), + raw::Record::ValueLabel(record) => h.value_labels.push(record), + raw::Record::Document(record) => set_or_warn(&mut h.document, record, warn), + raw::Record::IntegerInfo(record) => set_or_warn(&mut h.integer_info, record, warn), + raw::Record::FloatInfo(record) => set_or_warn(&mut h.float_info, record, warn), + raw::Record::VariableSets(record) => h.variable_sets.push(record), + raw::Record::VarDisplay(record) => set_or_warn(&mut h.var_display, record, warn), + raw::Record::MultipleResponse(record) => h.multiple_response.push(record), + raw::Record::LongStringValueLabels(record) => { + h.long_string_value_labels.push(record) + } + raw::Record::LongStringMissingValues(record) => { + h.long_string_missing_values.push(record) + } + raw::Record::Encoding(record) => set_or_warn(&mut h.encoding, record, warn), + raw::Record::NumberOfCases(record) => { + set_or_warn(&mut h.number_of_cases, record, warn) + } + raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn), + raw::Record::LongNames(record) => h.long_names.push(record), + raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record), + raw::Record::FileAttributes(record) => h.file_attributes.push(record), + raw::Record::VariableAttributes(record) => h.variable_attributes.push(record), + raw::Record::OtherExtension(record) => h.other_extensions.push(record), + raw::Record::EndOfHeaders(_) => todo!(), + raw::Record::ZHeader(_) => todo!(), + raw::Record::ZTrailer(_) => todo!(), + raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn), + } + } + h + } +} + pub fn decode( headers: Vec, encoding: Option<&'static Encoding>, warn: &impl Fn(Error), ) -> Result, Error> { - let Some(header_record) = headers.iter().find_map(|rec| { - if let raw::Record::Header(header) = rec { - Some(header) - } else { - None - } - }) else { + let h = Headers::new(&headers, warn); + let Some(header) = h.header else { return Err(Error::MissingHeaderRecord); }; let encoding = match encoding { Some(encoding) => encoding, None => { - let encoding = headers.iter().find_map(|rec| { - if let raw::Record::Encoding(ref e) = rec { - Some(e.0.as_str()) - } else { - None - } - }); - let character_code = headers.iter().find_map(|rec| { - if let raw::Record::IntegerInfo(ref r) = rec { - Some(r.character_code) - } else { - None - } - }); + let encoding = h.encoding.map(|record| record.0.as_str()); + let character_code = h.integer_info.map(|record| record.character_code); match get_encoding(encoding, character_code) { Ok(encoding) => encoding, Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)), @@ -251,8 +357,8 @@ pub fn decode( }; let mut decoder = Decoder { - compression: header_record.compression, - endian: header_record.endian, + compression: header.compression, + endian: header.endian, encoding, variables: HashMap::new(), var_names: HashMap::new(), @@ -261,99 +367,105 @@ pub fn decode( }; let mut output = Vec::with_capacity(headers.len()); - for header in &headers { - match header { - raw::Record::Header(ref input) => { - if let Some(header) = HeaderRecord::try_decode(&mut decoder, input, warn)? { - output.push(Record::Header(header)) - } - } - raw::Record::Variable(ref input) => { - if let Some(variable) = VariableRecord::try_decode(&mut decoder, input, warn)? { - output.push(Record::Variable(variable)); - } - } - raw::Record::ValueLabel(ref input) => { - if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, input, warn)? - { - output.push(Record::ValueLabel(value_label)); - } - } - raw::Record::Document(ref input) => { - if let Some(document) = DocumentRecord::try_decode(&mut decoder, input, warn)? { - output.push(Record::Document(document)) - } - } - raw::Record::IntegerInfo(ref input) => output.push(Record::IntegerInfo(input.clone())), - raw::Record::FloatInfo(ref input) => output.push(Record::FloatInfo(input.clone())), - raw::Record::VariableSets(ref input) => { - let s = decoder.decode_string_cow(&input.text.0, warn); - output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?)); - } - raw::Record::VarDisplay(ref input) => { - if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, input, warn)? { - output.push(Record::VarDisplay(vdr)) - } - } - raw::Record::MultipleResponse(ref input) => { - if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, input, warn)? { - output.push(Record::MultipleResponse(mrr)) - } - } - raw::Record::LongStringMissingValues(ref input) => { - if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, input, warn)? { - output.push(Record::LongStringMissingValues(mrr)) - } - } - raw::Record::LongStringValueLabels(ref input) => { - if let Some(mrr) = - LongStringValueLabelRecord::try_decode(&mut decoder, input, warn)? - { - output.push(Record::LongStringValueLabels(mrr)) - } - } - raw::Record::Encoding(ref input) => output.push(Record::Encoding(input.clone())), - raw::Record::NumberOfCases(ref input) => { - output.push(Record::NumberOfCases(input.clone())) - } - raw::Record::ProductInfo(ref input) => { - let s = decoder.decode_string_cow(&input.text.0, warn); - output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?)); - } - raw::Record::LongNames(ref input) => { - let s = decoder.decode_string_cow(&input.text.0, warn); - output.push(Record::LongNames(LongNameRecord::parse( - &mut decoder, - &s, - warn, - )?)); - } - raw::Record::VeryLongStrings(ref input) => { - let s = decoder.decode_string_cow(&input.text.0, warn); - output.push(Record::VeryLongStrings(VeryLongStringRecord::parse( - &decoder, &s, warn, - )?)); - } - raw::Record::FileAttributes(ref input) => { - let s = decoder.decode_string_cow(&input.text.0, warn); - output.push(Record::FileAttributes(FileAttributeRecord::parse( - &decoder, &s, warn, - )?)); - } - raw::Record::VariableAttributes(ref input) => { - let s = decoder.decode_string_cow(&input.text.0, warn); - output.push(Record::VariableAttributes(VariableAttributeRecord::parse( - &decoder, &s, warn, - )?)); - } - raw::Record::OtherExtension(ref input) => { - output.push(Record::OtherExtension(input.clone())) - } - raw::Record::EndOfHeaders(_) => (), - raw::Record::ZHeader(_) => (), - raw::Record::ZTrailer(_) => (), - raw::Record::Cases(_) => (), - }; + + // Decode the records that don't use variables at all. + if let Some(header) = HeaderRecord::try_decode(&mut decoder, header, warn)? { + output.push(Record::Header(header)) + } + if let Some(raw) = h.document { + if let Some(document) = DocumentRecord::try_decode(&mut decoder, raw, warn)? { + output.push(Record::Document(document)) + } + } + if let Some(raw) = h.integer_info { + output.push(Record::IntegerInfo(raw.clone())); + } + if let Some(raw) = h.float_info { + output.push(Record::FloatInfo(raw.clone())); + } + if let Some(raw) = h.product_info { + let s = decoder.decode_string_cow(&raw.text.0, warn); + output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?)); + } + if let Some(raw) = h.number_of_cases { + output.push(Record::NumberOfCases(raw.clone())) + } + for &raw in &h.file_attributes { + let s = decoder.decode_string_cow(&raw.text.0, warn); + output.push(Record::FileAttributes(FileAttributeRecord::parse( + &decoder, &s, warn, + )?)); + } + for &raw in &h.other_extensions { + output.push(Record::OtherExtension(raw.clone())); + } + + // Decode the variable records, which are the basis of almost everything + // else. + for &raw in &h.variables { + if let Some(variable) = VariableRecord::try_decode(&mut decoder, raw, warn)? { + output.push(Record::Variable(variable)); + } + } + + // Decode value labels and weight variable. These use indexes into the + // variable records, so we need to parse them before those indexes become + // invalidated by very long string variables. + for &raw in &h.value_labels { + if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, raw, warn)? { + output.push(Record::ValueLabel(value_label)); + } + } + // XXX weight + if let Some(raw) = h.var_display { + if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, raw, warn)? { + output.push(Record::VarDisplay(vdr)) + } + } + + // Decode records that use short names. + for &raw in &h.multiple_response { + if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, raw, warn)? { + output.push(Record::MultipleResponse(mrr)) + } + } + for &raw in &h.very_long_strings { + let s = decoder.decode_string_cow(&raw.text.0, warn); + output.push(Record::VeryLongStrings(VeryLongStringRecord::parse( + &decoder, &s, warn, + )?)); + } + + // Rename variables to their long names. + for &raw in &h.long_names { + let s = decoder.decode_string_cow(&raw.text.0, warn); + output.push(Record::LongNames(LongNameRecord::parse( + &mut decoder, + &s, + warn, + )?)); + } + + // Decode recods that use long names. + for &raw in &h.variable_attributes { + let s = decoder.decode_string_cow(&raw.text.0, warn); + output.push(Record::VariableAttributes(VariableAttributeRecord::parse( + &decoder, &s, warn, + )?)); + } + for &raw in &h.long_string_value_labels { + if let Some(mrr) = LongStringValueLabelRecord::try_decode(&mut decoder, raw, warn)? { + output.push(Record::LongStringValueLabels(mrr)) + } + } + for &raw in &h.long_string_missing_values { + if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, raw, warn)? { + output.push(Record::LongStringMissingValues(mrr)) + } + } + for &raw in &h.variable_sets { + let s = decoder.decode_string_cow(&raw.text.0, warn); + output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?)); } Ok(output) } @@ -1356,7 +1468,7 @@ impl LongStringMissingValues { Ok(LongStringMissingValues { var_name, - missing_values + missing_values, }) } } @@ -1365,7 +1477,7 @@ impl LongStringMissingValues { pub struct LongStringMissingValuesRecord(Vec); impl TryDecode for LongStringMissingValuesRecord { - type Input = raw::LongStringMissingValueSet; + type Input = raw::LongStringMissingValueRecord; fn try_decode( decoder: &mut Decoder, diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 8b69f760d6..2eb96b2fec 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -145,7 +145,7 @@ pub enum Record { VarDisplay(VarDisplayRecord), MultipleResponse(MultipleResponseRecord), LongStringValueLabels(LongStringValueLabelRecord), - LongStringMissingValues(LongStringMissingValueSet), + LongStringMissingValues(LongStringMissingValueRecord), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), ProductInfo(TextRecord), @@ -1432,9 +1432,9 @@ pub struct LongStringMissingValues { } #[derive(Clone, Debug)] -pub struct LongStringMissingValueSet(pub Vec); +pub struct LongStringMissingValueRecord(pub Vec); -impl ExtensionRecord for LongStringMissingValueSet { +impl ExtensionRecord for LongStringMissingValueRecord { const SUBTYPE: u32 = 22; const SIZE: Option = Some(1); const COUNT: Option = None; @@ -1480,7 +1480,7 @@ impl ExtensionRecord for LongStringMissingValueSet { missing_values, }); } - Ok(Record::LongStringMissingValues(LongStringMissingValueSet( + Ok(Record::LongStringMissingValues(LongStringMissingValueRecord( missing_value_set, ))) } @@ -1506,7 +1506,7 @@ impl ExtensionRecord for EncodingRecord { } } -#[derive(Clone, Debug)] +#[derive(Copy, Clone, Debug)] pub struct NumberOfCasesRecord { /// Always observed as 1. pub one: u64,