work
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 10 Dec 2023 18:19:41 +0000 (10:19 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 10 Dec 2023 18:19:41 +0000 (10:19 -0800)
rust/src/cooked.rs
rust/src/raw.rs

index 966e4b3e45637016d7bd0e022cef1e34bc70537d..f1753846c21f1c135750a533a845df87d7583e69 100644 (file)
@@ -1,4 +1,7 @@
-use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range};
+use std::{
+    borrow::Cow, cell::RefCell, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range,
+    rc::Rc,
+};
 
 use crate::{
     encoding::{default_encoding, get_encoding, Error as EncodingError},
@@ -207,37 +210,140 @@ pub struct Decoder {
     n_generated_names: usize,
 }
 
+fn decode_sort_order(record: &raw::Record) -> i32 {
+    match record {
+        // File header record.
+        raw::Record::Header(_) => 0,
+
+        // Then the records used to decide character encoding.
+        raw::Record::Encoding(_) => 1,
+        raw::Record::IntegerInfo(_) => 2,
+
+        // Then the other records that don't use variables at all.
+        raw::Record::Document(_) => 3,
+        raw::Record::FloatInfo(_) => 4,
+        raw::Record::ProductInfo(_) => 5,
+        raw::Record::FileAttributes(_) => 6,
+
+        // Variable records.
+        raw::Record::Variable(_) => 7,
+
+        // These records use variable indexes that would be invalidated by very
+        // long string variables.
+        raw::Record::ValueLabel(_) => 8,
+        raw::Record::VarDisplay(_) => 9,
+
+        // These records use short names.
+        raw::Record::MultipleResponse(_) => 10,
+        raw::Record::VeryLongStrings(_) => 11,
+
+        // Rename short names to long names.
+        raw::Record::LongNames(_) => 12,
+
+        // These records use long names.
+        raw::Record::VariableAttributes(_) => 13,
+        raw::Record::LongStringValueLabels(_) => 14,
+        raw::Record::LongStringMissingValues(_) => 15,
+        raw::Record::VariableSets(_) => 16,
+
+        // Cases come last.
+        raw::Record::Cases(_) => 17,
+
+        // We don't use these records at all.
+        raw::Record::NumberOfCases(_) => i32::MAX,
+        raw::Record::OtherExtension(_) => i32::MAX,
+        raw::Record::EndOfHeaders(_) => i32::MAX,
+        raw::Record::ZHeader(_) => i32::MAX,
+        raw::Record::ZTrailer(_) => i32::MAX,
+    }
+}
+
+#[derive(Default)]
+struct Headers<'a> {
+    header: Option<&'a raw::HeaderRecord>,
+    variables: Vec<&'a raw::VariableRecord>,
+    value_labels: Vec<&'a raw::ValueLabelRecord>,
+    document: Option<&'a raw::DocumentRecord>,
+    integer_info: Option<&'a raw::IntegerInfoRecord>,
+    float_info: Option<&'a raw::FloatInfoRecord>,
+    variable_sets: Vec<&'a raw::TextRecord>,
+    var_display: Option<&'a raw::VarDisplayRecord>,
+    multiple_response: Vec<&'a raw::MultipleResponseRecord>,
+    long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord>,
+    long_string_missing_values: Vec<&'a raw::LongStringMissingValueRecord>,
+    encoding: Option<&'a raw::EncodingRecord>,
+    number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
+    product_info: Option<&'a raw::TextRecord>,
+    long_names: Vec<&'a raw::TextRecord>,
+    very_long_strings: Vec<&'a raw::TextRecord>,
+    file_attributes: Vec<&'a raw::TextRecord>,
+    variable_attributes: Vec<&'a raw::TextRecord>,
+    other_extensions: Vec<&'a raw::Extension>,
+    cases: Option<&'a Rc<RefCell<raw::Cases>>>,
+}
+
+fn set_or_warn<T>(option: &mut Option<T>, value: T, warn: &impl Fn(Error)) {
+    if option.is_none() {
+        let _ = option.insert(value);
+    } else {
+        warn(Error::TBD);
+    }
+}
+
+impl<'a> Headers<'a> {
+    fn new(headers: &'a Vec<raw::Record>, warn: &impl Fn(Error)) -> Headers<'a> {
+        let mut h = Headers::default();
+        for header in headers {
+            match header {
+                raw::Record::Header(record) => set_or_warn(&mut h.header, record, warn),
+                raw::Record::Variable(record) => h.variables.push(record),
+                raw::Record::ValueLabel(record) => h.value_labels.push(record),
+                raw::Record::Document(record) => set_or_warn(&mut h.document, record, warn),
+                raw::Record::IntegerInfo(record) => set_or_warn(&mut h.integer_info, record, warn),
+                raw::Record::FloatInfo(record) => set_or_warn(&mut h.float_info, record, warn),
+                raw::Record::VariableSets(record) => h.variable_sets.push(record),
+                raw::Record::VarDisplay(record) => set_or_warn(&mut h.var_display, record, warn),
+                raw::Record::MultipleResponse(record) => h.multiple_response.push(record),
+                raw::Record::LongStringValueLabels(record) => {
+                    h.long_string_value_labels.push(record)
+                }
+                raw::Record::LongStringMissingValues(record) => {
+                    h.long_string_missing_values.push(record)
+                }
+                raw::Record::Encoding(record) => set_or_warn(&mut h.encoding, record, warn),
+                raw::Record::NumberOfCases(record) => {
+                    set_or_warn(&mut h.number_of_cases, record, warn)
+                }
+                raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn),
+                raw::Record::LongNames(record) => h.long_names.push(record),
+                raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record),
+                raw::Record::FileAttributes(record) => h.file_attributes.push(record),
+                raw::Record::VariableAttributes(record) => h.variable_attributes.push(record),
+                raw::Record::OtherExtension(record) => h.other_extensions.push(record),
+                raw::Record::EndOfHeaders(_) => todo!(),
+                raw::Record::ZHeader(_) => todo!(),
+                raw::Record::ZTrailer(_) => todo!(),
+                raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn),
+            }
+        }
+        h
+    }
+}
+
 pub fn decode(
     headers: Vec<raw::Record>,
     encoding: Option<&'static Encoding>,
     warn: &impl Fn(Error),
 ) -> Result<Vec<Record>, Error> {
-    let Some(header_record) = headers.iter().find_map(|rec| {
-        if let raw::Record::Header(header) = rec {
-            Some(header)
-        } else {
-            None
-        }
-    }) else {
+    let h = Headers::new(&headers, warn);
+    let Some(header) = h.header else {
         return Err(Error::MissingHeaderRecord);
     };
     let encoding = match encoding {
         Some(encoding) => encoding,
         None => {
-            let encoding = headers.iter().find_map(|rec| {
-                if let raw::Record::Encoding(ref e) = rec {
-                    Some(e.0.as_str())
-                } else {
-                    None
-                }
-            });
-            let character_code = headers.iter().find_map(|rec| {
-                if let raw::Record::IntegerInfo(ref r) = rec {
-                    Some(r.character_code)
-                } else {
-                    None
-                }
-            });
+            let encoding = h.encoding.map(|record| record.0.as_str());
+            let character_code = h.integer_info.map(|record| record.character_code);
             match get_encoding(encoding, character_code) {
                 Ok(encoding) => encoding,
                 Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)),
@@ -251,8 +357,8 @@ pub fn decode(
     };
 
     let mut decoder = Decoder {
-        compression: header_record.compression,
-        endian: header_record.endian,
+        compression: header.compression,
+        endian: header.endian,
         encoding,
         variables: HashMap::new(),
         var_names: HashMap::new(),
@@ -261,99 +367,105 @@ pub fn decode(
     };
 
     let mut output = Vec::with_capacity(headers.len());
-    for header in &headers {
-        match header {
-            raw::Record::Header(ref input) => {
-                if let Some(header) = HeaderRecord::try_decode(&mut decoder, input, warn)? {
-                    output.push(Record::Header(header))
-                }
-            }
-            raw::Record::Variable(ref input) => {
-                if let Some(variable) = VariableRecord::try_decode(&mut decoder, input, warn)? {
-                    output.push(Record::Variable(variable));
-                }
-            }
-            raw::Record::ValueLabel(ref input) => {
-                if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, input, warn)?
-                {
-                    output.push(Record::ValueLabel(value_label));
-                }
-            }
-            raw::Record::Document(ref input) => {
-                if let Some(document) = DocumentRecord::try_decode(&mut decoder, input, warn)? {
-                    output.push(Record::Document(document))
-                }
-            }
-            raw::Record::IntegerInfo(ref input) => output.push(Record::IntegerInfo(input.clone())),
-            raw::Record::FloatInfo(ref input) => output.push(Record::FloatInfo(input.clone())),
-            raw::Record::VariableSets(ref input) => {
-                let s = decoder.decode_string_cow(&input.text.0, warn);
-                output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
-            }
-            raw::Record::VarDisplay(ref input) => {
-                if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, input, warn)? {
-                    output.push(Record::VarDisplay(vdr))
-                }
-            }
-            raw::Record::MultipleResponse(ref input) => {
-                if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, input, warn)? {
-                    output.push(Record::MultipleResponse(mrr))
-                }
-            }
-            raw::Record::LongStringMissingValues(ref input) => {
-                if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, input, warn)? {
-                    output.push(Record::LongStringMissingValues(mrr))
-                }
-            }
-            raw::Record::LongStringValueLabels(ref input) => {
-                if let Some(mrr) =
-                    LongStringValueLabelRecord::try_decode(&mut decoder, input, warn)?
-                {
-                    output.push(Record::LongStringValueLabels(mrr))
-                }
-            }
-            raw::Record::Encoding(ref input) => output.push(Record::Encoding(input.clone())),
-            raw::Record::NumberOfCases(ref input) => {
-                output.push(Record::NumberOfCases(input.clone()))
-            }
-            raw::Record::ProductInfo(ref input) => {
-                let s = decoder.decode_string_cow(&input.text.0, warn);
-                output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?));
-            }
-            raw::Record::LongNames(ref input) => {
-                let s = decoder.decode_string_cow(&input.text.0, warn);
-                output.push(Record::LongNames(LongNameRecord::parse(
-                    &mut decoder,
-                    &s,
-                    warn,
-                )?));
-            }
-            raw::Record::VeryLongStrings(ref input) => {
-                let s = decoder.decode_string_cow(&input.text.0, warn);
-                output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
-                    &decoder, &s, warn,
-                )?));
-            }
-            raw::Record::FileAttributes(ref input) => {
-                let s = decoder.decode_string_cow(&input.text.0, warn);
-                output.push(Record::FileAttributes(FileAttributeRecord::parse(
-                    &decoder, &s, warn,
-                )?));
-            }
-            raw::Record::VariableAttributes(ref input) => {
-                let s = decoder.decode_string_cow(&input.text.0, warn);
-                output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
-                    &decoder, &s, warn,
-                )?));
-            }
-            raw::Record::OtherExtension(ref input) => {
-                output.push(Record::OtherExtension(input.clone()))
-            }
-            raw::Record::EndOfHeaders(_) => (),
-            raw::Record::ZHeader(_) => (),
-            raw::Record::ZTrailer(_) => (),
-            raw::Record::Cases(_) => (),
-        };
+
+    // Decode the records that don't use variables at all.
+    if let Some(header) = HeaderRecord::try_decode(&mut decoder, header, warn)? {
+        output.push(Record::Header(header))
+    }
+    if let Some(raw) = h.document {
+        if let Some(document) = DocumentRecord::try_decode(&mut decoder, raw, warn)? {
+            output.push(Record::Document(document))
+        }
+    }
+    if let Some(raw) = h.integer_info {
+        output.push(Record::IntegerInfo(raw.clone()));
+    }
+    if let Some(raw) = h.float_info {
+        output.push(Record::FloatInfo(raw.clone()));
+    }
+    if let Some(raw) = h.product_info {
+        let s = decoder.decode_string_cow(&raw.text.0, warn);
+        output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?));
+    }
+    if let Some(raw) = h.number_of_cases {
+        output.push(Record::NumberOfCases(raw.clone()))
+    }
+    for &raw in &h.file_attributes {
+        let s = decoder.decode_string_cow(&raw.text.0, warn);
+        output.push(Record::FileAttributes(FileAttributeRecord::parse(
+            &decoder, &s, warn,
+        )?));
+    }
+    for &raw in &h.other_extensions {
+        output.push(Record::OtherExtension(raw.clone()));
+    }
+
+    // Decode the variable records, which are the basis of almost everything
+    // else.
+    for &raw in &h.variables {
+        if let Some(variable) = VariableRecord::try_decode(&mut decoder, raw, warn)? {
+            output.push(Record::Variable(variable));
+        }
+    }
+
+    // Decode value labels and weight variable.  These use indexes into the
+    // variable records, so we need to parse them before those indexes become
+    // invalidated by very long string variables.
+    for &raw in &h.value_labels {
+        if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
+            output.push(Record::ValueLabel(value_label));
+        }
+    }
+    // XXX weight
+    if let Some(raw) = h.var_display {
+        if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, raw, warn)? {
+            output.push(Record::VarDisplay(vdr))
+        }
+    }
+
+    // Decode records that use short names.
+    for &raw in &h.multiple_response {
+        if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, raw, warn)? {
+            output.push(Record::MultipleResponse(mrr))
+        }
+    }
+    for &raw in &h.very_long_strings {
+        let s = decoder.decode_string_cow(&raw.text.0, warn);
+        output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
+            &decoder, &s, warn,
+        )?));
+    }
+
+    // Rename variables to their long names.
+    for &raw in &h.long_names {
+        let s = decoder.decode_string_cow(&raw.text.0, warn);
+        output.push(Record::LongNames(LongNameRecord::parse(
+            &mut decoder,
+            &s,
+            warn,
+        )?));
+    }
+
+    // Decode recods that use long names.
+    for &raw in &h.variable_attributes {
+        let s = decoder.decode_string_cow(&raw.text.0, warn);
+        output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
+            &decoder, &s, warn,
+        )?));
+    }
+    for &raw in &h.long_string_value_labels {
+        if let Some(mrr) = LongStringValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
+            output.push(Record::LongStringValueLabels(mrr))
+        }
+    }
+    for &raw in &h.long_string_missing_values {
+        if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, raw, warn)? {
+            output.push(Record::LongStringMissingValues(mrr))
+        }
+    }
+    for &raw in &h.variable_sets {
+        let s = decoder.decode_string_cow(&raw.text.0, warn);
+        output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
     }
     Ok(output)
 }
@@ -1356,7 +1468,7 @@ impl LongStringMissingValues {
 
         Ok(LongStringMissingValues {
             var_name,
-            missing_values
+            missing_values,
         })
     }
 }
@@ -1365,7 +1477,7 @@ impl LongStringMissingValues {
 pub struct LongStringMissingValuesRecord(Vec<LongStringMissingValues>);
 
 impl TryDecode for LongStringMissingValuesRecord {
-    type Input = raw::LongStringMissingValueSet;
+    type Input = raw::LongStringMissingValueRecord;
 
     fn try_decode(
         decoder: &mut Decoder,
index 8b69f760d6364299cc19271f23efb9570bfd8975..2eb96b2fec1a8d5d2f2349b7e618ed23811745b0 100644 (file)
@@ -145,7 +145,7 @@ pub enum Record {
     VarDisplay(VarDisplayRecord),
     MultipleResponse(MultipleResponseRecord),
     LongStringValueLabels(LongStringValueLabelRecord),
-    LongStringMissingValues(LongStringMissingValueSet),
+    LongStringMissingValues(LongStringMissingValueRecord),
     Encoding(EncodingRecord),
     NumberOfCases(NumberOfCasesRecord),
     ProductInfo(TextRecord),
@@ -1432,9 +1432,9 @@ pub struct LongStringMissingValues {
 }
 
 #[derive(Clone, Debug)]
-pub struct LongStringMissingValueSet(pub Vec<LongStringMissingValues>);
+pub struct LongStringMissingValueRecord(pub Vec<LongStringMissingValues>);
 
-impl ExtensionRecord for LongStringMissingValueSet {
+impl ExtensionRecord for LongStringMissingValueRecord {
     const SUBTYPE: u32 = 22;
     const SIZE: Option<u32> = Some(1);
     const COUNT: Option<u32> = None;
@@ -1480,7 +1480,7 @@ impl ExtensionRecord for LongStringMissingValueSet {
                 missing_values,
             });
         }
-        Ok(Record::LongStringMissingValues(LongStringMissingValueSet(
+        Ok(Record::LongStringMissingValues(LongStringMissingValueRecord(
             missing_value_set,
         )))
     }
@@ -1506,7 +1506,7 @@ impl ExtensionRecord for EncodingRecord {
     }
 }
 
-#[derive(Clone, Debug)]
+#[derive(Copy, Clone, Debug)]
 pub struct NumberOfCasesRecord {
     /// Always observed as 1.
     pub one: u64,