work
authorBen Pfaff <blp@cs.stanford.edu>
Mon, 25 Dec 2023 17:54:03 +0000 (09:54 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Mon, 25 Dec 2023 17:54:03 +0000 (09:54 -0800)
rust/src/cooked.rs
rust/src/raw.rs

index c6eabecdbf17bf6942f67874b8f5ef99fb637103..4c0135b949ff365b0a4ca4f2c26c5b600b3165ab 100644 (file)
@@ -215,8 +215,8 @@ struct Headers<'a> {
     variable_sets: Vec<&'a raw::TextRecord>,
     var_display: Option<&'a raw::VarDisplayRecord>,
     multiple_response: Vec<&'a raw::MultipleResponseRecord<RawString>>,
-    long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord>,
-    long_string_missing_values: Vec<&'a raw::LongStringMissingValueRecord>,
+    long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord<RawString>>,
+    long_string_missing_values: Vec<&'a raw::LongStringMissingValueRecord<RawString, RawStr<8>>>,
     encoding: Option<&'a raw::EncodingRecord>,
     number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
     product_info: Option<&'a raw::TextRecord>,
@@ -239,6 +239,7 @@ fn set_or_warn<T>(option: &mut Option<T>, value: T, warn: &impl Fn(Error)) {
 impl<'a> Headers<'a> {
     fn new(headers: &'a Vec<raw::Record>, warn: &impl Fn(Error)) -> Headers<'a> {
         let mut h = Headers::default();
+/*
         for header in headers {
             match header {
                 raw::Record::Header(record) => set_or_warn(&mut h.header, record, warn),
@@ -272,6 +273,7 @@ impl<'a> Headers<'a> {
                 raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn),
             }
         }
+*/
         h
     }
 }
@@ -1316,7 +1318,7 @@ pub struct LongStringMissingValues {
 impl LongStringMissingValues {
     fn decode(
         decoder: &Decoder,
-        input: &raw::LongStringMissingValues,
+        input: &raw::LongStringMissingValues<RawString, RawStr<8>>,
         warn: &impl Fn(Error),
     ) -> Result<Self, Error> {
         let var_name = decoder.decode_string(&input.var_name.0, warn);
@@ -1336,7 +1338,7 @@ impl LongStringMissingValues {
 pub struct LongStringMissingValuesRecord(Vec<LongStringMissingValues>);
 
 impl TryDecode for LongStringMissingValuesRecord {
-    type Input<'a> = raw::LongStringMissingValueRecord;
+    type Input<'a> = raw::LongStringMissingValueRecord<RawString, RawStr<8>>;
 
     fn try_decode(
         decoder: &mut Decoder,
@@ -1364,7 +1366,7 @@ pub struct LongStringValueLabels {
 impl LongStringValueLabels {
     fn decode(
         decoder: &Decoder,
-        input: &raw::LongStringValueLabels,
+        input: &raw::LongStringValueLabels<RawString>,
         warn: &impl Fn(Error),
     ) -> Result<Self, Error> {
         let var_name = decoder.decode_string(&input.var_name.0, warn);
@@ -1402,7 +1404,7 @@ impl LongStringValueLabels {
 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
 
 impl TryDecode for LongStringValueLabelRecord {
-    type Input<'a> = raw::LongStringValueLabelRecord;
+    type Input<'a> = raw::LongStringValueLabelRecord<RawString>;
 
     fn try_decode(
         decoder: &mut Decoder,
index 85312ef4c9e1f2260817fdef5ec1e06e0a8d7f22..4ab11c3043dcf4205601df0a0c10f767ada7b07e 100644 (file)
@@ -170,18 +170,19 @@ pub enum Record {
     Document(DocumentRecord<RawDocumentLine>),
     IntegerInfo(IntegerInfoRecord),
     FloatInfo(FloatInfoRecord),
-    VariableSets(TextRecord),
+    VariableSets(VariableSetRecord),
     VarDisplay(VarDisplayRecord),
     MultipleResponse(MultipleResponseRecord<RawString>),
-    LongStringValueLabels(LongStringValueLabelRecord),
-    LongStringMissingValues(LongStringMissingValueRecord),
+    LongStringValueLabels(LongStringValueLabelRecord<RawString>),
+    LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
     Encoding(EncodingRecord),
     NumberOfCases(NumberOfCasesRecord),
-    ProductInfo(TextRecord),
-    LongNames(TextRecord),
-    VeryLongStrings(TextRecord),
-    FileAttributes(TextRecord),
-    VariableAttributes(TextRecord),
+    ProductInfo(ProductInfoRecord),
+    LongNames(LongNamesRecord),
+    VeryLongStrings(VeryLongStringsRecord),
+    FileAttributes(FileAttributeRecord),
+    VariableAttributes(VariableAttributeRecord),
+    Text(TextRecord),
     OtherExtension(Extension),
     EndOfHeaders(u32),
     ZHeader(ZHeader),
@@ -399,10 +400,13 @@ struct Decoder {
 }
 
 impl Decoder {
+    fn warn(&self, error: Error) {
+        (self.warn)(error)
+    }
     fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
         let (output, malformed) = self.encoding.decode_without_bom_handling(input);
         if malformed {
-            (self.warn)(Error::MalformedString {
+            self.warn(Error::MalformedString {
                 encoding: self.encoding.name().into(),
                 text: output.clone().into(),
             });
@@ -1817,18 +1821,34 @@ impl VarDisplayRecord {
 }
 
 #[derive(Clone, Debug)]
-pub struct LongStringMissingValues {
+pub struct LongStringMissingValues<N, V>
+where
+    N: Debug,
+    V: Debug,
+{
     /// Variable name.
-    pub var_name: RawString,
+    pub var_name: N,
 
     /// Missing values.
-    pub missing_values: MissingValues<RawStr<8>>,
+    pub missing_values: MissingValues<V>,
+}
+
+impl LongStringMissingValues<RawString, RawStr<8>> {
+    fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValues<String, String> {
+        LongStringMissingValues {
+            var_name: decoder.decode(&self.var_name).to_string(),
+            missing_values: self.missing_values.decode(decoder),
+        }
+    }
 }
 
 #[derive(Clone, Debug)]
-pub struct LongStringMissingValueRecord(pub Vec<LongStringMissingValues>);
+pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
+where
+    N: Debug,
+    V: Debug;
 
-impl ExtensionRecord for LongStringMissingValueRecord {
+impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
     const SUBTYPE: u32 = 22;
     const SIZE: Option<u32> = Some(1);
     const COUNT: Option<u32> = None;
@@ -1880,6 +1900,12 @@ impl ExtensionRecord for LongStringMissingValueRecord {
     }
 }
 
+impl LongStringMissingValueRecord<RawString, RawStr<8>> {
+    fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValueRecord<String, String> {
+        LongStringMissingValueRecord(self.0.iter().map(|mv| mv.decode(decoder)).collect())
+    }
+}
+
 #[derive(Clone, Debug)]
 pub struct EncodingRecord(pub String);
 
@@ -1930,19 +1956,252 @@ impl ExtensionRecord for NumberOfCasesRecord {
 pub struct TextRecord {
     pub offsets: Range<u64>,
 
+    /// Type of record.
+    pub rec_type: TextRecordType,
+
     /// The text content of the record.
     pub text: RawString,
 }
 
-impl From<Extension> for TextRecord {
-    fn from(source: Extension) -> Self {
-        TextRecord {
-            offsets: source.offsets,
-            text: source.data.into(),
+#[derive(Clone, Copy, Debug)]
+pub enum TextRecordType {
+    VariableSets,
+    ProductInfo,
+    LongNames,
+    VeryLongStrings,
+    FileAttributes,
+    VariableAttributes,
+}
+
+impl TextRecord {
+    fn new(extension: Extension, rec_type: TextRecordType) -> Self {
+        Self {
+            offsets: extension.offsets,
+            rec_type,
+            text: extension.data.into(),
+        }
+    }
+    fn decode<'a>(&self, decoder: &Decoder) -> Result<Option<Record>, Error> {
+        match self.rec_type {
+            TextRecordType::VariableSets => Ok(Some(Record::VariableSets(
+                VariableSetRecord::decode(self, decoder),
+            ))),
+            TextRecordType::ProductInfo => Ok(Some(Record::ProductInfo(
+                ProductInfoRecord::decode(self, decoder),
+            ))),
+            TextRecordType::LongNames => Ok(Some(Record::LongNames(LongNamesRecord::decode(
+                self, decoder,
+            )))),
+            TextRecordType::VeryLongStrings => Ok(Some(Record::VeryLongStrings(
+                VeryLongStringsRecord::decode(self, decoder),
+            ))),
+            TextRecordType::FileAttributes => {
+                Ok(FileAttributeRecord::decode(self, decoder).map(|fa| Record::FileAttributes(fa)))
+            }
+            TextRecordType::VariableAttributes => {
+                Ok(Some(Record::VariableAttributes(
+VariableAttributeRecord::decode(self, decoder))))
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+    pub short_name: String,
+    pub length: u16,
+}
+
+impl VeryLongString {
+    fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
+        let Some((short_name, length)) = input.split_once('=') else {
+            return Err(Error::TBD);
+        };
+        let length = length.parse().map_err(|_| Error::TBD)?;
+        Ok(VeryLongString {
+            short_name: short_name.into(),
+            length,
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+    pub name: String,
+    pub values: Vec<String>,
+}
+
+impl Attribute {
+    fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Error> {
+        let Some((name, mut input)) = input.split_once('(') else {
+            return Err(Error::TBD);
+        };
+        let mut values = Vec::new();
+        loop {
+            let Some((value, rest)) = input.split_once('\n') else {
+                return Err(Error::TBD);
+            };
+            if let Some(stripped) = value
+                .strip_prefix('\'')
+                .and_then(|value| value.strip_suffix('\''))
+            {
+                values.push(stripped.into());
+            } else {
+                decoder.warn(Error::TBD);
+                values.push(value.into());
+            }
+            if let Some(rest) = rest.strip_prefix(')') {
+                let attribute = Attribute {
+                    name: name.into(),
+                    values,
+                };
+                return Ok((attribute, rest));
+            };
+            input = rest;
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct AttributeSet(pub Vec<Attribute>);
+
+impl AttributeSet {
+    fn parse<'a>(
+        decoder: &Decoder,
+        mut input: &'a str,
+        sentinel: Option<char>,
+    ) -> Result<(AttributeSet, &'a str), Error> {
+        let mut attributes = Vec::new();
+        let rest = loop {
+            match input.chars().next() {
+                None => break input,
+                c if c == sentinel => break &input[1..],
+                _ => {
+                    let (attribute, rest) = Attribute::parse(decoder, input)?;
+                    attributes.push(attribute);
+                    input = rest;
+                }
+            }
+        };
+        Ok((AttributeSet(attributes), rest))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct FileAttributeRecord(AttributeSet);
+
+impl FileAttributeRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Option<Self> {
+        let input = decoder.decode(&source.text);
+        match AttributeSet::parse(decoder, &input, None).warn_on_error(&decoder.warn) {
+            Some((set, rest)) => {
+                if !rest.is_empty() {
+                    decoder.warn(Error::TBD);
+                }
+                Some(FileAttributeRecord(set))
+            }
+            None => None,
         }
     }
 }
 
+#[derive(Clone, Debug)]
+pub struct VarAttributeSet {
+    pub long_var_name: String,
+    pub attributes: AttributeSet,
+}
+
+impl VarAttributeSet {
+    fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Error> {
+        let Some((long_var_name, rest)) = input.split_once(':') else {
+            return Err(Error::TBD);
+        };
+        let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
+        let var_attribute = VarAttributeSet {
+            long_var_name: long_var_name.into(),
+            attributes,
+        };
+        Ok((var_attribute, rest))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
+
+impl VariableAttributeRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+        let decoded = decoder.decode(&source.text);
+        let mut input = decoded.as_ref();
+        let mut var_attribute_sets = Vec::new();
+        while !input.is_empty() {
+            let Some((var_attribute, rest)) =
+                VarAttributeSet::parse(decoder, &input).warn_on_error(&decoder.warn)
+            else {
+                break;
+            };
+            var_attribute_sets.push(var_attribute);
+            input = rest.into();
+        }
+        VariableAttributeRecord(var_attribute_sets)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongStringsRecord(Vec<VeryLongString>);
+
+impl VeryLongStringsRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+        let input = decoder.decode(&source.text);
+        let mut very_long_strings = Vec::new();
+        for tuple in input
+            .split('\0')
+            .map(|s| s.trim_end_matches('\t'))
+            .filter(|s| !s.is_empty())
+        {
+            if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&decoder.warn) {
+                very_long_strings.push(vls)
+            }
+        }
+        VeryLongStringsRecord(very_long_strings)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongName {
+    pub short_name: String,
+    pub long_name: String,
+}
+
+#[derive(Clone, Debug)]
+pub struct LongNamesRecord(Vec<LongName>);
+
+impl LongNamesRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+        let input = decoder.decode(&source.text);
+        let mut names = Vec::new();
+        for pair in input.split('\t').filter(|s| !s.is_empty()) {
+            if let Some((short_name, long_name)) = pair.split_once('=') {
+                names.push(LongName {
+                    short_name: short_name.into(),
+                    long_name: long_name.into(),
+                });
+            } else {
+                decoder.warn(Error::TBD)
+            }
+        }
+        LongNamesRecord(names)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ProductInfoRecord(pub String);
+
+impl ProductInfoRecord {
+    const NAME: &'static str = "extra product info";
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+        Self(decoder.decode(&source.text).into())
+    }
+}
 #[derive(Clone, Debug)]
 pub struct VariableSet {
     pub name: String,
@@ -1967,7 +2226,7 @@ pub struct VariableSetRecord {
 }
 
 impl VariableSetRecord {
-    fn decode<'a>(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
         let mut sets = Vec::new();
         let input = decoder.decode(&source.text);
         for line in input.lines() {
@@ -2079,12 +2338,30 @@ impl Extension {
             }
             EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
             NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
-            5 => Ok(Record::VariableSets(extension.into())),
-            10 => Ok(Record::ProductInfo(extension.into())),
-            13 => Ok(Record::LongNames(extension.into())),
-            14 => Ok(Record::VeryLongStrings(extension.into())),
-            17 => Ok(Record::FileAttributes(extension.into())),
-            18 => Ok(Record::VariableAttributes(extension.into())),
+            5 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::VariableSets,
+            ))),
+            10 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::ProductInfo,
+            ))),
+            13 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::LongNames,
+            ))),
+            14 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::VeryLongStrings,
+            ))),
+            17 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::FileAttributes,
+            ))),
+            18 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::VariableAttributes,
+            ))),
             _ => Ok(Record::OtherExtension(extension)),
         };
         match result {
@@ -2244,18 +2521,23 @@ fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError>
 }
 
 #[derive(Clone, Debug)]
-pub struct LongStringValueLabels {
-    pub var_name: RawString,
+pub struct LongStringValueLabels<S>
+where
+    S: Debug,
+{
+    pub var_name: S,
     pub width: u32,
 
     /// `(value, label)` pairs, where each value is `width` bytes.
-    pub labels: Vec<(RawString, RawString)>,
+    pub labels: Vec<(S, S)>,
 }
 
 #[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
+pub struct LongStringValueLabelRecord<S>(pub Vec<LongStringValueLabels<S>>)
+where
+    S: Debug;
 
-impl ExtensionRecord for LongStringValueLabelRecord {
+impl ExtensionRecord for LongStringValueLabelRecord<RawString> {
     const SUBTYPE: u32 = 21;
     const SIZE: Option<u32> = Some(1);
     const COUNT: Option<u32> = None;