From: Ben Pfaff Date: Tue, 15 Aug 2023 18:27:41 +0000 (-0700) Subject: work X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4b071a32e0fa25c0b5257a15a176df1dd93990ea;p=pspp work --- diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 6a8c89267b..afe9323c59 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -179,6 +179,12 @@ dependencies = [ "libc", ] +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + [[package]] name = "flate2" version = "1.0.26" @@ -468,6 +474,7 @@ dependencies = [ "chrono", "clap", "encoding_rs", + "finl_unicode", "flate2", "float_next_after", "hexplay", @@ -477,6 +484,7 @@ dependencies = [ "num-traits", "ordered-float", "thiserror", + "unicase", ] [[package]] @@ -603,6 +611,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-ident" version = "1.0.6" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 665f90ea79..0059ae98aa 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -18,6 +18,8 @@ num-traits = "0.2.16" ordered-float = "3.7.0" thiserror = "1.0" chrono = "0.4.26" +finl_unicode = "1.2.0" +unicase = "2.6.0" [build-dependencies] anyhow = "1.0.69" diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index ae87b5abc2..0012a51dcc 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -1,21 +1,58 @@ -use std::borrow::Cow; +use std::{borrow::Cow, collections::HashSet}; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::Encoding; use crate::{ - Error, - {endian::Endian, CategoryLabels, Compression}, format::Spec, + identifier::{Identifier, Error as IdError}, + {endian::Endian, CategoryLabels, Compression}, }; +use thiserror::Error as ThisError; + +#[derive(ThisError, Debug)] +pub enum Error { + #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] + BadVariableWidth { offset: u64, width: i32 }, + + #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] + BadLongMissingValueFormat, + + #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")] + InvalidCreationDate { creation_date: String }, + + #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")] + InvalidCreationTime { creation_time: String }, + + #[error("Invalid variable name: {0}")] + BadIdentifier(#[from] IdError), + + #[error("Details TBD")] + TBD, +} pub struct Decoder { pub compression: Option, pub endian: Endian, pub encoding: &'static Encoding, + pub var_names: HashSet, + n_generated_names: usize, } impl Decoder { + fn take_name(&mut self, id: Identifier) -> bool { + self.var_names.insert(id) + } + fn generate_name(&mut self) -> Identifier { + loop { + self.n_generated_names += 1; + let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding).unwrap(); + if self.take_name(name.clone()) { + return name; + } + assert!(self.n_generated_names < usize::MAX); + } + } fn decode_string<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> { let (output, malformed) = self.encoding.decode_without_bom_handling(input); if malformed { @@ -27,7 +64,7 @@ impl Decoder { pub trait Decode: Sized { type Input; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Self; + fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result; } #[derive(Clone)] @@ -42,7 +79,7 @@ pub struct Header { impl Decode for Header { type Input = crate::raw::Header; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Self { + fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result { let eye_catcher = decoder.decode_string(&input.eye_catcher, &warn); let file_label = decoder.decode_string(&input.file_label, &warn); let creation_date = decoder.decode_string(&input.creation_date, &warn); @@ -60,37 +97,68 @@ impl Decode for Header { }); Default::default() }); - Header { + Ok(Header { eye_catcher: eye_catcher.into(), weight_index: input.weight_index.map(|n| n as usize), n_cases: input.n_cases.map(|n| n as u64), creation: NaiveDateTime::new(creation_date, creation_time), file_label: file_label.into(), - } + }) } } pub struct Variable { pub width: i32, - pub name: String, + pub name: Identifier, pub print_format: Spec, pub write_format: Spec, } +fn decode_var( + decoder: &mut Decoder, + input: &crate::raw::Variable, + warn: impl Fn(Error), +) -> Result, Error> { + match input.width { + 0..=255 => (), + -1 => return Ok(None), + _ => { + return Err(Error::BadVariableWidth { + offset: input.offset, + width: input.width, + }) + } + }; + let name = decoder.decode_string(&input.name, &warn); + let name = match Identifier::new(&name, decoder.encoding) { + Ok(name) => { + if !decoder.take_name(name) { + decoder.generate_name() + } else { + name + } + } + Err(error) => { + warn(error.into()); + decoder.generate_name() + } + }; +} + #[derive(Clone)] pub struct Document(Vec); impl Decode for Document { type Input = crate::raw::Document; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Self { - Document( + fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result { + Ok(Document( input .lines .iter() .map(|s| decoder.decode_string(s, &warn).into()) .collect(), - ) + )) } } diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs new file mode 100644 index 0000000000..0553b41b11 --- /dev/null +++ b/rust/src/identifier.rs @@ -0,0 +1,120 @@ +use encoding_rs::{EncoderResult, Encoding}; +use finl_unicode::categories::{CharacterCategories, MajorCategory}; +use thiserror::Error as ThisError; +use unicase::UniCase; + +pub trait IdentifierChar { + /// Returns true if `self` may be the first character in an identifier. + fn may_start_id(self) -> bool; + + /// Returns true if `self` may be a second or subsequent character in an + /// identifier. + fn may_continue_id(self) -> bool; +} + +impl IdentifierChar for char { + fn may_start_id(self) -> bool { + use MajorCategory::*; + + ([L, M, S].contains(&self.get_major_category()) || "@#$".contains(self)) + && self != char::REPLACEMENT_CHARACTER + } + + fn may_continue_id(self) -> bool { + use MajorCategory::*; + + ([L, M, S, N].contains(&self.get_major_category()) || "@#$._".contains(self)) + && self != char::REPLACEMENT_CHARACTER + } +} + +#[derive(Clone, PartialEq, Eq, Debug, Hash)] +pub struct Identifier(pub UniCase); + +#[derive(Clone, Debug, ThisError)] +pub enum Error { + #[error("Identifier cannot be empty string.")] + Empty, + + #[error("\"{0}\" may not be used as an identifier because it is a reserved word.")] + Reserved(String), + + #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")] + BadFirstCharacter(String, char), + + #[error("\"{0}\" may not be used as an identifier because it contains disallowed character \"{1}\".")] + BadLaterCharacter(String, char), + + #[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")] + TooLong { + id: String, + length: usize, + encoding: &'static str, + max: usize, + }, + + #[error("\"{id}\" may not be used as an identifier because the encoding in use ({encoding}) cannot represent \"{c}\".")] + NotEncodable { + id: String, + encoding: &'static str, + c: char, + }, +} + +fn is_reserved_word(s: &str) -> bool { + for word in [ + "and", "or", "not", "eq", "ge", "gt", "le", "ne", "all", "by", "to", "with", + ] { + if s.eq_ignore_ascii_case(word) { + return true; + } + } + false +} + +impl Identifier { + /// Maximum length of an identifier, in bytes. The limit applies in the + /// encoding used by the dictionary, not in UTF-8. + pub const MAX_LEN: usize = 64; + + pub fn new(s: &str, encoding: &'static Encoding) -> Result { + Self::is_plausible(s)?; + let (encoded, _, unencodable) = encoding.encode(s); + if unencodable { + let mut encoder = encoding.new_encoder(); + let mut buf = + Vec::with_capacity(encoder.max_buffer_length_from_utf8_without_replacement(s.len()).unwrap()); + let EncoderResult::Unmappable(c) = encoder + .encode_from_utf8_to_vec_without_replacement(s, &mut buf, true) + .0 + else { + unreachable!(); + }; + return Err(Error::NotEncodable { id: s.into(), encoding: encoding.name(), c }); + } + if encoded.len() > Self::MAX_LEN { + return Err(Error::TooLong { id: s.into(), length: encoded.len(), encoding: encoding.name(), max: Self::MAX_LEN }); + } + Ok(Identifier(s.into())) + } + pub fn is_plausible(s: &str) -> Result<(), Error> { + if s.is_empty() { + return Err(Error::Empty); + } + if is_reserved_word(s) { + return Err(Error::Reserved(s.into())); + } + + let mut i = s.chars(); + let first = i.next().unwrap(); + if !first.may_start_id() { + return Err(Error::BadFirstCharacter(s.into(), first)); + } + for c in i { + if !c.may_continue_id() { + return Err(Error::BadLaterCharacter(s.into(), c)); + } + } + Ok(()) + } +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 482bd08197..c793f44cbb 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,117 +1,10 @@ -use std::io::Error as IoError; -use thiserror::Error as ThisError; - pub mod endian; pub mod raw; pub mod cooked; pub mod sack; pub mod encoding; pub mod format; - -#[derive(ThisError, Debug)] -pub enum Error { - #[error("Not an SPSS system file")] - NotASystemFile, - - #[error("Invalid magic number {0:?}")] - BadMagic([u8; 4]), - - #[error("I/O error ({0})")] - Io(#[from] IoError), - - #[error("Invalid SAV compression code {0}")] - InvalidSavCompression(u32), - - #[error("Invalid ZSAV compression code {0}")] - InvalidZsavCompression(u32), - - #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] - BadVariableWidth { offset: u64, width: i32 }, - - #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] - BadDocumentLength { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] - BadRecordType { offset: u64, rec_type: u32 }, - - #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")] - BadVariableLabelCode { offset: u64, code: u32 }, - - #[error( - "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." - )] - BadNumericMissingValueCode { offset: u64, code: i32 }, - - #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] - BadStringMissingValueCode { offset: u64, code: i32 }, - - #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] - BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] - BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] - ExtensionRecordTooLarge { - offset: u64, - subtype: u32, - size: u32, - count: u32, - }, - - #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] - EofInCase { - offset: u64, - case_ofs: u64, - case_len: usize, - }, - - #[error( - "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." - )] - EofInCompressedCase { offset: u64, case_ofs: u64 }, - - #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] - PartialCompressedCase { offset: u64, case_ofs: u64 }, - - #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] - CompressedNumberExpected { offset: u64, case_ofs: u64 }, - - #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] - CompressedStringExpected { offset: u64, case_ofs: u64 }, - - #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] - BadZlibTrailerNBlocks { - offset: u64, - n_blocks: u32, - expected_n_blocks: u64, - ztrailer_len: u64, - }, - - #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] - BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 }, - - #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] - BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 }, - - #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] - BadEncodingName { offset: u64 }, - - #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] - BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 }, - - #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] - BadLongMissingValueFormat, - - #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")] - InvalidCreationDate { creation_date: String }, - - #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")] - InvalidCreationTime { creation_time: String }, - - #[error("Details TBD")] - TBD, -} +pub mod identifier; #[derive(Copy, Clone, Debug)] pub enum Compression { diff --git a/rust/src/raw.rs b/rust/src/raw.rs index ccd1e6662b..0ae4f53b8c 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,5 +1,5 @@ use crate::endian::{Endian, Parse, ToBytes}; -use crate::{CategoryLabels, Compression, Error}; +use crate::{CategoryLabels, Compression}; use flate2::read::ZlibDecoder; use num::Integer; @@ -11,9 +11,106 @@ use std::{ io::{Error as IoError, Read, Seek, SeekFrom}, iter::FusedIterator, }; +use thiserror::Error as ThisError; use self::state::State; +#[derive(ThisError, Debug)] +pub enum Error { + #[error("Not an SPSS system file")] + NotASystemFile, + + #[error("Invalid magic number {0:?}")] + BadMagic([u8; 4]), + + #[error("I/O error ({0})")] + Io(#[from] IoError), + + #[error("Invalid SAV compression code {0}")] + InvalidSavCompression(u32), + + #[error("Invalid ZSAV compression code {0}")] + InvalidZsavCompression(u32), + + #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] + BadVariableWidth { offset: u64, width: i32 }, + + #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] + BadDocumentLength { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] + BadRecordType { offset: u64, rec_type: u32 }, + + #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")] + BadVariableLabelCode { offset: u64, code: u32 }, + + #[error( + "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." + )] + BadNumericMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] + BadStringMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] + BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] + BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] + ExtensionRecordTooLarge { + offset: u64, + subtype: u32, + size: u32, + count: u32, + }, + + #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] + EofInCase { + offset: u64, + case_ofs: u64, + case_len: usize, + }, + + #[error( + "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." + )] + EofInCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] + PartialCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] + CompressedNumberExpected { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] + CompressedStringExpected { offset: u64, case_ofs: u64 }, + + #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] + BadZlibTrailerNBlocks { + offset: u64, + n_blocks: u32, + expected_n_blocks: u64, + ztrailer_len: u64, + }, + + #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] + BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 }, + + #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] + BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 }, + + #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] + BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 }, + + #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] + BadEncodingName { offset: u64 }, + + #[error("Details TBD")] + TBD, +} + #[derive(Clone, Debug)] pub enum Record { Header(Header),