From 92bba7a2dd4dff9030989f4459902f47d504752a Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 1 Aug 2023 18:20:25 -0700 Subject: [PATCH] work --- doc/dev/system-file-format.texi | 174 +++++++++++++++++++++++++++++--- rust/Cargo.lock | 7 ++ rust/Cargo.toml | 6 +- rust/build.rs | 171 +++++++++++++++++++++++++++++++ rust/src/cooked.rs | 0 rust/src/encoding.rs | 1 + rust/src/lib.rs | 10 +- rust/src/raw.rs | 98 ++++++++++++++++-- src/data/sys-file-encoding.py | 2 +- utilities/pspp-dump-sav.c | 12 +-- 10 files changed, 452 insertions(+), 29 deletions(-) create mode 100644 rust/build.rs create mode 100644 rust/src/cooked.rs create mode 100644 rust/src/encoding.rs diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index ba390c1c02..e7bda48d56 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -717,17 +717,75 @@ been actually observed in system files: EBCDIC. @item 2 -7-bit ASCII. +7-bit ASCII. Old versions of SPSS for Unix and Windows always wrote +value 2 in this field, regardless of the encoding in use, so it is not +reliable and should be ignored. + +@item 3 +8-bit ``ASCII''. + +@item 819 +ISO 8859-1 (IBM AIX code page number). + +@item 874 +@itemx 9066 +The @code{windows-874} code page for Thai. + +@item 932 +The @code{windows-932} code page for Japanese. + +@item 936 +The @code{windows-936} code page for simplified Chinese + +@item 949 +Probably @code{ks_c_5601-1987}, Unified Hangul Code. + +@item 950 +The @code{big5} code page for traditional Chinese. @item 1250 The @code{windows-1250} code page for Central European and Eastern European languages. +@item 1251 +The @code{windows-1251} code page for Cyrillic languages. + @item 1252 The @code{windows-1252} code page for Western European languages. +@item 1253 +The @code{windows-1253} code page for modern Greek. + +@item 1254 +The @code{windows-1254} code page for Turkish. + +@item 1255 +The @code{windows-1255} code page for Hebrew. + +@item 1256 +The @code{windows-1256} code page for Arabic script. + +@item 1257 +The @code{windows-1257} code page for Estonian, Latvian, and +Lithuanian. + +@item 1258 +The @code{windows-1258} code page for Vietnamese. + +@item 20127 +US-ASCII. + @item 28591 -ISO 8859-1. +ISO 8859-1 (Latin-1). + +@item 25592 +ISO 8859-2 (Central European). + +@item 28605 +ISO 8895-9 (Latin-9). + +@item 51949 +The @code{euc-kr} code page for Korean. @item 65001 UTF-8. @@ -743,12 +801,13 @@ The following additional values are known to be defined: DEC Kanji. @end table +The most common values observed, from most to least common, are 1252, +65001, 2, and 28591. + Other Windows code page numbers are known to be generally valid. -Old versions of SPSS for Unix and Windows always wrote value 2 in this -field, regardless of the encoding in use. Newer versions also write -the character encoding as a string (see @ref{Character Encoding -Record}). +Newer versions also write the character encoding as a string (see +@ref{Character Encoding Record}). @end table @node Machine Floating-Point Info Record @@ -1251,20 +1310,107 @@ The total number of bytes in @code{encoding}. @item char encoding[]; The name of the character encoding. Normally this will be an official -IANA character set name or alias. -See @url{http://www.iana.org/assignments/character-sets}. -Character set names are not case-sensitive, but SPSS appears to write -them in all-uppercase. +IANA character set name or alias. See +@url{http://www.iana.org/assignments/character-sets}. Character set +names are not case-sensitive, and SPSS is not consistent, e.g.@: both +@code{windows-1251} and @code{WINDOWS-1252} have both been observed, +as have @code{Big5} and @code{BIG5}. @end table This record is not present in files generated by older software. See also the @code{character_code} field in the machine integer info record (@pxref{character-code}). -When the character encoding record and the machine integer info record -are both present, all system files observed in practice indicate the -same character encoding, e.g.@: 1252 as @code{character_code} and -@code{windows-1252} as @code{encoding}, 65001 and @code{UTF-8}, etc. +The following character encoding names have been observed. The names +are shown in lowercase, even though they were not always in lowercase +in the file. Alternative names for the same encoding are, when known, +listed together. For each encoding, the @code[character_code} values +that they were observed paired with are also listed. First, the +following are strictly single-byte, ASCII-compatible encodings: + +@table @code +@item @r{(encoding record missing)} +0, 2, 3, 874, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 20127, 28591, +28592, 28605 + +@item ansi_x3.4-1968 +@itemx ascii +1252 + +@item cp28605 +2 + +@item cp874 +9066 + +@item iso-8859-1 +819 + +@item windows-874 +874 + +@item windows-1250 +2, 1250, 1252 + +@item windows-1251 +2, 1251 + +@item cp1252 +@itemx windows-1252 +2, 1250, 1252, 1253 + +@item cp1253 +@itemx windows-1253 +1253 + +@item windows-1254 +2, 1254 + +@item windows-1255 +2, 1255 + +@Item windows-1256 +2, 1252, 1256 + +@item windows-1257 +2, 1257 + +@item windows-1258 +1258 +@end table + +The following are multibyte encodings, in which some code points +occupy a single byte and others multiple bytes. All of the following +encode ASCII characters as their native values, but some of them +(marked as ``not ASCII compatible'') also use ASCII values as second +or later bytes in multibyte sequences: + +@table @code +@item @r{(encoding record missing)} +65001, 949 (ASCII compatible) and 932, 936, 950 (not ASCII compatible). + +@item big5 +@itemx cp950 +2, 950 (not ASCII compatible) + +@item euc-kr +2, 51949 (ASCII compatible) + +@item gbk +936 (not ASCII compatible) + +@item utf-8 +0, 2, 1250, 1251, 1252, 1256, 65001 (ASCII compatible) + +@item cp932 +@itemx windows-31j +932 (not ASCII compatible) +@end table + +As the tables above show, when the character encoding record and the +machine integer info record are both present, they can contradict each +other. Observations show that, in this case, the character encoding +record should be honored. If, for testing purposes, a file is crafted with different @code{character_code} and @code{encoding}, it seems that diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 339237c316..bb71860a72 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -197,6 +197,12 @@ dependencies = [ "windows-sys 0.45.0", ] +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + [[package]] name = "libc" version = "0.2.139" @@ -374,6 +380,7 @@ dependencies = [ "flate2", "float_next_after", "hexplay", + "lazy_static", "num", "num-derive", "num-traits", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index d365ab63a8..7ee93bd9df 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -10,12 +10,16 @@ clap = { version = "4.1.7", features = ["derive", "wrap_help"] } flate2 = "1.0.26" float_next_after = "1.0.0" hexplay = "0.2.1" +lazy_static = "1.4.0" num = "0.4.0" num-derive = "0.4.0" num-traits = "0.2.16" ordered-float = "3.7.0" thiserror = "1.0" +[build-dependencies] +anyhow = "1.0.69" + [[bin]] name = "pspp-dump-sav" path = "src/main.rs" @@ -26,4 +30,4 @@ path = "src/lib.rs" [[test]] name = "sack" path = "tests/sack.rs" -harness = false \ No newline at end of file +harness = false diff --git a/rust/build.rs b/rust/build.rs new file mode 100644 index 0000000000..5b6c2dfea5 --- /dev/null +++ b/rust/build.rs @@ -0,0 +1,171 @@ +use anyhow::{anyhow, Result as AnyResult}; +use std::{ + collections::{BTreeMap, HashSet, VecDeque}, + env::var_os, + fs::{read_to_string, File}, + io::{Error as IoError, Write}, + path::{Path, PathBuf}, +}; + +#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)] +enum Source { + CP, + IBM, + Windows, +} + +// Code page number. +type CPNumber = usize; + +fn process_converter<'a>( + fields: &Vec<&'a str>, + codepages: &mut BTreeMap>>, +) { + if fields.is_empty() || fields[0] == "{" { + return; + } + + let mut cps: BTreeMap = BTreeMap::new(); + let mut iana = VecDeque::new(); + let mut other = VecDeque::new(); + + let mut iter = fields.iter().peekable(); + while let Some(&name) = iter.next() { + if iter.next_if(|&&s| s == "{").is_some() { + let mut standards = HashSet::new(); + loop { + let &standard = iter.next().expect("missing `}` in list of standards"); + if standard == "}" { + break; + } + standards.insert(standard); + } + + if standards.contains("IANA*") { + iana.push_front(name); + } else if standards.contains("IANA") { + iana.push_back(name); + } else if standards.iter().any(|&s| s.ends_with('*')) { + other.push_front(name); + } else { + other.push_back(name); + } + } else { + // Untagged names are completely nonstandard. + continue; + } + + if let Some(number) = name.strip_prefix("cp") { + if let Ok(number) = number.parse::() { + cps.insert(Source::CP, number); + } + } + + if let Some(number) = name.strip_prefix("windows-") { + if let Ok(number) = number.parse::() { + cps.insert(Source::Windows, number); + } + } + + if let Some(number) = name.strip_prefix("ibm-") { + if let Ok(number) = number.parse::() { + cps.insert(Source::IBM, number); + } + } + } + + // If there are no tagged names then this is completely nonstandard. + if iana.is_empty() && other.is_empty() { + return; + } + + let all: Vec<&str> = iana.into_iter().chain(other.into_iter()).collect(); + for (source, number) in cps { + codepages + .entry(number) + .or_insert_with(BTreeMap::new) + .insert(source, all.clone()); + } +} + +fn write_output( + codepages: &BTreeMap>>, + file_name: &PathBuf, +) -> Result<(), IoError> { + let mut file = File::create(file_name)?; + + write!(file, "{}", "\ +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref CODEPAGE_NUMBER_TO_NAME: HashMap = { + let mut map = HashMap::new(); +")?; + + for (&cpnumber, value) in codepages.iter() { + let source = value.keys().max().unwrap(); + let name = value[source][0]; + writeln!(file, " map.insert({cpnumber}, \"{name}\");")?; + } + write!(file, "{}", "\ + map + }; +} +")?; + + let mut names: BTreeMap<&str, BTreeMap>> = BTreeMap::new(); + for (&cpnumber, value) in codepages.iter() { + for (&source, value2) in value.iter() { + for &name in value2.iter() { + names + .entry(name) + .or_insert_with(BTreeMap::new) + .entry(source) + .or_insert_with(Vec::new) + .push(cpnumber); + } + } + } + + for (&name, value) in names.iter() { + for (_source, numbers) in value.iter().rev() { + println!(" {{ {}, \"{name}\" }},", numbers[0]); + break; + } + } + + Ok(()) +} + +fn main() -> AnyResult<()> { + println!("cargo:rerun-if-changed=build.rs"); + + let input_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("../src/data/convrtrs.txt"); + println!("cargo:rerun-if-changed={}", input_file.to_string_lossy()); + let input = read_to_string(&input_file) + .map_err(|e| anyhow!("{}: read failed ({e})", input_file.to_string_lossy()))?; + + let mut codepages: BTreeMap>> = BTreeMap::new(); + let mut converter: Vec<&str> = Vec::new(); + for line in input.lines() { + let line = line + .find('#') + .map(|position| &line[..position]) + .unwrap_or(line) + .trim_end(); + if !line.starts_with(&[' ', '\t']) { + process_converter(&converter, &mut codepages); + converter.clear(); + } + converter.extend(line.split_whitespace()); + } + process_converter(&converter, &mut codepages); + + let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs"); + + write_output(&codepages, &output_file_name) + .map_err(|e| anyhow!("{}: write failed ({e})", output_file_name.to_string_lossy()))?; + + Ok(()) +} diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/rust/src/encoding.rs b/rust/src/encoding.rs new file mode 100644 index 0000000000..a0e28af771 --- /dev/null +++ b/rust/src/encoding.rs @@ -0,0 +1 @@ +include!(concat!(env!("OUT_DIR"), "/encodings.rs")); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 6dc13b586a..680d3a3a51 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -3,7 +3,9 @@ use thiserror::Error as ThisError; pub mod endian; pub mod raw; +pub mod cooked; pub mod sack; +pub mod encoding; #[derive(ThisError, Debug)] pub enum Error { @@ -28,7 +30,7 @@ pub enum Error { #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] BadDocumentLength { offset: u64, n: u32, max: u32 }, - #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")] + #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] BadRecordType { offset: u64, rec_type: u32 }, #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")] @@ -84,4 +86,10 @@ pub enum Error { expected_n_blocks: u64, ztrailer_len: u64, }, + + #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] + BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 }, + + #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] + BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 }, } diff --git a/rust/src/raw.rs b/rust/src/raw.rs index b09f94e73a..3f7309c7ce 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -759,6 +759,71 @@ enum ExtensionType { } */ +trait ExtensionRecord where Self: Sized { + const SIZE: Option; + const COUNT: Option; + const NAME: &'static str; + fn parse(ext: &Extension, endian: Endian) -> Result; +} + +pub struct IntegerInfo { + version: (i32, i32, i32), + machine_code: i32, + floating_point_rep: i32, + compression_code: i32, + endianness: i32, + character_code: i32, +} + +impl ExtensionRecord for IntegerInfo { + const SIZE: Option = Some(4); + const COUNT: Option = Some(8); + const NAME: &'static str = "integer record"; + + fn parse(ext: &Extension, endian: Endian) -> Result{ + ext.check_size::()?; + + let mut input = &ext.data[..]; + let data: Vec = (0..8) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(IntegerInfo { + version: (data[0], data[1], data[2]), + machine_code: data[3], + floating_point_rep: data[4], + compression_code: data[5], + endianness: data[6], + character_code: data[7] + }) + } +} + +pub struct FloatInfo { + sysmis: f64, + highest: f64, + lowest: f64, +} + +impl ExtensionRecord for FloatInfo { + const SIZE: Option = Some(8); + const COUNT: Option = Some(3); + const NAME: &'static str = "floating point record"; + + fn parse(ext: &Extension, endian: Endian) -> Result{ + ext.check_size::()?; + + let mut input = &ext.data[..]; + let data: Vec = (0..3) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(FloatInfo { + sysmis: data[0], + highest: data[1], + lowest: data[2], + }) + } +} + pub struct Extension { /// Offset from the start of the file to the start of the record. pub offset: u64, @@ -805,6 +870,30 @@ fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) { */ impl Extension { + fn check_size(&self) -> Result<(), Error> { + if let Some(expected_size) = E::SIZE { + if self.size != expected_size { + return Err(Error::BadRecordSize { + offset: self.offset, + record: E::NAME.into(), + size: self.size, + expected_size, + }); + } + } + if let Some(expected_count) = E::COUNT { + if self.count != expected_count { + return Err(Error::BadRecordCount { + offset: self.offset, + record: E::NAME.into(), + count: self.count, + expected_count, + }); + } + } + Ok(()) + } + fn read(r: &mut R, endian: Endian) -> Result { let subtype = endian.parse(read_bytes(r)?); let offset = r.stream_position()?; @@ -895,10 +984,7 @@ pub struct ZBlock { } impl ZBlock { - fn read( - r: &mut R, - endian: Endian, - ) -> Result { + fn read(r: &mut R, endian: Endian) -> Result { Ok(ZBlock { uncompressed_ofs: endian.parse(read_bytes(r)?), compressed_ofs: endian.parse(read_bytes(r)?), @@ -933,8 +1019,8 @@ impl ZTrailer { }); } let blocks = (0..n_blocks) - .map(|_| ZBlock::read(reader, endian)) - .collect::, _>>()?; + .map(|_| ZBlock::read(reader, endian)) + .collect::, _>>()?; reader.seek(SeekFrom::Start(start_offset))?; Ok(Some(ZTrailer { offset: ztrailer_ofs, diff --git a/src/data/sys-file-encoding.py b/src/data/sys-file-encoding.py index 7d14545490..0f88a04fff 100644 --- a/src/data/sys-file-encoding.py +++ b/src/data/sys-file-encoding.py @@ -139,7 +139,7 @@ struct sys_encoding sys_codepage_number_to_name[] = {""") for cpnumber, value in sorted(codepages.items()): source = max(value.keys()) - name = value[source][0] + name = value[source] print(' { %s, "%s" },' % (cpnumber, name)) print(""" { 0, NULL } }; diff --git a/utilities/pspp-dump-sav.c b/utilities/pspp-dump-sav.c index d08999e01b..3268575459 100644 --- a/utilities/pspp-dump-sav.c +++ b/utilities/pspp-dump-sav.c @@ -65,6 +65,9 @@ struct sfm_reader double bias; }; + static int character_code; +static char *encoding; + static void read_header (struct sfm_reader *); static void read_variable_record (struct sfm_reader *); static void read_value_label_record (struct sfm_reader *); @@ -229,6 +232,7 @@ main (int argc, char *argv[]) (long long int) ftello (r.file), (long long int) ftello (r.file) + 4); + printf ("Character Encoding: %s (%d)\n", encoding ? encoding : "none", character_code); if (r.compression == COMP_SIMPLE) { if (max_cases > 0) @@ -695,7 +699,7 @@ read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count) int float_representation = read_int (r); int compression_code = read_int (r); int integer_representation = read_int (r); - int character_code = read_int (r); + character_code = read_int (r); printf ("%08llx: machine integer info\n", offset); if (size != 4 || count != 8) @@ -1065,13 +1069,9 @@ read_datafile_attributes (struct sfm_reader *r, size_t size, size_t count) static void read_character_encoding (struct sfm_reader *r, size_t size, size_t count) { - long long int posn = ftello (r->file); - char *encoding = xcalloc (size, count + 1); + encoding = xcalloc (size, count + 1); read_string (r, encoding, count + 1); - printf ("%08llx: Character Encoding: %s\n", posn, encoding); - - free (encoding); } static void -- 2.30.2