From 6225b13e18d6ce5b42aa7112fa44a67b13b3bb41 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 3 Aug 2023 08:44:54 -0700 Subject: [PATCH] work --- doc/dev/system-file-format.texi | 6 +++--- rust/src/encoding.rs | 27 +++++++++++++++++++++++++++ rust/src/raw.rs | 14 ++++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index e7bda48d56..db8af7e374 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -714,7 +714,7 @@ been actually observed in system files: @table @asis @item 1 -EBCDIC. +EBCDIC. Only one example has been observed. @item 2 7-bit ASCII. Old versions of SPSS for Unix and Windows always wrote @@ -732,10 +732,10 @@ ISO 8859-1 (IBM AIX code page number). The @code{windows-874} code page for Thai. @item 932 -The @code{windows-932} code page for Japanese. +The @code{windows-932} code page for Japanese (aka @code{Shift_JIS}). @item 936 -The @code{windows-936} code page for simplified Chinese +The @code{windows-936} code page for simplified Chinese (aka @code{GBK}). @item 949 Probably @code{ks_c_5601-1987}, Unified Hangul Code. diff --git a/rust/src/encoding.rs b/rust/src/encoding.rs index a0e28af771..296e4e65a8 100644 --- a/rust/src/encoding.rs +++ b/rust/src/encoding.rs @@ -1 +1,28 @@ include!(concat!(env!("OUT_DIR"), "/encodings.rs")); + +pub fn codepage_from_encoding(encoding: &str) -> Option { + CODEPAGE_NAME_TO_NUMBER + .get(encoding.to_ascii_lowercase().as_str()) + .copied() +} + +pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option) -> Option<&str> { + if encoding.is_some() { + encoding + } else if let Some(codepage) = codepage { + match codepage { + 1 => Some("EBCDIC-US"), + 2 | 3 => { + // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] + // respectively. However, many files have character code 2 but + // data which are clearly not ASCII. Therefore, ignore these + // values. + None + }, + 4 => Some("MS_KANJI"), + _ => CODEPAGE_NUMBER_TO_NAME.get(&codepage).copied() + } + } else { + None + } +} diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 3f7309c7ce..e017f74ac4 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -824,6 +824,20 @@ impl ExtensionRecord for FloatInfo { } } +pub struct Encoding(pub String); + +impl ExtensionRecord for Encoding { + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "encoding record"; + + fn parse(ext: &Extension, endian: Endian) -> Result{ + ext.check_size::()?; + + Ok(Encoding(String::from_utf8(ext.data)?)) + } +} + pub struct Extension { /// Offset from the start of the file to the start of the record. pub offset: u64, -- 2.30.2