X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Fencoding.rs;h=8fd13f3ea3fb4e9b3f5243cd998037c15ac609d0;hb=a8331d2f67af24ce1f9f5da99641b8d1cdc21300;hp=296e4e65a8e4f32a61e4e415318610e3180513f7;hpb=6225b13e18d6ce5b42aa7112fa44a67b13b3bb41;p=pspp diff --git a/rust/src/encoding.rs b/rust/src/encoding.rs index 296e4e65a8..8fd13f3ea3 100644 --- a/rust/src/encoding.rs +++ b/rust/src/encoding.rs @@ -1,3 +1,6 @@ +use crate::locale_charset::locale_charset; +use encoding_rs::{Encoding, UTF_8}; + include!(concat!(env!("OUT_DIR"), "/encodings.rs")); pub fn codepage_from_encoding(encoding: &str) -> Option { @@ -6,23 +9,56 @@ pub fn codepage_from_encoding(encoding: &str) -> Option { .copied() } -pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option) -> Option<&str> { - if encoding.is_some() { +use thiserror::Error as ThisError; + +#[derive(ThisError, Debug)] +pub enum Error { + #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] + NoEncoding, + + #[error("This system file encodes text strings with unknown code page {0}.")] + UnknownCodepage(i32), + + #[error("This system file encodes text strings with unknown encoding {0}.")] + UnknownEncoding(String), + + #[error("This system file is encoded in EBCDIC, which is not supported.")] + Ebcdic, +} + +pub fn default_encoding() -> &'static Encoding { + lazy_static! { + static ref DEFAULT_ENCODING: &'static Encoding = + Encoding::for_label(locale_charset().as_bytes()).unwrap_or(&UTF_8); + } + &DEFAULT_ENCODING +} + +pub fn get_encoding( + encoding: Option<&str>, + character_code: Option, +) -> Result<&'static Encoding, Error> { + let label = if let Some(encoding) = encoding { encoding - } else if let Some(codepage) = codepage { + } else if let Some(codepage) = character_code { match codepage { - 1 => Some("EBCDIC-US"), + 1 => return Err(Error::Ebcdic), 2 | 3 => { // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] // respectively. However, many files have character code 2 but // data which are clearly not ASCII. Therefore, ignore these // values. - None - }, - 4 => Some("MS_KANJI"), - _ => CODEPAGE_NUMBER_TO_NAME.get(&codepage).copied() + return Err(Error::NoEncoding); + } + 4 => "MS_KANJI", + _ => CODEPAGE_NUMBER_TO_NAME + .get(&codepage) + .copied() + .ok_or(Error::UnknownCodepage(codepage))?, } } else { - None - } + return Err(Error::NoEncoding); + }; + + Ok(Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))?) }