From: Ben Pfaff Date: Thu, 2 Nov 2023 02:27:34 +0000 (-0700) Subject: work on character encoding X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=062f5cd94f8682d7b2311372d12c8d7e022e60a0;p=pspp work on character encoding --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 9f6f0101b0..3fac184063 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -4,7 +4,7 @@ use crate::{ endian::Endian, format::{Error as FormatError, Spec, UncheckedSpec}, identifier::{Error as IdError, Identifier}, - raw::{self, MissingValues, UnencodedStr, VarType}, + raw::{self, MissingValues, UnencodedStr, VarType}, encoding::get_encoding, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::{DecoderResult, Encoding}; @@ -187,7 +187,7 @@ pub struct Decoder { n_generated_names: usize, } -pub fn decode(headers: Vec) -> Vec { +pub fn decode(headers: Vec, warn: &impl Fn(Error)) -> Vec { let encoding = headers.iter().find_map(|rec| { if let raw::Record::Encoding(ref e) = rec { Some(e.0.as_str()) @@ -202,7 +202,10 @@ pub fn decode(headers: Vec) -> Vec { None } }); - + let encoding = get_encoding(encoding, character_code) + + let decoder = Decoder { + }; Vec::new() } diff --git a/rust/src/encoding.rs b/rust/src/encoding.rs index d22f692747..3d585a6b63 100644 --- a/rust/src/encoding.rs +++ b/rust/src/encoding.rs @@ -9,9 +9,14 @@ pub fn codepage_from_encoding(encoding: &str) -> Option { use thiserror::Error as ThisError; #[derive(ThisError, Debug)] pub enum Error { - #[error("This system file does not indicate its own character encoding. Using default encoding {0}. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] - NoEncoding(String), - + #[error("This system file does not indicate its own character encoding. xFor best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] + NoEncoding, + + #[error("This system file encodes text strings with unknown code page {0}.")] + UnknownCodepage(u32), + + #[error("This system file is encoded in EBCDIC, which is not supported.")] + Ebcdic, } /// Returns the character set used by the locale configured in the operating @@ -21,25 +26,26 @@ pub fn locale_charset() -> &'static str { "UTF-8" } -/* -pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option) -> Result<&str, ()> { - let label = if encoding.is_some() { - encoding - } else if let Some(codepage) = codepage { +pub fn get_encoding(encoding: Option<&str>, character_code: Option) -> Result<&str, Error> { + if let Some(encoding) = encoding { + Ok(encoding) + } else if let Some(codepage) = character_code { match codepage { - 1 => Some("EBCDIC-US"), + 1 => Err(Error::Ebcdic), 2 | 3 => { // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] // respectively. However, many files have character code 2 but // data which are clearly not ASCII. Therefore, ignore these // values. - None - }, - 4 => Some("MS_KANJI"), - _ => CODEPAGE_NUMBER_TO_NAME.get(&codepage).copied() + Err(Error::NoEncoding) + } + 4 => Ok("MS_KANJI"), + _ => CODEPAGE_NUMBER_TO_NAME + .get(&codepage) + .copied() + .ok_or(Error::UnknownCodepage(codepage)), } } else { - None - }; + Err(Error::NoEncoding) + } } -*/ diff --git a/rust/src/locale_charset.rs b/rust/src/locale_charset.rs index e967c1d46d..8b3de74daf 100644 --- a/rust/src/locale_charset.rs +++ b/rust/src/locale_charset.rs @@ -295,6 +295,8 @@ mod inner { } } +/// Returns the character set used by the locale configured in the operating +/// system. pub fn locale_charset() -> &'static str { lazy_static! { static ref LOCALE_CHARSET: &'static str =