From 20e2af4ec687ccbdfe47fe30275dd0121ec3ec16 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 5 Sep 2023 08:31:49 -0700 Subject: [PATCH] work --- rust/Cargo.lock | 6 +- rust/Cargo.toml | 4 + rust/src/cooked.rs | 22 ++- rust/src/encoding.rs | 23 ++- rust/src/lib.rs | 1 + rust/src/locale_charset.rs | 289 +++++++++++++++++++++++++++++++++++++ 6 files changed, 339 insertions(+), 6 deletions(-) create mode 100644 rust/src/locale_charset.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index afe9323c59..d9d4fc6e37 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -294,9 +294,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" [[package]] name = "linux-raw-sys" @@ -479,12 +479,14 @@ dependencies = [ "float_next_after", "hexplay", "lazy_static", + "libc", "num", "num-derive", "num-traits", "ordered-float", "thiserror", "unicase", + "windows-sys 0.48.0", ] [[package]] diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 0059ae98aa..0e75239764 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -20,6 +20,10 @@ thiserror = "1.0" chrono = "0.4.26" finl_unicode = "1.2.0" unicase = "2.6.0" +libc = "0.2.147" + +[target.'cfg(windows)'.dependencies] +windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } [build-dependencies] anyhow = "1.0.69" diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 97ea4906d6..9f6f0101b0 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -163,10 +163,10 @@ pub enum Record { } pub use crate::raw::EncodingRecord; +pub use crate::raw::Extension; pub use crate::raw::FloatInfoRecord; pub use crate::raw::IntegerInfoRecord; pub use crate::raw::NumberOfCasesRecord; -pub use crate::raw::Extension; type DictIndex = usize; @@ -187,6 +187,26 @@ pub struct Decoder { n_generated_names: usize, } +pub fn decode(headers: Vec) -> Vec { + let encoding = headers.iter().find_map(|rec| { + if let raw::Record::Encoding(ref e) = rec { + Some(e.0.as_str()) + } else { + None + } + }); + let character_code = headers.iter().find_map(|rec| { + if let raw::Record::IntegerInfo(ref r) = rec { + Some(r.character_code) + } else { + None + } + }); + + + Vec::new() +} + impl Decoder { fn generate_name(&mut self) -> Identifier { loop { diff --git a/rust/src/encoding.rs b/rust/src/encoding.rs index 296e4e65a8..d22f692747 100644 --- a/rust/src/encoding.rs +++ b/rust/src/encoding.rs @@ -6,8 +6,24 @@ pub fn codepage_from_encoding(encoding: &str) -> Option { .copied() } -pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option) -> Option<&str> { - if encoding.is_some() { +use thiserror::Error as ThisError; +#[derive(ThisError, Debug)] +pub enum Error { + #[error("This system file does not indicate its own character encoding. Using default encoding {0}. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] + NoEncoding(String), + +} + +/// Returns the character set used by the locale configured in the operating +/// system. This should implement roughly the same behavior as the function +/// with the same name in Gnulib. Until then, we'll just use a default. +pub fn locale_charset() -> &'static str { + "UTF-8" +} + +/* +pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option) -> Result<&str, ()> { + let label = if encoding.is_some() { encoding } else if let Some(codepage) = codepage { match codepage { @@ -24,5 +40,6 @@ pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option) -> Opt } } else { None - } + }; } +*/ diff --git a/rust/src/lib.rs b/rust/src/lib.rs index ebb4033b25..3eb4bbae4e 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -5,3 +5,4 @@ pub mod format; pub mod identifier; pub mod raw; pub mod sack; +pub mod locale_charset; diff --git a/rust/src/locale_charset.rs b/rust/src/locale_charset.rs new file mode 100644 index 0000000000..fdcb685f18 --- /dev/null +++ b/rust/src/locale_charset.rs @@ -0,0 +1,289 @@ +// Determine a canonical name for the current locale's character encoding. +// +// Copyright (C) 2000-2006, 2008-2023 Free Software Foundation, Inc. +// +// This file is free software: you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 2.1 of the License, or (at your option) +// any later version. +// +// This file is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +// A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program. If not, see . +// +// Written by Bruno Haible . Translated to Rust by Ben Pfaff +// . + +use lazy_static::lazy_static; + +fn map_aliases(s: &str) -> &'static str { + #[cfg(target_os = "freebsd")] + match s { + "ARMSCII-8" => return "ARMSCII-8", + "Big5" => return "BIG5", + "C" => return "ASCII", + "CP1131" => return "CP1131", + "CP1251" => return "CP1251", + "CP866" => return "CP866", + "GB18030" => return "GB18030", + "GB2312" => return "GB2312", + "GBK" => return "GBK", + "ISCII-DEV" => return "?", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-13" => return "ISO-8859-13", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-7" => return "ISO-8859-7", + "ISO8859-9" => return "ISO-8859-9", + "KOI8-R" => return "KOI8-R", + "KOI8-U" => return "KOI8-U", + "SJIS" => return "SHIFT_JIS", + "US-ASCII" => return "ASCII", + "eucCN" => return "GB2312", + "eucJP" => return "EUC-JP", + "eucKR" => return "EUC-KR", + _ => (), + }; + + #[cfg(target_os = "netbsd")] + match s { + "646" => return "ASCII", + "ARMSCII-8" => return "ARMSCII-8", + "BIG5" => return "BIG5", + "Big5-HKSCS" => return "BIG5-HKSCS", + "CP1251" => return "CP1251", + "CP866" => return "CP866", + "GB18030" => return "GB18030", + "GB2312" => return "GB2312", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-13" => return "ISO-8859-13", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-4" => return "ISO-8859-4", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-7" => return "ISO-8859-7", + "KOI8-R" => return "KOI8-R", + "KOI8-U" => return "KOI8-U", + "PT154" => return "PT154", + "SJIS" => return "SHIFT_JIS", + "eucCN" => return "GB2312", + "eucJP" => return "EUC-JP", + "eucKR" => return "EUC-KR", + "eucTW" => return "EUC-TW", + _ => (), + }; + + #[cfg(target_os = "openbsd")] + match s { + "646" => return "ASCII", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-13" => return "ISO-8859-13", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-4" => return "ISO-8859-4", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-7" => return "ISO-8859-7", + "US-ASCII" => return "ASCII", + _ => (), + }; + + /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is + useless: + - It returns the empty string when LANG is set to a locale of the + form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8 + LC_CTYPE file. + - The environment variables LANG, LC_CTYPE, LC_ALL are not set by + the system; nl_langinfo(CODESET) returns "US-ASCII" in this case. + - The documentation says: + "... all code that calls BSD system routines should ensure + that the const *char parameters of these routines are in UTF-8 + encoding. All BSD system functions expect their string + parameters to be in UTF-8 encoding and nothing else." + It also says + "An additional caveat is that string parameters for files, + paths, and other file-system entities must be in canonical + UTF-8. In a canonical UTF-8 Unicode string, all decomposable + characters are decomposed ..." + but this is not true: You can pass non-decomposed UTF-8 strings + to file system functions, and it is the OS which will convert + them to decomposed UTF-8 before accessing the file system. + - The Apple Terminal application displays UTF-8 by default. + - However, other applications are free to use different encodings: + - xterm uses ISO-8859-1 by default. + - TextEdit uses MacRoman by default. + We prefer UTF-8 over decomposed UTF-8-MAC because one should + minimize the use of decomposed Unicode. Unfortunately, through the + Darwin file system, decomposed UTF-8 strings are leaked into user + space nevertheless. + Then there are also the locales with encodings other than US-ASCII + and UTF-8. These locales can be occasionally useful to users (e.g. + when grepping through ISO-8859-1 encoded text files), when all their + file names are in US-ASCII. + */ + + #[cfg(target_os = "macos")] + match s { + "ARMSCII-8" => return "ARMSCII-8", + "Big5" => return "BIG5", + "Big5HKSCS" => return "BIG5-HKSCS", + "CP1131" => return "CP1131", + "CP1251" => return "CP1251", + "CP866" => return "CP866", + "CP949" => return "CP949", + "GB18030" => return "GB18030", + "GB2312" => return "GB2312", + "GBK" => return "GBK", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-13" => return "ISO-8859-13", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-4" => return "ISO-8859-4", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-7" => return "ISO-8859-7", + "ISO8859-9" => return "ISO-8859-9", + "KOI8-R" => return "KOI8-R", + "KOI8-U" => return "KOI8-U", + "PT154" => return "PT154", + "SJIS" => return "SHIFT_JIS", + "eucCN" => return "GB2312", + "eucJP" => return "EUC-JP", + "eucKR" => return "EUC-KR", + _ => (), + }; + + #[cfg(target_os = "aix")] + match s { + "GBK" => return "GBK", + "IBM-1046" => return "CP1046", + "IBM-1124" => return "CP1124", + "IBM-1129" => return "CP1129", + "IBM-1252" => return "CP1252", + "IBM-850" => return "CP850", + "IBM-856" => return "CP856", + "IBM-921" => return "ISO-8859-13", + "IBM-922" => return "CP922", + "IBM-932" => return "CP932", + "IBM-943" => return "CP943", + "IBM-eucCN" => return "GB2312", + "IBM-eucJP" => return "EUC-JP", + "IBM-eucKR" => return "EUC-KR", + "IBM-eucTW" => return "EUC-TW", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-6" => return "ISO-8859-6", + "ISO8859-7" => return "ISO-8859-7", + "ISO8859-8" => return "ISO-8859-8", + "ISO8859-9" => return "ISO-8859-9", + "TIS-620" => return "TIS-620", + "UTF-8" => return "UTF-8", + "big5" => return "BIG5", + _ => (), + }; + + #[cfg(windows)] + match s { + "CP1361" => return "JOHAB", + "CP20127" => return "ASCII", + "CP20866" => return "KOI8-R", + "CP20936" => return "GB2312", + "CP21866" => return "KOI8-RU", + "CP28591" => return "ISO-8859-1", + "CP28592" => return "ISO-8859-2", + "CP28593" => return "ISO-8859-3", + "CP28594" => return "ISO-8859-4", + "CP28595" => return "ISO-8859-5", + "CP28596" => return "ISO-8859-6", + "CP28597" => return "ISO-8859-7", + "CP28598" => return "ISO-8859-8", + "CP28599" => return "ISO-8859-9", + "CP28605" => return "ISO-8859-15", + "CP38598" => return "ISO-8859-8", + "CP51932" => return "EUC-JP", + "CP51936" => return "GB2312", + "CP51949" => return "EUC-KR", + "CP51950" => return "EUC-TW", + "CP54936" => return "GB18030", + "CP65001" => return "UTF-8", + "CP936" => return "GBK", + _ => (), + }; + + String::from(s).leak() +} + +#[cfg(unix)] +mod inner { + use std::ffi::CStr; + + use libc::{self, nl_langinfo, CODESET}; + + fn codeset() -> Option { + unsafe { + let codeset = nl_langinfo(CODESET); + if codeset.is_null() { + None + } else { + Some(CStr::from_ptr(codeset).to_string_lossy().into()) + } + } + } + + pub fn locale_charset() -> Option { + codeset() + } +} + +#[cfg(windows)] +mod inner { + use libc::{setlocale, LC_CTYPE}; + use std::ffi::{CStr, CString}; + use windows_sys::Win32::Globalization::GetACP; + + fn current_locale() -> Option { + unsafe { + let empty_cstr = CString::new("").unwrap(); + let locale = setlocale(LC_CTYPE, empty_cstr.as_ptr()); + if locale.is_null() { + None + } else { + Some(CStr::from_ptr(locale).to_string_lossy().into()) + } + } + } + + pub fn locale_charset() -> Option { + let Some(current_locale) = current_locale() else { + return None; + }; + let codepage = if let Some((_, pdot)) = current_locale.rsplit_once('.') { + format!("CP{pdot}") + } else { + format!("CP{}", unsafe { GetACP() }) + }; + Some(match codepage.as_str() { + "CP65001" | "CPutf8" => String::from("UTF-8"), + _ => codepage, + }) + } +} + +#[cfg(not(any(unix, windows)))] +mod inner { + pub fn locale_charse() -> String { + String::from("UTF-8") + } +} + +pub fn locale_charset() -> &'static str { + lazy_static! { + static ref LOCALE_CHARSET: &'static str = map_aliases(&inner::locale_charset().unwrap_or(String::from("UTF-8"))); + } + &LOCALE_CHARSET +} -- 2.30.2