work
authorBen Pfaff <blp@cs.stanford.edu>
Tue, 5 Sep 2023 15:31:49 +0000 (08:31 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Tue, 5 Sep 2023 15:31:49 +0000 (08:31 -0700)
rust/Cargo.lock
rust/Cargo.toml
rust/src/cooked.rs
rust/src/encoding.rs
rust/src/lib.rs
rust/src/locale_charset.rs [new file with mode: 0644]

index afe9323c59df40b663c370a3c87230406bdc8662..d9d4fc6e37f9d2206802ec5733f61a84a700703d 100644 (file)
@@ -294,9 +294,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
 [[package]]
 name = "libc"
-version = "0.2.139"
+version = "0.2.147"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
+checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
 
 [[package]]
 name = "linux-raw-sys"
@@ -479,12 +479,14 @@ dependencies = [
  "float_next_after",
  "hexplay",
  "lazy_static",
+ "libc",
  "num",
  "num-derive",
  "num-traits",
  "ordered-float",
  "thiserror",
  "unicase",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
index 0059ae98aa319d54bf54e29ffdfd9a7495682c54..0e7523976483d7bd727aac2c861bea536d934102 100644 (file)
@@ -20,6 +20,10 @@ thiserror = "1.0"
 chrono = "0.4.26"
 finl_unicode = "1.2.0"
 unicase = "2.6.0"
+libc = "0.2.147"
+
+[target.'cfg(windows)'.dependencies]
+windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
 
 [build-dependencies]
 anyhow = "1.0.69"
index 97ea4906d6d9d774cc9bd2881ddc705ee23c0c74..9f6f0101b0ce51535221779b362008edcd311a94 100644 (file)
@@ -163,10 +163,10 @@ pub enum Record {
 }
 
 pub use crate::raw::EncodingRecord;
+pub use crate::raw::Extension;
 pub use crate::raw::FloatInfoRecord;
 pub use crate::raw::IntegerInfoRecord;
 pub use crate::raw::NumberOfCasesRecord;
-pub use crate::raw::Extension;
 
 type DictIndex = usize;
 
@@ -187,6 +187,26 @@ pub struct Decoder {
     n_generated_names: usize,
 }
 
+pub fn decode<T>(headers: Vec<raw::Record>) -> Vec<Record> {
+    let encoding = headers.iter().find_map(|rec| {
+        if let raw::Record::Encoding(ref e) = rec {
+            Some(e.0.as_str())
+        } else {
+            None
+        }
+    });
+    let character_code = headers.iter().find_map(|rec| {
+        if let raw::Record::IntegerInfo(ref r) = rec {
+            Some(r.character_code)
+        } else {
+            None
+        }
+    });
+    
+
+    Vec::new()
+}
+
 impl Decoder {
     fn generate_name(&mut self) -> Identifier {
         loop {
index 296e4e65a8e4f32a61e4e415318610e3180513f7..d22f6927477218f59f564dd6a93a8bfd79d77d3c 100644 (file)
@@ -6,8 +6,24 @@ pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
         .copied()
 }
 
-pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option<u32>) -> Option<&str> {
-    if encoding.is_some() {
+use thiserror::Error as ThisError;
+#[derive(ThisError, Debug)]
+pub enum Error {
+    #[error("This system file does not indicate its own character encoding.  Using default encoding {0}.  For best results, specify an encoding explicitly.  Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
+    NoEncoding(String),
+    
+}
+
+/// Returns the character set used by the locale configured in the operating
+/// system.  This should implement roughly the same behavior as the function
+/// with the same name in Gnulib.  Until then, we'll just use a default.
+pub fn locale_charset() -> &'static str {
+    "UTF-8"
+}
+
+/*
+pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option<u32>) -> Result<&str, ()> {
+    let label = if encoding.is_some() {
         encoding
     } else if let Some(codepage) = codepage {
         match codepage {
@@ -24,5 +40,6 @@ pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option<u32>) -> Opt
         }
     } else {
         None
-    }
+    };
 }
+*/
index ebb4033b25038d2a3284c46be9ec18905115bae8..3eb4bbae4eba2032a46a954c05c98d0d99959525 100644 (file)
@@ -5,3 +5,4 @@ pub mod format;
 pub mod identifier;
 pub mod raw;
 pub mod sack;
+pub mod locale_charset;
diff --git a/rust/src/locale_charset.rs b/rust/src/locale_charset.rs
new file mode 100644 (file)
index 0000000..fdcb685
--- /dev/null
@@ -0,0 +1,289 @@
+// Determine a canonical name for the current locale's character encoding.
+//
+// Copyright (C) 2000-2006, 2008-2023 Free Software Foundation, Inc.
+//
+// This file is free software: you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 2.1 of the License, or (at your option)
+// any later version.
+//
+// This file is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+// A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+//
+// Written by Bruno Haible <bruno@clisp.org>.  Translated to Rust by Ben Pfaff
+// <blp@cs.stanford.edu>.
+
+use lazy_static::lazy_static;
+
+fn map_aliases(s: &str) -> &'static str {
+    #[cfg(target_os = "freebsd")]
+    match s {
+        "ARMSCII-8" => return "ARMSCII-8",
+        "Big5" => return "BIG5",
+        "C" => return "ASCII",
+        "CP1131" => return "CP1131",
+        "CP1251" => return "CP1251",
+        "CP866" => return "CP866",
+        "GB18030" => return "GB18030",
+        "GB2312" => return "GB2312",
+        "GBK" => return "GBK",
+        "ISCII-DEV" => return "?",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-13" => return "ISO-8859-13",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-7" => return "ISO-8859-7",
+        "ISO8859-9" => return "ISO-8859-9",
+        "KOI8-R" => return "KOI8-R",
+        "KOI8-U" => return "KOI8-U",
+        "SJIS" => return "SHIFT_JIS",
+        "US-ASCII" => return "ASCII",
+        "eucCN" => return "GB2312",
+        "eucJP" => return "EUC-JP",
+        "eucKR" => return "EUC-KR",
+        _ => (),
+    };
+
+    #[cfg(target_os = "netbsd")]
+    match s {
+        "646" => return "ASCII",
+        "ARMSCII-8" => return "ARMSCII-8",
+        "BIG5" => return "BIG5",
+        "Big5-HKSCS" => return "BIG5-HKSCS",
+        "CP1251" => return "CP1251",
+        "CP866" => return "CP866",
+        "GB18030" => return "GB18030",
+        "GB2312" => return "GB2312",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-13" => return "ISO-8859-13",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-4" => return "ISO-8859-4",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-7" => return "ISO-8859-7",
+        "KOI8-R" => return "KOI8-R",
+        "KOI8-U" => return "KOI8-U",
+        "PT154" => return "PT154",
+        "SJIS" => return "SHIFT_JIS",
+        "eucCN" => return "GB2312",
+        "eucJP" => return "EUC-JP",
+        "eucKR" => return "EUC-KR",
+        "eucTW" => return "EUC-TW",
+        _ => (),
+    };
+
+    #[cfg(target_os = "openbsd")]
+    match s {
+        "646" => return "ASCII",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-13" => return "ISO-8859-13",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-4" => return "ISO-8859-4",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-7" => return "ISO-8859-7",
+        "US-ASCII" => return "ASCII",
+        _ => (),
+    };
+
+    /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
+      useless:
+      - It returns the empty string when LANG is set to a locale of the
+        form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
+        LC_CTYPE file.
+      - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
+        the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
+      - The documentation says:
+          "... all code that calls BSD system routines should ensure
+           that the const *char parameters of these routines are in UTF-8
+           encoding. All BSD system functions expect their string
+           parameters to be in UTF-8 encoding and nothing else."
+        It also says
+          "An additional caveat is that string parameters for files,
+           paths, and other file-system entities must be in canonical
+           UTF-8. In a canonical UTF-8 Unicode string, all decomposable
+           characters are decomposed ..."
+        but this is not true: You can pass non-decomposed UTF-8 strings
+        to file system functions, and it is the OS which will convert
+        them to decomposed UTF-8 before accessing the file system.
+      - The Apple Terminal application displays UTF-8 by default.
+      - However, other applications are free to use different encodings:
+        - xterm uses ISO-8859-1 by default.
+        - TextEdit uses MacRoman by default.
+      We prefer UTF-8 over decomposed UTF-8-MAC because one should
+      minimize the use of decomposed Unicode. Unfortunately, through the
+      Darwin file system, decomposed UTF-8 strings are leaked into user
+      space nevertheless.
+      Then there are also the locales with encodings other than US-ASCII
+      and UTF-8. These locales can be occasionally useful to users (e.g.
+      when grepping through ISO-8859-1 encoded text files), when all their
+      file names are in US-ASCII.
+    */
+
+    #[cfg(target_os = "macos")]
+    match s {
+        "ARMSCII-8" => return "ARMSCII-8",
+        "Big5" => return "BIG5",
+        "Big5HKSCS" => return "BIG5-HKSCS",
+        "CP1131" => return "CP1131",
+        "CP1251" => return "CP1251",
+        "CP866" => return "CP866",
+        "CP949" => return "CP949",
+        "GB18030" => return "GB18030",
+        "GB2312" => return "GB2312",
+        "GBK" => return "GBK",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-13" => return "ISO-8859-13",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-4" => return "ISO-8859-4",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-7" => return "ISO-8859-7",
+        "ISO8859-9" => return "ISO-8859-9",
+        "KOI8-R" => return "KOI8-R",
+        "KOI8-U" => return "KOI8-U",
+        "PT154" => return "PT154",
+        "SJIS" => return "SHIFT_JIS",
+        "eucCN" => return "GB2312",
+        "eucJP" => return "EUC-JP",
+        "eucKR" => return "EUC-KR",
+        _ => (),
+    };
+
+    #[cfg(target_os = "aix")]
+    match s {
+        "GBK" => return "GBK",
+        "IBM-1046" => return "CP1046",
+        "IBM-1124" => return "CP1124",
+        "IBM-1129" => return "CP1129",
+        "IBM-1252" => return "CP1252",
+        "IBM-850" => return "CP850",
+        "IBM-856" => return "CP856",
+        "IBM-921" => return "ISO-8859-13",
+        "IBM-922" => return "CP922",
+        "IBM-932" => return "CP932",
+        "IBM-943" => return "CP943",
+        "IBM-eucCN" => return "GB2312",
+        "IBM-eucJP" => return "EUC-JP",
+        "IBM-eucKR" => return "EUC-KR",
+        "IBM-eucTW" => return "EUC-TW",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-6" => return "ISO-8859-6",
+        "ISO8859-7" => return "ISO-8859-7",
+        "ISO8859-8" => return "ISO-8859-8",
+        "ISO8859-9" => return "ISO-8859-9",
+        "TIS-620" => return "TIS-620",
+        "UTF-8" => return "UTF-8",
+        "big5" => return "BIG5",
+        _ => (),
+    };
+
+    #[cfg(windows)]
+    match s {
+        "CP1361" => return "JOHAB",
+        "CP20127" => return "ASCII",
+        "CP20866" => return "KOI8-R",
+        "CP20936" => return "GB2312",
+        "CP21866" => return "KOI8-RU",
+        "CP28591" => return "ISO-8859-1",
+        "CP28592" => return "ISO-8859-2",
+        "CP28593" => return "ISO-8859-3",
+        "CP28594" => return "ISO-8859-4",
+        "CP28595" => return "ISO-8859-5",
+        "CP28596" => return "ISO-8859-6",
+        "CP28597" => return "ISO-8859-7",
+        "CP28598" => return "ISO-8859-8",
+        "CP28599" => return "ISO-8859-9",
+        "CP28605" => return "ISO-8859-15",
+        "CP38598" => return "ISO-8859-8",
+        "CP51932" => return "EUC-JP",
+        "CP51936" => return "GB2312",
+        "CP51949" => return "EUC-KR",
+        "CP51950" => return "EUC-TW",
+        "CP54936" => return "GB18030",
+        "CP65001" => return "UTF-8",
+        "CP936" => return "GBK",
+        _ => (),
+    };
+
+    String::from(s).leak()
+}
+
+#[cfg(unix)]
+mod inner {
+    use std::ffi::CStr;
+
+    use libc::{self, nl_langinfo, CODESET};
+
+    fn codeset() -> Option<String> {
+        unsafe {
+            let codeset = nl_langinfo(CODESET);
+            if codeset.is_null() {
+                None
+            } else {
+                Some(CStr::from_ptr(codeset).to_string_lossy().into())
+            }
+        }
+    }
+
+    pub fn locale_charset() -> Option<String> {
+        codeset()
+    }
+}
+
+#[cfg(windows)]
+mod inner {
+    use libc::{setlocale, LC_CTYPE};
+    use std::ffi::{CStr, CString};
+    use windows_sys::Win32::Globalization::GetACP;
+
+    fn current_locale() -> Option<String> {
+        unsafe {
+            let empty_cstr = CString::new("").unwrap();
+            let locale = setlocale(LC_CTYPE, empty_cstr.as_ptr());
+            if locale.is_null() {
+                None
+            } else {
+                Some(CStr::from_ptr(locale).to_string_lossy().into())
+            }
+        }
+    }
+
+    pub fn locale_charset() -> Option<String> {
+        let Some(current_locale) = current_locale() else {
+            return None;
+        };
+        let codepage = if let Some((_, pdot)) = current_locale.rsplit_once('.') {
+            format!("CP{pdot}")
+        } else {
+            format!("CP{}", unsafe { GetACP() })
+        };
+        Some(match codepage.as_str() {
+            "CP65001" | "CPutf8" => String::from("UTF-8"),
+            _ => codepage,
+        })
+    }
+}
+
+#[cfg(not(any(unix, windows)))]
+mod inner {
+    pub fn locale_charse() -> String {
+        String::from("UTF-8")
+    }
+}
+
+pub fn locale_charset() -> &'static str {
+    lazy_static! {
+        static ref LOCALE_CHARSET: &'static str = map_aliases(&inner::locale_charset().unwrap_or(String::from("UTF-8")));
+    }
+    &LOCALE_CHARSET
+}