Works for at least one test file now
[pspp] / rust / src / encoding.rs
index a0e28af77100de023d5d142acd0bba1782a784ee..8fd13f3ea3fb4e9b3f5243cd998037c15ac609d0 100644 (file)
@@ -1 +1,64 @@
+use crate::locale_charset::locale_charset;
+use encoding_rs::{Encoding, UTF_8};
+
 include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
+
+pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
+    CODEPAGE_NAME_TO_NUMBER
+        .get(encoding.to_ascii_lowercase().as_str())
+        .copied()
+}
+
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+    #[error("This system file does not indicate its own character encoding.  For best results, specify an encoding explicitly.  Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
+    NoEncoding,
+
+    #[error("This system file encodes text strings with unknown code page {0}.")]
+    UnknownCodepage(i32),
+
+    #[error("This system file encodes text strings with unknown encoding {0}.")]
+    UnknownEncoding(String),
+
+    #[error("This system file is encoded in EBCDIC, which is not supported.")]
+    Ebcdic,
+}
+
+pub fn default_encoding() -> &'static Encoding {
+    lazy_static! {
+        static ref DEFAULT_ENCODING: &'static Encoding =
+            Encoding::for_label(locale_charset().as_bytes()).unwrap_or(&UTF_8);
+    }
+    &DEFAULT_ENCODING
+}
+
+pub fn get_encoding(
+    encoding: Option<&str>,
+    character_code: Option<i32>,
+) -> Result<&'static Encoding, Error> {
+    let label = if let Some(encoding) = encoding {
+        encoding
+    } else if let Some(codepage) = character_code {
+        match codepage {
+            1 => return Err(Error::Ebcdic),
+            2 | 3 => {
+                // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
+                // respectively.  However, many files have character code 2 but
+                // data which are clearly not ASCII.  Therefore, ignore these
+                // values.
+                return Err(Error::NoEncoding);
+            }
+            4 => "MS_KANJI",
+            _ => CODEPAGE_NUMBER_TO_NAME
+                .get(&codepage)
+                .copied()
+                .ok_or(Error::UnknownCodepage(codepage))?,
+        }
+    } else {
+        return Err(Error::NoEncoding);
+    };
+
+    Ok(Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))?)
+}