work

author Ben Pfaff <blp@cs.stanford.edu>

Thu, 3 Aug 2023 15:44:54 +0000 (08:44 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Thu, 3 Aug 2023 15:44:54 +0000 (08:44 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Thu, 3 Aug 2023 15:44:54 +0000 (08:44 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Thu, 3 Aug 2023 15:44:54 +0000 (08:44 -0700)
diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi

index e7bda48d5693f42ec2781b475d751646de362e69..db8af7e37464aa1f415458a68b4df15a38a467ce 100644 (file)
--- a/doc/dev/system-file-format.texi
+++ b/doc/dev/system-file-format.texi
@@ -714,7 +714,7 @@ been actually observed in system files:
  
  @table @asis
  @item 1
-EBCDIC.
+EBCDIC.  Only one example has been observed.
  
  @item 2
  7-bit ASCII.  Old versions of SPSS for Unix and Windows always wrote
@@ -732,10 +732,10 @@ ISO 8859-1 (IBM AIX code page number).
  The @code{windows-874} code page for Thai.
  
  @item 932
-The @code{windows-932} code page for Japanese.
+The @code{windows-932} code page for Japanese (aka @code{Shift_JIS}).
  
  @item 936
-The @code{windows-936} code page for simplified Chinese
+The @code{windows-936} code page for simplified Chinese (aka @code{GBK}).
  
  @item 949
  Probably @code{ks_c_5601-1987}, Unified Hangul Code.
diff --git a/rust/src/encoding.rs b/rust/src/encoding.rs

index a0e28af77100de023d5d142acd0bba1782a784ee..296e4e65a8e4f32a61e4e415318610e3180513f7 100644 (file)
--- a/rust/src/encoding.rs
+++ b/rust/src/encoding.rs
@@ -1 +1,28 @@
  include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
+
+pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
+    CODEPAGE_NAME_TO_NUMBER
+        .get(encoding.to_ascii_lowercase().as_str())
+        .copied()
+}
+
+pub fn encoding_from_hints(encoding: Option<&str>, codepage: Option<u32>) -> Option<&str> {
+    if encoding.is_some() {
+        encoding
+    } else if let Some(codepage) = codepage {
+        match codepage {
+            1 => Some("EBCDIC-US"),
+            2 | 3 => {
+                // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
+                // respectively.  However, many files have character code 2 but
+                // data which are clearly not ASCII.  Therefore, ignore these
+                // values.
+                None
+            },
+            4 => Some("MS_KANJI"),
+            _ => CODEPAGE_NUMBER_TO_NAME.get(&codepage).copied()
+        }
+    } else {
+        None
+    }
+}
diff --git a/rust/src/raw.rs b/rust/src/raw.rs

index 3f7309c7ce67cb0584a83c2efb1f415ec1100674..e017f74ac494d4cce3afcc516112167d0c148b0c 100644 (file)
--- a/rust/src/raw.rs
+++ b/rust/src/raw.rs
@@ -824,6 +824,20 @@ impl ExtensionRecord for FloatInfo {
      }
  }
  
+pub struct Encoding(pub String);
+
+impl ExtensionRecord for Encoding {
+    const SIZE: Option<u32> = Some(1);
+    const COUNT: Option<u32> = None;
+    const NAME: &'static str = "encoding record";
+
+    fn parse(ext: &Extension, endian: Endian) -> Result<Self, Error>{
+        ext.check_size::<Self>()?;
+
+        Ok(Encoding(String::from_utf8(ext.data)?))
+    }
+}
+
  pub struct Extension {
      /// Offset from the start of the file to the start of the record.
      pub offset: u64,
author	Ben Pfaff <blp@cs.stanford.edu>
	Thu, 3 Aug 2023 15:44:54 +0000 (08:44 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Thu, 3 Aug 2023 15:44:54 +0000 (08:44 -0700)
doc/dev/system-file-format.texi		patch \| blob \| history
rust/src/encoding.rs		patch \| blob \| history
rust/src/raw.rs		patch \| blob \| history