From 92bba7a2dd4dff9030989f4459902f47d504752a Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Tue, 1 Aug 2023 18:20:25 -0700
Subject: [PATCH] work

---
 doc/dev/system-file-format.texi | 174 +++++++++++++++++++++++++++++---
 rust/Cargo.lock                 |   7 ++
 rust/Cargo.toml                 |   6 +-
 rust/build.rs                   | 171 +++++++++++++++++++++++++++++++
 rust/src/cooked.rs              |   0
 rust/src/encoding.rs            |   1 +
 rust/src/lib.rs                 |  10 +-
 rust/src/raw.rs                 |  98 ++++++++++++++++--
 src/data/sys-file-encoding.py   |   2 +-
 utilities/pspp-dump-sav.c       |  12 +--
 10 files changed, 452 insertions(+), 29 deletions(-)
 create mode 100644 rust/build.rs
 create mode 100644 rust/src/cooked.rs
 create mode 100644 rust/src/encoding.rs

diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi
index ba390c1c02..e7bda48d56 100644
--- a/doc/dev/system-file-format.texi
+++ b/doc/dev/system-file-format.texi
@@ -717,17 +717,75 @@ been actually observed in system files:
 EBCDIC.
 
 @item 2
-7-bit ASCII.
+7-bit ASCII.  Old versions of SPSS for Unix and Windows always wrote
+value 2 in this field, regardless of the encoding in use, so it is not
+reliable and should be ignored.
+
+@item 3
+8-bit ``ASCII''.
+
+@item 819
+ISO 8859-1 (IBM AIX code page number).
+
+@item 874
+@itemx 9066
+The @code{windows-874} code page for Thai.
+
+@item 932
+The @code{windows-932} code page for Japanese.
+
+@item 936
+The @code{windows-936} code page for simplified Chinese
+
+@item 949
+Probably @code{ks_c_5601-1987}, Unified Hangul Code.
+
+@item 950
+The @code{big5} code page for traditional Chinese.
 
 @item 1250
 The @code{windows-1250} code page for Central European and Eastern
 European languages.
 
+@item 1251
+The @code{windows-1251} code page for Cyrillic languages.
+
 @item 1252
 The @code{windows-1252} code page for Western European languages.
 
+@item 1253
+The @code{windows-1253} code page for modern Greek.
+
+@item 1254
+The @code{windows-1254} code page for Turkish.
+
+@item 1255
+The @code{windows-1255} code page for Hebrew.
+
+@item 1256
+The @code{windows-1256} code page for Arabic script.
+
+@item 1257
+The @code{windows-1257} code page for Estonian, Latvian, and
+Lithuanian.
+
+@item 1258
+The @code{windows-1258} code page for Vietnamese.
+
+@item 20127
+US-ASCII.
+
 @item 28591
-ISO 8859-1.
+ISO 8859-1 (Latin-1).
+
+@item 25592
+ISO 8859-2 (Central European).
+
+@item 28605
+ISO 8895-9 (Latin-9).
+
+@item 51949
+The @code{euc-kr} code page for Korean.
 
 @item 65001
 UTF-8.
@@ -743,12 +801,13 @@ The following additional values are known to be defined:
 DEC Kanji.
 @end table
 
+The most common values observed, from most to least common, are 1252,
+65001, 2, and 28591.
+
 Other Windows code page numbers are known to be generally valid.
 
-Old versions of SPSS for Unix and Windows always wrote value 2 in this
-field, regardless of the encoding in use.  Newer versions also write
-the character encoding as a string (see @ref{Character Encoding
-Record}).
+Newer versions also write the character encoding as a string (see
+@ref{Character Encoding Record}).
 @end table
 
 @node Machine Floating-Point Info Record
@@ -1251,20 +1310,107 @@ The total number of bytes in @code{encoding}.
 
 @item char encoding[];
 The name of the character encoding.  Normally this will be an official
-IANA character set name or alias.
-See @url{http://www.iana.org/assignments/character-sets}.
-Character set names are not case-sensitive, but SPSS appears to write
-them in all-uppercase.
+IANA character set name or alias.  See
+@url{http://www.iana.org/assignments/character-sets}.  Character set
+names are not case-sensitive, and SPSS is not consistent, e.g.@: both
+@code{windows-1251} and @code{WINDOWS-1252} have both been observed,
+as have @code{Big5} and @code{BIG5}.
 @end table
 
 This record is not present in files generated by older software.  See
 also the @code{character_code} field in the machine integer info
 record (@pxref{character-code}).
 
-When the character encoding record and the machine integer info record
-are both present, all system files observed in practice indicate the
-same character encoding, e.g.@: 1252 as @code{character_code} and
-@code{windows-1252} as @code{encoding}, 65001 and @code{UTF-8}, etc.
+The following character encoding names have been observed.  The names
+are shown in lowercase, even though they were not always in lowercase
+in the file.  Alternative names for the same encoding are, when known,
+listed together.  For each encoding, the @code[character_code} values
+that they were observed paired with are also listed.  First, the
+following are strictly single-byte, ASCII-compatible encodings:
+
+@table @code
+@item @r{(encoding record missing)}
+0, 2, 3, 874, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 20127, 28591,
+28592, 28605
+
+@item ansi_x3.4-1968
+@itemx ascii
+1252
+
+@item cp28605
+2
+
+@item cp874
+9066
+
+@item iso-8859-1
+819
+
+@item windows-874
+874
+
+@item windows-1250
+2, 1250, 1252
+
+@item windows-1251
+2, 1251
+
+@item cp1252
+@itemx windows-1252
+2, 1250, 1252, 1253
+
+@item cp1253
+@itemx windows-1253
+1253
+
+@item windows-1254
+2, 1254
+
+@item windows-1255
+2, 1255
+
+@Item windows-1256
+2, 1252, 1256
+
+@item windows-1257
+2, 1257
+
+@item windows-1258
+1258
+@end table
+
+The following are multibyte encodings, in which some code points
+occupy a single byte and others multiple bytes.  All of the following
+encode ASCII characters as their native values, but some of them
+(marked as ``not ASCII compatible'') also use ASCII values as second
+or later bytes in multibyte sequences:
+
+@table @code
+@item @r{(encoding record missing)}
+65001, 949 (ASCII compatible) and 932, 936, 950 (not ASCII compatible).
+
+@item big5
+@itemx cp950
+2, 950 (not ASCII compatible)
+
+@item euc-kr
+2, 51949 (ASCII compatible)
+
+@item gbk
+936 (not ASCII compatible)
+
+@item utf-8
+0, 2, 1250, 1251, 1252, 1256, 65001 (ASCII compatible)
+
+@item cp932
+@itemx windows-31j
+932 (not ASCII compatible)
+@end table
+
+As the tables above show, when the character encoding record and the
+machine integer info record are both present, they can contradict each
+other.  Observations show that, in this case, the character encoding
+record should be honored.
 
 If, for testing purposes, a file is crafted with different
 @code{character_code} and @code{encoding}, it seems that
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index 339237c316..bb71860a72 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -197,6 +197,12 @@ dependencies = [
  "windows-sys 0.45.0",
 ]
 
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
 [[package]]
 name = "libc"
 version = "0.2.139"
@@ -374,6 +380,7 @@ dependencies = [
  "flate2",
  "float_next_after",
  "hexplay",
+ "lazy_static",
  "num",
  "num-derive",
  "num-traits",
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index d365ab63a8..7ee93bd9df 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -10,12 +10,16 @@ clap = { version = "4.1.7", features = ["derive", "wrap_help"] }
 flate2 = "1.0.26"
 float_next_after = "1.0.0"
 hexplay = "0.2.1"
+lazy_static = "1.4.0"
 num = "0.4.0"
 num-derive = "0.4.0"
 num-traits = "0.2.16"
 ordered-float = "3.7.0"
 thiserror = "1.0"
 
+[build-dependencies]
+anyhow = "1.0.69"
+
 [[bin]]
 name = "pspp-dump-sav"
 path = "src/main.rs"
@@ -26,4 +30,4 @@ path = "src/lib.rs"
 [[test]]
 name = "sack"
 path = "tests/sack.rs"
-harness = false
\ No newline at end of file
+harness = false
diff --git a/rust/build.rs b/rust/build.rs
new file mode 100644
index 0000000000..5b6c2dfea5
--- /dev/null
+++ b/rust/build.rs
@@ -0,0 +1,171 @@
+use anyhow::{anyhow, Result as AnyResult};
+use std::{
+    collections::{BTreeMap, HashSet, VecDeque},
+    env::var_os,
+    fs::{read_to_string, File},
+    io::{Error as IoError, Write},
+    path::{Path, PathBuf},
+};
+
+#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)]
+enum Source {
+    CP,
+    IBM,
+    Windows,
+}
+
+// Code page number.
+type CPNumber = usize;
+
+fn process_converter<'a>(
+    fields: &Vec<&'a str>,
+    codepages: &mut BTreeMap<CPNumber, BTreeMap<Source, Vec<&'a str>>>,
+) {
+    if fields.is_empty() || fields[0] == "{" {
+        return;
+    }
+
+    let mut cps: BTreeMap<Source, CPNumber> = BTreeMap::new();
+    let mut iana = VecDeque::new();
+    let mut other = VecDeque::new();
+
+    let mut iter = fields.iter().peekable();
+    while let Some(&name) = iter.next() {
+        if iter.next_if(|&&s| s == "{").is_some() {
+            let mut standards = HashSet::new();
+            loop {
+                let &standard = iter.next().expect("missing `}` in list of standards");
+                if standard == "}" {
+                    break;
+                }
+                standards.insert(standard);
+            }
+
+            if standards.contains("IANA*") {
+                iana.push_front(name);
+            } else if standards.contains("IANA") {
+                iana.push_back(name);
+            } else if standards.iter().any(|&s| s.ends_with('*')) {
+                other.push_front(name);
+            } else {
+                other.push_back(name);
+            }
+        } else {
+            // Untagged names are completely nonstandard.
+            continue;
+        }
+
+        if let Some(number) = name.strip_prefix("cp") {
+            if let Ok(number) = number.parse::<CPNumber>() {
+                cps.insert(Source::CP, number);
+            }
+        }
+
+        if let Some(number) = name.strip_prefix("windows-") {
+            if let Ok(number) = number.parse::<CPNumber>() {
+                cps.insert(Source::Windows, number);
+            }
+        }
+
+        if let Some(number) = name.strip_prefix("ibm-") {
+            if let Ok(number) = number.parse::<CPNumber>() {
+                cps.insert(Source::IBM, number);
+            }
+        }
+    }
+
+    // If there are no tagged names then this is completely nonstandard.
+    if iana.is_empty() && other.is_empty() {
+        return;
+    }
+
+    let all: Vec<&str> = iana.into_iter().chain(other.into_iter()).collect();
+    for (source, number) in cps {
+        codepages
+            .entry(number)
+            .or_insert_with(BTreeMap::new)
+            .insert(source, all.clone());
+    }
+}
+
+fn write_output(
+    codepages: &BTreeMap<CPNumber, BTreeMap<Source, Vec<&str>>>,
+    file_name: &PathBuf,
+) -> Result<(), IoError> {
+    let mut file = File::create(file_name)?;
+
+    write!(file, "{}", "\
+use lazy_static::lazy_static;
+use std::collections::HashMap;
+
+lazy_static! {
+    static ref CODEPAGE_NUMBER_TO_NAME: HashMap<u32, &'static str> = {
+        let mut map = HashMap::new();
+")?;
+
+    for (&cpnumber, value) in codepages.iter() {
+        let source = value.keys().max().unwrap();
+        let name = value[source][0];
+        writeln!(file, "        map.insert({cpnumber}, \"{name}\");")?;
+    }
+    write!(file, "{}", "\
+        map
+    };
+}
+")?;
+
+    let mut names: BTreeMap<&str, BTreeMap<Source, Vec<CPNumber>>> = BTreeMap::new();
+    for (&cpnumber, value) in codepages.iter() {
+        for (&source, value2) in value.iter() {
+            for &name in value2.iter() {
+                names
+                    .entry(name)
+                    .or_insert_with(BTreeMap::new)
+                    .entry(source)
+                    .or_insert_with(Vec::new)
+                    .push(cpnumber);
+            }
+        }
+    }
+
+    for (&name, value) in names.iter() {
+        for (_source, numbers) in value.iter().rev() {
+            println!("  {{ {}, \"{name}\" }},", numbers[0]);
+            break;
+        }
+    }
+
+    Ok(())
+}
+
+fn main() -> AnyResult<()> {
+    println!("cargo:rerun-if-changed=build.rs");
+
+    let input_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("../src/data/convrtrs.txt");
+    println!("cargo:rerun-if-changed={}", input_file.to_string_lossy());
+    let input = read_to_string(&input_file)
+        .map_err(|e| anyhow!("{}: read failed ({e})", input_file.to_string_lossy()))?;
+
+    let mut codepages: BTreeMap<CPNumber, BTreeMap<Source, Vec<&str>>> = BTreeMap::new();
+    let mut converter: Vec<&str> = Vec::new();
+    for line in input.lines() {
+        let line = line
+            .find('#')
+            .map(|position| &line[..position])
+            .unwrap_or(line)
+            .trim_end();
+        if !line.starts_with(&[' ', '\t']) {
+            process_converter(&converter, &mut codepages);
+            converter.clear();
+        }
+        converter.extend(line.split_whitespace());
+    }
+    process_converter(&converter, &mut codepages);
+
+    let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs");
+
+    write_output(&codepages, &output_file_name)
+        .map_err(|e| anyhow!("{}: write failed ({e})", output_file_name.to_string_lossy()))?;
+
+    Ok(())
+}
diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/rust/src/encoding.rs b/rust/src/encoding.rs
new file mode 100644
index 0000000000..a0e28af771
--- /dev/null
+++ b/rust/src/encoding.rs
@@ -0,0 +1 @@
+include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
index 6dc13b586a..680d3a3a51 100644
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -3,7 +3,9 @@ use thiserror::Error as ThisError;
 
 pub mod endian;
 pub mod raw;
+pub mod cooked;
 pub mod sack;
+pub mod encoding;
 
 #[derive(ThisError, Debug)]
 pub enum Error {
@@ -28,7 +30,7 @@ pub enum Error {
     #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
     BadDocumentLength { offset: u64, n: u32, max: u32 },
 
-    #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")]
+    #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
     BadRecordType { offset: u64, rec_type: u32 },
 
     #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
@@ -84,4 +86,10 @@ pub enum Error {
         expected_n_blocks: u64,
         ztrailer_len: u64,
     },
+
+    #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
+    BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 },
+
+    #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
+    BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 },
 }
diff --git a/rust/src/raw.rs b/rust/src/raw.rs
index b09f94e73a..3f7309c7ce 100644
--- a/rust/src/raw.rs
+++ b/rust/src/raw.rs
@@ -759,6 +759,71 @@ enum ExtensionType {
 }
  */
 
+trait ExtensionRecord where Self: Sized {
+    const SIZE: Option<u32>;
+    const COUNT: Option<u32>;
+    const NAME: &'static str;
+    fn parse(ext: &Extension, endian: Endian) -> Result<Self, Error>;
+}
+
+pub struct IntegerInfo {
+    version: (i32, i32, i32),
+    machine_code: i32,
+    floating_point_rep: i32,
+    compression_code: i32,
+    endianness: i32,
+    character_code: i32,
+}
+
+impl ExtensionRecord for IntegerInfo {
+    const SIZE: Option<u32> = Some(4);
+    const COUNT: Option<u32> = Some(8);
+    const NAME: &'static str = "integer record";
+
+    fn parse(ext: &Extension, endian: Endian) -> Result<Self, Error>{
+        ext.check_size::<Self>()?;
+
+        let mut input = &ext.data[..];
+        let data: Vec<i32> = (0..8)
+            .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+            .collect();
+        Ok(IntegerInfo {
+            version: (data[0], data[1], data[2]),
+            machine_code: data[3],
+            floating_point_rep: data[4],
+            compression_code: data[5],
+            endianness: data[6],
+            character_code: data[7]
+        })
+    }
+}
+
+pub struct FloatInfo {
+    sysmis: f64,
+    highest: f64,
+    lowest: f64,
+}
+
+impl ExtensionRecord for FloatInfo {
+    const SIZE: Option<u32> = Some(8);
+    const COUNT: Option<u32> = Some(3);
+    const NAME: &'static str = "floating point record";
+
+    fn parse(ext: &Extension, endian: Endian) -> Result<Self, Error>{
+        ext.check_size::<Self>()?;
+
+        let mut input = &ext.data[..];
+        let data: Vec<f64> = (0..3)
+            .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+            .collect();
+        Ok(FloatInfo {
+            sysmis: data[0],
+            highest: data[1],
+            lowest: data[2],
+        })
+    }
+}
+
 pub struct Extension {
     /// Offset from the start of the file to the start of the record.
     pub offset: u64,
@@ -805,6 +870,30 @@ fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
  */
 
 impl Extension {
+    fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
+        if let Some(expected_size) = E::SIZE {
+            if self.size != expected_size {
+                return Err(Error::BadRecordSize {
+                    offset: self.offset,
+                    record: E::NAME.into(),
+                    size: self.size,
+                    expected_size,
+                });
+            }
+        }
+        if let Some(expected_count) = E::COUNT {
+            if self.count != expected_count {
+                return Err(Error::BadRecordCount {
+                    offset: self.offset,
+                    record: E::NAME.into(),
+                    count: self.count,
+                    expected_count,
+                });
+            }
+        }
+        Ok(())
+    }
+
     fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
         let subtype = endian.parse(read_bytes(r)?);
         let offset = r.stream_position()?;
@@ -895,10 +984,7 @@ pub struct ZBlock {
 }
 
 impl ZBlock {
-    fn read<R: Read + Seek>(
-        r: &mut R,
-        endian: Endian,
-    ) -> Result<ZBlock, Error> {
+    fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
         Ok(ZBlock {
             uncompressed_ofs: endian.parse(read_bytes(r)?),
             compressed_ofs: endian.parse(read_bytes(r)?),
@@ -933,8 +1019,8 @@ impl ZTrailer {
             });
         }
         let blocks = (0..n_blocks)
-                    .map(|_| ZBlock::read(reader, endian))
-                    .collect::<Result<Vec<_>, _>>()?;
+            .map(|_| ZBlock::read(reader, endian))
+            .collect::<Result<Vec<_>, _>>()?;
         reader.seek(SeekFrom::Start(start_offset))?;
         Ok(Some(ZTrailer {
             offset: ztrailer_ofs,
diff --git a/src/data/sys-file-encoding.py b/src/data/sys-file-encoding.py
index 7d14545490..0f88a04fff 100644
--- a/src/data/sys-file-encoding.py
+++ b/src/data/sys-file-encoding.py
@@ -139,7 +139,7 @@ struct sys_encoding sys_codepage_number_to_name[] = {""")
 
 for cpnumber, value in sorted(codepages.items()):
     source = max(value.keys())
-    name = value[source][0]
+    name = value[source]
     print('  { %s, "%s" },' % (cpnumber, name))
 print("""  { 0, NULL }
 };
diff --git a/utilities/pspp-dump-sav.c b/utilities/pspp-dump-sav.c
index d08999e01b..3268575459 100644
--- a/utilities/pspp-dump-sav.c
+++ b/utilities/pspp-dump-sav.c
@@ -65,6 +65,9 @@ struct sfm_reader
     double bias;
   };
 
+  static int character_code;
+static char *encoding;
+
 static void read_header (struct sfm_reader *);
 static void read_variable_record (struct sfm_reader *);
 static void read_value_label_record (struct sfm_reader *);
@@ -229,6 +232,7 @@ main (int argc, char *argv[])
               (long long int) ftello (r.file),
               (long long int) ftello (r.file) + 4);
 
+      printf ("Character Encoding: %s (%d)\n", encoding ? encoding : "none", character_code);
       if (r.compression == COMP_SIMPLE)
         {
           if (max_cases > 0)
@@ -695,7 +699,7 @@ read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count)
   int float_representation = read_int (r);
   int compression_code = read_int (r);
   int integer_representation = read_int (r);
-  int character_code = read_int (r);
+  character_code = read_int (r);
 
   printf ("%08llx: machine integer info\n", offset);
   if (size != 4 || count != 8)
@@ -1065,13 +1069,9 @@ read_datafile_attributes (struct sfm_reader *r, size_t size, size_t count)
 static void
 read_character_encoding (struct sfm_reader *r, size_t size, size_t count)
 {
-  long long int posn =  ftello (r->file);
-  char *encoding = xcalloc (size, count + 1);
+  encoding = xcalloc (size, count + 1);
   read_string (r, encoding, count + 1);
 
-  printf ("%08llx: Character Encoding: %s\n", posn, encoding);
-
-  free (encoding);
 }
 
 static void
-- 
2.30.2