EBCDIC.
@item 2
-7-bit ASCII.
+7-bit ASCII. Old versions of SPSS for Unix and Windows always wrote
+value 2 in this field, regardless of the encoding in use, so it is not
+reliable and should be ignored.
+
+@item 3
+8-bit ``ASCII''.
+
+@item 819
+ISO 8859-1 (IBM AIX code page number).
+
+@item 874
+@itemx 9066
+The @code{windows-874} code page for Thai.
+
+@item 932
+The @code{windows-932} code page for Japanese.
+
+@item 936
+The @code{windows-936} code page for simplified Chinese
+
+@item 949
+Probably @code{ks_c_5601-1987}, Unified Hangul Code.
+
+@item 950
+The @code{big5} code page for traditional Chinese.
@item 1250
The @code{windows-1250} code page for Central European and Eastern
European languages.
+@item 1251
+The @code{windows-1251} code page for Cyrillic languages.
+
@item 1252
The @code{windows-1252} code page for Western European languages.
+@item 1253
+The @code{windows-1253} code page for modern Greek.
+
+@item 1254
+The @code{windows-1254} code page for Turkish.
+
+@item 1255
+The @code{windows-1255} code page for Hebrew.
+
+@item 1256
+The @code{windows-1256} code page for Arabic script.
+
+@item 1257
+The @code{windows-1257} code page for Estonian, Latvian, and
+Lithuanian.
+
+@item 1258
+The @code{windows-1258} code page for Vietnamese.
+
+@item 20127
+US-ASCII.
+
@item 28591
-ISO 8859-1.
+ISO 8859-1 (Latin-1).
+
+@item 25592
+ISO 8859-2 (Central European).
+
+@item 28605
+ISO 8895-9 (Latin-9).
+
+@item 51949
+The @code{euc-kr} code page for Korean.
@item 65001
UTF-8.
DEC Kanji.
@end table
+The most common values observed, from most to least common, are 1252,
+65001, 2, and 28591.
+
Other Windows code page numbers are known to be generally valid.
-Old versions of SPSS for Unix and Windows always wrote value 2 in this
-field, regardless of the encoding in use. Newer versions also write
-the character encoding as a string (see @ref{Character Encoding
-Record}).
+Newer versions also write the character encoding as a string (see
+@ref{Character Encoding Record}).
@end table
@node Machine Floating-Point Info Record
@item char encoding[];
The name of the character encoding. Normally this will be an official
-IANA character set name or alias.
-See @url{http://www.iana.org/assignments/character-sets}.
-Character set names are not case-sensitive, but SPSS appears to write
-them in all-uppercase.
+IANA character set name or alias. See
+@url{http://www.iana.org/assignments/character-sets}. Character set
+names are not case-sensitive, and SPSS is not consistent, e.g.@: both
+@code{windows-1251} and @code{WINDOWS-1252} have both been observed,
+as have @code{Big5} and @code{BIG5}.
@end table
This record is not present in files generated by older software. See
also the @code{character_code} field in the machine integer info
record (@pxref{character-code}).
-When the character encoding record and the machine integer info record
-are both present, all system files observed in practice indicate the
-same character encoding, e.g.@: 1252 as @code{character_code} and
-@code{windows-1252} as @code{encoding}, 65001 and @code{UTF-8}, etc.
+The following character encoding names have been observed. The names
+are shown in lowercase, even though they were not always in lowercase
+in the file. Alternative names for the same encoding are, when known,
+listed together. For each encoding, the @code[character_code} values
+that they were observed paired with are also listed. First, the
+following are strictly single-byte, ASCII-compatible encodings:
+
+@table @code
+@item @r{(encoding record missing)}
+0, 2, 3, 874, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 20127, 28591,
+28592, 28605
+
+@item ansi_x3.4-1968
+@itemx ascii
+1252
+
+@item cp28605
+2
+
+@item cp874
+9066
+
+@item iso-8859-1
+819
+
+@item windows-874
+874
+
+@item windows-1250
+2, 1250, 1252
+
+@item windows-1251
+2, 1251
+
+@item cp1252
+@itemx windows-1252
+2, 1250, 1252, 1253
+
+@item cp1253
+@itemx windows-1253
+1253
+
+@item windows-1254
+2, 1254
+
+@item windows-1255
+2, 1255
+
+@Item windows-1256
+2, 1252, 1256
+
+@item windows-1257
+2, 1257
+
+@item windows-1258
+1258
+@end table
+
+The following are multibyte encodings, in which some code points
+occupy a single byte and others multiple bytes. All of the following
+encode ASCII characters as their native values, but some of them
+(marked as ``not ASCII compatible'') also use ASCII values as second
+or later bytes in multibyte sequences:
+
+@table @code
+@item @r{(encoding record missing)}
+65001, 949 (ASCII compatible) and 932, 936, 950 (not ASCII compatible).
+
+@item big5
+@itemx cp950
+2, 950 (not ASCII compatible)
+
+@item euc-kr
+2, 51949 (ASCII compatible)
+
+@item gbk
+936 (not ASCII compatible)
+
+@item utf-8
+0, 2, 1250, 1251, 1252, 1256, 65001 (ASCII compatible)
+
+@item cp932
+@itemx windows-31j
+932 (not ASCII compatible)
+@end table
+
+As the tables above show, when the character encoding record and the
+machine integer info record are both present, they can contradict each
+other. Observations show that, in this case, the character encoding
+record should be honored.
If, for testing purposes, a file is crafted with different
@code{character_code} and @code{encoding}, it seems that
"windows-sys 0.45.0",
]
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
[[package]]
name = "libc"
version = "0.2.139"
"flate2",
"float_next_after",
"hexplay",
+ "lazy_static",
"num",
"num-derive",
"num-traits",
flate2 = "1.0.26"
float_next_after = "1.0.0"
hexplay = "0.2.1"
+lazy_static = "1.4.0"
num = "0.4.0"
num-derive = "0.4.0"
num-traits = "0.2.16"
ordered-float = "3.7.0"
thiserror = "1.0"
+[build-dependencies]
+anyhow = "1.0.69"
+
[[bin]]
name = "pspp-dump-sav"
path = "src/main.rs"
[[test]]
name = "sack"
path = "tests/sack.rs"
-harness = false
\ No newline at end of file
+harness = false
--- /dev/null
+use anyhow::{anyhow, Result as AnyResult};
+use std::{
+ collections::{BTreeMap, HashSet, VecDeque},
+ env::var_os,
+ fs::{read_to_string, File},
+ io::{Error as IoError, Write},
+ path::{Path, PathBuf},
+};
+
+#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)]
+enum Source {
+ CP,
+ IBM,
+ Windows,
+}
+
+// Code page number.
+type CPNumber = usize;
+
+fn process_converter<'a>(
+ fields: &Vec<&'a str>,
+ codepages: &mut BTreeMap<CPNumber, BTreeMap<Source, Vec<&'a str>>>,
+) {
+ if fields.is_empty() || fields[0] == "{" {
+ return;
+ }
+
+ let mut cps: BTreeMap<Source, CPNumber> = BTreeMap::new();
+ let mut iana = VecDeque::new();
+ let mut other = VecDeque::new();
+
+ let mut iter = fields.iter().peekable();
+ while let Some(&name) = iter.next() {
+ if iter.next_if(|&&s| s == "{").is_some() {
+ let mut standards = HashSet::new();
+ loop {
+ let &standard = iter.next().expect("missing `}` in list of standards");
+ if standard == "}" {
+ break;
+ }
+ standards.insert(standard);
+ }
+
+ if standards.contains("IANA*") {
+ iana.push_front(name);
+ } else if standards.contains("IANA") {
+ iana.push_back(name);
+ } else if standards.iter().any(|&s| s.ends_with('*')) {
+ other.push_front(name);
+ } else {
+ other.push_back(name);
+ }
+ } else {
+ // Untagged names are completely nonstandard.
+ continue;
+ }
+
+ if let Some(number) = name.strip_prefix("cp") {
+ if let Ok(number) = number.parse::<CPNumber>() {
+ cps.insert(Source::CP, number);
+ }
+ }
+
+ if let Some(number) = name.strip_prefix("windows-") {
+ if let Ok(number) = number.parse::<CPNumber>() {
+ cps.insert(Source::Windows, number);
+ }
+ }
+
+ if let Some(number) = name.strip_prefix("ibm-") {
+ if let Ok(number) = number.parse::<CPNumber>() {
+ cps.insert(Source::IBM, number);
+ }
+ }
+ }
+
+ // If there are no tagged names then this is completely nonstandard.
+ if iana.is_empty() && other.is_empty() {
+ return;
+ }
+
+ let all: Vec<&str> = iana.into_iter().chain(other.into_iter()).collect();
+ for (source, number) in cps {
+ codepages
+ .entry(number)
+ .or_insert_with(BTreeMap::new)
+ .insert(source, all.clone());
+ }
+}
+
+fn write_output(
+ codepages: &BTreeMap<CPNumber, BTreeMap<Source, Vec<&str>>>,
+ file_name: &PathBuf,
+) -> Result<(), IoError> {
+ let mut file = File::create(file_name)?;
+
+ write!(file, "{}", "\
+use lazy_static::lazy_static;
+use std::collections::HashMap;
+
+lazy_static! {
+ static ref CODEPAGE_NUMBER_TO_NAME: HashMap<u32, &'static str> = {
+ let mut map = HashMap::new();
+")?;
+
+ for (&cpnumber, value) in codepages.iter() {
+ let source = value.keys().max().unwrap();
+ let name = value[source][0];
+ writeln!(file, " map.insert({cpnumber}, \"{name}\");")?;
+ }
+ write!(file, "{}", "\
+ map
+ };
+}
+")?;
+
+ let mut names: BTreeMap<&str, BTreeMap<Source, Vec<CPNumber>>> = BTreeMap::new();
+ for (&cpnumber, value) in codepages.iter() {
+ for (&source, value2) in value.iter() {
+ for &name in value2.iter() {
+ names
+ .entry(name)
+ .or_insert_with(BTreeMap::new)
+ .entry(source)
+ .or_insert_with(Vec::new)
+ .push(cpnumber);
+ }
+ }
+ }
+
+ for (&name, value) in names.iter() {
+ for (_source, numbers) in value.iter().rev() {
+ println!(" {{ {}, \"{name}\" }},", numbers[0]);
+ break;
+ }
+ }
+
+ Ok(())
+}
+
+fn main() -> AnyResult<()> {
+ println!("cargo:rerun-if-changed=build.rs");
+
+ let input_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("../src/data/convrtrs.txt");
+ println!("cargo:rerun-if-changed={}", input_file.to_string_lossy());
+ let input = read_to_string(&input_file)
+ .map_err(|e| anyhow!("{}: read failed ({e})", input_file.to_string_lossy()))?;
+
+ let mut codepages: BTreeMap<CPNumber, BTreeMap<Source, Vec<&str>>> = BTreeMap::new();
+ let mut converter: Vec<&str> = Vec::new();
+ for line in input.lines() {
+ let line = line
+ .find('#')
+ .map(|position| &line[..position])
+ .unwrap_or(line)
+ .trim_end();
+ if !line.starts_with(&[' ', '\t']) {
+ process_converter(&converter, &mut codepages);
+ converter.clear();
+ }
+ converter.extend(line.split_whitespace());
+ }
+ process_converter(&converter, &mut codepages);
+
+ let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs");
+
+ write_output(&codepages, &output_file_name)
+ .map_err(|e| anyhow!("{}: write failed ({e})", output_file_name.to_string_lossy()))?;
+
+ Ok(())
+}
--- /dev/null
+include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
pub mod endian;
pub mod raw;
+pub mod cooked;
pub mod sack;
+pub mod encoding;
#[derive(ThisError, Debug)]
pub enum Error {
#[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
BadDocumentLength { offset: u64, n: u32, max: u32 },
- #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")]
+ #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
BadRecordType { offset: u64, rec_type: u32 },
#[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
expected_n_blocks: u64,
ztrailer_len: u64,
},
+
+ #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
+ BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 },
+
+ #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
+ BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 },
}
}
*/
+trait ExtensionRecord where Self: Sized {
+ const SIZE: Option<u32>;
+ const COUNT: Option<u32>;
+ const NAME: &'static str;
+ fn parse(ext: &Extension, endian: Endian) -> Result<Self, Error>;
+}
+
+pub struct IntegerInfo {
+ version: (i32, i32, i32),
+ machine_code: i32,
+ floating_point_rep: i32,
+ compression_code: i32,
+ endianness: i32,
+ character_code: i32,
+}
+
+impl ExtensionRecord for IntegerInfo {
+ const SIZE: Option<u32> = Some(4);
+ const COUNT: Option<u32> = Some(8);
+ const NAME: &'static str = "integer record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Self, Error>{
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<i32> = (0..8)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(IntegerInfo {
+ version: (data[0], data[1], data[2]),
+ machine_code: data[3],
+ floating_point_rep: data[4],
+ compression_code: data[5],
+ endianness: data[6],
+ character_code: data[7]
+ })
+ }
+}
+
+pub struct FloatInfo {
+ sysmis: f64,
+ highest: f64,
+ lowest: f64,
+}
+
+impl ExtensionRecord for FloatInfo {
+ const SIZE: Option<u32> = Some(8);
+ const COUNT: Option<u32> = Some(3);
+ const NAME: &'static str = "floating point record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Self, Error>{
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<f64> = (0..3)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(FloatInfo {
+ sysmis: data[0],
+ highest: data[1],
+ lowest: data[2],
+ })
+ }
+}
+
pub struct Extension {
/// Offset from the start of the file to the start of the record.
pub offset: u64,
*/
impl Extension {
+ fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
+ if let Some(expected_size) = E::SIZE {
+ if self.size != expected_size {
+ return Err(Error::BadRecordSize {
+ offset: self.offset,
+ record: E::NAME.into(),
+ size: self.size,
+ expected_size,
+ });
+ }
+ }
+ if let Some(expected_count) = E::COUNT {
+ if self.count != expected_count {
+ return Err(Error::BadRecordCount {
+ offset: self.offset,
+ record: E::NAME.into(),
+ count: self.count,
+ expected_count,
+ });
+ }
+ }
+ Ok(())
+ }
+
fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
let subtype = endian.parse(read_bytes(r)?);
let offset = r.stream_position()?;
}
impl ZBlock {
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- ) -> Result<ZBlock, Error> {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
Ok(ZBlock {
uncompressed_ofs: endian.parse(read_bytes(r)?),
compressed_ofs: endian.parse(read_bytes(r)?),
});
}
let blocks = (0..n_blocks)
- .map(|_| ZBlock::read(reader, endian))
- .collect::<Result<Vec<_>, _>>()?;
+ .map(|_| ZBlock::read(reader, endian))
+ .collect::<Result<Vec<_>, _>>()?;
reader.seek(SeekFrom::Start(start_offset))?;
Ok(Some(ZTrailer {
offset: ztrailer_ofs,
for cpnumber, value in sorted(codepages.items()):
source = max(value.keys())
- name = value[source][0]
+ name = value[source]
print(' { %s, "%s" },' % (cpnumber, name))
print(""" { 0, NULL }
};
double bias;
};
+ static int character_code;
+static char *encoding;
+
static void read_header (struct sfm_reader *);
static void read_variable_record (struct sfm_reader *);
static void read_value_label_record (struct sfm_reader *);
(long long int) ftello (r.file),
(long long int) ftello (r.file) + 4);
+ printf ("Character Encoding: %s (%d)\n", encoding ? encoding : "none", character_code);
if (r.compression == COMP_SIMPLE)
{
if (max_cases > 0)
int float_representation = read_int (r);
int compression_code = read_int (r);
int integer_representation = read_int (r);
- int character_code = read_int (r);
+ character_code = read_int (r);
printf ("%08llx: machine integer info\n", offset);
if (size != 4 || count != 8)
static void
read_character_encoding (struct sfm_reader *r, size_t size, size_t count)
{
- long long int posn = ftello (r->file);
- char *encoding = xcalloc (size, count + 1);
+ encoding = xcalloc (size, count + 1);
read_string (r, encoding, count + 1);
- printf ("%08llx: Character Encoding: %s\n", posn, encoding);
-
- free (encoding);
}
static void