use crate::locale_charset::locale_charset;
use encoding_rs::{Encoding, UTF_8};
+use thiserror::Error as ThisError;
include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
/// Returns the code page number corresponding to `encoding`, or `None` if
/// unknown.
-pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
+pub fn codepage_from_encoding_name(encoding: &str) -> Option<u32> {
CODEPAGE_NAME_TO_NUMBER
.get(encoding.to_ascii_lowercase().as_str())
.copied()
}
-use thiserror::Error as ThisError;
+/// Returns the code page number for `encoding`.
+pub fn codepage_from_encoding(encoding: &'static Encoding) -> u32 {
+ // This `unwrap()` is tested against all the actual [Encoding]s in a
+ // #[test].
+ codepage_from_encoding_name(encoding.name()).unwrap()
+}
/// An error or warning related to encodings.
#[derive(Clone, ThisError, Debug, PartialEq, Eq)]
Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
}
+
+#[cfg(test)]
+mod tests {
+ use crate::sys::encoding::codepage_from_encoding;
+
+ /// Test that every `Encoding` has a codepage.
+ #[test]
+ fn codepages() {
+ codepage_from_encoding(&encoding_rs::BIG5);
+ codepage_from_encoding(&encoding_rs::EUC_JP);
+ codepage_from_encoding(&encoding_rs::EUC_KR);
+ codepage_from_encoding(&encoding_rs::GB18030);
+ codepage_from_encoding(&encoding_rs::GBK);
+ codepage_from_encoding(&encoding_rs::IBM866);
+ codepage_from_encoding(&encoding_rs::ISO_2022_JP);
+ codepage_from_encoding(&encoding_rs::ISO_8859_2);
+ codepage_from_encoding(&encoding_rs::ISO_8859_3);
+ codepage_from_encoding(&encoding_rs::ISO_8859_4);
+ codepage_from_encoding(&encoding_rs::ISO_8859_5);
+ codepage_from_encoding(&encoding_rs::ISO_8859_6);
+ codepage_from_encoding(&encoding_rs::ISO_8859_7);
+ codepage_from_encoding(&encoding_rs::ISO_8859_8);
+ codepage_from_encoding(&encoding_rs::ISO_8859_8_I);
+ codepage_from_encoding(&encoding_rs::ISO_8859_10);
+ codepage_from_encoding(&encoding_rs::ISO_8859_13);
+ codepage_from_encoding(&encoding_rs::ISO_8859_14);
+ codepage_from_encoding(&encoding_rs::ISO_8859_15);
+ codepage_from_encoding(&encoding_rs::ISO_8859_16);
+ codepage_from_encoding(&encoding_rs::KOI8_R);
+ codepage_from_encoding(&encoding_rs::KOI8_U);
+ codepage_from_encoding(&encoding_rs::MACINTOSH);
+ codepage_from_encoding(&encoding_rs::REPLACEMENT);
+ codepage_from_encoding(&encoding_rs::SHIFT_JIS);
+ codepage_from_encoding(&encoding_rs::UTF_8);
+ codepage_from_encoding(&encoding_rs::UTF_16BE);
+ codepage_from_encoding(&encoding_rs::UTF_16LE);
+ codepage_from_encoding(&encoding_rs::WINDOWS_874);
+ codepage_from_encoding(&encoding_rs::WINDOWS_1250);
+ codepage_from_encoding(&encoding_rs::WINDOWS_1251);
+ codepage_from_encoding(&encoding_rs::WINDOWS_1252);
+ codepage_from_encoding(&encoding_rs::WINDOWS_1253);
+ codepage_from_encoding(&encoding_rs::WINDOWS_1254);
+ codepage_from_encoding(&encoding_rs::WINDOWS_1255);
+ codepage_from_encoding(&encoding_rs::WINDOWS_1256);
+ codepage_from_encoding(&encoding_rs::WINDOWS_1257);
+ codepage_from_encoding(&encoding_rs::WINDOWS_1258);
+ codepage_from_encoding(&encoding_rs::X_MAC_CYRILLIC);
+ codepage_from_encoding(&encoding_rs::X_USER_DEFINED);
+ }
+}
}
}
-/// [Format](crate::format::Format) as represented in a system file.
+/// [Format] as represented in a system file.
#[derive(Copy, Clone, PartialEq, Eq, Hash, BinRead, BinWrite)]
pub struct RawFormat(
/// The most-significant 16 bits are the type, the next 8 bytes are the
/// File offsets occupied by the record.
pub offsets: Range<u64>,
+ /// Details.
+ pub inner: RawIntegerInfoRecord,
+}
+
+/// Machine integer info record in [binrw] format.
+#[derive(Clone, Debug, BinRead, BinWrite)]
+pub struct RawIntegerInfoRecord {
/// Version number.
///
/// e.g. `(1,2,3)` for version 1.2.3.
pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
ext.check_size(Some(4), Some(8), "integer record")?;
- let mut input = &ext.data[..];
- let data: Vec<i32> = (0..8)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
+ let inner =
+ RawIntegerInfoRecord::read_options(&mut Cursor::new(ext.data.as_slice()), endian, ())
+ .unwrap();
Ok(Record::IntegerInfo(IntegerInfoRecord {
offsets: ext.offsets.clone(),
- version: (data[0], data[1], data[2]),
- machine_code: data[3],
- floating_point_rep: data[4],
- compression_code: data[5],
- endianness: data[6],
- character_code: data[7],
+ inner,
}))
}
}
pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
ext.check_size(Some(8), Some(3), "floating point record")?;
- let mut input = &ext.data[..];
- let data: Vec<f64> = (0..3)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(Record::FloatInfo(FloatInfoRecord {
- sysmis: data[0],
- highest: data[1],
- lowest: data[2],
- }))
+ let data = FloatInfoRecord::read_options(&mut Cursor::new(ext.data.as_slice()), endian, ())
+ .unwrap();
+ Ok(Record::FloatInfo(data))
}
}
/// A floating-point info record.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, BinRead, BinWrite)]
pub struct FloatInfoRecord {
/// Value used for system-missing values.
pub sysmis: f64,
)
}
Some((b'E', input)) => {
- let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
- (CategoryLabels::CountedValues, rest)
+ let (use_var_label_as_mrset_label, input) = if let Some(rest) =
+ input.strip_prefix(b" 1 ")
+ {
+ (false, rest)
} else if let Some(rest) = input.strip_prefix(b" 11 ") {
- (CategoryLabels::VarLabels, rest)
+ (true, rest)
} else {
return Err(MultipleResponseWarning::InvalidMultipleDichotomyLabelType.into());
};
let (value, input) = parse_counted_string(input)?;
(
- MultipleResponseType::MultipleDichotomy { value, labels },
+ MultipleResponseType::MultipleDichotomy {
+ value,
+ labels: CategoryLabels::CountedValues {
+ use_var_label_as_mrset_label,
+ },
+ },
input,
)
}
#![allow(dead_code, missing_docs)]
use core::f64;
use std::{
+ borrow::Cow,
collections::HashMap,
- io::{Seek, Write},
+ fmt::Write as _,
+ io::{Cursor, Seek, Write},
};
use binrw::{BinWrite, Endian, Error as BinError};
use crate::{
data::Datum,
- dictionary::{Dictionary, ValueLabels, VarWidth},
- format::Format,
+ dictionary::{
+ Alignment, Attributes, CategoryLabels, Dictionary, Measure, MultipleResponseType,
+ ValueLabels, VarWidth,
+ },
+ format::{DisplayPlainF64, Format},
identifier::Identifier,
output::spv::Zeros,
- sys::raw::{
- records::{Compression, RawFormat, RawHeader, RawVariableRecord},
- Magic,
+ sys::{
+ encoding::codepage_from_encoding,
+ raw::{
+ records::{
+ Compression, FloatInfoRecord, RawFormat, RawHeader, RawIntegerInfoRecord,
+ RawVariableRecord,
+ },
+ Magic,
+ },
},
};
dictionary: &'a Dictionary,
}
+fn count_segments(case_vars: &[CaseVar]) -> u32 {
+ case_vars.iter().map(CaseVar::n_segments).sum::<usize>() as u32
+}
+
+fn put_attributes(attributes: &Attributes, s: &mut String) {
+ for (name, values) in attributes.iter(true) {
+ write!(s, "{name}(").unwrap();
+ for value in values {
+ writeln!(s, "'{value}'").unwrap();
+ }
+ write!(s, ")").unwrap()
+ }
+}
+
impl<'a, W> DictionaryWriter<'a, W>
where
W: Write + Seek,
pub fn write(&mut self) -> Result<(), BinError> {
self.write_header()?;
self.write_variables()?;
- self.write_value_labels()
+ self.write_value_labels()?;
+ self.write_integer_record()?;
+ self.write_float_record()?;
+ self.write_var_sets()?;
+ self.write_mrsets(true)?;
+ self.write_variable_display_parameters()?;
+ self.write_long_variable_names()?;
+ self.write_very_long_strings()?;
+ self.write_long_string_value_labels()?;
+ self.write_long_string_missing_values()?;
+ self.write_data_file_attributes()?;
+ self.write_variable_attributes()?;
+ self.write_mrsets(false)?;
+ self.write_encoding()?;
+ (999u32, 0u32).write_le(self.writer)
}
fn write_header(&mut self) -> Result<(), BinError> {
bytes.try_into().unwrap()
}
- fn count_segments(case_vars: &[CaseVar]) -> u32 {
- case_vars.iter().map(CaseVar::n_segments).sum::<usize>() as u32
- }
-
let now = Local::now();
let header = RawHeader {
magic: if self.compression == Some(Compression::ZLib) {
if !variable.width.is_long_string() {
if let Some(range) = variable.missing_values.range() {
(
- range.low().unwrap_or(-f64::MAX),
+ range.low().unwrap_or(f64::MIN),
range.high().unwrap_or(f64::MAX),
)
.write_le(self.writer)?;
// Variable record.
(4u32, variables.len() as u32, variables).write_le(self.writer)?;
}
- todo!()
+ Ok(())
}
pub fn write_documents(&mut self) -> Result<(), BinError> {
}
Ok(())
}
+
+ const fn version() -> (i32, i32, i32) {
+ const fn parse_integer(mut s: &[u8]) -> (i32, &[u8]) {
+ let mut value = 0;
+ let mut n = 0;
+ while let Some((c, rest)) = s.split_first()
+ && *c >= b'0'
+ && *c <= b'9'
+ {
+ value = value * 10 + (*c - b'0') as i32;
+ n += 1;
+ s = rest;
+ }
+ assert!(n > 0);
+ (value, s)
+ }
+
+ const fn skip_dot(s: &[u8]) -> &[u8] {
+ let Some((c, rest)) = s.split_first() else {
+ unreachable!()
+ };
+ assert!(*c == b'.');
+ rest
+ }
+
+ let s = env!("CARGO_PKG_VERSION").as_bytes();
+ let (first, s) = parse_integer(s);
+ let s = skip_dot(s);
+ let (second, s) = parse_integer(s);
+ let s = skip_dot(s);
+ let (third, s) = parse_integer(s);
+ assert!(matches!(s.first(), None | Some(b'-' | b'+')));
+ (first, second, third)
+ }
+
+ fn write_integer_record(&mut self) -> Result<(), BinError> {
+ (
+ 7u32,
+ 3u32,
+ 4u32,
+ 8u32,
+ RawIntegerInfoRecord {
+ version: Self::version(),
+ machine_code: -1,
+ floating_point_rep: 1,
+ compression_code: 1,
+ endianness: {
+ // We always write files in little-endian.
+ 2
+ },
+ character_code: codepage_from_encoding(self.dictionary.encoding) as i32,
+ },
+ )
+ .write_le(self.writer)
+ }
+
+ fn write_float_record(&mut self) -> Result<(), BinError> {
+ (
+ 7u32,
+ 4u32,
+ 8u32,
+ 3u32,
+ FloatInfoRecord {
+ sysmis: f64::MIN,
+ highest: f64::MAX,
+ lowest: f64::MIN.next_up(),
+ },
+ )
+ .write_le(self.writer)
+ }
+
+ fn write_var_sets(&mut self) -> Result<(), BinError> {
+ let mut s = String::new();
+ for set in &self.dictionary.variable_sets {
+ write!(&mut s, "{}= ", set.name).unwrap();
+ for (index, variable) in set.variables.iter().enumerate() {
+ let prefix = if index > 0 { " " } else { "" };
+ write!(
+ &mut s,
+ "{prefix}{}",
+ self.dictionary.variables[*variable].name
+ )
+ .unwrap();
+ }
+ writeln!(&mut s).unwrap();
+ }
+ self.write_string_record(5, &s)
+ }
+
+ /// If `pre_v14` is true, writes only sets supported by SPSS before release
+ /// 14, otherwise writes sets supported only by later versions.
+ fn write_mrsets(&mut self, pre_v14: bool) -> Result<(), BinError> {
+ let mut output = Vec::new();
+ for set in self
+ .dictionary
+ .mrsets
+ .iter()
+ .filter(|set| set.mr_type.supported_before_v14() == pre_v14)
+ {
+ output.extend_from_slice(&self.dictionary.encoding.encode(&set.name).0[..]);
+ output.push(b'=');
+ match &set.mr_type {
+ MultipleResponseType::MultipleDichotomy { datum, labels } => {
+ let leader = match labels {
+ CategoryLabels::VarLabels => b"D".as_slice(),
+ CategoryLabels::CountedValues {
+ use_var_label_as_mrset_label: true,
+ } => b"E 11".as_slice(),
+ CategoryLabels::CountedValues {
+ use_var_label_as_mrset_label: false,
+ } => b"E 1".as_slice(),
+ };
+ output.extend_from_slice(leader);
+
+ let mut value = match datum {
+ Datum::Number(Some(number)) => {
+ DisplayPlainF64(*number).to_string().into_bytes()
+ }
+ Datum::Number(None) => vec![b'.'],
+ Datum::String(raw_string) => raw_string.0.clone(),
+ };
+ write!(&mut output, "{} ", value.len()).unwrap();
+ output.append(&mut value);
+ }
+ MultipleResponseType::MultipleCategory => write!(&mut output, "C").unwrap(),
+ }
+
+ let label = if set.mr_type.label_from_var_label() {
+ Cow::from(&[])
+ } else {
+ self.dictionary.encoding.encode(&set.label).0
+ };
+ write!(&mut output, "{} ", label.len()).unwrap();
+ output.extend_from_slice(&label[..]);
+
+ for variable in set.variables.iter().copied() {
+ // Only lowercase ASCII characters because other characters
+ // might expand upon lowercasing.
+ let short_name = self.short_names[variable][0].as_str().to_ascii_lowercase();
+ output.push(b' ');
+ output.extend_from_slice(&self.dictionary.encoding.encode(&short_name).0);
+ }
+ output.push(b'\n');
+ }
+ self.write_bytes_record(if pre_v14 { 7 } else { 19 }, &output)
+ }
+
+ fn write_variable_display_parameters(&mut self) -> Result<(), BinError> {
+ (7u32, 11u32, 4u32, count_segments(&self.case_vars) * 3).write_le(self.writer)?;
+ for variable in &self.dictionary.variables {
+ let measure = match variable.measure {
+ None => 0,
+ Some(Measure::Nominal) => 1,
+ Some(Measure::Ordinal) => 2,
+ Some(Measure::Scale) => 3,
+ };
+ let alignment = match variable.alignment {
+ Alignment::Left => 0,
+ Alignment::Right => 1,
+ Alignment::Center => 2,
+ };
+ for (index, segment) in SegmentWidths::new(variable.width).enumerate() {
+ let display_width = match index {
+ 0 => variable.display_width,
+ _ => segment.default_display_width(),
+ };
+ (measure, display_width, alignment).write_le(self.writer)?;
+ }
+ }
+ Ok(())
+ }
+
+ fn write_long_variable_names(&mut self) -> Result<(), BinError> {
+ if self.version == Version::V2 {
+ return Ok(());
+ }
+
+ let mut s = String::new();
+ for (index, variable) in self.dictionary.variables.iter().enumerate() {
+ if index > 0 {
+ s.push('\t');
+ }
+ write!(&mut s, "{}={}", &self.short_names[index][0], variable.name).unwrap();
+ }
+ self.write_string_record(13, &s)
+ }
+
+ fn write_very_long_strings(&mut self) -> Result<(), BinError> {
+ let mut s = String::new();
+ for (index, variable) in self.dictionary.variables.iter().enumerate() {
+ if variable.width.is_very_long() {
+ let width = variable.width.as_string_width().unwrap();
+ write!(&mut s, "{}={width:05}\0\t", &self.short_names[index][0],).unwrap();
+ }
+ }
+ self.write_string_record(14, &s)
+ }
+
+ fn write_long_string_value_labels(&mut self) -> Result<(), BinError> {
+ let mut body = Vec::new();
+ let mut cursor = Cursor::new(&mut body);
+ for variable in &self.dictionary.variables {
+ if variable.value_labels.is_empty() || !variable.width.is_long_string() {
+ break;
+ }
+ let name = self.dictionary.encoding.encode(&variable.name).0;
+ (
+ name.len() as u32,
+ &name[..],
+ variable.width.as_string_width().unwrap() as u32,
+ variable.value_labels.0.len() as u32,
+ )
+ .write_le(&mut cursor)?;
+
+ for (value, label) in &variable.value_labels.0 {
+ let value = value.as_string().unwrap();
+ let label = self.dictionary.encoding.encode(&label).0;
+ (
+ value.len() as u32,
+ value.as_bytes(),
+ label.len() as u32,
+ &label[..],
+ )
+ .write_le(&mut cursor)?;
+ }
+ }
+ self.write_bytes_record(21, &body)
+ }
+
+ fn write_long_string_missing_values(&mut self) -> Result<(), BinError> {
+ let mut body = Vec::new();
+ let mut cursor = Cursor::new(&mut body);
+ for variable in &self.dictionary.variables {
+ if variable.missing_values.is_empty() || !variable.width.is_long_string() {
+ break;
+ }
+ let name = self.dictionary.encoding.encode(&variable.name).0;
+ (
+ name.len() as u32,
+ &name[..],
+ variable.missing_values.values().len() as u32,
+ 8u32,
+ )
+ .write_le(&mut cursor)?;
+
+ for value in variable.missing_values.values() {
+ let value = value.as_string().unwrap();
+ value.0[..8].write_le(&mut cursor).unwrap();
+ }
+ }
+ self.write_bytes_record(22, &body)
+ }
+
+ fn write_data_file_attributes(&mut self) -> Result<(), BinError> {
+ if self.version != Version::V2 {
+ return Ok(());
+ }
+ let mut s = String::new();
+ put_attributes(&self.dictionary.attributes, &mut s);
+ self.write_string_record(17, &s)
+ }
+
+ fn write_variable_attributes(&mut self) -> Result<(), BinError> {
+ if self.version != Version::V2 {
+ return Ok(());
+ }
+ let mut s = String::new();
+ for (index, variable) in self.dictionary.variables.iter().enumerate() {
+ let mut attributes = variable.attributes.clone();
+ attributes.0.insert(
+ Identifier::new("$@Role").unwrap(),
+ vec![i32::from(variable.role).to_string()],
+ );
+
+ if index > 0 {
+ s.push('/');
+ }
+ put_attributes(&attributes, &mut s);
+ }
+ self.write_string_record(18, &s)
+ }
+
+ fn write_encoding(&mut self) -> Result<(), BinError> {
+ self.write_string_record(20, self.dictionary.encoding.name())
+ }
+
+ fn write_bytes_record(&mut self, subtype: u32, bytes: &[u8]) -> Result<(), BinError> {
+ if !bytes.is_empty() {
+ (7u32, subtype, 1u32, bytes.len() as u32, bytes).write_le(self.writer)
+ } else {
+ Ok(())
+ }
+ }
+
+ fn write_string_record(&mut self, subtype: u32, s: &str) -> Result<(), BinError> {
+ self.write_bytes_record(subtype, &self.dictionary.encoding.encode(&s).0)
+ }
}
#[derive(BinWrite)]
_: (),
) -> binrw::BinResult<()> {
match self {
- Datum::Number(number) => number
- .unwrap_or(-f64::MAX)
- .write_options(writer, endian, ()),
+ Datum::Number(number) => number.unwrap_or(f64::MIN).write_options(writer, endian, ()),
Datum::String(raw_string) => raw_string.0.write_options(writer, endian, ()),
}
}