}
}
- pub fn push(&mut self, child: impl Into<Category>) {
+ pub fn push(&mut self, child: impl Into<Category>) -> usize {
let mut child = child.into();
if let Category::Group(group) = &mut child {
group.show_label = true;
}
self.len += child.len();
self.children.push(child);
+ self.len - 1
}
pub fn with(mut self, child: impl Into<Category>) -> Self {
}
}
+impl From<String> for Category {
+ fn from(name: String) -> Self {
+ Self::Leaf(Leaf::new(Value::new_text(name)))
+ }
+}
+
+impl From<&String> for Category {
+ fn from(name: &String) -> Self {
+ Self::Leaf(Leaf::new(Value::new_text(name)))
+ }
+}
+
/// Styling for a pivot table.
///
/// The division between this and the style information in [PivotTable] seems
self
}
- pub fn with_caption(mut self, caption: Value) -> Self {
- self.caption = Some(Box::new(caption));
+ pub fn with_caption(mut self, caption: impl Into<Value>) -> Self {
+ self.caption = Some(Box::new(caption.into()));
self.show_caption = true;
self
}
- pub fn with_corner_text(mut self, corner_text: Value) -> Self {
- self.corner_text = Some(Box::new(corner_text));
+ pub fn with_corner_text(mut self, corner_text: impl Into<Value>) -> Self {
+ self.corner_text = Some(Box::new(corner_text.into()));
self
}
- pub fn with_subtype(self, subtype: Value) -> Self {
+ pub fn with_subtype(self, subtype: impl Into<Value>) -> Self {
Self {
- subtype: Some(Box::new(subtype)),
+ subtype: Some(Box::new(subtype.into())),
..self
}
}
}
impl PivotTable {
- pub fn new(dimensions_and_axes: impl IntoIterator<Item = (Axis3, Dimension)>) -> Self {
+ pub fn new(axes_and_dimensions: impl IntoIterator<Item = (Axis3, Dimension)>) -> Self {
let mut dimensions = Vec::new();
let mut axes = EnumMap::<Axis3, Axis>::default();
- for (axis, dimension) in dimensions_and_axes {
+ for (axis, dimension) in axes_and_dimensions {
axes[axis].dimensions.push(dimensions.len());
dimensions.push(dimension);
}
//! raw details. Most readers will want to use higher-level interfaces.
use crate::{
- data::{ByteStr, ByteString, Datum, RawCase},
- variable::{VarType, VarWidth},
+ data::{ByteStr, ByteString, Datum, RawCase, RawString},
endian::{FromBytes, ToBytes},
identifier::{Error as IdError, Identifier},
+ output::pivot::{Axis3, Dimension, Group, PivotTable, Value},
sys::{
encoding::{default_encoding, get_encoding, Error as EncodingError},
raw::records::{
ZlibTrailerWarning,
},
},
+ variable::{VarType, VarWidth},
};
use binrw::Endian;
-use encoding_rs::Encoding;
+use encoding_rs::{
+ Encoding, BIG5, EUC_JP, EUC_KR, GB18030, IBM866, ISO_2022_JP, ISO_8859_10, ISO_8859_13,
+ ISO_8859_14, ISO_8859_16, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, ISO_8859_6,
+ ISO_8859_7, ISO_8859_8, KOI8_R, KOI8_U, MACINTOSH, SHIFT_JIS, UTF_8, WINDOWS_1250,
+ WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254, WINDOWS_1255, WINDOWS_1256,
+ WINDOWS_1257, WINDOWS_1258, WINDOWS_874,
+};
use flate2::bufread::ZlibDecoder;
+use indexmap::IndexMap;
+use itertools::{EitherOrBoth, Itertools};
use serde::Serialize;
use smallvec::SmallVec;
use std::{
_ => None,
}
}
+
+ pub fn get_strings(&self) -> Vec<RecordString> {
+ let mut strings = Vec::new();
+ match self {
+ Record::Variable(variable_record) => {
+ strings.push(RecordString::new(
+ "Variable Name",
+ &variable_record.name,
+ true,
+ ));
+ if let Some(label) = &variable_record.label {
+ strings.push(RecordString::new("Variable Label", label, false));
+ }
+ for missing_value in &variable_record.missing_values.values {
+ if let Some(string) = missing_value.as_string() {
+ strings.push(RecordString::new("Missing Value", string, false));
+ }
+ }
+ }
+ Record::ValueLabel(value_label_record) => {
+ for label in &value_label_record.labels {
+ strings.push(RecordString::new("Value Label", &label.label, false));
+ }
+ }
+ Record::Document(document_record) => {
+ for (line, index) in document_record.lines.iter().zip(1..) {
+ strings.push(RecordString::new(
+ format!("Document Line {index}"),
+ line,
+ false,
+ ));
+ }
+ }
+ Record::MultipleResponse(multiple_response_record) => {
+ for set in &multiple_response_record.sets {
+ strings.push(RecordString::new(
+ "Multiple Response Set Name",
+ &set.name,
+ true,
+ ));
+ if !set.label.is_empty() {
+ strings.push(RecordString::new(
+ "Multiple Response Set Label",
+ &set.label,
+ false,
+ ));
+ }
+ match &set.mr_type {
+ records::MultipleResponseType::MultipleDichotomy { value, .. } => {
+ strings.push(RecordString::new(
+ "Multiple Response Set Counted Value",
+ value,
+ false,
+ ));
+ }
+ _ => (),
+ }
+ }
+ }
+ Record::LongStringValueLabels(long_string_value_label_record) => {
+ for labels in &long_string_value_label_record.labels {
+ for (_value, label) in &labels.labels {
+ strings.push(RecordString::new("Value Label", label, false));
+ }
+ }
+ }
+ Record::ProductInfo(raw_product_info_record) => {
+ strings.push(RecordString::new(
+ "Extra Product Info",
+ &raw_product_info_record.0.text,
+ false,
+ ));
+ }
+ Record::IntegerInfo(_)
+ | Record::FloatInfo(_)
+ | Record::VarDisplay(_)
+ | Record::LongStringMissingValues(_)
+ | Record::Encoding(_)
+ | Record::NumberOfCases(_)
+ | Record::VariableSets(_)
+ | Record::LongNames(_)
+ | Record::VeryLongStrings(_)
+ | Record::FileAttributes(_)
+ | Record::VariableAttributes(_)
+ | Record::OtherExtension(_)
+ | Record::EndOfHeaders(_)
+ | Record::ZHeader(_)
+ | Record::ZTrailer(_) => (),
+ }
+ strings
+ }
}
/// A [Record] that has been decoded to a more usable form.
self.types.iter().flatten().count()
}
}
+
+pub struct RecordString {
+ pub title: String,
+ pub string: ByteString,
+ pub is_identifier: bool,
+}
+
+impl RecordString {
+ pub fn new(
+ title: impl Into<String>,
+ string: impl Into<ByteString>,
+ is_identifier: bool,
+ ) -> Self {
+ Self {
+ title: title.into(),
+ string: string.into(),
+ is_identifier,
+ }
+ }
+}
+
+static ENCODINGS: [&Encoding; 32] = [
+ UTF_8,
+ WINDOWS_1252,
+ ISO_8859_2,
+ ISO_8859_3,
+ ISO_8859_4,
+ ISO_8859_5,
+ ISO_8859_6,
+ ISO_8859_7,
+ ISO_8859_8,
+ ISO_8859_10,
+ ISO_8859_13,
+ ISO_8859_14,
+ ISO_8859_16,
+ MACINTOSH,
+ WINDOWS_874,
+ WINDOWS_1250,
+ WINDOWS_1251,
+ WINDOWS_1253,
+ WINDOWS_1254,
+ WINDOWS_1255,
+ WINDOWS_1256,
+ WINDOWS_1257,
+ WINDOWS_1258,
+ KOI8_R,
+ KOI8_U,
+ IBM866,
+ GB18030,
+ BIG5,
+ EUC_JP,
+ ISO_2022_JP,
+ SHIFT_JIS,
+ EUC_KR,
+];
+
+pub struct EncodingReport {
+ pub valid_encodings: PivotTable,
+ pub interpretations: Option<PivotTable>,
+}
+
+impl EncodingReport {
+ pub fn new(record_strings: &[RecordString]) -> Option<Self> {
+ let mut encodings: IndexMap<Vec<String>, Vec<&'static Encoding>> = IndexMap::new();
+ for encoding in ENCODINGS {
+ fn recode_as(
+ record_strings: &[RecordString],
+ encoding: &'static Encoding,
+ ) -> Option<Vec<String>> {
+ let mut output = Vec::with_capacity(record_strings.len());
+ for rs in record_strings {
+ let mut s = encoding
+ .decode_without_bom_handling_and_without_replacement(&rs.string.0)?
+ .into_owned();
+ s.truncate(s.trim_end().len());
+ if rs.is_identifier {
+ Identifier::check_plausible(&s).ok()?;
+ }
+ output.push(s);
+ }
+ Some(output)
+ }
+ if let Some(strings) = recode_as(record_strings, encoding) {
+ encodings.entry(strings).or_default().push(encoding);
+ }
+ }
+ if encodings.is_empty() {
+ return None;
+ }
+
+ let numbers = Group::new("#").with_multiple((1..=encodings.len()).map(|i| format!("{i}")));
+ let valid_encodings = PivotTable::new([(Axis3::Y, Dimension::new(numbers))]).with_data(
+ encodings
+ .values()
+ .map(|encodings| {
+ Value::new_user_text(encodings.iter().map(|e| e.name()).join(", "))
+ })
+ .enumerate()
+ .map(|(index, datum)| ([index], datum)),
+ );
+
+ let mut purposes = Group::new("Purpose").with_label_shown();
+ let mut data = Vec::new();
+ for (index, rs) in record_strings.iter().enumerate() {
+ // Skip strings that decode the same way from every encoding.
+ if encodings.keys().map(|strings| &strings[index]).all_equal() {
+ continue;
+ }
+
+ /// Returns an iterator for the decoded strings for the given
+ /// `index`.
+ fn decoded_index<'a>(
+ encodings: &'a IndexMap<Vec<String>, Vec<&'static Encoding>>,
+ index: usize,
+ ) -> impl Iterator<Item = &'a str> {
+ encodings.keys().map(move |strings| strings[index].as_str())
+ }
+
+ let common_prefix = decoded_index(&encodings, index)
+ .reduce(common_prefix)
+ .unwrap()
+ .trim_end_matches(|c| c != ' ')
+ .len();
+ let common_suffix = decoded_index(&encodings, index)
+ .reduce(common_suffix)
+ .unwrap()
+ .trim_start_matches(|c| c != ' ')
+ .len();
+
+ let purpose = purposes.push(&rs.title);
+
+ for (j, s) in decoded_index(&encodings, index).enumerate() {
+ let s = &s[common_prefix..s.len() - common_suffix];
+ let mut entry = String::with_capacity(s.len() + 6);
+ if common_prefix > 0 {
+ entry.push_str("...");
+ }
+ entry.push_str(s);
+ if common_suffix > 0 {
+ entry.push_str("...");
+ }
+ data.push(([0, j, purpose], Value::new_user_text(entry)));
+ }
+ }
+ let number = Group::new("Text")
+ .with_label_shown()
+ .with_multiple((1..=encodings.len()).map(|i| format!("{i}")));
+ let interpretations = if !data.is_empty() {
+ Some(
+ PivotTable::new([
+ (Axis3::X, Dimension::new(Group::new("Text").with("Text"))),
+ (Axis3::Y, Dimension::new(number)),
+ (Axis3::Y, Dimension::new(purposes)),
+ ])
+ .with_title("Alternate Encoded Text Strings")
+ .with_caption("Text strings in the file dictionary that the previously listed encodings interpret differently, along with the interpretations.")
+ .with_data(data),
+ )
+ } else {
+ None
+ };
+ Some(Self {
+ valid_encodings,
+ interpretations,
+ })
+ }
+}
+
+fn common_prefix<'a>(a: &'a str, b: &'a str) -> &'a str {
+ for elem in a.char_indices().zip_longest(b.char_indices()) {
+ match elem {
+ EitherOrBoth::Both((offset, a_char), (_, b_char)) => {
+ if a_char != b_char {
+ return &a[..offset];
+ }
+ }
+ EitherOrBoth::Left((offset, _)) | EitherOrBoth::Right((offset, _)) => {
+ return &a[..offset]
+ }
+ }
+ }
+ a
+}
+
+fn common_suffix<'a>(a: &'a str, b: &'a str) -> &'a str {
+ for elem in a.char_indices().rev().zip_longest(b.char_indices().rev()) {
+ match elem {
+ EitherOrBoth::Both((offset, a_char), (_, b_char)) => {
+ if a_char != b_char {
+ return &a[offset + a_char.len_utf8()..];
+ }
+ }
+ EitherOrBoth::Left((offset, char)) => {
+ return &a[offset + char.len_utf8()..];
+ }
+ EitherOrBoth::Right((offset, char)) => {
+ return &b[offset + char.len_utf8()..];
+ }
+ }
+ }
+ a
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::sys::raw::{common_prefix, common_suffix};
+
+ #[test]
+ fn test_common_prefix() {
+ assert_eq!(common_prefix("abc", "abcxyzzy"), "abc");
+ assert_eq!(common_prefix("abcxyzzy", "abc"), "abc");
+ assert_eq!(common_prefix("abc", "abc"), "abc");
+ assert_eq!(common_prefix("", ""), "");
+ }
+
+ #[test]
+ fn test_common_suffix() {
+ assert_eq!(common_suffix("xyzzyabc", "abc"), "abc");
+ assert_eq!(common_suffix("abc", "xyzzyabc"), "abc");
+ assert_eq!(common_suffix("abc", "abc"), "abc");
+ assert_eq!(common_suffix("", ""), "");
+ }
+}