work on writing system files

author Ben Pfaff <blp@cs.stanford.edu>

Fri, 18 Jul 2025 16:10:56 +0000 (09:10 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Fri, 18 Jul 2025 16:10:56 +0000 (09:10 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Fri, 18 Jul 2025 16:10:56 +0000 (09:10 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Fri, 18 Jul 2025 16:10:56 +0000 (09:10 -0700)
diff --git a/rust/Cargo.lock b/rust/Cargo.lock

index 52e85441602feeb03ca58b72132e3486acca2924..fb413463d9b10fa39919b71afb79106df68daa72 100644 (file)
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -663,12 +663,6 @@ dependencies = [
   "windows-sys 0.59.0",
  ]
  
-[[package]]
-name = "finl_unicode"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94c970b525906eb37d3940083aa65b95e481fc1857d467d13374e1d925cfc163"
-
  [[package]]
  name = "flagset"
  version = "0.4.7"
@@ -1618,7 +1612,6 @@ dependencies = [
   "encoding_rs",
   "enum-iterator",
   "enum-map",
- "finl_unicode",
   "flagset",
   "flate2",
   "hexplay",
@@ -1641,6 +1634,8 @@ dependencies = [
   "thiserror",
   "unicase",
   "unicode-linebreak",
+ "unicode-properties",
+ "unicode-segmentation",
   "unicode-width",
   "windows-sys 0.48.0",
   "xmlwriter",
@@ -2261,6 +2256,18 @@ version = "0.1.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f"
  
+[[package]]
+name = "unicode-properties"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0"
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
+
  [[package]]
  name = "unicode-width"
  version = "0.2.0"
diff --git a/rust/pspp/Cargo.toml b/rust/pspp/Cargo.toml

index 3ad04a1296bfbabec78df53883b70b5d5c9d547e..a4e1e8b97565566e0de869988719de3bb7e6d046 100644 (file)
--- a/rust/pspp/Cargo.toml
+++ b/rust/pspp/Cargo.toml
@@ -49,6 +49,7 @@ aes = "0.8.4"
  readpass = "1.0.3"
  zeroize = "1.8.1"
  unicode-properties = "0.1.3"
+unicode-segmentation = "1.12.0"
  
  [target.'cfg(windows)'.dependencies]
  windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs

index 151e2189876e28c6a496c5c1f7f2090a91f77e79..7e1bb237053f7b4f168637ce592c1dc32f7f1c58 100644 (file)
--- a/rust/pspp/src/dictionary.rs
+++ b/rust/pspp/src/dictionary.rs
@@ -31,6 +31,7 @@ use encoding_rs::Encoding;
  use enum_map::{Enum, EnumMap};
  use indexmap::IndexSet;
  use num::integer::div_ceil;
+use smallvec::SmallVec;
  use thiserror::Error as ThisError;
  use unicase::UniCase;
  
@@ -38,7 +39,9 @@ use crate::{
      data::Datum,
      format::{DisplayPlain, Format},
      identifier::{ByIdentifier, HasIdentifier, Identifier},
-    output::pivot::{Axis3, Dimension, Footnote, Footnotes, Group, PivotTable, Value},
+    output::pivot::{
+        Axis3, Dimension, Display26Adic, Footnote, Footnotes, Group, PivotTable, Value,
+    },
      settings::Show,
  };
  
@@ -168,10 +171,12 @@ impl VarWidth {
          !self.is_numeric()
      }
  
+    /// Returns true if this is a very long string width, meaning wider than 255
+    /// bytes, which was the limit for old versions of SPSS.
      pub fn is_very_long(&self) -> bool {
          match *self {
              VarWidth::Numeric => false,
-            VarWidth::String(width) => width >= 256,
+            VarWidth::String(width) => width > 255,
          }
      }
  
@@ -549,6 +554,110 @@ impl Dictionary {
  
          (group, values)
      }
+
+    pub fn short_names(&self) -> Vec<SmallVec<[Identifier; 1]>> {
+        fn pick_short_name(
+            variable_name: &Identifier,
+            used_names: &mut HashSet<Identifier>,
+            encoding: &'static Encoding,
+        ) -> Identifier {
+            for index in 0.. {
+                let name = if index == 0 {
+                    variable_name.shortened(encoding)
+                } else {
+                    variable_name
+                        .with_suffix(
+                            &format!("_{}", Display26Adic::new_uppercase(index)),
+                            encoding,
+                            8,
+                        )
+                        .or_else(|_| {
+                            Identifier::new(format!("V{}", Display26Adic::new_uppercase(index)))
+                        })
+                        .unwrap()
+                };
+                if !used_names.contains(&name) {
+                    used_names.insert(name.clone());
+                    return name;
+                }
+            }
+            unreachable!()
+        }
+
+        let mut used_names = HashSet::new();
+
+        // Each variable whose name is short has the best claim to its short
+        // name.
+        let mut short_names: Vec<SmallVec<[Option<Identifier>; 1]>> = self
+            .variables
+            .iter()
+            .map(|variable| {
+                let n = variable.width.n_segments();
+                let mut names = SmallVec::with_capacity(n);
+                if self.encoding.encode(variable.name.as_str()).0.len() <= 8 {
+                    used_names.insert(variable.name.clone());
+                    names.push(Some(variable.name.clone()))
+                }
+                while names.len() < n {
+                    names.push(None);
+                }
+                names
+            })
+            .collect();
+
+        // Each variable with an assigned short name for its first segment now
+        // gets it unless there is a conflict.  In case of conflict, the
+        // claimant earlier in dictionary order wins.  Then similarly for
+        // additional segments of very long strings.
+        for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) {
+            if short_names[0].is_none()
+                && let Some(short_name) = variable.short_names.first()
+                && !used_names.contains(&short_name)
+            {
+                used_names.insert(short_name.clone());
+                short_names[0] = Some(short_name.clone());
+            }
+        }
+        for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) {
+            for (index, assigned_short_name) in short_names.iter_mut().enumerate().skip(1) {
+                if assigned_short_name.is_none()
+                    && let Some(short_name) = variable.short_names.get(index)
+                    && !used_names.contains(&short_name)
+                {
+                    used_names.insert(short_name.clone());
+                    *assigned_short_name = Some(short_name.clone());
+                }
+            }
+        }
+
+        // Assign short names to first segment of remaining variables,
+        // then similarly for additional segments.
+        for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) {
+            if short_names[0].is_none() {
+                short_names[0] = Some(pick_short_name(
+                    &variable.name,
+                    &mut used_names,
+                    self.encoding,
+                ));
+            }
+        }
+        for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) {
+            for assigned_short_name in short_names.iter_mut().skip(1) {
+                if assigned_short_name.is_none() {
+                    *assigned_short_name = Some(pick_short_name(
+                        &variable.name,
+                        &mut used_names,
+                        self.encoding,
+                    ));
+                }
+            }
+        }
+
+        short_names
+            .into_iter()
+            .map(|names| names.into_iter().flatten().collect())
+            .collect()
+    }
  }
  
  pub struct OutputVariables<'a> {
@@ -1553,19 +1662,24 @@ pub enum CategoryLabels {
  mod test {
      use std::collections::HashSet;
  
+    use encoding_rs::{UTF_8, WINDOWS_1252};
+    use smallvec::SmallVec;
      use unicase::UniCase;
  
-    use crate::identifier::Identifier;
+    use crate::{
+        dictionary::{Dictionary, VarWidth, Variable},
+        identifier::Identifier,
+    };
  
      use super::{ByIdentifier, HasIdentifier};
  
      #[derive(PartialEq, Eq, Debug, Clone)]
-    struct Variable {
+    struct SimpleVar {
          name: Identifier,
          value: i32,
      }
  
-    impl HasIdentifier for Variable {
+    impl HasIdentifier for SimpleVar {
          fn identifier(&self) -> &UniCase<String> {
              &self.name.0
          }
@@ -1575,11 +1689,11 @@ mod test {
      fn test() {
          // Variables should not be the same if their values differ.
          let abcd = Identifier::new("abcd").unwrap();
-        let abcd1 = Variable {
+        let abcd1 = SimpleVar {
              name: abcd.clone(),
              value: 1,
          };
-        let abcd2 = Variable {
+        let abcd2 = SimpleVar {
              name: abcd,
              value: 2,
          };
@@ -1591,7 +1705,7 @@ mod test {
          assert_eq!(abcd1_by_name, abcd2_by_name);
  
          // And a `HashSet` of `ByName` should also treat them the same.
-        let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
+        let mut vars: HashSet<ByIdentifier<SimpleVar>> = HashSet::new();
          assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
          assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
          assert_eq!(
@@ -1602,4 +1716,93 @@ mod test {
              1
          );
      }
+
+    #[test]
+    fn short_names() {
+        for (variables, expected, encoding) in [
+            (
+                [("VariableName1", 1), ("VARIABLE", 1), ("VariableName2", 1)],
+                vec![vec!["Variab_A"], vec!["VARIABLE"], vec!["Variab_B"]],
+                UTF_8,
+            ),
+            (
+                [
+                    ("LongVarNameA", 1),
+                    ("LongVarNameB", 1),
+                    ("LongVarNameC", 1),
+                ],
+                vec![vec!["LongVarN"], vec!["LongVa_A"], vec!["LongVa_B"]],
+                UTF_8,
+            ),
+            (
+                [
+                    ("LongVarNameA", 300),
+                    ("LongVarNameB", 1),
+                    ("LongVarNameC", 1),
+                ],
+                vec![
+                    vec!["LongVarN", "LongVa_C"],
+                    vec!["LongVa_A"],
+                    vec!["LongVa_B"],
+                ],
+                UTF_8,
+            ),
+            (
+                [
+                    // The accented letters are 2 bytes and the katakana is 3
+                    // bytes in UTF-8.
+                    ("éèäスîVarNameA", 300),
+                    ("éèäスVarNameB", 1),
+                    ("éèäîVarNameC", 1),
+                ],
+                vec![vec!["éèä", "éèä_B"], vec!["éèä_A"], vec!["éèäî"]],
+                UTF_8,
+            ),
+            (
+                [
+                    // This version uses `e` with modifying acute accent in the
+                    // first name.
+                    ("e\u{301}èäスîVarNameA", 300),
+                    ("éèäスVarNameB", 1),
+                    ("éèäîVarNameC", 1),
+                ],
+                vec![vec!["e\u{301}èä", "e\u{301}è_A"], vec!["éèä"], vec!["éèäî"]],
+                UTF_8,
+            ),
+            (
+                [
+                    // The accented letters are only 1 byte in windows-1252.
+                    ("éèäîVarNameA", 300),
+                    ("éèäîVarNameB", 1),
+                    ("éèäîVarNameC", 1),
+                ],
+                vec![
+                    vec!["éèäîVarN", "éèäîVa_C"],
+                    vec!["éèäîVa_A"],
+                    vec!["éèäîVa_B"],
+                ],
+                WINDOWS_1252,
+            ),
+        ] {
+            let mut dict = Dictionary::new(encoding);
+            for (name, width) in variables {
+                dict.add_var(Variable::new(
+                    Identifier::new(name).unwrap(),
+                    VarWidth::String(width),
+                    encoding,
+                ))
+                .unwrap();
+            }
+            let expected = expected
+                .into_iter()
+                .map(|names| {
+                    names
+                        .into_iter()
+                        .map(|name| Identifier::new(name).unwrap())
+                        .collect::<SmallVec<[_; 1]>>()
+                })
+                .collect::<Vec<_>>();
+            assert_eq!(expected, dict.short_names());
+        }
+    }
  }
diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs

index d46db2d8ee125caa0c940ef2ec9ff6e2f3eed7c9..7ddbbbaee7643a8287e5cd4ca4a21ed400d46dd6 100644 (file)
--- a/rust/pspp/src/format/mod.rs
+++ b/rust/pspp/src/format/mod.rs
@@ -28,8 +28,8 @@ use thiserror::Error as ThisError;
  use unicode_width::UnicodeWidthStr;
  
  use crate::{
-    data::RawString,
      data::Datum,
+    data::RawString,
      dictionary::{VarType, VarWidth},
      sys::raw,
  };
diff --git a/rust/pspp/src/identifier.rs b/rust/pspp/src/identifier.rs

index 823ba6e9718d0696eb40d91779e262f0f3808b91..b2565d6a543be64f32ceb06577572f938917b603 100644 (file)
--- a/rust/pspp/src/identifier.rs
+++ b/rust/pspp/src/identifier.rs
@@ -22,10 +22,11 @@ use std::{
      ops::{Deref, DerefMut},
  };
  
-use encoding_rs::{EncoderResult, Encoding, UTF_8};
+use encoding_rs::{CoderResult, Encoder, EncoderResult, Encoding, UTF_8};
  use thiserror::Error as ThisError;
  use unicase::UniCase;
  use unicode_properties::UnicodeGeneralCategory;
+use unicode_segmentation::UnicodeSegmentation;
  
  #[derive(Clone, Copy, Debug, Eq, PartialEq)]
  pub enum Class {
@@ -209,6 +210,12 @@ impl Identifier {
      /// encoding used by the dictionary, not in UTF-8.
      pub const MAX_LEN: usize = 64;
  
+    fn new_unchecked(s: impl Into<UniCase<String>>) -> Self {
+        let s: UniCase<String> = s.into();
+        debug_assert!(Self::check_plausible(&s).is_ok());
+        Identifier(s)
+    }
+
      pub fn new(s: impl Into<UniCase<String>>) -> Result<Self, Error> {
          Self::from_encoding(s, UTF_8)
      }
@@ -218,7 +225,7 @@ impl Identifier {
          encoding: &'static Encoding,
      ) -> Result<Identifier, Error> {
          let s: UniCase<String> = s.into();
-        Self::is_plausible(&s)?;
+        Self::check_plausible(&s)?;
          let identifier = Identifier(s);
          identifier.check_encoding(encoding)?;
          Ok(identifier)
@@ -260,7 +267,7 @@ impl Identifier {
          }*/
          Ok(())
      }
-    pub fn is_plausible(s: &str) -> Result<(), Error> {
+    pub fn check_plausible(s: &str) -> Result<(), Error> {
          if s.is_empty() {
              return Err(Error::Empty);
          }
@@ -330,6 +337,111 @@ impl Identifier {
      pub fn as_str(&self) -> &str {
          self.0.as_ref()
      }
+
+    /// Returns this this identifier truncated to at most 8 bytes in `encoding`.
+    pub fn shortened(&self, encoding: &'static Encoding) -> Self {
+        let new_len = shortened_len(self, "", encoding, 8);
+        Self::new_unchecked(self.0[..new_len].to_string())
+    }
+
+    /// Returns a prefix of this identifier concatenated with all of `suffix`,
+    /// including as many grapheme clusters from the beginning of this
+    /// identifier as would fit within `max_len` bytes if the resulting string
+    /// were to be re-encoded in `encoding`.
+    ///
+    /// `max_len` would ordinarily be 64, since that's the maximum length of an
+    /// identifier, but a value of 8 is appropriate for short variable names.
+    ///
+    /// This function fails if adding or using `suffix` produces an invalid
+    /// [Identifier], for example if `max_len` is short enough that none of the
+    /// identifier can be included and `suffix` begins with `'_'` or another
+    /// character that may not appear at the beginning of an identifier.
+    ///
+    /// # Examples
+    ///
+    /// Simple examples for UTF-8 `encoding` with `max_len` of 6:
+    ///
+    /// ```text
+    /// identifier="abc",  suffix="xyz"     => "abcxyz"
+    /// identifier="abcd", suffix="xyz"     => "abcxyz"
+    /// identifier="abc",  suffix="uvwxyz"  => "uvwxyz"
+    /// identifier="abc",  suffix="tuvwxyz" => "tuvwxyz"
+    /// ```
+    ///
+    /// Examples for windows-1252 `encoding` with `max_len` of 6:
+    ///
+    /// ```text
+    /// identifier="éèä",  suffix="xyz"    => "éèäxyz"
+    /// ```
+    ///
+    /// (each letter in the identifier is only 1 byte in windows-1252 even
+    /// though they each take 2 bytes in UTF-8)
+    pub fn with_suffix(
+        &self,
+        suffix: &str,
+        encoding: &'static Encoding,
+        max_len: usize,
+    ) -> Result<Self, Error> {
+        let prefix_len = shortened_len(self, suffix, encoding, max_len);
+        if prefix_len == 0 {
+            Self::new(suffix)
+        } else {
+            Self::new(format!("{}{suffix}", &self[..prefix_len]))
+        }
+    }
+}
+
+fn encode_fully(encoder: &mut Encoder, mut src: &str, dst: &mut Vec<u8>, last: bool) {
+    while let (CoderResult::OutputFull, read, _) = encoder.encode_from_utf8_to_vec(src, dst, last) {
+        src = &src[read..];
+        dst.reserve((dst.capacity() * 2) - dst.len());
+    }
+}
+
+fn shortened_len(prefix: &str, suffix: &str, encoding: &'static Encoding, max_len: usize) -> usize {
+    assert!(max_len <= 64);
+    if encoding == UTF_8 {
+        if prefix.len() + suffix.len() <= max_len {
+            prefix.len()
+        } else if suffix.len() >= max_len {
+            0
+        } else {
+            let mut copy_len = 0;
+            for (cluster_start, cluster) in prefix.grapheme_indices(true) {
+                let cluster_end = cluster_start + cluster.len();
+                if cluster_end > max_len - suffix.len() {
+                    break;
+                }
+                copy_len = cluster_end;
+            }
+            copy_len
+        }
+    } else {
+        let mut copy_len = 0;
+        let mut tmp = Vec::with_capacity(max_len);
+        for (cluster_start, cluster) in prefix.grapheme_indices(true) {
+            let cluster_end = cluster_start + cluster.len();
+            let mut encoder = encoding.new_encoder();
+            tmp.clear();
+            encode_fully(&mut encoder, &prefix[..cluster_end], &mut tmp, false);
+            if tmp.len() <= max_len {
+                encode_fully(&mut encoder, suffix, &mut tmp, true);
+            }
+            if tmp.len() > max_len {
+                break;
+            }
+            copy_len = cluster_end;
+        }
+        copy_len
+    }
+}
+
+impl Deref for Identifier {
+    type Target = UniCase<String>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
  }
  
  impl PartialEq<str> for Identifier {
@@ -490,3 +602,40 @@ where
          &mut self.0
      }
  }
+
+#[cfg(test)]
+mod tests {
+    use encoding_rs::{UTF_8, WINDOWS_1252};
+
+    use crate::identifier::Identifier;
+
+    #[test]
+    fn with_suffix() {
+        for (head, suffix, encoding, max_len, expected) in [
+            ("abc", "xyz", UTF_8, 6, "abcxyz"),
+            ("abcd", "xyz", UTF_8, 6, "abcxyz"),
+            ("abcd", "uvwxyz", UTF_8, 6, "uvwxyz"),
+            ("abc", "tuvwxyz", UTF_8, 6, "tuvwxyz"),
+            ("éèä", "xyz", UTF_8, 6, "éxyz"),
+            ("éèä", "xyz", WINDOWS_1252, 6, "éèäxyz"),
+        ] {
+            let head = Identifier::new(head).unwrap();
+            let suffix = Identifier::new(suffix).unwrap();
+            let actual = head.with_suffix(&suffix, encoding, max_len).unwrap();
+            assert_eq!(&actual, expected);
+        }
+    }
+
+    #[test]
+    fn shortened() {
+        for (long, expected_short, encoding) in [
+            ("abc", "abc", UTF_8),
+            ("éèäîVarNameA", "éèäî", UTF_8),
+            ("éèäîVarNameA", "éèäîVarN", WINDOWS_1252),
+        ] {
+            let long = Identifier::new(long).unwrap();
+            let short = long.shortened(encoding);
+            assert_eq!(&short, expected_short);
+        }
+    }
+}
diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs

index 5d94390b30f1d41d8a26bb096e25e0ee07a5d471..2f670a572cf5fc187643effedcca536379b6161b 100644 (file)
--- a/rust/pspp/src/output/pivot/mod.rs
+++ b/rust/pspp/src/output/pivot/mod.rs
@@ -1725,23 +1725,42 @@ impl Display for DisplayMarker<'_> {
          } else {
              let i = self.footnote.index + 1;
              match self.options.footnote_marker_type {
-                FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic(i)),
+                FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic::new_lowercase(i)),
                  FootnoteMarkerType::Numeric => write!(f, "{i}"),
              }
          }
      }
  }
  
-pub struct Display26Adic(pub usize);
+/// Displays a number in 26adic notation.
+///
+/// Zero is displayed as the empty string, 1 through 26 as `a` through `z`, 27
+/// through 52 as `aa` through `az`, and so on.
+pub struct Display26Adic {
+    value: usize,
+    base: u8,
+}
+
+impl Display26Adic {
+    /// Constructs a `Display26Adic` for `value`, with letters in lowercase.
+    pub fn new_lowercase(value: usize) -> Self {
+        Self { value, base: b'a' }
+    }
+
+    /// Constructs a `Display26Adic` for `value`, with letters in uppercase.
+    pub fn new_uppercase(value: usize) -> Self {
+        Self { value, base: b'A' }
+    }
+}
  
  impl Display for Display26Adic {
      fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
          let mut output = SmallVec::<[u8; 16]>::new();
-        let mut number = self.0;
+        let mut number = self.value;
          while number > 0 {
              number -= 1;
              let digit = (number % 26) as u8;
-            output.push(digit + b'a');
+            output.push(digit + self.base);
              number /= 26;
          }
          output.reverse();
@@ -1749,6 +1768,29 @@ impl Display for Display26Adic {
      }
  }
  
+#[cfg(test)]
+mod tests {
+    use super::Display26Adic;
+    #[test]
+    fn display_26adic() {
+        for (number, lowercase, uppercase) in [
+            (0, "", ""),
+            (1, "a", "A"),
+            (2, "b", "B"),
+            (26, "z", "Z"),
+            (27, "aa", "AA"),
+            (28, "ab", "AB"),
+            (29, "ac", "AC"),
+            (18278, "zzz", "ZZZ"),
+            (18279, "aaaa", "AAAA"),
+            (19010, "abcd", "ABCD"),
+        ] {
+            assert_eq!(Display26Adic::new_lowercase(number).to_string(), lowercase);
+            assert_eq!(Display26Adic::new_uppercase(number).to_string(), uppercase);
+        }
+    }
+}
+
  /// The content of a single pivot table cell.
  ///
  /// A [Value] is also a pivot table's title, caption, footnote marker and
diff --git a/rust/pspp/src/sys/mod.rs b/rust/pspp/src/sys/mod.rs

index 94e063fcf1eec4545663a5cb7221741cb8b43161..b05eba3ec69f1f44ca0387d8db1efc8187d09fea 100644 (file)
--- a/rust/pspp/src/sys/mod.rs
+++ b/rust/pspp/src/sys/mod.rs
@@ -35,5 +35,7 @@ pub mod raw;
  #[cfg(test)]
  pub mod sack;
  
+mod write;
+
  #[cfg(test)]
  mod test;
diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs

index 41ab01f42feeea91d44a9f59c222a032ee094de3..dca752f034eef228802bf9ae7675db9e2dd41282 100644 (file)
--- a/rust/pspp/src/sys/raw.rs
+++ b/rust/pspp/src/sys/raw.rs
@@ -939,6 +939,16 @@ impl Debug for Magic {
      }
  }
  
+impl From<Magic> for [u8; 4] {
+    fn from(value: Magic) -> Self {
+        match value {
+            Magic::Sav => Magic::SAV,
+            Magic::Zsav => Magic::ZSAV,
+            Magic::Ebcdic => Magic::EBCDIC,
+        }
+    }
+}
+
  impl TryFrom<[u8; 4]> for Magic {
      type Error = ErrorDetails;
  
diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs

index d49437697bb21e9a3471969c08300844aaff626e..7dbe107959d7b9b5e95a404da33cf8dde0505163 100644 (file)
--- a/rust/pspp/src/sys/raw/records.rs
+++ b/rust/pspp/src/sys/raw/records.rs
@@ -25,7 +25,7 @@ use crate::{
      },
  };
  
-use binrw::BinRead;
+use binrw::{BinRead, BinWrite};
  use itertools::Itertools;
  use thiserror::Error as ThisError;
  
@@ -127,6 +127,22 @@ where
      }
  }
  
+#[derive(BinRead, BinWrite)]
+pub struct RawHeader {
+    pub magic: [u8; 4],
+    pub eye_catcher: [u8; 60],
+    pub layout_code: u32,
+    pub nominal_case_size: u32,
+    pub compression_code: u32,
+    pub weight_index: u32,
+    pub n_cases: u32,
+    pub bias: f64,
+    pub creation_date: [u8; 9],
+    pub creation_time: [u8; 8],
+    #[brw(pad_after = 3)]
+    pub file_label: [u8; 64],
+}
+
  impl FileHeader<RawString> {
      /// Reads a header record from `r`, reporting any warnings via `warn`.
      pub fn read<R>(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result<Self, Error>
@@ -150,22 +166,6 @@ impl FileHeader<RawString> {
          header_bytes: &[u8],
          warn: &mut dyn FnMut(Warning),
      ) -> Result<Self, ErrorDetails> {
-        #[derive(BinRead)]
-        struct RawHeader {
-            magic: [u8; 4],
-            eye_catcher: [u8; 60],
-            layout_code: u32,
-            nominal_case_size: u32,
-            compression_code: u32,
-            weight_index: u32,
-            n_cases: u32,
-            bias: f64,
-            creation_date: [u8; 9],
-            creation_time: [u8; 8],
-            file_label: [u8; 64],
-            _padding: [u8; 3],
-        }
-
          if &header_bytes[8..20] == b"ENCRYPTEDSAV" {
              return Err(ErrorDetails::Encrypted);
          }
diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs

new file mode 100644 (file)

index 0000000..e65edfd
--- /dev/null
+++ b/rust/pspp/src/sys/write.rs
@@ -0,0 +1,208 @@
+use std::{
+    io::{Seek, Write},
+    iter::repeat_n,
+};
+
+use binrw::{BinWrite, Error as BinError};
+use chrono::Local;
+use smallvec::SmallVec;
+
+use crate::{
+    dictionary::{Dictionary, VarWidth},
+    sys::raw::{
+        records::{Compression, RawHeader},
+        Magic,
+    },
+};
+
+/// System file format version.
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
+pub enum Version {
+    /// Obsolete version.
+    V2,
+
+    /// Current version.
+    #[default]
+    V3,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct WriteOptions {
+    /// How to compress (if at all) data in the system file.
+    pub compression: Option<Compression>,
+
+    /// System file version to write.
+    pub version: Version,
+}
+
+impl Default for WriteOptions {
+    fn default() -> Self {
+        Self {
+            compression: Some(Compression::Simple),
+            version: Default::default(),
+        }
+    }
+}
+
+impl WriteOptions {
+    pub fn new() -> Self {
+        Self::default()
+    }
+    pub fn write_writer<W>(
+        self,
+        dictionary: &Dictionary,
+        mut writer: W,
+    ) -> Result<Writer<W>, BinError>
+    where
+        W: Write + Seek,
+    {
+        fn as_byte_array<const N: usize>(s: String) -> [u8; N] {
+            let mut bytes = s.into_bytes();
+            bytes.resize(N, b' ');
+            bytes.try_into().unwrap()
+        }
+
+        fn count_segments(case_vars: &[CaseVar]) -> u32 {
+            case_vars.iter().map(CaseVar::n_segments).sum::<usize>() as u32
+        }
+
+        let case_vars = dictionary
+            .variables
+            .iter()
+            .map(|variable| CaseVar::new(variable.width))
+            .collect::<Vec<_>>();
+
+        let now = Local::now();
+        let header = RawHeader {
+            magic: if self.compression == Some(Compression::ZLib) {
+                Magic::Zsav
+            } else {
+                Magic::Sav
+            }
+            .into(),
+            eye_catcher: {
+                as_byte_array(format!(
+                    "@(#) SPSS DATA FILE GNU pspp (Rust) {}",
+                    env!("CARGO_PKG_VERSION")
+                ))
+            },
+            layout_code: 2,
+            nominal_case_size: count_segments(&case_vars),
+            compression_code: match self.compression {
+                Some(Compression::Simple) => 1,
+                Some(Compression::ZLib) => 2,
+                None => 0,
+            },
+            weight_index: if let Some(weight_index) = dictionary.weight {
+                count_segments(&case_vars[..weight_index]) + 1
+            } else {
+                0
+            },
+            n_cases: u32::MAX,
+            bias: 100.0,
+            creation_date: as_byte_array(now.format("%d %b %Y").to_string()),
+            creation_time: as_byte_array(now.format("%H:%M:%S").to_string()),
+            file_label: as_byte_array(dictionary.file_label.clone().unwrap_or_default()),
+        };
+        header.write_le(&mut writer)?;
+        todo!()
+    }
+}
+
+#[derive(Debug)]
+struct StringSegment {
+    data_bytes: usize,
+    padding_bytes: usize,
+}
+
+fn segment_widths(width: usize) -> impl Iterator<Item = usize> {
+    let n_segments = width.div_ceil(252);
+    repeat_n(255, n_segments - 1)
+        .chain(if n_segments > 1 {
+            std::iter::once(width - (n_segments - 1) * 252)
+        } else {
+            std::iter::once(width)
+        })
+        .map(|w| w.next_multiple_of(8))
+}
+
+enum CaseVar {
+    Numeric,
+    String {
+        width: usize,
+        encoding: SmallVec<[StringSegment; 1]>,
+    },
+}
+
+impl CaseVar {
+    fn new(width: VarWidth) -> Self {
+        match width {
+            VarWidth::Numeric => Self::Numeric,
+            VarWidth::String(width) => {
+                let width = width as usize;
+                let mut encoding = SmallVec::<[StringSegment; 1]>::new();
+                let mut remaining = width;
+                for segment in segment_widths(width) {
+                    let data_bytes = remaining.min(segment).min(255);
+                    let padding_bytes = segment - data_bytes;
+                    if data_bytes > 0 {
+                        encoding.push(StringSegment {
+                            data_bytes,
+                            padding_bytes,
+                        });
+                        remaining -= data_bytes;
+                    } else {
+                        encoding.last_mut().unwrap().padding_bytes += padding_bytes;
+                    }
+                }
+                CaseVar::String { width, encoding }
+            }
+        }
+    }
+
+    fn bytes(&self) -> usize {
+        match self {
+            CaseVar::Numeric => 8,
+            CaseVar::String { width: _, encoding } => encoding
+                .iter()
+                .map(|segment| segment.data_bytes + segment.padding_bytes)
+                .sum(),
+        }
+    }
+
+    fn n_segments(&self) -> usize {
+        match self {
+            CaseVar::Numeric => 1,
+            CaseVar::String { encoding, .. } => encoding.len(),
+        }
+    }
+}
+/*
+/// A variable in a system file.
+struct WriteVar {
+    width: VarWidth,
+    segment_width: u8,
+    case_index: usize,
+
+    /// Offset within string variable in case.
+    offset: usize,
+
+    /// Number of padding bytes following data.
+    padding: usize,
+}
+
+impl WriteVar {
+    fn new_vars(dictionary: &Dictionary) -> Vec<Self> {
+        let mut vars = Vec::new();
+        for dv in &dictionary.variables {
+
+        }
+    }
+}*/
+
+/// System file writer.
+pub struct Writer<W> {
+    inner: W,
+}
+
+impl<W> Writer<W> where W: Write + Seek {}
author	Ben Pfaff <blp@cs.stanford.edu>
	Fri, 18 Jul 2025 16:10:56 +0000 (09:10 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Fri, 18 Jul 2025 16:10:56 +0000 (09:10 -0700)
rust/Cargo.lock		patch \| blob \| history
rust/pspp/Cargo.toml		patch \| blob \| history
rust/pspp/src/dictionary.rs		patch \| blob \| history
rust/pspp/src/format/mod.rs		patch \| blob \| history
rust/pspp/src/identifier.rs		patch \| blob \| history
rust/pspp/src/output/pivot/mod.rs		patch \| blob \| history
rust/pspp/src/sys/mod.rs		patch \| blob \| history
rust/pspp/src/sys/raw.rs		patch \| blob \| history
rust/pspp/src/sys/raw/records.rs		patch \| blob \| history
rust/pspp/src/sys/write.rs	[new file with mode: 0644]	patch \| blob