From: Ben Pfaff Date: Wed, 3 Sep 2025 18:24:28 +0000 (-0700) Subject: rust: Rename `mod.rs` files to reflect module names. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9b569ae35f22c5cfee071a6a0a8f47d25d761fdb;p=pspp rust: Rename `mod.rs` files to reflect module names. This is the more modern Rust style of file naming, which avoids having many files named `mod.rs`, reducing confusion in editors. --- diff --git a/rust/pspp/src/command.rs b/rust/pspp/src/command.rs new file mode 100644 index 0000000000..5f0d1ec55b --- /dev/null +++ b/rust/pspp/src/command.rs @@ -0,0 +1,959 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +#![allow(dead_code)] +use std::{ + fmt::{Debug, Write}, + ops::RangeFrom, + sync::OnceLock, +}; + +use crosstabs::crosstabs_command; +use ctables::ctables_command; +use data_list::data_list_command; +use descriptives::descriptives_command; +use either::Either; +use flagset::{flags, FlagSet}; +use pspp_derive::FromTokens; + +use crate::{ + format::AbstractFormat, + identifier::Identifier, + integer::ToInteger, + lex::{ + command_name::CommandMatcher, + lexer::{LexToken, TokenSlice}, + Punct, Token, + }, + message::{Diagnostic, Diagnostics}, +}; + +pub mod crosstabs; +pub mod ctables; +pub mod data_list; +pub mod descriptives; + +flags! { + enum State: u8 { + /// No active dataset yet defined. + Initial, + + /// Active dataset has been defined. + Data, + + /// Inside `INPUT PROGRAM`. + InputProgram, + + /// Inside `FILE TYPE`. + FileType, + + /// State nested inside `LOOP` or `DO IF`, inside [State::Data]. + NestedData, + + /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram]. + NestedInputProgram, + } +} + +struct Command { + allowed_states: FlagSet, + enhanced_only: bool, + testing_only: bool, + no_abbrev: bool, + name: &'static str, + run: Box, //-> Box + Send + Sync>, +} + +#[derive(Debug)] +enum ParseError { + Error(Diagnostics), + Mismatch(Diagnostics), +} + +#[derive(Debug)] +struct Parsed { + value: T, + rest: TokenSlice, + diagnostics: Diagnostics, +} + +impl Parsed { + pub fn new(value: T, rest: TokenSlice, warnings: Diagnostics) -> Self { + Self { + value, + rest, + diagnostics: warnings, + } + } + pub fn ok(value: T, rest: TokenSlice) -> Self { + Self { + value, + rest, + diagnostics: Diagnostics::default(), + } + } + pub fn into_tuple(self) -> (T, TokenSlice, Diagnostics) { + (self.value, self.rest, self.diagnostics) + } + pub fn take_diagnostics(self, d: &mut Diagnostics) -> (T, TokenSlice) { + let (value, rest, mut diagnostics) = self.into_tuple(); + d.0.append(&mut diagnostics.0); + (value, rest) + } + pub fn map(self, f: F) -> Parsed + where + F: FnOnce(T) -> R, + { + Parsed { + value: f(self.value), + rest: self.rest, + diagnostics: self.diagnostics, + } + } + pub fn warn(self, mut warnings: Diagnostics) -> Self { + Self { + value: self.value, + rest: self.rest, + diagnostics: { + let mut vec = self.diagnostics.0; + vec.append(&mut warnings.0); + Diagnostics(vec) + }, + } + } +} + +type ParseResult = Result, ParseError>; + +trait MismatchToError { + fn mismatch_to_error(self) -> Self; +} + +impl MismatchToError for ParseResult { + fn mismatch_to_error(self) -> Self { + match self { + Err(ParseError::Mismatch(diagnostic)) => Err(ParseError::Error(diagnostic)), + rest => rest, + } + } +} + +trait FromTokens { + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized; +} + +impl FromTokens for Option +where + T: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + match T::from_tokens(input) { + Ok(p) => Ok(p.map(Some)), + Err(ParseError::Mismatch(_)) => Ok(Parsed::ok(None, input.clone())), + Err(ParseError::Error(error)) => Err(ParseError::Error(error)), + } + } +} + +impl FromTokens for Either +where + L: FromTokens, + R: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + match L::from_tokens(input) { + Ok(p) => Ok(p.map(Either::Left)), + Err(ParseError::Mismatch(_)) => Ok(R::from_tokens(input)?.map(Either::Right)), + Err(ParseError::Error(error)) => Err(ParseError::Error(error)), + } + } +} + +impl FromTokens for (A, B) +where + A: FromTokens, + B: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let (a, input, mut diagnostics) = A::from_tokens(input)?.into_tuple(); + let (b, rest, mut diagnostics2) = B::from_tokens(&input)?.into_tuple(); + diagnostics.0.append(&mut diagnostics2.0); + Ok(Parsed::new((a, b), rest, diagnostics)) + } +} + +impl FromTokens for (A, B, C) +where + A: FromTokens, + B: FromTokens, + C: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let (a, input, mut diagnostics) = A::from_tokens(input)?.into_tuple(); + let (b, input, mut diagnostics2) = B::from_tokens(&input)?.into_tuple(); + let (c, rest, mut diagnostics3) = C::from_tokens(&input)?.into_tuple(); + diagnostics.0.append(&mut diagnostics2.0); + diagnostics.0.append(&mut diagnostics3.0); + Ok(Parsed::new((a, b, c), rest, diagnostics)) + } +} + +#[derive(Debug, pspp_derive::FromTokens)] +#[pspp(syntax = "/")] +pub struct Slash; + +#[derive(Debug)] +pub struct Comma; + +impl FromTokens for Comma { + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + _parse_token(input, &Token::Punct(Punct::Comma)).map(|p| p.map(|_| Comma)) + } +} + +#[derive(Debug, pspp_derive::FromTokens)] +#[pspp(syntax = "=")] +pub struct Equals; + +#[derive(Debug, pspp_derive::FromTokens)] +#[pspp(syntax = "&")] +pub struct And; + +#[derive(Debug, pspp_derive::FromTokens)] +#[pspp(syntax = ">")] +pub struct Gt; + +#[derive(Debug, pspp_derive::FromTokens)] +#[pspp(syntax = "+")] +pub struct Plus; + +#[derive(Debug, pspp_derive::FromTokens)] +#[pspp(syntax = "-")] +pub struct Dash; + +#[derive(Debug, pspp_derive::FromTokens)] +#[pspp(syntax = "*")] +pub struct Asterisk; + +#[derive(Debug, pspp_derive::FromTokens)] +#[pspp(syntax = "**")] +pub struct Exp; + +#[derive(Debug, pspp_derive::FromTokens)] +struct By; + +pub struct Punctuated> { + head: Vec<(T, P)>, + tail: Option, +} + +impl Debug for Punctuated +where + T: Debug, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + for (index, item) in self + .head + .iter() + .map(|(t, _p)| t) + .chain(self.tail.iter()) + .enumerate() + { + if index > 0 { + write!(f, ", ")?; + } + write!(f, "{item:?}")?; + } + write!(f, "]") + } +} + +impl FromTokens for Punctuated +where + T: FromTokens, + P: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let mut head = Vec::new(); + let mut warnings_vec = Vec::new(); + let mut input = input.clone(); + let tail = loop { + let t = match T::from_tokens(&input) { + Ok(Parsed { + value, + rest, + diagnostics: mut warnings, + }) => { + warnings_vec.append(&mut warnings.0); + input = rest; + value + } + Err(ParseError::Mismatch(_)) => break None, + Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), + }; + let p = match P::from_tokens(&input) { + Ok(Parsed { + value, + rest, + diagnostics: mut warnings, + }) => { + warnings_vec.append(&mut warnings.0); + input = rest; + value + } + Err(ParseError::Mismatch(_)) => break Some(t), + Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), + }; + head.push((t, p)); + }; + Ok(Parsed { + value: Punctuated { head, tail }, + rest: input, + diagnostics: Diagnostics(warnings_vec), + }) + } +} + +impl FromTokens for Box +where + T: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + T::from_tokens(input).map(|p| p.map(|value| Box::new(value))) + } +} + +pub struct Subcommands(Vec); + +impl Debug for Subcommands +where + T: Debug, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Subcommands[")?; + for (index, item) in self.0.iter().enumerate() { + if index > 0 { + writeln!(f, ",")?; + } + write!(f, "{item:?}")?; + } + write!(f, "]") + } +} + +impl FromTokens for Subcommands +where + T: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let mut items = Vec::new(); + let mut diagnostics = Vec::new(); + let mut input = input.clone(); + loop { + let start = input.skip_until(|token| token != &Token::Punct(Punct::Slash)); + if start.is_empty() { + break; + } + let end = start.skip_to(&Token::Punct(Punct::Slash)); + let subcommand = start.subslice(0..start.len() - end.len()); + match T::from_tokens(&subcommand) { + Ok(p) => { + let (value, rest, mut d) = p.into_tuple(); + items.push(value); + diagnostics.append(&mut d.0); + if !rest.is_empty() { + diagnostics.push(rest.warning("Syntax error expecting end of subcommand.")); + } + } + Err(ParseError::Error(mut d) | ParseError::Mismatch(mut d)) => { + diagnostics.append(&mut d.0); + } + } + input = end; + } + println!("{diagnostics:?}"); + Ok(Parsed { + value: Subcommands(items), + rest: input, + diagnostics: Diagnostics(diagnostics), + }) + } +} + +#[derive(Debug)] +pub struct Seq0(Vec); + +impl FromTokens for Seq0 +where + T: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let mut values_vec = Vec::new(); + let mut warnings_vec = Vec::new(); + let mut input = input.clone(); + while !input.is_empty() { + match T::from_tokens(&input) { + Ok(Parsed { + value, + rest, + diagnostics: mut warnings, + }) => { + warnings_vec.append(&mut warnings.0); + if input.len() == rest.len() { + break; + } + values_vec.push(value); + input = rest; + } + Err(ParseError::Mismatch(_)) => break, + Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), + } + } + Ok(Parsed { + value: Seq0(values_vec), + rest: input, + diagnostics: Diagnostics(warnings_vec), + }) + } +} + +#[derive(Debug)] +pub struct Seq1(Vec); + +impl FromTokens for Seq1 +where + T: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let mut values_vec = Vec::new(); + let mut warnings_vec = Vec::new(); + let mut input = input.clone(); + while !input.is_empty() { + match T::from_tokens(&input) { + Ok(Parsed { + value, + rest, + diagnostics: mut warnings, + }) => { + warnings_vec.append(&mut warnings.0); + if input.len() == rest.len() { + break; + } + values_vec.push(value); + input = rest; + } + Err(ParseError::Mismatch(_)) => break, + Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), + } + } + if values_vec.is_empty() { + return Err(ParseError::Mismatch(input.error("Syntax error.").into())); + } + Ok(Parsed { + value: Seq1(values_vec), + rest: input, + diagnostics: Diagnostics(warnings_vec), + }) + } +} + +/* +impl FromTokens for Vec +where + T: FromTokens, +{ + fn from_tokens(mut input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let mut values_vec = Vec::new(); + let mut warnings_vec = Vec::new(); + while !input.is_empty() { + match T::from_tokens(input) { + Ok(Parsed { + value, + rest, + diagnostics: mut warnings, + }) => { + values_vec.push(value); + warnings_vec.append(&mut warnings.0); + input = rest; + } + Err(ParseError::Mismatch(_)) => break, + Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), + } + } + Ok(Parsed { + value: values_vec, + rest: input, + diagnostics: Diagnostics(warnings_vec), + }) + } +}*/ + +impl FromTokens for TokenSlice { + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + Ok(Parsed::ok(input.clone(), input.end())) + } +} + +#[derive(Debug)] +struct Subcommand(pub T); + +impl FromTokens for Subcommand +where + T: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let start = input.skip_until(|token| token != &Token::Punct(Punct::Slash)); + if start.is_empty() { + return Err(ParseError::Error( + input.error("Syntax error at end of input.").into(), + )); + } + let end = start.skip_to(&Token::Punct(Punct::Slash)); + let subcommand = start.subslice(0..start.len() - end.len()); + let (value, rest, mut warnings) = T::from_tokens(&subcommand)?.into_tuple(); + if !rest.is_empty() { + warnings + .0 + .push(rest.warning("Syntax error expecting end of subcommand.")); + } + Ok(Parsed::new(Self(value), end, warnings)) + } +} + +#[derive(Debug)] +struct InParens(pub T); + +impl FromTokens for InParens +where + T: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let ((), rest, _) = parse_token(input, &Token::Punct(Punct::LParen))?.into_tuple(); + let (value, rest, warnings) = T::from_tokens(&rest)?.into_tuple(); + let ((), rest, _) = parse_token(&rest, &Token::Punct(Punct::RParen))?.into_tuple(); + Ok(Parsed { + value: Self(value), + rest, + diagnostics: warnings, + }) + } +} + +#[derive(Debug)] +struct InSquares(pub T); + +impl FromTokens for InSquares +where + T: FromTokens, +{ + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + let ((), rest, _) = parse_token(input, &Token::Punct(Punct::LSquare))?.into_tuple(); + let (value, rest, warnings) = T::from_tokens(&rest)?.into_tuple(); + let ((), rest, _) = parse_token(&rest, &Token::Punct(Punct::RSquare))?.into_tuple(); + Ok(Parsed { + value: Self(value), + rest, + diagnostics: warnings, + }) + } +} + +fn parse_token_if(input: &TokenSlice, parse: F) -> ParseResult +where + F: Fn(&Token) -> Option, +{ + if let Some(token) = input.get_token(0) { + if let Some(result) = parse(token) { + return Ok(Parsed::ok(result, input.subslice(1..input.len()))); + } + } + Err(ParseError::Mismatch(Diagnostics::default())) +} + +fn _parse_token(input: &TokenSlice, token: &Token) -> ParseResult { + if let Some(rest) = input.skip(token) { + Ok(Parsed::ok(input.first().token.clone(), rest)) + } else { + Err(ParseError::Mismatch( + input.error(format!("expecting {token}")).into(), + )) + } +} + +fn parse_token(input: &TokenSlice, token: &Token) -> ParseResult<()> { + if let Some(rest) = input.skip(token) { + Ok(Parsed::ok((), rest)) + } else { + Err(ParseError::Mismatch( + input.error(format!("expecting {token}")).into(), + )) + } +} + +fn parse_syntax(input: &TokenSlice, syntax: &str) -> ParseResult<()> { + if let Some(rest) = input.skip_syntax(syntax) { + Ok(Parsed::ok((), rest)) + } else { + Err(ParseError::Mismatch( + input.error(format!("expecting {syntax}")).into(), + )) + } +} + +pub type VarList = Punctuated; + +pub struct Number(f64); + +impl Debug for Number { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.0) + } +} + +impl FromTokens for Number { + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + parse_token_if(input, |token| token.as_number().map(Number)) + .map_err(|_| ParseError::Mismatch(input.error(String::from("expecting number")).into())) + } +} + +#[derive(Debug)] +pub struct Integer(i64); + +impl FromTokens for Integer { + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + parse_token_if(input, |token| token.as_integer().map(Integer)).map_err(|_| { + ParseError::Mismatch(input.error(String::from("expecting integer")).into()) + }) + } +} + +pub enum VarRange { + Single(Identifier), + Range(Identifier, Identifier), + All, +} + +impl Debug for VarRange { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Single(var) => write!(f, "{var:?}"), + Self::Range(from, to) => write!(f, "{from:?} TO {to:?}"), + Self::All => write!(f, "ALL"), + } + } +} + +impl FromTokens for VarRange { + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + if let Ok(Parsed { rest, .. }) = parse_token(input, &Token::Punct(Punct::All)) { + Ok(Parsed::ok(Self::All, rest)) + } else { + let (from, rest, _) = parse_id(input)?.into_tuple(); + if let Ok(Parsed { rest, .. }) = parse_token(&rest, &Token::Punct(Punct::To)) { + if let Ok(p) = parse_id(&rest) { + return Ok(p.map(|to| Self::Range(from, to))); + } + } + Ok(Parsed::ok(Self::Single(from), rest)) + } + } +} + +fn parse_id(input: &TokenSlice) -> ParseResult { + let mut iter = input.iter(); + if let Some(LexToken { + token: Token::Id(id), + .. + }) = iter.next() + { + Ok(Parsed::ok(id.clone(), iter.remainder())) + } else { + Err(ParseError::Mismatch( + input.error("Syntax error expecting identifier.").into(), + )) + } +} + +fn parse_format(input: &TokenSlice) -> ParseResult { + let mut iter = input.iter(); + if let Some(LexToken { + token: Token::Id(id), + .. + }) = iter.next() + { + if let Ok(format) = id.0.as_ref().parse() { + return Ok(Parsed::ok(format, iter.remainder())); + } + } + Err(ParseError::Mismatch( + input.error("Syntax error expecting identifier.").into(), + )) +} + +fn parse_string(input: &TokenSlice) -> ParseResult { + let mut iter = input.iter(); + if let Some(LexToken { + token: Token::String(s), + .. + }) = iter.next() + { + Ok(Parsed::ok(s.clone(), iter.remainder())) + } else { + Err(ParseError::Mismatch( + input.error("Syntax error expecting identifier.").into(), + )) + } +} + +impl FromTokens for Identifier { + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + parse_id(input) + } +} + +impl FromTokens for String { + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + parse_string(input) + } +} + +impl FromTokens for AbstractFormat { + fn from_tokens(input: &TokenSlice) -> ParseResult + where + Self: Sized, + { + parse_format(input) + } +} + +fn collect_subcommands(src: TokenSlice) -> Vec { + src.split(|token| token.token == Token::Punct(Punct::Slash)) + .filter(|slice| !slice.is_empty()) + .collect() +} + +fn commands() -> &'static [Command] { + fn new_commands() -> Vec { + vec![ + descriptives_command(), + crosstabs_command(), + ctables_command(), + data_list_command(), + Command { + allowed_states: FlagSet::full(), + enhanced_only: false, + testing_only: false, + no_abbrev: false, + name: "ECHO", + run: Box::new(|_context| todo!()), + }, + ] + } + + static COMMANDS: OnceLock> = OnceLock::new(); + COMMANDS.get_or_init(new_commands).as_slice() +} + +fn parse_command_word(lexer: &mut TokenSlice, s: &mut String, n: usize) -> bool { + let separator = match s.chars().next_back() { + Some(c) if c != '-' => " ", + _ => "", + }; + + match lexer.get_token(n) { + Some(Token::Punct(Punct::Dash)) => { + s.push('-'); + true + } + Some(Token::Id(id)) => { + write!(s, "{separator}{id}").unwrap(); + true + } + Some(Token::Number(number)) if number.is_sign_positive() => { + if let Some(integer) = number.to_exact_usize() { + write!(s, "{separator}{integer}").unwrap(); + true + } else { + false + } + } + _ => false, + } +} + +fn find_best_match(s: &str) -> (Option<&'static Command>, isize) { + let mut cm = CommandMatcher::new(s); + for command in commands() { + cm.add(command.name, command); + } + cm.get_match() +} + +fn parse_command_name( + lexer: &mut TokenSlice, + error: &dyn Fn(Diagnostic), +) -> Result<(&'static Command, usize), ()> { + let mut s = String::new(); + let mut word = 0; + let mut missing_words = 0; + let mut command = None; + while parse_command_word(lexer, &mut s, word) { + (command, missing_words) = find_best_match(&s); + if missing_words <= 0 { + break; + } + word += 1; + } + if command.is_none() && missing_words > 0 { + s.push_str(" ."); + (command, missing_words) = find_best_match(&s); + s.truncate(s.len() - 2); + } + + match command { + Some(command) => Ok((command, ((word as isize + 1) + missing_words) as usize)), + None => { + if word == 0 { + error( + lexer + .subslice(0..1) + .error("Syntax error expecting command name"), + ) + } else { + error(lexer.subslice(0..word + 1).error("Unknown command `{s}`.")) + }; + Err(()) + } + } +} + +pub enum Success { + Success, + Eof, + Finish, +} + +pub fn end_of_command(context: &Context, range: RangeFrom) -> Result { + match context.lexer.get_token(range.start) { + None | Some(Token::End) => Ok(Success::Success), + _ => { + context.error( + context + .lexer + .subslice(range.start..context.lexer.len()) + .error("Syntax error expecting end of command."), + ); + Err(()) + } + } +} + +fn parse_in_state(mut lexer: TokenSlice, error: &dyn Fn(Diagnostic), _state: State) { + match lexer.get_token(0) { + None | Some(Token::End) => (), + _ => match parse_command_name(&mut lexer, error) { + Ok((command, n_tokens)) => { + let mut context = Context { + error, + lexer: lexer.subslice(n_tokens..lexer.len()), + command_name: Some(command.name), + }; + (command.run)(&mut context); + } + Err(error) => println!("{error:?}"), + }, + } +} + +pub fn parse_command(lexer: TokenSlice, error: &dyn Fn(Diagnostic)) { + parse_in_state(lexer, error, State::Initial) +} + +pub struct Context<'a> { + error: &'a dyn Fn(Diagnostic), + lexer: TokenSlice, + command_name: Option<&'static str>, +} + +impl Context<'_> { + pub fn error(&self, diagnostic: Diagnostic) { + (self.error)(diagnostic); + } +} diff --git a/rust/pspp/src/command/mod.rs b/rust/pspp/src/command/mod.rs deleted file mode 100644 index 5f0d1ec55b..0000000000 --- a/rust/pspp/src/command/mod.rs +++ /dev/null @@ -1,959 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -#![allow(dead_code)] -use std::{ - fmt::{Debug, Write}, - ops::RangeFrom, - sync::OnceLock, -}; - -use crosstabs::crosstabs_command; -use ctables::ctables_command; -use data_list::data_list_command; -use descriptives::descriptives_command; -use either::Either; -use flagset::{flags, FlagSet}; -use pspp_derive::FromTokens; - -use crate::{ - format::AbstractFormat, - identifier::Identifier, - integer::ToInteger, - lex::{ - command_name::CommandMatcher, - lexer::{LexToken, TokenSlice}, - Punct, Token, - }, - message::{Diagnostic, Diagnostics}, -}; - -pub mod crosstabs; -pub mod ctables; -pub mod data_list; -pub mod descriptives; - -flags! { - enum State: u8 { - /// No active dataset yet defined. - Initial, - - /// Active dataset has been defined. - Data, - - /// Inside `INPUT PROGRAM`. - InputProgram, - - /// Inside `FILE TYPE`. - FileType, - - /// State nested inside `LOOP` or `DO IF`, inside [State::Data]. - NestedData, - - /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram]. - NestedInputProgram, - } -} - -struct Command { - allowed_states: FlagSet, - enhanced_only: bool, - testing_only: bool, - no_abbrev: bool, - name: &'static str, - run: Box, //-> Box + Send + Sync>, -} - -#[derive(Debug)] -enum ParseError { - Error(Diagnostics), - Mismatch(Diagnostics), -} - -#[derive(Debug)] -struct Parsed { - value: T, - rest: TokenSlice, - diagnostics: Diagnostics, -} - -impl Parsed { - pub fn new(value: T, rest: TokenSlice, warnings: Diagnostics) -> Self { - Self { - value, - rest, - diagnostics: warnings, - } - } - pub fn ok(value: T, rest: TokenSlice) -> Self { - Self { - value, - rest, - diagnostics: Diagnostics::default(), - } - } - pub fn into_tuple(self) -> (T, TokenSlice, Diagnostics) { - (self.value, self.rest, self.diagnostics) - } - pub fn take_diagnostics(self, d: &mut Diagnostics) -> (T, TokenSlice) { - let (value, rest, mut diagnostics) = self.into_tuple(); - d.0.append(&mut diagnostics.0); - (value, rest) - } - pub fn map(self, f: F) -> Parsed - where - F: FnOnce(T) -> R, - { - Parsed { - value: f(self.value), - rest: self.rest, - diagnostics: self.diagnostics, - } - } - pub fn warn(self, mut warnings: Diagnostics) -> Self { - Self { - value: self.value, - rest: self.rest, - diagnostics: { - let mut vec = self.diagnostics.0; - vec.append(&mut warnings.0); - Diagnostics(vec) - }, - } - } -} - -type ParseResult = Result, ParseError>; - -trait MismatchToError { - fn mismatch_to_error(self) -> Self; -} - -impl MismatchToError for ParseResult { - fn mismatch_to_error(self) -> Self { - match self { - Err(ParseError::Mismatch(diagnostic)) => Err(ParseError::Error(diagnostic)), - rest => rest, - } - } -} - -trait FromTokens { - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized; -} - -impl FromTokens for Option -where - T: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - match T::from_tokens(input) { - Ok(p) => Ok(p.map(Some)), - Err(ParseError::Mismatch(_)) => Ok(Parsed::ok(None, input.clone())), - Err(ParseError::Error(error)) => Err(ParseError::Error(error)), - } - } -} - -impl FromTokens for Either -where - L: FromTokens, - R: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - match L::from_tokens(input) { - Ok(p) => Ok(p.map(Either::Left)), - Err(ParseError::Mismatch(_)) => Ok(R::from_tokens(input)?.map(Either::Right)), - Err(ParseError::Error(error)) => Err(ParseError::Error(error)), - } - } -} - -impl FromTokens for (A, B) -where - A: FromTokens, - B: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let (a, input, mut diagnostics) = A::from_tokens(input)?.into_tuple(); - let (b, rest, mut diagnostics2) = B::from_tokens(&input)?.into_tuple(); - diagnostics.0.append(&mut diagnostics2.0); - Ok(Parsed::new((a, b), rest, diagnostics)) - } -} - -impl FromTokens for (A, B, C) -where - A: FromTokens, - B: FromTokens, - C: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let (a, input, mut diagnostics) = A::from_tokens(input)?.into_tuple(); - let (b, input, mut diagnostics2) = B::from_tokens(&input)?.into_tuple(); - let (c, rest, mut diagnostics3) = C::from_tokens(&input)?.into_tuple(); - diagnostics.0.append(&mut diagnostics2.0); - diagnostics.0.append(&mut diagnostics3.0); - Ok(Parsed::new((a, b, c), rest, diagnostics)) - } -} - -#[derive(Debug, pspp_derive::FromTokens)] -#[pspp(syntax = "/")] -pub struct Slash; - -#[derive(Debug)] -pub struct Comma; - -impl FromTokens for Comma { - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - _parse_token(input, &Token::Punct(Punct::Comma)).map(|p| p.map(|_| Comma)) - } -} - -#[derive(Debug, pspp_derive::FromTokens)] -#[pspp(syntax = "=")] -pub struct Equals; - -#[derive(Debug, pspp_derive::FromTokens)] -#[pspp(syntax = "&")] -pub struct And; - -#[derive(Debug, pspp_derive::FromTokens)] -#[pspp(syntax = ">")] -pub struct Gt; - -#[derive(Debug, pspp_derive::FromTokens)] -#[pspp(syntax = "+")] -pub struct Plus; - -#[derive(Debug, pspp_derive::FromTokens)] -#[pspp(syntax = "-")] -pub struct Dash; - -#[derive(Debug, pspp_derive::FromTokens)] -#[pspp(syntax = "*")] -pub struct Asterisk; - -#[derive(Debug, pspp_derive::FromTokens)] -#[pspp(syntax = "**")] -pub struct Exp; - -#[derive(Debug, pspp_derive::FromTokens)] -struct By; - -pub struct Punctuated> { - head: Vec<(T, P)>, - tail: Option, -} - -impl Debug for Punctuated -where - T: Debug, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "[")?; - for (index, item) in self - .head - .iter() - .map(|(t, _p)| t) - .chain(self.tail.iter()) - .enumerate() - { - if index > 0 { - write!(f, ", ")?; - } - write!(f, "{item:?}")?; - } - write!(f, "]") - } -} - -impl FromTokens for Punctuated -where - T: FromTokens, - P: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let mut head = Vec::new(); - let mut warnings_vec = Vec::new(); - let mut input = input.clone(); - let tail = loop { - let t = match T::from_tokens(&input) { - Ok(Parsed { - value, - rest, - diagnostics: mut warnings, - }) => { - warnings_vec.append(&mut warnings.0); - input = rest; - value - } - Err(ParseError::Mismatch(_)) => break None, - Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), - }; - let p = match P::from_tokens(&input) { - Ok(Parsed { - value, - rest, - diagnostics: mut warnings, - }) => { - warnings_vec.append(&mut warnings.0); - input = rest; - value - } - Err(ParseError::Mismatch(_)) => break Some(t), - Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), - }; - head.push((t, p)); - }; - Ok(Parsed { - value: Punctuated { head, tail }, - rest: input, - diagnostics: Diagnostics(warnings_vec), - }) - } -} - -impl FromTokens for Box -where - T: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - T::from_tokens(input).map(|p| p.map(|value| Box::new(value))) - } -} - -pub struct Subcommands(Vec); - -impl Debug for Subcommands -where - T: Debug, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Subcommands[")?; - for (index, item) in self.0.iter().enumerate() { - if index > 0 { - writeln!(f, ",")?; - } - write!(f, "{item:?}")?; - } - write!(f, "]") - } -} - -impl FromTokens for Subcommands -where - T: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let mut items = Vec::new(); - let mut diagnostics = Vec::new(); - let mut input = input.clone(); - loop { - let start = input.skip_until(|token| token != &Token::Punct(Punct::Slash)); - if start.is_empty() { - break; - } - let end = start.skip_to(&Token::Punct(Punct::Slash)); - let subcommand = start.subslice(0..start.len() - end.len()); - match T::from_tokens(&subcommand) { - Ok(p) => { - let (value, rest, mut d) = p.into_tuple(); - items.push(value); - diagnostics.append(&mut d.0); - if !rest.is_empty() { - diagnostics.push(rest.warning("Syntax error expecting end of subcommand.")); - } - } - Err(ParseError::Error(mut d) | ParseError::Mismatch(mut d)) => { - diagnostics.append(&mut d.0); - } - } - input = end; - } - println!("{diagnostics:?}"); - Ok(Parsed { - value: Subcommands(items), - rest: input, - diagnostics: Diagnostics(diagnostics), - }) - } -} - -#[derive(Debug)] -pub struct Seq0(Vec); - -impl FromTokens for Seq0 -where - T: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let mut values_vec = Vec::new(); - let mut warnings_vec = Vec::new(); - let mut input = input.clone(); - while !input.is_empty() { - match T::from_tokens(&input) { - Ok(Parsed { - value, - rest, - diagnostics: mut warnings, - }) => { - warnings_vec.append(&mut warnings.0); - if input.len() == rest.len() { - break; - } - values_vec.push(value); - input = rest; - } - Err(ParseError::Mismatch(_)) => break, - Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), - } - } - Ok(Parsed { - value: Seq0(values_vec), - rest: input, - diagnostics: Diagnostics(warnings_vec), - }) - } -} - -#[derive(Debug)] -pub struct Seq1(Vec); - -impl FromTokens for Seq1 -where - T: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let mut values_vec = Vec::new(); - let mut warnings_vec = Vec::new(); - let mut input = input.clone(); - while !input.is_empty() { - match T::from_tokens(&input) { - Ok(Parsed { - value, - rest, - diagnostics: mut warnings, - }) => { - warnings_vec.append(&mut warnings.0); - if input.len() == rest.len() { - break; - } - values_vec.push(value); - input = rest; - } - Err(ParseError::Mismatch(_)) => break, - Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), - } - } - if values_vec.is_empty() { - return Err(ParseError::Mismatch(input.error("Syntax error.").into())); - } - Ok(Parsed { - value: Seq1(values_vec), - rest: input, - diagnostics: Diagnostics(warnings_vec), - }) - } -} - -/* -impl FromTokens for Vec -where - T: FromTokens, -{ - fn from_tokens(mut input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let mut values_vec = Vec::new(); - let mut warnings_vec = Vec::new(); - while !input.is_empty() { - match T::from_tokens(input) { - Ok(Parsed { - value, - rest, - diagnostics: mut warnings, - }) => { - values_vec.push(value); - warnings_vec.append(&mut warnings.0); - input = rest; - } - Err(ParseError::Mismatch(_)) => break, - Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), - } - } - Ok(Parsed { - value: values_vec, - rest: input, - diagnostics: Diagnostics(warnings_vec), - }) - } -}*/ - -impl FromTokens for TokenSlice { - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - Ok(Parsed::ok(input.clone(), input.end())) - } -} - -#[derive(Debug)] -struct Subcommand(pub T); - -impl FromTokens for Subcommand -where - T: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let start = input.skip_until(|token| token != &Token::Punct(Punct::Slash)); - if start.is_empty() { - return Err(ParseError::Error( - input.error("Syntax error at end of input.").into(), - )); - } - let end = start.skip_to(&Token::Punct(Punct::Slash)); - let subcommand = start.subslice(0..start.len() - end.len()); - let (value, rest, mut warnings) = T::from_tokens(&subcommand)?.into_tuple(); - if !rest.is_empty() { - warnings - .0 - .push(rest.warning("Syntax error expecting end of subcommand.")); - } - Ok(Parsed::new(Self(value), end, warnings)) - } -} - -#[derive(Debug)] -struct InParens(pub T); - -impl FromTokens for InParens -where - T: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let ((), rest, _) = parse_token(input, &Token::Punct(Punct::LParen))?.into_tuple(); - let (value, rest, warnings) = T::from_tokens(&rest)?.into_tuple(); - let ((), rest, _) = parse_token(&rest, &Token::Punct(Punct::RParen))?.into_tuple(); - Ok(Parsed { - value: Self(value), - rest, - diagnostics: warnings, - }) - } -} - -#[derive(Debug)] -struct InSquares(pub T); - -impl FromTokens for InSquares -where - T: FromTokens, -{ - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - let ((), rest, _) = parse_token(input, &Token::Punct(Punct::LSquare))?.into_tuple(); - let (value, rest, warnings) = T::from_tokens(&rest)?.into_tuple(); - let ((), rest, _) = parse_token(&rest, &Token::Punct(Punct::RSquare))?.into_tuple(); - Ok(Parsed { - value: Self(value), - rest, - diagnostics: warnings, - }) - } -} - -fn parse_token_if(input: &TokenSlice, parse: F) -> ParseResult -where - F: Fn(&Token) -> Option, -{ - if let Some(token) = input.get_token(0) { - if let Some(result) = parse(token) { - return Ok(Parsed::ok(result, input.subslice(1..input.len()))); - } - } - Err(ParseError::Mismatch(Diagnostics::default())) -} - -fn _parse_token(input: &TokenSlice, token: &Token) -> ParseResult { - if let Some(rest) = input.skip(token) { - Ok(Parsed::ok(input.first().token.clone(), rest)) - } else { - Err(ParseError::Mismatch( - input.error(format!("expecting {token}")).into(), - )) - } -} - -fn parse_token(input: &TokenSlice, token: &Token) -> ParseResult<()> { - if let Some(rest) = input.skip(token) { - Ok(Parsed::ok((), rest)) - } else { - Err(ParseError::Mismatch( - input.error(format!("expecting {token}")).into(), - )) - } -} - -fn parse_syntax(input: &TokenSlice, syntax: &str) -> ParseResult<()> { - if let Some(rest) = input.skip_syntax(syntax) { - Ok(Parsed::ok((), rest)) - } else { - Err(ParseError::Mismatch( - input.error(format!("expecting {syntax}")).into(), - )) - } -} - -pub type VarList = Punctuated; - -pub struct Number(f64); - -impl Debug for Number { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0) - } -} - -impl FromTokens for Number { - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - parse_token_if(input, |token| token.as_number().map(Number)) - .map_err(|_| ParseError::Mismatch(input.error(String::from("expecting number")).into())) - } -} - -#[derive(Debug)] -pub struct Integer(i64); - -impl FromTokens for Integer { - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - parse_token_if(input, |token| token.as_integer().map(Integer)).map_err(|_| { - ParseError::Mismatch(input.error(String::from("expecting integer")).into()) - }) - } -} - -pub enum VarRange { - Single(Identifier), - Range(Identifier, Identifier), - All, -} - -impl Debug for VarRange { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Single(var) => write!(f, "{var:?}"), - Self::Range(from, to) => write!(f, "{from:?} TO {to:?}"), - Self::All => write!(f, "ALL"), - } - } -} - -impl FromTokens for VarRange { - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - if let Ok(Parsed { rest, .. }) = parse_token(input, &Token::Punct(Punct::All)) { - Ok(Parsed::ok(Self::All, rest)) - } else { - let (from, rest, _) = parse_id(input)?.into_tuple(); - if let Ok(Parsed { rest, .. }) = parse_token(&rest, &Token::Punct(Punct::To)) { - if let Ok(p) = parse_id(&rest) { - return Ok(p.map(|to| Self::Range(from, to))); - } - } - Ok(Parsed::ok(Self::Single(from), rest)) - } - } -} - -fn parse_id(input: &TokenSlice) -> ParseResult { - let mut iter = input.iter(); - if let Some(LexToken { - token: Token::Id(id), - .. - }) = iter.next() - { - Ok(Parsed::ok(id.clone(), iter.remainder())) - } else { - Err(ParseError::Mismatch( - input.error("Syntax error expecting identifier.").into(), - )) - } -} - -fn parse_format(input: &TokenSlice) -> ParseResult { - let mut iter = input.iter(); - if let Some(LexToken { - token: Token::Id(id), - .. - }) = iter.next() - { - if let Ok(format) = id.0.as_ref().parse() { - return Ok(Parsed::ok(format, iter.remainder())); - } - } - Err(ParseError::Mismatch( - input.error("Syntax error expecting identifier.").into(), - )) -} - -fn parse_string(input: &TokenSlice) -> ParseResult { - let mut iter = input.iter(); - if let Some(LexToken { - token: Token::String(s), - .. - }) = iter.next() - { - Ok(Parsed::ok(s.clone(), iter.remainder())) - } else { - Err(ParseError::Mismatch( - input.error("Syntax error expecting identifier.").into(), - )) - } -} - -impl FromTokens for Identifier { - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - parse_id(input) - } -} - -impl FromTokens for String { - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - parse_string(input) - } -} - -impl FromTokens for AbstractFormat { - fn from_tokens(input: &TokenSlice) -> ParseResult - where - Self: Sized, - { - parse_format(input) - } -} - -fn collect_subcommands(src: TokenSlice) -> Vec { - src.split(|token| token.token == Token::Punct(Punct::Slash)) - .filter(|slice| !slice.is_empty()) - .collect() -} - -fn commands() -> &'static [Command] { - fn new_commands() -> Vec { - vec![ - descriptives_command(), - crosstabs_command(), - ctables_command(), - data_list_command(), - Command { - allowed_states: FlagSet::full(), - enhanced_only: false, - testing_only: false, - no_abbrev: false, - name: "ECHO", - run: Box::new(|_context| todo!()), - }, - ] - } - - static COMMANDS: OnceLock> = OnceLock::new(); - COMMANDS.get_or_init(new_commands).as_slice() -} - -fn parse_command_word(lexer: &mut TokenSlice, s: &mut String, n: usize) -> bool { - let separator = match s.chars().next_back() { - Some(c) if c != '-' => " ", - _ => "", - }; - - match lexer.get_token(n) { - Some(Token::Punct(Punct::Dash)) => { - s.push('-'); - true - } - Some(Token::Id(id)) => { - write!(s, "{separator}{id}").unwrap(); - true - } - Some(Token::Number(number)) if number.is_sign_positive() => { - if let Some(integer) = number.to_exact_usize() { - write!(s, "{separator}{integer}").unwrap(); - true - } else { - false - } - } - _ => false, - } -} - -fn find_best_match(s: &str) -> (Option<&'static Command>, isize) { - let mut cm = CommandMatcher::new(s); - for command in commands() { - cm.add(command.name, command); - } - cm.get_match() -} - -fn parse_command_name( - lexer: &mut TokenSlice, - error: &dyn Fn(Diagnostic), -) -> Result<(&'static Command, usize), ()> { - let mut s = String::new(); - let mut word = 0; - let mut missing_words = 0; - let mut command = None; - while parse_command_word(lexer, &mut s, word) { - (command, missing_words) = find_best_match(&s); - if missing_words <= 0 { - break; - } - word += 1; - } - if command.is_none() && missing_words > 0 { - s.push_str(" ."); - (command, missing_words) = find_best_match(&s); - s.truncate(s.len() - 2); - } - - match command { - Some(command) => Ok((command, ((word as isize + 1) + missing_words) as usize)), - None => { - if word == 0 { - error( - lexer - .subslice(0..1) - .error("Syntax error expecting command name"), - ) - } else { - error(lexer.subslice(0..word + 1).error("Unknown command `{s}`.")) - }; - Err(()) - } - } -} - -pub enum Success { - Success, - Eof, - Finish, -} - -pub fn end_of_command(context: &Context, range: RangeFrom) -> Result { - match context.lexer.get_token(range.start) { - None | Some(Token::End) => Ok(Success::Success), - _ => { - context.error( - context - .lexer - .subslice(range.start..context.lexer.len()) - .error("Syntax error expecting end of command."), - ); - Err(()) - } - } -} - -fn parse_in_state(mut lexer: TokenSlice, error: &dyn Fn(Diagnostic), _state: State) { - match lexer.get_token(0) { - None | Some(Token::End) => (), - _ => match parse_command_name(&mut lexer, error) { - Ok((command, n_tokens)) => { - let mut context = Context { - error, - lexer: lexer.subslice(n_tokens..lexer.len()), - command_name: Some(command.name), - }; - (command.run)(&mut context); - } - Err(error) => println!("{error:?}"), - }, - } -} - -pub fn parse_command(lexer: TokenSlice, error: &dyn Fn(Diagnostic)) { - parse_in_state(lexer, error, State::Initial) -} - -pub struct Context<'a> { - error: &'a dyn Fn(Diagnostic), - lexer: TokenSlice, - command_name: Option<&'static str>, -} - -impl Context<'_> { - pub fn error(&self, diagnostic: Diagnostic) { - (self.error)(diagnostic); - } -} diff --git a/rust/pspp/src/crypto.rs b/rust/pspp/src/crypto.rs new file mode 100644 index 0000000000..c2e86cdea1 --- /dev/null +++ b/rust/pspp/src/crypto.rs @@ -0,0 +1,668 @@ +//! # Decryption for SPSS encrypted files +//! +//! SPSS supports encryption using a password for data, viewer, and syntax +//! files. The encryption mechanism is poorly designed, so this module provides +//! support for decrypting, but not encrypting, the SPSS format. +//! Use [EncryptedFile] as the starting point for reading an encrypted file. +//! +//! SPSS also supports what calls "encrypted passwords". Use [EncodedPassword] +//! to encode and decode these passwords. + +// Warn about missing docs, but not for items declared with `#[cfg(test)]`. +#![cfg_attr(not(test), warn(missing_docs))] + +use aes::{ + cipher::{generic_array::GenericArray, BlockDecrypt, KeyInit}, + Aes256, Aes256Dec, +}; +use cmac::{Cmac, Mac}; +use smallvec::SmallVec; +use std::{ + fmt::Debug, + io::{BufRead, Error as IoError, ErrorKind, Read, Seek, SeekFrom}, +}; +use thiserror::Error as ThisError; + +use binrw::{io::NoSeek, BinRead}; + +/// Error reading an encrypted file. +#[derive(Clone, Debug, ThisError)] +pub enum Error { + /// I/O error. + #[error("I/O error reading encrypted file wrapper ({0})")] + IoError(ErrorKind), + + /// Invalid padding in final encrypted data block. + #[error("Invalid padding in final encrypted data block")] + InvalidPadding, + + /// Not an encrypted file. + #[error("Not an encrypted file")] + NotEncrypted, + + /// Encrypted file has invalid length. + #[error("Encrypted file has invalid length {0} (expected 4 more than a multiple of 16).")] + InvalidLength(u64), + + /// Unknown file type. + #[error("Unknown file type {0:?}.")] + UnknownFileType(String), +} + +impl From for Error { + fn from(value: std::io::Error) -> Self { + Self::IoError(value.kind()) + } +} + +#[derive(BinRead)] +struct EncryptedHeader { + /// Fixed as `1c 00 00 00 00 00 00 00` in practice. + _ignore: [u8; 8], + + /// File type. + #[br(magic = b"ENCRYPTED")] + file_type: [u8; 3], + + /// Fixed as `15 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00` in practice. + _ignore2: [u8; 16], +} + +/// An encrypted file. +pub struct EncryptedFile { + reader: R, + file_type: FileType, + + /// Length of the ciphertext (excluding the 36-byte header). + length: u64, + + /// First block of ciphertext, for verifying that any password the user + /// tries is correct. + first_block: [u8; 16], + + /// Last block of ciphertext, for checking padding and determining the + /// plaintext length. + last_block: [u8; 16], +} + +/// Type of encrypted file. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum FileType { + /// A `.sps` syntax file. + Syntax, + + /// A `.spv` viewer file. + Viewer, + + /// A `.sav` data file. + Data, +} + +impl EncryptedFile +where + R: Read + Seek, +{ + /// Opens `reader` as an encrypted file. + /// + /// This reads enough of the file to verify that it is in the expected + /// format and returns an error if it cannot be read or is not the expected + /// format. + /// + /// `reader` doesn't need to be [BufRead], and probably should not be. The + /// [EncryptedReader] returned by [unlock] or [unlock_literal] will be + /// [BufRead]. + /// + /// [unlock]: Self::unlock + /// [unlock_literal]: Self::unlock_literal + pub fn new(mut reader: R) -> Result { + let header = + EncryptedHeader::read_le(&mut NoSeek::new(&mut reader)).map_err( + |error| match error { + binrw::Error::BadMagic { .. } => Error::NotEncrypted, + binrw::Error::Io(error) => Error::IoError(error.kind()), + _ => unreachable!(), + }, + )?; + let file_type = match &header.file_type { + b"SAV" => FileType::Data, + b"SPV" => FileType::Viewer, + b"SPS" => FileType::Syntax, + _ => { + return Err(Error::UnknownFileType( + header.file_type.iter().map(|b| *b as char).collect(), + )) + } + }; + let mut first_block = [0; 16]; + reader.read_exact(&mut first_block)?; + let length = reader.seek(SeekFrom::End(-16))? + 16; + if length < 36 + 16 || (length - 36) % 16 != 0 { + return Err(Error::InvalidLength(length + 36)); + } + let mut last_block = [0; 16]; + reader.read_exact(&mut last_block)?; + reader.seek(SeekFrom::Start(36))?; + Ok(Self { + reader, + file_type, + length, + first_block, + last_block, + }) + } + + /// Tries to unlock the encrypted file using both `password` and with + /// `password` decoded with [EncodedPassword::decode]. If successful, + /// returns an [EncryptedReader] for the file; on failure, returns the + /// [EncryptedFile] again for another try. + pub fn unlock(self, password: &[u8]) -> Result, Self> { + self.unlock_literal(password).or_else(|this| { + match EncodedPassword::from_encoded(password) { + Some(encoded) => this.unlock_literal(&encoded.decode()), + None => Err(this), + } + }) + } + + /// Tries to unlock the encrypted file using just `password`. If + /// successful, returns an [EncryptedReader] for the file; on failure, + /// returns the [EncryptedFile] again for another try. + /// + /// If the password itself might be encoded ("encrypted"), instead use + /// [Self::unlock] to try it both ways. + pub fn unlock_literal(self, password: &[u8]) -> Result, Self> { + // NIST SP 800-108 fixed data. + #[rustfmt::skip] + static FIXED: &[u8] = &[ + // i + 0x00, 0x00, 0x00, 0x01, + + // label + 0x35, 0x27, 0x13, 0xcc, 0x53, 0xa7, 0x78, 0x89, + 0x87, 0x53, 0x22, 0x11, 0xd6, 0x5b, 0x31, 0x58, + 0xdc, 0xfe, 0x2e, 0x7e, 0x94, 0xda, 0x2f, 0x00, + 0xcc, 0x15, 0x71, 0x80, 0x0a, 0x6c, 0x63, 0x53, + + // delimiter + 0x00, + + // context + 0x38, 0xc3, 0x38, 0xac, 0x22, 0xf3, 0x63, 0x62, + 0x0e, 0xce, 0x85, 0x3f, 0xb8, 0x07, 0x4c, 0x4e, + 0x2b, 0x77, 0xc7, 0x21, 0xf5, 0x1a, 0x80, 0x1d, + 0x67, 0xfb, 0xe1, 0xe1, 0x83, 0x07, 0xd8, 0x0d, + + // L + 0x00, 0x00, 0x01, 0x00, + ]; + + // Truncate password to at most 10 bytes. + let password = password.get(..10).unwrap_or(password); + let n = password.len(); + + // padded_password = password padded with zeros to 32 bytes. + let mut padded_password = [0; 32]; + padded_password[..n].copy_from_slice(password); + + // cmac = CMAC(padded_password, fixed). + let mut cmac = as Mac>::new_from_slice(&padded_password).unwrap(); + cmac.update(FIXED); + let cmac = cmac.finalize().into_bytes(); + + // The key is the cmac repeated twice. + let mut key = [0; 32]; + key[..16].copy_from_slice(cmac.as_slice()); + key[16..].copy_from_slice(cmac.as_slice()); + + // Use key to initialize AES. + let aes = ::new_from_slice(&key).unwrap(); + + // Decrypt first block to verify password. + let mut out = [0; 16]; + aes.decrypt_block_b2b( + GenericArray::from_slice(&self.first_block), + GenericArray::from_mut_slice(&mut out), + ); + static MAGIC: &[&[u8]] = &[ + b"$FL2@(#)", + b"$FL3@(#)", + b"* Encoding", + b"PK\x03\x04\x14\0\x08", + ]; + if !MAGIC.iter().any(|magic| out.starts_with(magic)) { + return Err(self); + } + + // Decrypt last block to check padding and get final length. + aes.decrypt_block_b2b( + GenericArray::from_slice(&self.last_block), + GenericArray::from_mut_slice(&mut out), + ); + let Some(padding_length) = parse_padding(&out) else { + return Err(self); + }; + + Ok(EncryptedReader::new( + self.reader, + aes, + self.file_type, + self.length - 36 - padding_length as u64, + )) + } + + /// Returns the type of encrypted file. + pub fn file_type(&self) -> FileType { + self.file_type + } +} + +fn parse_padding(block: &[u8; 16]) -> Option { + let pad = block[15] as usize; + if (1..=16).contains(&pad) && block[16 - pad..].iter().all(|b| *b == pad as u8) { + Some(pad) + } else { + None + } +} + +impl Debug for EncryptedFile +where + R: Read, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "EncryptedFile({:?})", &self.file_type) + } +} + +/// Encrypted file reader. +/// +/// This implements [Read] and [Seek] for SPSS encrypted files. To construct an +/// [EncryptedReader], call [EncryptedFile::new], then [EncryptedFile::unlock]. +pub struct EncryptedReader { + /// Underlying reader. + reader: R, + + /// AES-256 decryption key. + aes: Aes256Dec, + + /// Type of file. + file_type: FileType, + + /// Plaintext file length (not including the file header or padding). + length: u64, + + /// Plaintext data buffer. + buffer: Box<[u8; 4096]>, + + /// Plaintext offset of the byte in `buffer[0]`. A multiple of 16 less than + /// or equal to `length`. + start: u64, + + /// Number of bytes in buffer (`0 <= head <= 4096`). + head: usize, + + /// Offset in buffer of the next byte to read (`head <= tail`). + tail: usize, +} + +impl EncryptedReader { + fn new(reader: R, aes: Aes256Dec, file_type: FileType, length: u64) -> Self { + Self { + reader, + aes, + file_type, + length, + buffer: Box::new([0; 4096]), + start: 0, + head: 0, + tail: 0, + } + } + + fn read_buffer(&mut self, buf: &mut [u8]) -> Result { + let n = buf.len().min(self.head - self.tail); + buf[..n].copy_from_slice(&self.buffer[self.tail..n + self.tail]); + self.tail += n; + Ok(n) + } + + /// Returns the type of encrypted file. + pub fn file_type(&self) -> FileType { + self.file_type + } +} + +impl EncryptedReader +where + R: Read, +{ + fn fill_buffer(&mut self, offset: u64) -> Result<(), IoError> { + self.start = offset / 16 * 16; + self.head = 0; + self.tail = (offset % 16) as usize; + let n = self.buffer.len().min((self.length - self.start) as usize); + self.reader + .read_exact(&mut self.buffer[..n.next_multiple_of(16)])?; + for offset in (0..n).step_by(16) { + self.aes.decrypt_block(GenericArray::from_mut_slice( + &mut self.buffer[offset..offset + 16], + )); + } + self.head = n; + Ok(()) + } +} + +impl Read for EncryptedReader +where + R: Read, +{ + fn read(&mut self, buf: &mut [u8]) -> Result { + if self.tail < self.head { + self.read_buffer(buf) + } else { + let offset = self.start + self.head as u64; + if offset < self.length { + self.fill_buffer(offset)?; + self.read_buffer(buf) + } else { + Ok(0) + } + } + } +} + +impl Seek for EncryptedReader +where + R: Read + Seek, +{ + fn seek(&mut self, pos: SeekFrom) -> Result { + let offset = match pos { + SeekFrom::Start(offset) => Some(offset), + SeekFrom::End(relative) => self.length.checked_add_signed(relative), + SeekFrom::Current(relative) => { + (self.start + self.tail as u64).checked_add_signed(relative) + } + } + .filter(|offset| *offset < u64::MAX - 36) + .ok_or(IoError::from(ErrorKind::InvalidInput))?; + if offset != self.start + self.tail as u64 { + self.reader.seek(SeekFrom::Start(offset / 16 * 16 + 36))?; + self.fill_buffer(offset)?; + } + Ok(offset) + } +} + +impl BufRead for EncryptedReader +where + R: Read + Seek, +{ + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + if self.tail >= self.head { + let offset = self.start + self.head as u64; + if offset < self.length { + self.fill_buffer(offset)?; + } + } + Ok(&self.buffer[self.tail..self.head]) + } + + fn consume(&mut self, amount: usize) { + self.tail += amount; + debug_assert!(self.tail <= self.head); + } +} + +const fn b(x: i32) -> u16 { + 1 << x +} + +static AH: [[u16; 2]; 4] = [ + [b(2), b(2) | b(3) | b(6) | b(7)], + [b(3), b(0) | b(1) | b(4) | b(5)], + [b(4) | b(7), b(8) | b(9) | b(12) | b(13)], + [b(5) | b(6), b(10) | b(11) | b(14) | b(15)], +]; + +static AL: [[u16; 2]; 4] = [ + [b(0) | b(3) | b(12) | b(15), b(0) | b(1) | b(4) | b(5)], + [b(1) | b(2) | b(13) | b(14), b(2) | b(3) | b(6) | b(7)], + [b(4) | b(7) | b(8) | b(11), b(8) | b(9) | b(12) | b(13)], + [b(5) | b(6) | b(9) | b(10), b(10) | b(11) | b(14) | b(15)], +]; + +static BH: [[u16; 2]; 4] = [ + [b(2), b(1) | b(3) | b(9) | b(11)], + [b(3), b(0) | b(2) | b(8) | b(10)], + [b(4) | b(7), b(4) | b(6) | b(12) | b(14)], + [b(5) | b(6), b(5) | b(7) | b(13) | b(15)], +]; + +static BL: [[u16; 2]; 4] = [ + [b(0) | b(3) | b(12) | b(15), b(0) | b(2) | b(8) | b(10)], + [b(1) | b(2) | b(13) | b(14), b(1) | b(3) | b(9) | b(11)], + [b(4) | b(7) | b(8) | b(11), b(4) | b(6) | b(12) | b(14)], + [b(5) | b(6) | b(9) | b(10), b(5) | b(7) | b(13) | b(15)], +]; + +fn decode_nibble(table: &[[u16; 2]; 4], nibble: u8) -> u16 { + for section in table.iter() { + if section[0] & (1 << nibble) != 0 { + return section[1]; + } + } + 0 +} + +fn find_1bit(x: u16) -> Option { + x.is_power_of_two().then(|| x.trailing_zeros() as u8) +} + +fn decode_pair(a: u8, b: u8) -> Option { + let x = find_1bit(decode_nibble(&AH, a >> 4) & decode_nibble(&BH, b >> 4))?; + let y = find_1bit(decode_nibble(&AL, a & 15) & decode_nibble(&BL, b & 15))?; + Some((x << 4) | y) +} + +fn encode_nibble(table: &[[u16; 2]; 4], nibble: u8) -> Vec { + for section in table.iter() { + if section[1] & (1 << nibble) != 0 { + let mut outputs = Vec::with_capacity(4); + let mut bits = section[0]; + while bits != 0 { + outputs.push(bits.trailing_zeros() as u8); + bits &= bits - 1; + } + return outputs; + } + } + unreachable!() +} + +fn encode_byte(hi_table: &[[u16; 2]; 4], lo_table: &[[u16; 2]; 4], byte: u8) -> Vec { + let hi_variants = encode_nibble(hi_table, byte >> 4); + let lo_variants = encode_nibble(lo_table, byte & 15); + let mut variants = Vec::with_capacity(hi_variants.len() * lo_variants.len()); + for hi in hi_variants.iter().copied() { + for lo in lo_variants.iter().copied() { + let byte = (hi << 4) | lo; + if byte != 127 { + variants.push(byte as char); + } + } + } + variants +} + +/// An encoded password. +/// +/// SPSS calls these "encrypted passwords", but they are not encrypted. They +/// are encoded with a simple scheme, analogous to base64 encoding but +/// one-to-many: any plaintext password maps to many possible encoded passwords. +/// +/// The encoding scheme maps each plaintext password byte to 2 ASCII characters, +/// using only at most the first 10 bytes of the plaintext password. Thus, an +/// encoded password is always a multiple of 2 characters long, and never longer +/// than 20 characters. The characters in an encoded password are always in the +/// graphic ASCII range 33 through 126. Each successive pair of characters in +/// the password encodes a single byte in the plaintext password. +/// +/// This struct supports both encoding and decoding passwords. +#[derive(Clone, Debug)] +pub struct EncodedPassword(Vec>); + +impl EncodedPassword { + /// Creates an [EncodedPassword] from an already-encoded password `encoded`. + /// Returns `None` if `encoded` is not a valid encoded password. + pub fn from_encoded(encoded: &[u8]) -> Option { + if encoded.len() > 20 + || encoded.len() % 2 != 0 + || !encoded.iter().all(|byte| (32..=127).contains(byte)) + { + return None; + } + + Some(EncodedPassword( + encoded.iter().map(|byte| vec![*byte as char]).collect(), + )) + } + + /// Returns an [EncodedPassword] as an encoded version of the given + /// `plaintext` password. Only the first 10 bytes, at most, of the + /// plaintext password is used. + pub fn from_plaintext(plaintext: &[u8]) -> EncodedPassword { + let input = plaintext.get(..10).unwrap_or(plaintext); + EncodedPassword( + input + .iter() + .copied() + .flat_map(|byte| [encode_byte(&AH, &AL, byte), encode_byte(&BH, &BL, byte)]) + .collect(), + ) + } + + /// Returns the number of variations of this encoded password. + /// + /// An [EncodedPassword] created by [EncodedPassword::from_plaintext] has + /// many variations: between `16**n` and `32**n` for an `n`-byte plaintext + /// password, so up to `32**10` (about 1e15) for the 10-byte longest + /// plaintext passwords. + /// + /// An [EncodedPassword] created by [EncodedPassword::from_encoded] has only + /// a single variation, the one passed in by that function. + pub fn n_variants(&self) -> u64 { + self.0 + .iter() + .map(|variants| variants.len() as u64) + .product() + } + + /// Returns one variation of this encoded password, numbered `index`. All + /// variations decode the same way. + pub fn variant(&self, mut index: u64) -> String { + let mut output = String::with_capacity(20); + for variants in &self.0 { + let n = variants.len() as u64; + output.push(variants[(index % n) as usize]); + index /= n; + } + output + } + + /// Returns the decoded version of this encoded password. + pub fn decode(&self) -> SmallVec<[u8; 10]> { + let mut output = SmallVec::new(); + for [a, b] in self.0.as_chunks::<2>().0 { + output.push(decode_pair(a[0] as u8, b[0] as u8).unwrap()); + } + output + } +} + +#[cfg(test)] +mod test { + use std::{io::Cursor, path::Path}; + + use crate::crypto::{EncodedPassword, EncryptedFile, FileType}; + + fn test_decrypt(input_name: &Path, expected_name: &Path, password: &str, file_type: FileType) { + let input_filename = Path::new("src/crypto/testdata").join(input_name); + let input = std::fs::read(&input_filename).unwrap(); + let mut cursor = Cursor::new(&input); + let file = EncryptedFile::new(&mut cursor).unwrap(); + assert_eq!(file.file_type(), file_type); + let mut reader = file.unlock_literal(password.as_bytes()).unwrap(); + assert_eq!(reader.file_type(), file_type); + let mut actual = Vec::new(); + std::io::copy(&mut reader, &mut actual).unwrap(); + + let expected_filename = Path::new("src/crypto/testdata").join(expected_name); + let expected = std::fs::read(&expected_filename).unwrap(); + if actual != expected { + panic!(); + } + } + + #[test] + fn sys_file() { + test_decrypt( + Path::new("test-encrypted.sav"), + Path::new("test.sav"), + "pspp", + FileType::Data, + ); + } + + #[test] + fn syntax_file() { + test_decrypt( + Path::new("test-encrypted.sps"), + Path::new("test.sps"), + "password", + FileType::Syntax, + ); + } + + #[test] + fn spv_file() { + test_decrypt( + Path::new("test-encrypted.spv"), + Path::new("test.spv"), + "Password1", + FileType::Viewer, + ); + } + + #[test] + fn password_encoding() { + // Decode a few specific passwords. + assert_eq!( + EncodedPassword::from_encoded(b"-|") + .unwrap() + .decode() + .as_slice(), + b"b" + ); + assert_eq!( + EncodedPassword::from_encoded(b" A") + .unwrap() + .decode() + .as_slice(), + b"a" + ); + + // Check that the encoding and decoding algorithms are inverses + // for individual characters at least. + for plaintext in 0..=255 { + let encoded = EncodedPassword::from_plaintext(&[plaintext]); + for variant in 0..encoded.n_variants() { + let encoded_variant = encoded.variant(variant); + let decoded = EncodedPassword::from_encoded(encoded_variant.as_bytes()) + .unwrap() + .decode(); + assert_eq!(&[plaintext], decoded.as_slice()); + } + } + } +} diff --git a/rust/pspp/src/crypto/mod.rs b/rust/pspp/src/crypto/mod.rs deleted file mode 100644 index c2e86cdea1..0000000000 --- a/rust/pspp/src/crypto/mod.rs +++ /dev/null @@ -1,668 +0,0 @@ -//! # Decryption for SPSS encrypted files -//! -//! SPSS supports encryption using a password for data, viewer, and syntax -//! files. The encryption mechanism is poorly designed, so this module provides -//! support for decrypting, but not encrypting, the SPSS format. -//! Use [EncryptedFile] as the starting point for reading an encrypted file. -//! -//! SPSS also supports what calls "encrypted passwords". Use [EncodedPassword] -//! to encode and decode these passwords. - -// Warn about missing docs, but not for items declared with `#[cfg(test)]`. -#![cfg_attr(not(test), warn(missing_docs))] - -use aes::{ - cipher::{generic_array::GenericArray, BlockDecrypt, KeyInit}, - Aes256, Aes256Dec, -}; -use cmac::{Cmac, Mac}; -use smallvec::SmallVec; -use std::{ - fmt::Debug, - io::{BufRead, Error as IoError, ErrorKind, Read, Seek, SeekFrom}, -}; -use thiserror::Error as ThisError; - -use binrw::{io::NoSeek, BinRead}; - -/// Error reading an encrypted file. -#[derive(Clone, Debug, ThisError)] -pub enum Error { - /// I/O error. - #[error("I/O error reading encrypted file wrapper ({0})")] - IoError(ErrorKind), - - /// Invalid padding in final encrypted data block. - #[error("Invalid padding in final encrypted data block")] - InvalidPadding, - - /// Not an encrypted file. - #[error("Not an encrypted file")] - NotEncrypted, - - /// Encrypted file has invalid length. - #[error("Encrypted file has invalid length {0} (expected 4 more than a multiple of 16).")] - InvalidLength(u64), - - /// Unknown file type. - #[error("Unknown file type {0:?}.")] - UnknownFileType(String), -} - -impl From for Error { - fn from(value: std::io::Error) -> Self { - Self::IoError(value.kind()) - } -} - -#[derive(BinRead)] -struct EncryptedHeader { - /// Fixed as `1c 00 00 00 00 00 00 00` in practice. - _ignore: [u8; 8], - - /// File type. - #[br(magic = b"ENCRYPTED")] - file_type: [u8; 3], - - /// Fixed as `15 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00` in practice. - _ignore2: [u8; 16], -} - -/// An encrypted file. -pub struct EncryptedFile { - reader: R, - file_type: FileType, - - /// Length of the ciphertext (excluding the 36-byte header). - length: u64, - - /// First block of ciphertext, for verifying that any password the user - /// tries is correct. - first_block: [u8; 16], - - /// Last block of ciphertext, for checking padding and determining the - /// plaintext length. - last_block: [u8; 16], -} - -/// Type of encrypted file. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum FileType { - /// A `.sps` syntax file. - Syntax, - - /// A `.spv` viewer file. - Viewer, - - /// A `.sav` data file. - Data, -} - -impl EncryptedFile -where - R: Read + Seek, -{ - /// Opens `reader` as an encrypted file. - /// - /// This reads enough of the file to verify that it is in the expected - /// format and returns an error if it cannot be read or is not the expected - /// format. - /// - /// `reader` doesn't need to be [BufRead], and probably should not be. The - /// [EncryptedReader] returned by [unlock] or [unlock_literal] will be - /// [BufRead]. - /// - /// [unlock]: Self::unlock - /// [unlock_literal]: Self::unlock_literal - pub fn new(mut reader: R) -> Result { - let header = - EncryptedHeader::read_le(&mut NoSeek::new(&mut reader)).map_err( - |error| match error { - binrw::Error::BadMagic { .. } => Error::NotEncrypted, - binrw::Error::Io(error) => Error::IoError(error.kind()), - _ => unreachable!(), - }, - )?; - let file_type = match &header.file_type { - b"SAV" => FileType::Data, - b"SPV" => FileType::Viewer, - b"SPS" => FileType::Syntax, - _ => { - return Err(Error::UnknownFileType( - header.file_type.iter().map(|b| *b as char).collect(), - )) - } - }; - let mut first_block = [0; 16]; - reader.read_exact(&mut first_block)?; - let length = reader.seek(SeekFrom::End(-16))? + 16; - if length < 36 + 16 || (length - 36) % 16 != 0 { - return Err(Error::InvalidLength(length + 36)); - } - let mut last_block = [0; 16]; - reader.read_exact(&mut last_block)?; - reader.seek(SeekFrom::Start(36))?; - Ok(Self { - reader, - file_type, - length, - first_block, - last_block, - }) - } - - /// Tries to unlock the encrypted file using both `password` and with - /// `password` decoded with [EncodedPassword::decode]. If successful, - /// returns an [EncryptedReader] for the file; on failure, returns the - /// [EncryptedFile] again for another try. - pub fn unlock(self, password: &[u8]) -> Result, Self> { - self.unlock_literal(password).or_else(|this| { - match EncodedPassword::from_encoded(password) { - Some(encoded) => this.unlock_literal(&encoded.decode()), - None => Err(this), - } - }) - } - - /// Tries to unlock the encrypted file using just `password`. If - /// successful, returns an [EncryptedReader] for the file; on failure, - /// returns the [EncryptedFile] again for another try. - /// - /// If the password itself might be encoded ("encrypted"), instead use - /// [Self::unlock] to try it both ways. - pub fn unlock_literal(self, password: &[u8]) -> Result, Self> { - // NIST SP 800-108 fixed data. - #[rustfmt::skip] - static FIXED: &[u8] = &[ - // i - 0x00, 0x00, 0x00, 0x01, - - // label - 0x35, 0x27, 0x13, 0xcc, 0x53, 0xa7, 0x78, 0x89, - 0x87, 0x53, 0x22, 0x11, 0xd6, 0x5b, 0x31, 0x58, - 0xdc, 0xfe, 0x2e, 0x7e, 0x94, 0xda, 0x2f, 0x00, - 0xcc, 0x15, 0x71, 0x80, 0x0a, 0x6c, 0x63, 0x53, - - // delimiter - 0x00, - - // context - 0x38, 0xc3, 0x38, 0xac, 0x22, 0xf3, 0x63, 0x62, - 0x0e, 0xce, 0x85, 0x3f, 0xb8, 0x07, 0x4c, 0x4e, - 0x2b, 0x77, 0xc7, 0x21, 0xf5, 0x1a, 0x80, 0x1d, - 0x67, 0xfb, 0xe1, 0xe1, 0x83, 0x07, 0xd8, 0x0d, - - // L - 0x00, 0x00, 0x01, 0x00, - ]; - - // Truncate password to at most 10 bytes. - let password = password.get(..10).unwrap_or(password); - let n = password.len(); - - // padded_password = password padded with zeros to 32 bytes. - let mut padded_password = [0; 32]; - padded_password[..n].copy_from_slice(password); - - // cmac = CMAC(padded_password, fixed). - let mut cmac = as Mac>::new_from_slice(&padded_password).unwrap(); - cmac.update(FIXED); - let cmac = cmac.finalize().into_bytes(); - - // The key is the cmac repeated twice. - let mut key = [0; 32]; - key[..16].copy_from_slice(cmac.as_slice()); - key[16..].copy_from_slice(cmac.as_slice()); - - // Use key to initialize AES. - let aes = ::new_from_slice(&key).unwrap(); - - // Decrypt first block to verify password. - let mut out = [0; 16]; - aes.decrypt_block_b2b( - GenericArray::from_slice(&self.first_block), - GenericArray::from_mut_slice(&mut out), - ); - static MAGIC: &[&[u8]] = &[ - b"$FL2@(#)", - b"$FL3@(#)", - b"* Encoding", - b"PK\x03\x04\x14\0\x08", - ]; - if !MAGIC.iter().any(|magic| out.starts_with(magic)) { - return Err(self); - } - - // Decrypt last block to check padding and get final length. - aes.decrypt_block_b2b( - GenericArray::from_slice(&self.last_block), - GenericArray::from_mut_slice(&mut out), - ); - let Some(padding_length) = parse_padding(&out) else { - return Err(self); - }; - - Ok(EncryptedReader::new( - self.reader, - aes, - self.file_type, - self.length - 36 - padding_length as u64, - )) - } - - /// Returns the type of encrypted file. - pub fn file_type(&self) -> FileType { - self.file_type - } -} - -fn parse_padding(block: &[u8; 16]) -> Option { - let pad = block[15] as usize; - if (1..=16).contains(&pad) && block[16 - pad..].iter().all(|b| *b == pad as u8) { - Some(pad) - } else { - None - } -} - -impl Debug for EncryptedFile -where - R: Read, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "EncryptedFile({:?})", &self.file_type) - } -} - -/// Encrypted file reader. -/// -/// This implements [Read] and [Seek] for SPSS encrypted files. To construct an -/// [EncryptedReader], call [EncryptedFile::new], then [EncryptedFile::unlock]. -pub struct EncryptedReader { - /// Underlying reader. - reader: R, - - /// AES-256 decryption key. - aes: Aes256Dec, - - /// Type of file. - file_type: FileType, - - /// Plaintext file length (not including the file header or padding). - length: u64, - - /// Plaintext data buffer. - buffer: Box<[u8; 4096]>, - - /// Plaintext offset of the byte in `buffer[0]`. A multiple of 16 less than - /// or equal to `length`. - start: u64, - - /// Number of bytes in buffer (`0 <= head <= 4096`). - head: usize, - - /// Offset in buffer of the next byte to read (`head <= tail`). - tail: usize, -} - -impl EncryptedReader { - fn new(reader: R, aes: Aes256Dec, file_type: FileType, length: u64) -> Self { - Self { - reader, - aes, - file_type, - length, - buffer: Box::new([0; 4096]), - start: 0, - head: 0, - tail: 0, - } - } - - fn read_buffer(&mut self, buf: &mut [u8]) -> Result { - let n = buf.len().min(self.head - self.tail); - buf[..n].copy_from_slice(&self.buffer[self.tail..n + self.tail]); - self.tail += n; - Ok(n) - } - - /// Returns the type of encrypted file. - pub fn file_type(&self) -> FileType { - self.file_type - } -} - -impl EncryptedReader -where - R: Read, -{ - fn fill_buffer(&mut self, offset: u64) -> Result<(), IoError> { - self.start = offset / 16 * 16; - self.head = 0; - self.tail = (offset % 16) as usize; - let n = self.buffer.len().min((self.length - self.start) as usize); - self.reader - .read_exact(&mut self.buffer[..n.next_multiple_of(16)])?; - for offset in (0..n).step_by(16) { - self.aes.decrypt_block(GenericArray::from_mut_slice( - &mut self.buffer[offset..offset + 16], - )); - } - self.head = n; - Ok(()) - } -} - -impl Read for EncryptedReader -where - R: Read, -{ - fn read(&mut self, buf: &mut [u8]) -> Result { - if self.tail < self.head { - self.read_buffer(buf) - } else { - let offset = self.start + self.head as u64; - if offset < self.length { - self.fill_buffer(offset)?; - self.read_buffer(buf) - } else { - Ok(0) - } - } - } -} - -impl Seek for EncryptedReader -where - R: Read + Seek, -{ - fn seek(&mut self, pos: SeekFrom) -> Result { - let offset = match pos { - SeekFrom::Start(offset) => Some(offset), - SeekFrom::End(relative) => self.length.checked_add_signed(relative), - SeekFrom::Current(relative) => { - (self.start + self.tail as u64).checked_add_signed(relative) - } - } - .filter(|offset| *offset < u64::MAX - 36) - .ok_or(IoError::from(ErrorKind::InvalidInput))?; - if offset != self.start + self.tail as u64 { - self.reader.seek(SeekFrom::Start(offset / 16 * 16 + 36))?; - self.fill_buffer(offset)?; - } - Ok(offset) - } -} - -impl BufRead for EncryptedReader -where - R: Read + Seek, -{ - fn fill_buf(&mut self) -> std::io::Result<&[u8]> { - if self.tail >= self.head { - let offset = self.start + self.head as u64; - if offset < self.length { - self.fill_buffer(offset)?; - } - } - Ok(&self.buffer[self.tail..self.head]) - } - - fn consume(&mut self, amount: usize) { - self.tail += amount; - debug_assert!(self.tail <= self.head); - } -} - -const fn b(x: i32) -> u16 { - 1 << x -} - -static AH: [[u16; 2]; 4] = [ - [b(2), b(2) | b(3) | b(6) | b(7)], - [b(3), b(0) | b(1) | b(4) | b(5)], - [b(4) | b(7), b(8) | b(9) | b(12) | b(13)], - [b(5) | b(6), b(10) | b(11) | b(14) | b(15)], -]; - -static AL: [[u16; 2]; 4] = [ - [b(0) | b(3) | b(12) | b(15), b(0) | b(1) | b(4) | b(5)], - [b(1) | b(2) | b(13) | b(14), b(2) | b(3) | b(6) | b(7)], - [b(4) | b(7) | b(8) | b(11), b(8) | b(9) | b(12) | b(13)], - [b(5) | b(6) | b(9) | b(10), b(10) | b(11) | b(14) | b(15)], -]; - -static BH: [[u16; 2]; 4] = [ - [b(2), b(1) | b(3) | b(9) | b(11)], - [b(3), b(0) | b(2) | b(8) | b(10)], - [b(4) | b(7), b(4) | b(6) | b(12) | b(14)], - [b(5) | b(6), b(5) | b(7) | b(13) | b(15)], -]; - -static BL: [[u16; 2]; 4] = [ - [b(0) | b(3) | b(12) | b(15), b(0) | b(2) | b(8) | b(10)], - [b(1) | b(2) | b(13) | b(14), b(1) | b(3) | b(9) | b(11)], - [b(4) | b(7) | b(8) | b(11), b(4) | b(6) | b(12) | b(14)], - [b(5) | b(6) | b(9) | b(10), b(5) | b(7) | b(13) | b(15)], -]; - -fn decode_nibble(table: &[[u16; 2]; 4], nibble: u8) -> u16 { - for section in table.iter() { - if section[0] & (1 << nibble) != 0 { - return section[1]; - } - } - 0 -} - -fn find_1bit(x: u16) -> Option { - x.is_power_of_two().then(|| x.trailing_zeros() as u8) -} - -fn decode_pair(a: u8, b: u8) -> Option { - let x = find_1bit(decode_nibble(&AH, a >> 4) & decode_nibble(&BH, b >> 4))?; - let y = find_1bit(decode_nibble(&AL, a & 15) & decode_nibble(&BL, b & 15))?; - Some((x << 4) | y) -} - -fn encode_nibble(table: &[[u16; 2]; 4], nibble: u8) -> Vec { - for section in table.iter() { - if section[1] & (1 << nibble) != 0 { - let mut outputs = Vec::with_capacity(4); - let mut bits = section[0]; - while bits != 0 { - outputs.push(bits.trailing_zeros() as u8); - bits &= bits - 1; - } - return outputs; - } - } - unreachable!() -} - -fn encode_byte(hi_table: &[[u16; 2]; 4], lo_table: &[[u16; 2]; 4], byte: u8) -> Vec { - let hi_variants = encode_nibble(hi_table, byte >> 4); - let lo_variants = encode_nibble(lo_table, byte & 15); - let mut variants = Vec::with_capacity(hi_variants.len() * lo_variants.len()); - for hi in hi_variants.iter().copied() { - for lo in lo_variants.iter().copied() { - let byte = (hi << 4) | lo; - if byte != 127 { - variants.push(byte as char); - } - } - } - variants -} - -/// An encoded password. -/// -/// SPSS calls these "encrypted passwords", but they are not encrypted. They -/// are encoded with a simple scheme, analogous to base64 encoding but -/// one-to-many: any plaintext password maps to many possible encoded passwords. -/// -/// The encoding scheme maps each plaintext password byte to 2 ASCII characters, -/// using only at most the first 10 bytes of the plaintext password. Thus, an -/// encoded password is always a multiple of 2 characters long, and never longer -/// than 20 characters. The characters in an encoded password are always in the -/// graphic ASCII range 33 through 126. Each successive pair of characters in -/// the password encodes a single byte in the plaintext password. -/// -/// This struct supports both encoding and decoding passwords. -#[derive(Clone, Debug)] -pub struct EncodedPassword(Vec>); - -impl EncodedPassword { - /// Creates an [EncodedPassword] from an already-encoded password `encoded`. - /// Returns `None` if `encoded` is not a valid encoded password. - pub fn from_encoded(encoded: &[u8]) -> Option { - if encoded.len() > 20 - || encoded.len() % 2 != 0 - || !encoded.iter().all(|byte| (32..=127).contains(byte)) - { - return None; - } - - Some(EncodedPassword( - encoded.iter().map(|byte| vec![*byte as char]).collect(), - )) - } - - /// Returns an [EncodedPassword] as an encoded version of the given - /// `plaintext` password. Only the first 10 bytes, at most, of the - /// plaintext password is used. - pub fn from_plaintext(plaintext: &[u8]) -> EncodedPassword { - let input = plaintext.get(..10).unwrap_or(plaintext); - EncodedPassword( - input - .iter() - .copied() - .flat_map(|byte| [encode_byte(&AH, &AL, byte), encode_byte(&BH, &BL, byte)]) - .collect(), - ) - } - - /// Returns the number of variations of this encoded password. - /// - /// An [EncodedPassword] created by [EncodedPassword::from_plaintext] has - /// many variations: between `16**n` and `32**n` for an `n`-byte plaintext - /// password, so up to `32**10` (about 1e15) for the 10-byte longest - /// plaintext passwords. - /// - /// An [EncodedPassword] created by [EncodedPassword::from_encoded] has only - /// a single variation, the one passed in by that function. - pub fn n_variants(&self) -> u64 { - self.0 - .iter() - .map(|variants| variants.len() as u64) - .product() - } - - /// Returns one variation of this encoded password, numbered `index`. All - /// variations decode the same way. - pub fn variant(&self, mut index: u64) -> String { - let mut output = String::with_capacity(20); - for variants in &self.0 { - let n = variants.len() as u64; - output.push(variants[(index % n) as usize]); - index /= n; - } - output - } - - /// Returns the decoded version of this encoded password. - pub fn decode(&self) -> SmallVec<[u8; 10]> { - let mut output = SmallVec::new(); - for [a, b] in self.0.as_chunks::<2>().0 { - output.push(decode_pair(a[0] as u8, b[0] as u8).unwrap()); - } - output - } -} - -#[cfg(test)] -mod test { - use std::{io::Cursor, path::Path}; - - use crate::crypto::{EncodedPassword, EncryptedFile, FileType}; - - fn test_decrypt(input_name: &Path, expected_name: &Path, password: &str, file_type: FileType) { - let input_filename = Path::new("src/crypto/testdata").join(input_name); - let input = std::fs::read(&input_filename).unwrap(); - let mut cursor = Cursor::new(&input); - let file = EncryptedFile::new(&mut cursor).unwrap(); - assert_eq!(file.file_type(), file_type); - let mut reader = file.unlock_literal(password.as_bytes()).unwrap(); - assert_eq!(reader.file_type(), file_type); - let mut actual = Vec::new(); - std::io::copy(&mut reader, &mut actual).unwrap(); - - let expected_filename = Path::new("src/crypto/testdata").join(expected_name); - let expected = std::fs::read(&expected_filename).unwrap(); - if actual != expected { - panic!(); - } - } - - #[test] - fn sys_file() { - test_decrypt( - Path::new("test-encrypted.sav"), - Path::new("test.sav"), - "pspp", - FileType::Data, - ); - } - - #[test] - fn syntax_file() { - test_decrypt( - Path::new("test-encrypted.sps"), - Path::new("test.sps"), - "password", - FileType::Syntax, - ); - } - - #[test] - fn spv_file() { - test_decrypt( - Path::new("test-encrypted.spv"), - Path::new("test.spv"), - "Password1", - FileType::Viewer, - ); - } - - #[test] - fn password_encoding() { - // Decode a few specific passwords. - assert_eq!( - EncodedPassword::from_encoded(b"-|") - .unwrap() - .decode() - .as_slice(), - b"b" - ); - assert_eq!( - EncodedPassword::from_encoded(b" A") - .unwrap() - .decode() - .as_slice(), - b"a" - ); - - // Check that the encoding and decoding algorithms are inverses - // for individual characters at least. - for plaintext in 0..=255 { - let encoded = EncodedPassword::from_plaintext(&[plaintext]); - for variant in 0..encoded.n_variants() { - let encoded_variant = encoded.variant(variant); - let decoded = EncodedPassword::from_encoded(encoded_variant.as_bytes()) - .unwrap() - .decode(); - assert_eq!(&[plaintext], decoded.as_slice()); - } - } - } -} diff --git a/rust/pspp/src/format.rs b/rust/pspp/src/format.rs new file mode 100644 index 0000000000..43ba5198b8 --- /dev/null +++ b/rust/pspp/src/format.rs @@ -0,0 +1,1390 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use std::{ + fmt::{Debug, Display, Formatter, Result as FmtResult, Write}, + ops::{Not, RangeInclusive}, + str::{Chars, FromStr}, + sync::LazyLock, +}; + +use chrono::{Datelike, Local}; +use enum_iterator::{all, Sequence}; +use enum_map::{Enum, EnumMap}; +use serde::{Deserialize, Serialize}; +use thiserror::Error as ThisError; +use unicode_width::UnicodeWidthStr; + +use crate::{ + data::{ByteString, Datum}, + sys::raw, + util::ToSmallString, + variable::{VarType, VarWidth}, +}; + +mod display; +mod parse; +pub use display::{DisplayDatum, DisplayPlain, DisplayPlainF64}; + +#[derive(Clone, ThisError, Debug, PartialEq, Eq)] +pub enum Error { + #[error("Unknown format type {value}.")] + UnknownFormat { value: u16 }, + + #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)] + OddWidthNotAllowed(UncheckedFormat), + + #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())] + BadWidth(UncheckedFormat), + + #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)] + DecimalsNotAllowedForFormat(UncheckedFormat), + + #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)] + DecimalsNotAllowedForWidth(UncheckedFormat), + + #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)] + TooManyDecimalsForWidth { + spec: UncheckedFormat, + max_d: Decimals, + }, + + #[error("String variable is not compatible with numeric format {0}.")] + UnnamedVariableNotCompatibleWithNumericFormat(Type), + + #[error("Numeric variable is not compatible with string format {0}.")] + UnnamedVariableNotCompatibleWithStringFormat(Type), + + #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")] + NamedStringVariableBadSpecWidth { + variable: String, + width: Width, + bad_spec: Format, + good_spec: Format, + }, + + #[error("String variable with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")] + UnnamedStringVariableBadSpecWidth { + width: Width, + bad_spec: Format, + good_spec: Format, + }, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum Category { + // Numeric formats. + Basic, + Custom, + Legacy, + Binary, + Hex, + Date, + Time, + DateComponent, + + // String formats. + String, +} + +impl From for Category { + fn from(source: Type) -> Self { + match source { + Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic, + Type::CC(_) => Self::Custom, + Type::N | Type::Z => Self::Legacy, + Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary, + Type::PIBHex | Type::RBHex => Self::Hex, + Type::Date + | Type::ADate + | Type::EDate + | Type::JDate + | Type::SDate + | Type::QYr + | Type::MoYr + | Type::WkYr + | Type::DateTime + | Type::YmdHms => Self::Date, + Type::MTime | Type::Time | Type::DTime => Self::Time, + Type::WkDay | Type::Month => Self::DateComponent, + Type::A | Type::AHex => Self::String, + } + } +} + +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash, Sequence, Serialize)] +pub enum CC { + A, + B, + C, + D, + E, +} + +impl CC { + pub fn as_string(&self) -> &'static str { + match self { + CC::A => "A", + CC::B => "B", + CC::C => "C", + CC::D => "D", + CC::E => "E", + } + } +} + +impl Display for CC { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{}", self.as_string()) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Sequence, Serialize)] +pub enum Type { + // Basic numeric formats. + F, + Comma, + Dot, + Dollar, + Pct, + E, + + // Custom currency formats. + CC(CC), + + // Legacy numeric formats. + N, + Z, + + // Binary and hexadecimal formats. + P, + PK, + IB, + PIB, + PIBHex, + RB, + RBHex, + + // Time and date formats. + Date, + ADate, + EDate, + JDate, + SDate, + QYr, + MoYr, + WkYr, + DateTime, + YmdHms, + MTime, + Time, + DTime, + + // Date component formats. + WkDay, + Month, + + // String formats. + A, + AHex, +} + +pub type Width = u16; +pub type SignedWidth = i16; + +pub type Decimals = u8; + +impl Type { + pub fn max_width(self) -> Width { + match self { + Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16, + Self::IB | Self::PIB | Self::RB => 8, + Self::A => 32767, + Self::AHex => 32767 * 2, + _ => 40, + } + } + + pub fn min_width(self) -> Width { + match self { + // Basic numeric formats. + Self::F => 1, + Self::Comma => 1, + Self::Dot => 1, + Self::Dollar => 2, + Self::Pct => 2, + Self::E => 6, + + // Custom currency formats. + Self::CC(_) => 2, + + // Legacy numeric formats. + Self::N => 1, + Self::Z => 1, + + // Binary and hexadecimal formats. + Self::P => 1, + Self::PK => 1, + Self::IB => 1, + Self::PIB => 1, + Self::PIBHex => 2, + Self::RB => 2, + Self::RBHex => 4, + + // Time and date formats. + Self::Date => 9, + Self::ADate => 8, + Self::EDate => 8, + Self::JDate => 5, + Self::SDate => 8, + Self::QYr => 6, + Self::MoYr => 6, + Self::WkYr => 8, + Self::DateTime => 17, + Self::YmdHms => 16, + Self::MTime => 5, + Self::Time => 5, + Self::DTime => 8, + + // Date component formats. + Self::WkDay => 2, + Self::Month => 3, + + // String formats. + Self::A => 1, + Self::AHex => 2, + } + } + + pub fn width_range(self) -> RangeInclusive { + self.min_width()..=self.max_width() + } + + pub fn max_decimals(self, width: Width) -> Decimals { + let width = width.clamp(1, 40) as SignedWidth; + let max = match self { + Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1, + Self::Dollar | Self::Pct => width - 2, + Self::E => width - 7, + Self::N | Self::Z => width, + Self::P => width * 2 - 1, + Self::PK => width * 2, + Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth, + Self::PIBHex => 0, + Self::RB | Self::RBHex => 16, + Self::Date + | Self::ADate + | Self::EDate + | Self::JDate + | Self::SDate + | Self::QYr + | Self::MoYr + | Self::WkYr => 0, + Self::DateTime => width - 21, + Self::YmdHms => width - 20, + Self::MTime => width - 6, + Self::Time => width - 9, + Self::DTime => width - 12, + Self::WkDay | Self::Month | Self::A | Self::AHex => 0, + }; + max.clamp(0, 16) as Decimals + } + + pub fn takes_decimals(self) -> bool { + self.max_decimals(Width::MAX) > 0 + } + + pub fn category(self) -> Category { + self.into() + } + + pub fn width_step(self) -> Width { + if self.category() == Category::Hex || self == Self::AHex { + 2 + } else { + 1 + } + } + + pub fn clamp_width(self, width: Width) -> Width { + let (min, max) = self.width_range().into_inner(); + let width = width.clamp(min, max); + if self.width_step() == 2 { + width / 2 * 2 + } else { + width + } + } + + pub fn var_type(self) -> VarType { + match self { + Self::A | Self::AHex => VarType::String, + _ => VarType::Numeric, + } + } + + /// Checks whether this format is valid for a variable with the given + /// `var_type`. + pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> { + let my_type = self.var_type(); + match (my_type, var_type) { + (VarType::Numeric, VarType::String) => { + Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self)) + } + (VarType::String, VarType::Numeric) => { + Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self)) + } + _ => Ok(()), + } + } + + pub fn as_str(&self) -> &'static str { + match self { + Self::F => "F", + Self::Comma => "COMMA", + Self::Dot => "DOT", + Self::Dollar => "DOLLAR", + Self::Pct => "PCT", + Self::E => "E", + Self::CC(CC::A) => "CCA", + Self::CC(CC::B) => "CCB", + Self::CC(CC::C) => "CCC", + Self::CC(CC::D) => "CCD", + Self::CC(CC::E) => "CCE", + Self::N => "N", + Self::Z => "Z", + Self::P => "P", + Self::PK => "PK", + Self::IB => "IB", + Self::PIB => "PIB", + Self::PIBHex => "PIBHEX", + Self::RB => "RB", + Self::RBHex => "RBHEX", + Self::Date => "DATE", + Self::ADate => "ADATE", + Self::EDate => "EDATE", + Self::JDate => "JDATE", + Self::SDate => "SDATE", + Self::QYr => "QYR", + Self::MoYr => "MOYR", + Self::WkYr => "WKYR", + Self::DateTime => "DATETIME", + Self::YmdHms => "YMDHMS", + Self::MTime => "MTIME", + Self::Time => "TIME", + Self::DTime => "DTIME", + Self::WkDay => "WKDAY", + Self::Month => "MONTH", + Self::A => "A", + Self::AHex => "AHEX", + } + } + + pub fn default_value(&self) -> Datum { + match self.var_type() { + VarType::Numeric => Datum::sysmis(), + VarType::String => Datum::String(ByteString::default()), + } + } +} + +impl Display for Type { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{}", self.as_str()) + } +} + +impl FromStr for Type { + type Err = (); + + fn from_str(s: &str) -> Result { + for type_ in all::() { + if type_.as_str().eq_ignore_ascii_case(s) { + return Ok(type_); + } + } + Err(()) + } +} + +fn max_digits_for_bytes(bytes: usize) -> usize { + *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20) +} + +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct AbstractFormat { + pub name: String, + w: Width, + d: Decimals, +} + +fn split(s: &str, predicate: F) -> (&str, &str) +where + F: Fn(&char) -> bool, +{ + let rest = s.trim_start_matches(|c| predicate(&c)); + let start = &s[..s.len() - rest.len()]; + (start, rest) +} + +impl FromStr for AbstractFormat { + type Err = (); + + fn from_str(s: &str) -> Result { + let (name, s) = split(s, char::is_ascii_alphabetic); + if name.is_empty() { + return Err(()); + } + + let (w, s) = split(s, char::is_ascii_digit); + let Ok(w) = w.parse() else { + return Err(()); + }; + + let (d, rest) = if let Some(s) = s.strip_prefix('.') { + let (d, rest) = split(s, char::is_ascii_digit); + let Ok(d) = d.parse() else { + return Err(()); + }; + (d, rest) + } else { + (0, s) + }; + + if !rest.is_empty() { + return Err(()); + } + Ok(Self { + name: name.into(), + w, + d, + }) + } +} + +impl TryFrom for UncheckedFormat { + type Error = (); + + fn try_from(value: AbstractFormat) -> Result { + Ok(UncheckedFormat::new(value.name.parse()?, value.w, value.d)) + } +} + +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Format { + type_: Type, + w: Width, + d: Decimals, +} + +impl Serialize for Format { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.to_small_string::<16>().serialize(serializer) + } +} + +impl Format { + pub const F40: Format = Format { + type_: Type::F, + w: 40, + d: 0, + }; + + pub const F40_1: Format = Format { + type_: Type::F, + w: 40, + d: 1, + }; + + pub const F40_2: Format = Format { + type_: Type::F, + w: 40, + d: 2, + }; + + pub const F40_3: Format = Format { + type_: Type::F, + w: 40, + d: 3, + }; + + pub const PCT40_1: Format = Format { + type_: Type::Pct, + w: 40, + d: 1, + }; + + pub const F8_2: Format = Format { + type_: Type::F, + w: 8, + d: 2, + }; + + pub const DATETIME40_0: Format = Format { + type_: Type::DateTime, + w: 40, + d: 0, + }; + + pub fn type_(self) -> Type { + self.type_ + } + pub fn w(self) -> usize { + self.w as usize + } + pub fn d(self) -> usize { + self.d as usize + } + + pub fn new(type_: Type, w: Width, d: Decimals) -> Option { + UncheckedFormat { type_, w, d }.try_into().ok() + } + + pub fn default_for_width(var_width: VarWidth) -> Self { + match var_width { + VarWidth::Numeric => Format { + type_: Type::F, + w: 8, + d: 2, + }, + VarWidth::String(w) => Format { + type_: Type::A, + w, + d: 0, + }, + } + } + + pub fn fixed_from(source: &UncheckedFormat) -> Self { + let UncheckedFormat { + type_: format, + w, + d, + } = *source; + let (min, max) = format.width_range().into_inner(); + let mut w = w.clamp(min, max); + if d <= format.max_decimals(Width::MAX) { + while d > format.max_decimals(w) { + w += 1; + assert!(w <= 40); + } + } + let d = d.clamp(0, format.max_decimals(w)); + Self { + type_: format, + w, + d, + } + } + + pub fn var_width(self) -> VarWidth { + match self.type_ { + Type::A => VarWidth::String(self.w), + Type::AHex => VarWidth::String(self.w / 2), + _ => VarWidth::Numeric, + } + } + + pub fn var_type(self) -> VarType { + self.type_.var_type() + } + + /// Checks whether this format specification is valid for a variable with + /// width `var_width`. + pub fn check_width_compatibility(self, var_width: VarWidth) -> Result { + // Verify that the format is right for the variable's type. + self.type_.check_type_compatibility(var_width.into())?; + + if let VarWidth::String(w) = var_width { + if var_width != self.var_width() { + let bad_spec = self; + let good_spec = if self.type_ == Type::A { + Format { w, ..self } + } else { + Format { w: w * 2, ..self } + }; + return Err(Error::UnnamedStringVariableBadSpecWidth { + width: w, + bad_spec, + good_spec, + }); + } + } + + Ok(self) + } + + pub fn default_value(&self) -> Datum { + match self.var_width() { + VarWidth::Numeric => Datum::sysmis(), + VarWidth::String(width) => Datum::String(ByteString::spaces(width as usize)), + } + } + + pub fn resize(&mut self, width: VarWidth) { + match (self.var_width(), width) { + (VarWidth::Numeric, VarWidth::Numeric) => {} + (VarWidth::String(_), VarWidth::String(new_width)) => { + self.w = if self.type_ == Type::AHex { + new_width * 2 + } else { + new_width + }; + } + _ => *self = Self::default_for_width(width), + } + } + + pub fn codepage_to_unicode(&mut self) { + let mut width = self.var_width(); + width.codepage_to_unicode(); + if let Some(width) = width.as_string_width() { + if self.type_ == Type::AHex { + self.w = width as u16 * 2; + } else { + self.w = width as u16; + } + } + } +} + +impl Debug for Format { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{self}") + } +} + +impl Display for Format { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{}{}", self.type_, self.w)?; + if self.type_.takes_decimals() || self.d > 0 { + write!(f, ".{}", self.d)?; + } + Ok(()) + } +} + +impl TryFrom for Format { + type Error = Error; + + fn try_from(source: UncheckedFormat) -> Result { + let UncheckedFormat { + type_: format, + w, + d, + } = source; + let max_d = format.max_decimals(w); + if w % format.width_step() != 0 { + Err(Error::OddWidthNotAllowed(source)) + } else if !format.width_range().contains(&w) { + Err(Error::BadWidth(source)) + } else if d > max_d { + if format.takes_decimals() { + Err(Error::DecimalsNotAllowedForFormat(source)) + } else if max_d > 0 { + Err(Error::TooManyDecimalsForWidth { + spec: source, + max_d, + }) + } else { + Err(Error::DecimalsNotAllowedForWidth(source)) + } + } else { + Ok(Format { + type_: format, + w, + d, + }) + } + } +} + +impl From for u16 { + fn from(source: Type) -> Self { + match source { + Type::A => 1, + Type::AHex => 2, + Type::Comma => 3, + Type::Dollar => 4, + Type::F => 5, + Type::IB => 6, + Type::PIBHex => 7, + Type::P => 8, + Type::PIB => 9, + Type::PK => 10, + Type::RB => 11, + Type::RBHex => 12, + Type::Z => 15, + Type::N => 16, + Type::E => 17, + Type::Date => 20, + Type::Time => 21, + Type::DateTime => 22, + Type::ADate => 23, + Type::JDate => 24, + Type::DTime => 25, + Type::WkDay => 26, + Type::Month => 27, + Type::MoYr => 28, + Type::QYr => 29, + Type::WkYr => 30, + Type::Pct => 31, + Type::Dot => 32, + Type::CC(CC::A) => 33, + Type::CC(CC::B) => 34, + Type::CC(CC::C) => 35, + Type::CC(CC::D) => 36, + Type::CC(CC::E) => 37, + Type::EDate => 38, + Type::SDate => 39, + Type::MTime => 40, + Type::YmdHms => 41, + } + } +} + +impl TryFrom for Type { + type Error = Error; + + fn try_from(source: u16) -> Result { + match source { + 1 => Ok(Self::A), + 2 => Ok(Self::AHex), + 3 => Ok(Self::Comma), + 4 => Ok(Self::Dollar), + 5 => Ok(Self::F), + 6 => Ok(Self::IB), + 7 => Ok(Self::PIBHex), + 8 => Ok(Self::P), + 9 => Ok(Self::PIB), + 10 => Ok(Self::PK), + 11 => Ok(Self::RB), + 12 => Ok(Self::RBHex), + 15 => Ok(Self::Z), + 16 => Ok(Self::N), + 17 => Ok(Self::E), + 20 => Ok(Self::Date), + 21 => Ok(Self::Time), + 22 => Ok(Self::DateTime), + 23 => Ok(Self::ADate), + 24 => Ok(Self::JDate), + 25 => Ok(Self::DTime), + 26 => Ok(Self::WkDay), + 27 => Ok(Self::Month), + 28 => Ok(Self::MoYr), + 29 => Ok(Self::QYr), + 30 => Ok(Self::WkYr), + 31 => Ok(Self::Pct), + 32 => Ok(Self::Dot), + 33 => Ok(Self::CC(CC::A)), + 34 => Ok(Self::CC(CC::B)), + 35 => Ok(Self::CC(CC::C)), + 36 => Ok(Self::CC(CC::D)), + 37 => Ok(Self::CC(CC::E)), + 38 => Ok(Self::EDate), + 39 => Ok(Self::SDate), + 40 => Ok(Self::MTime), + 41 => Ok(Self::YmdHms), + _ => Err(Error::UnknownFormat { value: source }), + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct UncheckedFormat { + pub type_: Type, + + pub w: Width, + + pub d: Decimals, +} + +impl UncheckedFormat { + pub fn new(type_: Type, w: Width, d: Decimals) -> Self { + Self { type_, w, d } + } + pub fn fix(&self) -> Format { + Format::fixed_from(self) + } +} + +impl TryFrom for UncheckedFormat { + type Error = Error; + + fn try_from(raw: raw::records::RawFormat) -> Result { + let raw = raw.0; + let raw_format = (raw >> 16) as u16; + let format = raw_format.try_into()?; + let w = ((raw >> 8) & 0xff) as Width; + let d = (raw & 0xff) as Decimals; + Ok(Self { + type_: format, + w, + d, + }) + } +} + +impl Display for UncheckedFormat { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{}{}", self.type_, self.w)?; + if self.type_.takes_decimals() || self.d > 0 { + write!(f, ".{}", self.d)?; + } + Ok(()) + } +} + +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Enum, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Decimal { + #[default] + Dot, + Comma, +} + +impl Decimal { + pub fn as_str(&self) -> &'static str { + match self { + Decimal::Dot => ".", + Decimal::Comma => ",", + } + } +} + +impl From for char { + fn from(value: Decimal) -> Self { + u8::from(value).into() + } +} + +impl From for u8 { + fn from(value: Decimal) -> Self { + match value { + Decimal::Dot => b'.', + Decimal::Comma => b',', + } + } +} + +impl TryFrom for Decimal { + type Error = (); + + fn try_from(c: char) -> Result { + match c { + '.' => Ok(Self::Dot), + ',' => Ok(Self::Comma), + _ => Err(()), + } + } +} + +impl Not for Decimal { + type Output = Self; + + fn not(self) -> Self::Output { + match self { + Self::Dot => Self::Comma, + Self::Comma => Self::Dot, + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] +pub struct Epoch(pub i32); + +impl Epoch { + /// Applies the epoch to `year`: + /// + /// - If `year` is 2 digits (between 0 and 99, inclusive), returns it + /// converted it to the correct year considering the epoch. + /// + /// - Otherwise, returns `year` unchanged. + pub fn apply(&self, year: i32) -> i32 { + match year { + 0..=99 => { + let century = self.0 / 100 * 100; + let offset = self.0 - century; + if year >= offset { + year + century + } else { + year + century + 100 + } + } + other => other, + } + } +} + +impl Default for Epoch { + fn default() -> Self { + static DEFAULT: LazyLock = LazyLock::new(|| Epoch(Local::now().year() - 69)); + *DEFAULT + } +} + +impl Display for Epoch { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{}", self.0) + } +} + +#[derive(Clone, Debug, Default, Serialize)] +pub struct Settings { + pub epoch: Epoch, + + /// Either `'.'` or `','`. + pub decimal: Decimal, + + /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5` + /// instead of `.5`)? + pub leading_zero: bool, + + /// Custom currency styles. + pub ccs: EnumMap>>, +} + +#[derive(Copy, Clone, Enum)] +struct StyleParams { + decimal: Decimal, + leading_zero: bool, +} +impl From<&Settings> for StyleParams { + fn from(value: &Settings) -> Self { + Self { + decimal: value.decimal, + leading_zero: value.leading_zero, + } + } +} + +struct StyleSet(EnumMap); + +impl StyleSet { + fn new(f: impl Fn(StyleParams) -> NumberStyle) -> Self { + Self(EnumMap::from_fn(f)) + } + fn get(&self, settings: &Settings) -> &NumberStyle { + &self.0[settings.into()] + } +} + +impl Settings { + pub fn with_cc(mut self, cc: CC, style: NumberStyle) -> Self { + self.ccs[cc] = Some(Box::new(style)); + self + } + pub fn with_leading_zero(self, leading_zero: bool) -> Self { + Self { + leading_zero, + ..self + } + } + pub fn with_epoch(self, epoch: Epoch) -> Self { + Self { epoch, ..self } + } + pub fn number_style(&self, type_: Type) -> &NumberStyle { + static DEFAULT: LazyLock = + LazyLock::new(|| NumberStyle::new("", "", Decimal::Dot, None, false)); + + match type_ { + Type::F | Type::E => { + static F: LazyLock = LazyLock::new(|| { + StyleSet::new(|p| NumberStyle::new("", "", p.decimal, None, p.leading_zero)) + }); + F.get(self) + } + Type::Comma => { + static COMMA: LazyLock = LazyLock::new(|| { + StyleSet::new(|p| { + NumberStyle::new("", "", p.decimal, Some(!p.decimal), p.leading_zero) + }) + }); + COMMA.get(self) + } + Type::Dot => { + static DOT: LazyLock = LazyLock::new(|| { + StyleSet::new(|p| { + NumberStyle::new("", "", !p.decimal, Some(p.decimal), p.leading_zero) + }) + }); + DOT.get(self) + } + Type::Dollar => { + static DOLLAR: LazyLock = LazyLock::new(|| { + StyleSet::new(|p| NumberStyle::new("$", "", p.decimal, Some(!p.decimal), false)) + }); + DOLLAR.get(self) + } + Type::Pct => { + static PCT: LazyLock = LazyLock::new(|| { + StyleSet::new(|p| NumberStyle::new("", "%", p.decimal, None, false)) + }); + PCT.get(self) + } + Type::CC(cc) => self.ccs[cc].as_deref().unwrap_or(&DEFAULT), + Type::N + | Type::Z + | Type::P + | Type::PK + | Type::IB + | Type::PIB + | Type::PIBHex + | Type::RB + | Type::RBHex + | Type::Date + | Type::ADate + | Type::EDate + | Type::JDate + | Type::SDate + | Type::QYr + | Type::MoYr + | Type::WkYr + | Type::DateTime + | Type::YmdHms + | Type::MTime + | Type::Time + | Type::DTime + | Type::WkDay + | Type::Month + | Type::A + | Type::AHex => &DEFAULT, + } + } +} + +/// A numeric output style. This can express numeric formats in +/// [Category::Basic] and [Category::Custom]. +#[derive(Clone, Debug, Serialize)] +pub struct NumberStyle { + pub neg_prefix: Affix, + pub prefix: Affix, + pub suffix: Affix, + pub neg_suffix: Affix, + + /// Decimal point. + pub decimal: Decimal, + + /// Grouping character. + pub grouping: Option, + + /// Format as `.5` or `0.5`? + pub leading_zero: bool, + + /// An `Affix` may require more bytes than its display width; for example, + /// U+00A5 (Â¥) is 2 bytes in UTF-8 but occupies only one display column. + /// This member is the sum of the number of bytes required by all of the + /// `Affix` members in this struct, minus their display widths. Thus, it + /// can be used to size memory allocations: for example, the formatted + /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in + /// UTF-8. + #[serde(skip)] + pub extra_bytes: usize, +} + +impl Display for NumberStyle { + /// Display this number style in the format used for custom currency. + /// + /// This format can only accurately represent number styles that include a + /// grouping character. If this number style doesn't, it will pretend that + /// the grouping character is the opposite of the decimal point character. + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + let grouping = char::from(!self.decimal); + write!( + f, + "{}{}{}{}{}{}{}", + self.neg_prefix.display(grouping), + grouping, + self.prefix.display(grouping), + grouping, + self.suffix.display(grouping), + grouping, + self.neg_suffix.display(grouping), + ) + } +} + +impl NumberStyle { + fn new( + prefix: &str, + suffix: &str, + decimal: Decimal, + grouping: Option, + leading_zero: bool, + ) -> Self { + // These assertions ensure that zero is correct for `extra_bytes`. + debug_assert!(prefix.is_ascii()); + debug_assert!(suffix.is_ascii()); + + Self { + neg_prefix: Affix::new("-"), + prefix: Affix::new(prefix), + suffix: Affix::new(suffix), + neg_suffix: Affix::new(""), + decimal, + grouping, + leading_zero, + extra_bytes: 0, + } + } + + fn affix_width(&self) -> usize { + self.prefix.width + self.suffix.width + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct Affix { + /// String contents of affix. + pub s: String, + + #[serde(skip)] + /// Display width in columns (see [unicode_width]) + pub width: usize, +} + +impl Affix { + fn new(s: impl Into) -> Self { + let s = s.into(); + Self { + width: s.width(), + s, + } + } + + fn extra_bytes(&self) -> usize { + self.s.len().checked_sub(self.width).unwrap() + } + + fn display(&self, escape: char) -> DisplayAffix<'_> { + DisplayAffix { + affix: self.s.as_str(), + escape, + } + } +} + +pub struct DisplayAffix<'a> { + affix: &'a str, + escape: char, +} + +impl Display for DisplayAffix<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + for c in self.affix.chars() { + if c == self.escape { + f.write_char('\'')?; + } + f.write_char(c)?; + } + Ok(()) + } +} + +impl FromStr for NumberStyle { + type Err = (); + + fn from_str(s: &str) -> Result { + fn find_separator(s: &str) -> Option { + // Count commas and periods. There must be exactly three of one or + // the other, except that an apostrophe escapes a following comma or + // period. + let mut n_commas = 0; + let mut n_periods = 0; + let s = s.as_bytes(); + for i in 0..s.len() { + if i > 0 && s[i - 1] == b'\'' { + } else if s[i] == b',' { + n_commas += 1; + } else if s[i] == b'.' { + n_periods += 1; + } + } + + if n_commas == 3 && n_periods != 3 { + Some(',') + } else if n_periods == 3 && n_commas != 3 { + Some('.') + } else { + None + } + } + + fn take_cc_token(iter: &mut Chars<'_>, grouping: char) -> Affix { + let mut s = String::new(); + let mut quote = false; + for c in iter { + if c == '\'' && !quote { + quote = true; + } else if c == grouping && !quote { + break; + } else { + s.push(c); + quote = false; + } + } + Affix::new(s) + } + + let Some(grouping) = find_separator(s) else { + return Err(()); + }; + let mut iter = s.chars(); + let neg_prefix = take_cc_token(&mut iter, grouping); + let prefix = take_cc_token(&mut iter, grouping); + let suffix = take_cc_token(&mut iter, grouping); + let neg_suffix = take_cc_token(&mut iter, grouping); + let grouping: Decimal = grouping.try_into().unwrap(); + let decimal = !grouping; + let extra_bytes = neg_prefix.extra_bytes() + + prefix.extra_bytes() + + suffix.extra_bytes() + + neg_suffix.extra_bytes(); + Ok(Self { + neg_prefix, + prefix, + suffix, + neg_suffix, + decimal, + grouping: Some(grouping), + leading_zero: false, + extra_bytes, + }) + } +} + +/// An item within a [DateTemplate]. +pub struct TemplateItem { + /// Character in the template. + pub c: char, + + /// Number of repetitions of the character. + pub n: usize, +} + +/// A template for date and time formats. +#[derive(Clone)] +pub struct DateTemplate(&'static str); + +impl DateTemplate { + /// Returns a [DateTemplate] used for date and time input and output in a + /// field of the given `type_` and `width`. + /// + /// `width` only affects whether a 2-digit year or a 4-digit year is used, + /// that is, whether the returned string contains `yy` or `yyyy`, and + /// whether seconds are included, that is, whether the returned string + /// contains `:SS`. A caller that doesn't care whether the returned string + /// contains `yy` or `yyyy` or `:SS` can just specify 0 to omit them. + pub fn new(type_: Type, width: usize) -> Option { + let (short, long) = match type_ { + Type::F + | Type::Comma + | Type::Dot + | Type::Dollar + | Type::Pct + | Type::E + | Type::CC(_) + | Type::N + | Type::Z + | Type::P + | Type::PK + | Type::IB + | Type::PIB + | Type::PIBHex + | Type::RB + | Type::RBHex + | Type::WkDay + | Type::Month + | Type::A + | Type::AHex => return None, + Type::Date => ("dd-mmm-yy", "dd-mmm-yyyy"), + Type::ADate => ("mm/dd/yy", "mm/dd/yyyy"), + Type::EDate => ("dd.mm.yy", "dd.mm.yyyy"), + Type::JDate => ("yyddd", "yyyyddd"), + Type::SDate => ("yy/mm/dd", "yyyy/mm/dd"), + Type::QYr => ("q Q yy", "q Q yyyy"), + Type::MoYr => ("mmm yy", "mmm yyyy"), + Type::WkYr => ("ww WK yy", "ww WK yyyy"), + Type::DateTime => ("dd-mmm-yyyy HH:MM", "dd-mmm-yyyy HH:MM:SS"), + Type::YmdHms => ("yyyy-mm-dd HH:MM", "yyyy-mm-dd HH:MM:SS"), + Type::MTime => ("MM", "MM:SS"), + Type::Time => ("HH:MM", "HH:MM:SS"), + Type::DTime => ("D HH:MM", "D HH:MM:SS"), + }; + if width >= long.len() { + Some(DateTemplate(long)) + } else { + Some(DateTemplate(short)) + } + } + + pub fn for_format(format: Format) -> Option { + Self::new(format.type_(), format.w()) + } + + #[allow(clippy::len_without_is_empty)] + pub fn len(&self) -> usize { + self.0.len() + } +} + +impl Iterator for DateTemplate { + type Item = TemplateItem; + + fn next(&mut self) -> Option { + let mut iter = self.0.chars(); + let c = iter.next()?; + self.0 = iter.as_str(); + let mut n = 1; + while iter.next() == Some(c) { + self.0 = iter.as_str(); + n += 1; + } + Some(TemplateItem { c, n }) + } +} + +#[cfg(test)] +mod tests { + use crate::format::{Format, Type, Width}; + + #[test] + fn codepage_to_unicode() { + fn check_format(input: Format, expected_width: Width) { + let mut output = input; + output.codepage_to_unicode(); + let expected = Format::new(input.type_, expected_width, input.d).unwrap(); + assert_eq!(output, expected); + } + check_format(Format::new(Type::A, 1, 0).unwrap(), 3); + check_format(Format::new(Type::A, 2, 0).unwrap(), 6); + check_format(Format::new(Type::A, 3, 0).unwrap(), 9); + check_format(Format::new(Type::A, 1000, 0).unwrap(), 3000); + check_format(Format::new(Type::A, 20000, 0).unwrap(), 32767); + + check_format(Format::new(Type::AHex, 2, 0).unwrap(), 6); + check_format(Format::new(Type::AHex, 4, 0).unwrap(), 12); + check_format(Format::new(Type::AHex, 6, 0).unwrap(), 18); + check_format(Format::new(Type::AHex, 2000, 0).unwrap(), 6000); + check_format(Format::new(Type::AHex, 20000, 0).unwrap(), 60000); + check_format(Format::new(Type::AHex, 30000, 0).unwrap(), 65534); + + check_format(Format::new(Type::F, 40, 0).unwrap(), 40); + } +} diff --git a/rust/pspp/src/format/display.rs b/rust/pspp/src/format/display.rs new file mode 100644 index 0000000000..5b3bbe2ccc --- /dev/null +++ b/rust/pspp/src/format/display.rs @@ -0,0 +1,1197 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use std::{ + cmp::min, + fmt::{Display, Error as FmtError, Formatter, Result as FmtResult, Write as _}, + io::{Error as IoError, Write as IoWrite}, + str::from_utf8_unchecked, +}; + +use binrw::Endian; +use chrono::{Datelike, NaiveDate}; +use encoding_rs::{Encoding, UTF_8}; +use libm::frexp; +use smallstr::SmallString; +use smallvec::{Array, SmallVec}; + +use crate::{ + calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name}, + data::{ByteStr, Datum, EncodedString, QuotedDatum, WithEncoding}, + endian::ToBytes, + format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type}, + settings::{EndianSettings, Settings as PsppSettings}, + util::ToSmallString, +}; + +pub struct DisplayDatum<'b, B> { + format: Format, + settings: &'b Settings, + endian: EndianSettings, + datum: Datum, + + /// If true, the output will remove leading and trailing spaces from numeric + /// values, and trailing spaces from string values. (This might make the + /// output narrower than the requested width.) + trim_spaces: bool, + + /// If true, the output will include a double quote before and after string + /// values. + quote_strings: bool, +} + +#[cfg(test)] +mod test; + +pub trait DisplayPlain { + fn display_plain(&self) -> DisplayPlainF64; +} + +impl DisplayPlain for f64 { + fn display_plain(&self) -> DisplayPlainF64 { + DisplayPlainF64 { + value: *self, + decimal: '.', + } + } +} + +pub struct DisplayPlainF64 { + pub value: f64, + pub decimal: char, +} + +impl DisplayPlainF64 { + pub fn with_decimal(self, decimal: char) -> Self { + Self { decimal, ..self } + } +} + +impl Display for DisplayPlainF64 { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + struct Inner(f64); + + impl Display for Inner { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + let value = self.0; + if (value.abs() < 0.0005 && value != 0.0) || value.abs() > 1e15 { + // Print 0s that would otherwise have lots of leading or + // trailing zeros in scientific notation with full precision. + write!(f, "{value:.e}") + } else if value == value.trunc() { + // Print integers without decimal places. + write!(f, "{value:.0}") + } else { + // Print other numbers with full precision. + write!(f, "{value:.}") + } + } + } + + match self.decimal { + '.' => write!(f, "{}", Inner(self.value)), + _ => { + let tmp = Inner(self.value).to_small_string::<64>(); + if let Some(position) = tmp.find('.') { + f.write_str(&tmp[..position])?; + f.write_char(self.decimal)?; + f.write_str(&tmp[position + 1..]) + } else { + f.write_str(&tmp) + } + } + } + } +} + +impl<'a, D> Datum +where + D: EncodedString, +{ + /// Returns an object that implements [Display] for printing this [Datum] as + /// `format`. + /// + /// [Display]: std::fmt::Display + pub fn display(&'a self, format: Format) -> DisplayDatum<'a, WithEncoding<&'a ByteStr>> { + DisplayDatum::new(format, self.as_borrowed()) + } + + pub fn display_plain(&self) -> QuotedDatum<'_, D> { + self.quoted() + } +} + +impl<'b, B> Display for DisplayDatum<'b, B> +where + B: EncodedString, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + let number = match &self.datum { + Datum::Number(number) => *number, + Datum::String(string) => { + if self.format.type_() == Type::AHex { + for byte in string.raw_string_bytes() { + write!(f, "{byte:02x}")?; + } + } else { + let quote = if self.quote_strings { "\"" } else { "" }; + let s = string.as_str(); + let s = if self.trim_spaces { + s.trim_end_matches(' ') + } else { + &s + }; + write!(f, "{quote}{s}{quote}")?; + } + return Ok(()); + } + }; + + let Some(number) = number else { + return self.missing(f); + }; + + match self.format.type_() { + Type::F + | Type::Comma + | Type::Dot + | Type::Dollar + | Type::Pct + | Type::E + | Type::CC(_) => self.number(f, number), + Type::N => self.n(f, number), + Type::Z => self.z(f, number), + + Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => self.fmt_binary(f), + + Type::PIBHex => self.pibhex(f, number), + Type::RBHex => self.rbhex(f, number), + Type::Date + | Type::ADate + | Type::EDate + | Type::JDate + | Type::SDate + | Type::QYr + | Type::MoYr + | Type::WkYr + | Type::DateTime + | Type::YmdHms + | Type::MTime + | Type::Time + | Type::DTime + | Type::WkDay => self.date(f, number), + Type::Month => self.month(f, number), + Type::A | Type::AHex => unreachable!(), + } + } +} + +impl<'b, B> DisplayDatum<'b, B> +where + B: EncodedString, +{ + pub fn new(format: Format, datum: Datum) -> Self { + let settings = PsppSettings::global(); + Self { + format, + datum, + settings: &settings.formats, + endian: settings.endian, + trim_spaces: false, + quote_strings: false, + } + } + pub fn with_settings(self, settings: &'b Settings) -> Self { + Self { settings, ..self } + } + pub fn with_endian(self, endian: EndianSettings) -> Self { + Self { endian, ..self } + } + pub fn with_trimming(self) -> Self { + Self { + trim_spaces: true, + ..self + } + } + pub fn with_quoted_string(self) -> Self { + Self { + quote_strings: true, + ..self + } + } + fn fmt_binary(&self, f: &mut Formatter) -> FmtResult { + let output = self.to_binary().unwrap(); + for b in output { + f.write_char(b as char)?; + } + Ok(()) + } + fn number(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { + if number.is_finite() { + let style = self.settings.number_style(self.format.type_); + if self.format.type_ != Type::E && number.abs() < 1.5 * power10(self.format.w()) { + let rounder = Rounder::new(style, number, self.format.d); + if self.decimal(f, &rounder, style, true)? + || self.scientific(f, number, style, true)? + || self.decimal(f, &rounder, style, false)? + { + return Ok(()); + } + } + + if !self.scientific(f, number, style, false)? { + self.overflow(f)?; + } + Ok(()) + } else { + self.infinite(f, number) + } + } + + fn infinite(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { + if self.format.w >= 3 { + let s = if number.is_nan() { + "NaN" + } else if number.is_infinite() { + if number.is_sign_positive() { + "+Infinity" + } else { + "-Infinity" + } + } else { + "Unknown" + }; + let w = if self.trim_spaces { 0 } else { self.format.w() }; + write!(f, "{s:>w$.w$}") + } else { + self.overflow(f) + } + } + + fn missing(&self, f: &mut Formatter<'_>) -> FmtResult { + match self.format.type_ { + Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => return self.fmt_binary(f), + Type::RBHex => return self.rbhex(f, -f64::MAX), + _ => (), + } + + if self.trim_spaces { + return write!(f, "."); + } + + let w = self.format.w() as isize; + let d = self.format.d() as isize; + let dot_position = match self.format.type_ { + Type::N => w - 1, + Type::Pct => w - d - 2, + Type::E => w - d - 5, + _ => w - d - 1, + }; + let dot_position = dot_position.max(0) as u16; + + for i in 0..self.format.w { + if i == dot_position { + write!(f, ".")?; + } else { + write!(f, " ")?; + } + } + Ok(()) + } + + fn overflow(&self, f: &mut Formatter<'_>) -> FmtResult { + if self.trim_spaces { + write!(f, "*")?; + } else { + for _ in 0..self.format.w { + write!(f, "*")?; + } + } + Ok(()) + } + + fn decimal( + &self, + f: &mut Formatter<'_>, + rounder: &Rounder, + style: &NumberStyle, + require_affixes: bool, + ) -> Result { + for decimals in (0..=self.format.d).rev() { + // Make sure there's room for the number's magnitude, plus the + // negative suffix, plus (if negative) the negative prefix. + let RounderWidth { + mut width, + integer_digits, + negative, + } = rounder.width(decimals as usize); + width += style.neg_suffix.width; + if negative { + width += style.neg_prefix.width; + } + if width > self.format.w() { + continue; + } + + // If there's room for the prefix and suffix, allocate + // space. If the affixes are required, but there's no + // space, give up. + let add_affixes = allocate_space(style.affix_width(), self.format.w(), &mut width); + if !add_affixes && require_affixes { + continue; + } + + // Check whether we should include grouping characters. We need + // room for a complete set or we don't insert any at all. We don't + // include grouping characters if decimal places were requested but + // they were all dropped. + let grouping = style.grouping.filter(|_| { + integer_digits > 3 + && (self.format.d == 0 || decimals > 0) + && allocate_space((integer_digits - 1) / 3, self.format.w(), &mut width) + }); + + // Assemble number. + let magnitude = rounder.format(decimals as usize); + let mut output = SmallString::<[u8; 40]>::new(); + if !self.trim_spaces { + for _ in width..self.format.w() { + output.push(' '); + } + } + if negative { + output.push_str(&style.neg_prefix.s); + } + if add_affixes { + output.push_str(&style.prefix.s); + } + if let Some(grouping) = grouping { + for (i, digit) in magnitude[..integer_digits].bytes().enumerate() { + if i > 0 && (integer_digits - i) % 3 == 0 { + output.push(grouping.into()); + } + output.push(digit as char); + } + } else { + output.push_str(&magnitude[..integer_digits]); + } + if decimals > 0 { + output.push(style.decimal.into()); + let s = &magnitude[integer_digits + 1..]; + output.push_str(&s[..decimals as usize]); + } + if add_affixes { + output.push_str(&style.suffix.s); + } + if negative { + output.push_str(&style.neg_suffix.s); + } else { + for _ in 0..style.neg_suffix.width { + output.push(' '); + } + } + + debug_assert!(self.trim_spaces || output.len() >= self.format.w()); + debug_assert!(output.len() <= self.format.w() + style.extra_bytes); + f.write_str(&output)?; + return Ok(true); + } + Ok(false) + } + + fn scientific( + &self, + f: &mut Formatter<'_>, + number: f64, + style: &NumberStyle, + require_affixes: bool, + ) -> Result { + // Allocate minimum required space. + let mut width = 6 + style.neg_suffix.width; + if number < 0.0 { + width += style.neg_prefix.width; + } + if width > self.format.w() { + return Ok(false); + } + + // Check for room for prefix and suffix. + let add_affixes = allocate_space(style.affix_width(), self.format.w(), &mut width); + if require_affixes && !add_affixes { + return Ok(false); + } + + // Figure out number of characters we can use for the fraction, if any. + // (If that turns out to be `1`, then we'll output a decimal point + // without any digits following.) + let mut fraction_width = min(self.format.d as usize + 1, self.format.w() - width).min(16); + if self.format.type_ != Type::E && fraction_width == 1 { + fraction_width = 0; + } + width += fraction_width; + + let mut output = SmallString::<[u8; 40]>::new(); + if !self.trim_spaces { + for _ in width..self.format.w() { + output.push(' '); + } + } + if number < 0.0 { + output.push_str(&style.neg_prefix.s); + } + if add_affixes { + output.push_str(&style.prefix.s); + } + write!( + &mut output, + "{:.*E}", + fraction_width.saturating_sub(1), + number.abs() + ) + .unwrap(); + if fraction_width == 1 { + // Insert `.` before the `E`, to get a value like "1.E+000". + output.insert(output.find('E').unwrap(), '.'); + } + + // Rust always uses `.` as the decimal point. Translate to `,` if + // necessary. + if style.decimal == Decimal::Comma { + fix_decimal_point(&mut output); + } + + // Make exponent have exactly three digits, plus sign. + let e = output.as_bytes().iter().position(|c| *c == b'E').unwrap(); + let exponent: isize = output[e + 1..].parse().unwrap(); + if exponent.abs() > 999 { + return Ok(false); + } + output.truncate(e + 1); + write!(&mut output, "{exponent:+04}").unwrap(); + + // Add suffixes. + if add_affixes { + output.push_str(&style.suffix.s); + } + if number.is_sign_negative() { + output.push_str(&style.neg_suffix.s); + } else { + for _ in 0..style.neg_suffix.width { + output.push(' '); + } + } + + println!( + "{} for {number} width={width} fraction_width={fraction_width}: {output:?}", + self.format + ); + debug_assert!(self.trim_spaces || output.len() >= self.format.w()); + debug_assert!(output.len() <= self.format.w() + style.extra_bytes); + f.write_str(&output)?; + Ok(true) + } + + fn n(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { + if number < 0.0 { + return self.missing(f); + } + + let legacy = LegacyFormat::new(number, self.format.d()); + let w = self.format.w(); + let len = legacy.len(); + if len > w { + self.overflow(f) + } else { + write!(f, "{}{legacy}", Zeros(w.saturating_sub(len))) + } + } + + fn z(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { + let legacy = LegacyFormat::new(number, self.format.d()); + let w = self.format.w(); + let len = legacy.len(); + if len > w { + self.overflow(f) + } else { + let mut s = legacy.to_small_string::<40>(); + if number < 0.0 { + if let Some(last) = s.pop() { + let last = last.to_digit(10).unwrap(); + s.push(b"}JKLMNOPQR"[last as usize] as char); + } + } + write!(f, "{}{s}", Zeros(w.saturating_sub(len))) + } + } + + fn pibhex(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { + if number < 0.0 { + self.overflow(f) + } else { + let number = number.round(); + if number >= power256(self.format.w / 2) { + self.overflow(f) + } else { + let binary = integer_to_binary(number as u64, self.format.w / 2); + output_hex(f, &binary) + } + } + } + + fn rbhex(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { + let rb = self.rb(Some(number), self.format.w() / 2); + output_hex(f, &rb) + } + + fn date(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { + const MINUTE: f64 = 60.0; + const HOUR: f64 = 60.0 * 60.0; + const DAY: f64 = 60.0 * 60.0 * 24.0; + + let (date, mut time) = match self.format.type_.category() { + Category::Date => { + if number < 0.0 { + return self.missing(f); + } + let Some(date) = calendar_offset_to_gregorian(number / DAY) else { + return self.missing(f); + }; + (date, number % DAY) + } + Category::Time => (NaiveDate::MIN, number), + _ => unreachable!(), + }; + + let mut output = SmallString::<[u8; 40]>::new(); + for TemplateItem { c, n } in DateTemplate::for_format(self.format).unwrap() { + match c { + 'd' if n < 3 => write!(&mut output, "{:02}", date.day()).unwrap(), + 'd' => write!(&mut output, "{:03}", day_of_year(date).unwrap_or(1)).unwrap(), + 'm' if n < 3 => write!(&mut output, "{:02}", date.month()).unwrap(), + 'm' => write!(&mut output, "{}", short_month_name(date.month()).unwrap()).unwrap(), + 'y' if n >= 4 => { + let year = date.year(); + if year <= 9999 { + write!(&mut output, "{year:04}").unwrap(); + } else if self.format.type_ == Type::DateTime + || self.format.type_ == Type::YmdHms + { + write!(&mut output, "****").unwrap(); + } else { + return self.overflow(f); + } + } + 'y' => { + let epoch = self.settings.epoch.0; + let offset = date.year() - epoch; + if !(0..=99).contains(&offset) { + return self.overflow(f); + } + write!(&mut output, "{:02}", date.year().abs() % 100).unwrap(); + } + 'q' => write!(&mut output, "{}", date.month0() / 3 + 1).unwrap(), + 'w' => write!( + &mut output, + "{:2}", + (day_of_year(date).unwrap_or(1) - 1) / 7 + 1 + ) + .unwrap(), + 'D' => { + if time < 0.0 { + output.push('-'); + } + time = time.abs(); + write!(&mut output, "{:1$.0}", (time / DAY).floor(), n).unwrap(); + time %= DAY; + } + 'H' => { + if time < 0.0 { + output.push('-'); + } + time = time.abs(); + write!(&mut output, "{:01$.0}", (time / HOUR).floor(), n).unwrap(); + time %= HOUR; + } + 'M' => { + if time < 0.0 { + output.push('-'); + } + time = time.abs(); + write!(&mut output, "{:02.0}", (time / MINUTE).floor()).unwrap(); + time %= MINUTE; + + let excess_width = self.format.w() as isize - output.len() as isize; + if excess_width < 0 || (self.format.type_ == Type::MTime && excess_width < 3) { + return self.overflow(f); + } + if excess_width == 3 + || excess_width == 4 + || (excess_width >= 5 && self.format.d == 0) + { + write!(&mut output, ":{:02.0}", time.floor()).unwrap(); + } else if excess_width >= 5 { + let d = min(self.format.d(), excess_width as usize - 4); + let w = d + 3; + write!(&mut output, ":{time:0w$.d$}").unwrap(); + if self.settings.decimal == Decimal::Comma { + fix_decimal_point(&mut output); + } + } + break; + } + c if n == 1 => output.push(c), + _ => unreachable!(), + } + } + if !self.trim_spaces { + write!(f, "{:>1$}", &output, self.format.w()) + } else { + f.write_str(&output) + } + } + + fn month(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { + if let Some(month) = month_name(number as u32) { + if !self.trim_spaces { + write!(f, "{month:.*}", self.format.w()) + } else { + f.write_str(month) + } + } else { + self.missing(f) + } + } + + /// Writes this object to `w`. Writes binary formats ([Type::P], + /// [Type::PIB], and so on) as binary values, and writes other output + /// formats in the given `encoding`. + /// + /// If `dv` is a [DisplayDatum], the difference between `write!(f, "{}", + /// dv)` and `dv.write(f, encoding)` is: + /// + /// * `write!` always outputs UTF-8. Binary formats are encoded as the + /// Unicode characters corresponding to their bytes. + /// + /// * `dv.write` outputs the desired `encoding`. Binary formats are not + /// encoded in `encoding` (and thus they might be invalid for the + /// encoding). + pub fn write(&self, mut w: W, encoding: &'static Encoding) -> Result<(), IoError> + where + W: IoWrite, + { + match self.to_binary() { + Some(binary) => w.write_all(&binary), + None if encoding == UTF_8 => { + write!(&mut w, "{self}") + } + None => w.write_all(&encoding.encode(&self.to_small_string::<64>()).0), + } + } + + fn to_binary(&self) -> Option> { + let number = self.datum.as_number()?; + match self.format.type_() { + Type::P => Some(self.p(number)), + Type::PK => Some(self.pk(number)), + Type::IB => Some(self.ib(number)), + Type::PIB => Some(self.pib(number)), + Type::RB => Some(self.rb(number, self.format.w())), + _ => None, + } + } + + fn bcd(&self, number: Option, digits: usize) -> (bool, SmallVec<[u8; 16]>) { + let legacy = LegacyFormat::new(number.unwrap_or_default(), self.format.d()); + let len = legacy.len(); + + let mut output = SmallVec::new(); + if len > digits { + output.resize(digits.div_ceil(2), 0); + (false, output) + } else { + let mut decimal = SmallString::<[u8; 16]>::new(); + write!( + &mut decimal, + "{}{legacy}", + Zeros(digits.saturating_sub(len)) + ) + .unwrap(); + + let mut src = decimal.bytes(); + for _ in 0..digits / 2 { + let d0 = src.next().unwrap() - b'0'; + let d1 = src.next().unwrap() - b'0'; + output.push((d0 << 4) + d1); + } + if digits % 2 != 0 { + let d = src.next().unwrap() - b'0'; + output.push(d << 4); + } + (true, output) + } + } + + fn p(&self, number: Option) -> SmallVec<[u8; 16]> { + let (valid, mut output) = self.bcd(number, self.format.w() * 2 - 1); + if valid && number.is_some_and(|number| number < 0.0) { + *output.last_mut().unwrap() |= 0xd; + } else { + *output.last_mut().unwrap() |= 0xf; + } + output + } + + fn pk(&self, number: Option) -> SmallVec<[u8; 16]> { + let number = match number { + Some(number) if number < 0.0 => None, + other => other, + }; + let (_valid, output) = self.bcd(number, self.format.w() * 2); + output + } + + fn ib(&self, number: Option) -> SmallVec<[u8; 16]> { + let number = number.map_or(0.0, |number| (number * power10(self.format.d())).round()); + let number = if number >= power256(self.format.w) / 2.0 - 1.0 + || number < -power256(self.format.w) / 2.0 + { + 0.0 + } else { + number + }; + let integer = number.abs() as u64; + let integer = if number < 0.0 { + (-(integer as i64)) as u64 + } else { + integer + }; + endian_to_smallvec(self.endian.output, integer, self.format.w()) + } + + fn pib(&self, number: Option) -> SmallVec<[u8; 16]> { + let number = number.map_or(0.0, |number| (number * power10(self.format.d())).round()); + let number = if number >= power256(self.format.w) || number < 0.0 { + 0.0 + } else { + number + }; + let integer = number.abs() as u64; + endian_to_smallvec(self.endian.output, integer, self.format.w()) + } + + fn rb(&self, number: Option, w: usize) -> SmallVec<[u8; 16]> { + let number = number.unwrap_or(-f64::MAX); + let bytes: [u8; 8] = self.endian.output.to_bytes(number); + let mut vec = SmallVec::new(); + vec.extend_from_slice(&bytes); + vec.resize(w, 0); + vec + } +} + +struct LegacyFormat { + s: SmallVec<[u8; 40]>, + trailing_zeros: usize, +} + +impl LegacyFormat { + fn new(number: f64, d: usize) -> Self { + let mut s = SmallVec::<[u8; 40]>::new(); + write!(&mut s, "{:E}", number.abs()).unwrap(); + debug_assert!(s.is_ascii()); + + // Parse exponent. + // + // Add 1 because of the transformation we will do just below, and `d` so + // that we just need to round to the nearest integer. + let e_index = s.iter().position(|c| *c == b'E').unwrap(); + let mut exponent = unsafe { from_utf8_unchecked(&s[e_index + 1..]) } + .parse::() + .unwrap() + + 1 + + d as i32; + + // Transform `1.234E56` into `1234`. + if e_index == 1 { + // No decimals, e.g. `1E4` or `0E0`. + s.truncate(1) + } else { + s.remove(1); + s.truncate(e_index - 1); + }; + debug_assert!(s.iter().all(|c| c.is_ascii_digit())); + + if exponent >= 0 && exponent < s.len() as i32 { + // The first `exponent` digits are before the decimal point. We + // need to round off there. + let exp = exponent as usize; + + fn round_up(digits: &mut [u8], position: usize) -> bool { + for index in (0..position).rev() { + match digits[index] { + b'0'..=b'8' => { + digits[index] += 1; + return true; + } + b'9' => { + digits[index] = b'0'; + } + _ => unreachable!(), + } + } + false + } + + if s[exp] >= b'5' && !round_up(&mut s, exp) { + s.clear(); + s.push(b'1'); + exponent += 1; + } + } + + let exponent = exponent.max(0) as usize; + s.truncate(exponent); + s.resize(exponent, b'0'); + let trailing_zeros = exponent.saturating_sub(s.len()); + Self { s, trailing_zeros } + } + fn s(&self) -> &str { + unsafe { from_utf8_unchecked(&self.s) } + } + fn len(&self) -> usize { + self.s.len() + self.trailing_zeros + } +} + +impl Display for LegacyFormat { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{}{}", self.s(), Zeros(self.trailing_zeros)) + } +} + +struct Zeros(usize); + +impl Display for Zeros { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + let mut n = self.0; + while n > 0 { + static ZEROS: &str = "0000000000000000000000000000000000000000"; + let chunk = n.min(ZEROS.len()); + f.write_str(&ZEROS[..chunk])?; + n -= chunk; + } + Ok(()) + } +} + +fn integer_to_binary(number: u64, width: u16) -> SmallVec<[u8; 8]> { + let bytes = (number << ((8 - width) * 8)).to_be_bytes(); + SmallVec::from_slice(&bytes[..width as usize]) +} + +fn output_hex(f: &mut Formatter<'_>, bytes: &[u8]) -> FmtResult { + for byte in bytes { + write!(f, "{byte:02X}")?; + } + Ok(()) +} + +fn allocate_space(want: usize, capacity: usize, used: &mut usize) -> bool { + if *used + want <= capacity { + *used += want; + true + } else { + false + } +} + +/// A representation of a number that can be quickly rounded to any desired +/// number of decimal places (up to a specified maximum). +#[derive(Debug)] +struct Rounder { + /// Magnitude of number with excess precision. + string: SmallString<[u8; 40]>, + + /// Number of digits before decimal point. + integer_digits: usize, + + /// Number of `9`s or `.`s at start of string. + leading_nines: usize, + + /// Number of `0`s or `.`s at start of string. + leading_zeros: usize, + + /// Is the number negative? + negative: bool, +} + +impl Rounder { + fn new(style: &NumberStyle, number: f64, max_decimals: u8) -> Self { + debug_assert!(number.abs() < 1e41); + debug_assert!((0..=16).contains(&max_decimals)); + + let mut string = SmallString::new(); + if max_decimals == 0 { + // Fast path. No rounding needed. + // + // We append `.00` to the integer representation because + // [Self::round_up] assumes that fractional digits are present. + write!(&mut string, "{:.0}.00", number.round().abs()).unwrap() + } else { + // Slow path. + // + // This is more difficult than it really should be because we have + // to make sure that numbers that are exactly halfway between two + // representations are always rounded away from zero. This is not + // what format! normally does (usually it rounds to even), so we + // have to fake it as best we can, by formatting with extra + // precision and then doing the rounding ourselves. + // + // We take up to two rounds to format numbers. In the first round, + // we obtain 2 digits of precision beyond those requested by the + // user. If those digits are exactly "50", then in a second round + // we format with as many digits as are significant in a "double". + // + // It might be better to directly implement our own floating-point + // formatting routine instead of relying on the system's sprintf + // implementation. But the classic Steele and White paper on + // printing floating-point numbers does not hint how to do what we + // want, and it's not obvious how to change their algorithms to do + // so. It would also be a lot of work. + write!( + &mut string, + "{:.*}", + max_decimals as usize + 2, + number.abs() + ) + .unwrap(); + if string.ends_with("50") { + let (_sig, binary_exponent) = frexp(number); + let decimal_exponent = binary_exponent * 3 / 10; + let format_decimals = (f64::DIGITS as i32 + 1) - decimal_exponent; + if format_decimals > max_decimals as i32 + 2 { + string.clear(); + write!(&mut string, "{:.*}", format_decimals as usize, number.abs()).unwrap(); + } + } + }; + + if !style.leading_zero && string.starts_with("0") { + string.remove(0); + } + let leading_zeros = string + .bytes() + .take_while(|c| *c == b'0' || *c == b'.') + .count(); + let leading_nines = string + .bytes() + .take_while(|c| *c == b'9' || *c == b'.') + .count(); + let integer_digits = string.bytes().take_while(u8::is_ascii_digit).count(); + let negative = number.is_sign_negative(); + Self { + string, + integer_digits, + leading_nines, + leading_zeros, + negative, + } + } + + /// Returns a [RounderWdith] for formatting the magnitude to `decimals` + /// decimal places. `decimals` must be in `0..=16`. + fn width(&self, decimals: usize) -> RounderWidth { + // Calculate base measures. + let mut width = self.integer_digits; + if decimals > 0 { + width += decimals + 1; + } + let mut integer_digits = self.integer_digits; + let mut negative = self.negative; + + // Rounding can cause adjustments. + if self.should_round_up(decimals) { + // Rounding up leading `9s` adds a new digit (a `1`). + if self.leading_nines >= width { + width += 1; + integer_digits += 1; + } + } else { + // Rounding down. + if self.leading_zeros >= width { + // All digits that remain after rounding are zeros. Therefore + // we drop the negative sign. + negative = false; + if self.integer_digits == 0 && decimals == 0 { + // No digits at all are left. We need to display + // at least a single digit (a zero). + debug_assert_eq!(width, 0); + width += 1; + integer_digits = 1; + } + } + } + RounderWidth { + width, + integer_digits, + negative, + } + } + + /// Returns true if the number should be rounded up when chopped off at + /// `decimals` decimal places, false if it should be rounded down. + fn should_round_up(&self, decimals: usize) -> bool { + let digit = self.string.as_bytes()[self.integer_digits + decimals + 1]; + debug_assert!(digit.is_ascii_digit()); + digit >= b'5' + } + + /// Formats the number, rounding to `decimals` decimal places. Exactly as + /// many characters as indicated by [Self::width(decimals)] are written. + fn format(&self, decimals: usize) -> SmallString<[u8; 40]> { + let mut output = SmallString::new(); + let mut base_width = self.integer_digits; + if decimals > 0 { + base_width += decimals + 1; + } + + if self.should_round_up(decimals) { + if self.leading_nines < base_width { + // Rounding up. This is the common case where rounding up + // doesn't add an extra digit. + output.push_str(&self.string[..base_width]); + + // SAFETY: This loop only changes ASCII characters to other + // ASCII characters. + unsafe { + for c in output.as_bytes_mut().iter_mut().rev() { + match *c { + b'9' => *c = b'0', + b'0'..=b'8' => { + *c += 1; + break; + } + b'.' => (), + _ => unreachable!(), + } + } + } + } else { + // Rounding up leading 9s causes the result to be a 1 followed + // by a number of 0s, plus a decimal point. + output.push('1'); + for _ in 0..self.integer_digits { + output.push('0'); + } + if decimals > 0 { + output.push('.'); + for _ in 0..decimals { + output.push('0'); + } + } + debug_assert_eq!(output.len(), base_width + 1); + } + } else { + // Rounding down. + if self.integer_digits != 0 || decimals != 0 { + // Common case: just copy the digits. + output.push_str(&self.string); + } else { + // No digits remain. The output is just a zero. + output.push('0'); + } + } + output + } +} + +struct RounderWidth { + /// Number of characters required to format the number to a specified number + /// of decimal places. This includes integer digits and a decimal point and + /// fractional digits, if any, but it does not include any negative prefix + /// or suffix or other affixes. + width: usize, + + /// Number of digits before the decimal point, between 0 and 40. + integer_digits: usize, + + /// True if the number is negative and its rounded representation would + /// include at least one nonzero digit. + negative: bool, +} + +/// Returns `10^x`. +fn power10(x: usize) -> f64 { + const POWERS: [f64; 41] = [ + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, + 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, + 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, + ]; + POWERS + .get(x) + .copied() + .unwrap_or_else(|| 10.0_f64.powi(x as i32)) +} + +/// Returns `256^x`. +fn power256(x: u16) -> f64 { + const POWERS: [f64; 9] = [ + 1.0, + 256.0, + 65536.0, + 16777216.0, + 4294967296.0, + 1099511627776.0, + 281474976710656.0, + 72057594037927936.0, + 18446744073709551616.0, + ]; + POWERS + .get(x as usize) + .copied() + .unwrap_or_else(|| 256.0_f64.powi(x as i32)) +} + +fn fix_decimal_point(s: &mut SmallString) +where + A: Array, +{ + // SAFETY: This only changes only one ASCII character (`.`) to + // another ASCII character (`,`). + unsafe { + if let Some(dot) = s.as_bytes_mut().iter_mut().find(|c| **c == b'.') { + *dot = b','; + } + } +} + +pub fn endian_to_smallvec( + endian: Endian, + mut value: u64, + n: usize, +) -> SmallVec<[u8; N]> { + debug_assert!(n <= 8); + let mut vec = SmallVec::new(); + value <<= 8 * (8 - n); + for _ in 0..n { + vec.push((value >> 56) as u8); + value <<= 8; + } + if endian == Endian::Little { + vec.reverse(); + } + vec +} diff --git a/rust/pspp/src/format/display/mod.rs b/rust/pspp/src/format/display/mod.rs deleted file mode 100644 index 5b3bbe2ccc..0000000000 --- a/rust/pspp/src/format/display/mod.rs +++ /dev/null @@ -1,1197 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -use std::{ - cmp::min, - fmt::{Display, Error as FmtError, Formatter, Result as FmtResult, Write as _}, - io::{Error as IoError, Write as IoWrite}, - str::from_utf8_unchecked, -}; - -use binrw::Endian; -use chrono::{Datelike, NaiveDate}; -use encoding_rs::{Encoding, UTF_8}; -use libm::frexp; -use smallstr::SmallString; -use smallvec::{Array, SmallVec}; - -use crate::{ - calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name}, - data::{ByteStr, Datum, EncodedString, QuotedDatum, WithEncoding}, - endian::ToBytes, - format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type}, - settings::{EndianSettings, Settings as PsppSettings}, - util::ToSmallString, -}; - -pub struct DisplayDatum<'b, B> { - format: Format, - settings: &'b Settings, - endian: EndianSettings, - datum: Datum, - - /// If true, the output will remove leading and trailing spaces from numeric - /// values, and trailing spaces from string values. (This might make the - /// output narrower than the requested width.) - trim_spaces: bool, - - /// If true, the output will include a double quote before and after string - /// values. - quote_strings: bool, -} - -#[cfg(test)] -mod test; - -pub trait DisplayPlain { - fn display_plain(&self) -> DisplayPlainF64; -} - -impl DisplayPlain for f64 { - fn display_plain(&self) -> DisplayPlainF64 { - DisplayPlainF64 { - value: *self, - decimal: '.', - } - } -} - -pub struct DisplayPlainF64 { - pub value: f64, - pub decimal: char, -} - -impl DisplayPlainF64 { - pub fn with_decimal(self, decimal: char) -> Self { - Self { decimal, ..self } - } -} - -impl Display for DisplayPlainF64 { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - struct Inner(f64); - - impl Display for Inner { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let value = self.0; - if (value.abs() < 0.0005 && value != 0.0) || value.abs() > 1e15 { - // Print 0s that would otherwise have lots of leading or - // trailing zeros in scientific notation with full precision. - write!(f, "{value:.e}") - } else if value == value.trunc() { - // Print integers without decimal places. - write!(f, "{value:.0}") - } else { - // Print other numbers with full precision. - write!(f, "{value:.}") - } - } - } - - match self.decimal { - '.' => write!(f, "{}", Inner(self.value)), - _ => { - let tmp = Inner(self.value).to_small_string::<64>(); - if let Some(position) = tmp.find('.') { - f.write_str(&tmp[..position])?; - f.write_char(self.decimal)?; - f.write_str(&tmp[position + 1..]) - } else { - f.write_str(&tmp) - } - } - } - } -} - -impl<'a, D> Datum -where - D: EncodedString, -{ - /// Returns an object that implements [Display] for printing this [Datum] as - /// `format`. - /// - /// [Display]: std::fmt::Display - pub fn display(&'a self, format: Format) -> DisplayDatum<'a, WithEncoding<&'a ByteStr>> { - DisplayDatum::new(format, self.as_borrowed()) - } - - pub fn display_plain(&self) -> QuotedDatum<'_, D> { - self.quoted() - } -} - -impl<'b, B> Display for DisplayDatum<'b, B> -where - B: EncodedString, -{ - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let number = match &self.datum { - Datum::Number(number) => *number, - Datum::String(string) => { - if self.format.type_() == Type::AHex { - for byte in string.raw_string_bytes() { - write!(f, "{byte:02x}")?; - } - } else { - let quote = if self.quote_strings { "\"" } else { "" }; - let s = string.as_str(); - let s = if self.trim_spaces { - s.trim_end_matches(' ') - } else { - &s - }; - write!(f, "{quote}{s}{quote}")?; - } - return Ok(()); - } - }; - - let Some(number) = number else { - return self.missing(f); - }; - - match self.format.type_() { - Type::F - | Type::Comma - | Type::Dot - | Type::Dollar - | Type::Pct - | Type::E - | Type::CC(_) => self.number(f, number), - Type::N => self.n(f, number), - Type::Z => self.z(f, number), - - Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => self.fmt_binary(f), - - Type::PIBHex => self.pibhex(f, number), - Type::RBHex => self.rbhex(f, number), - Type::Date - | Type::ADate - | Type::EDate - | Type::JDate - | Type::SDate - | Type::QYr - | Type::MoYr - | Type::WkYr - | Type::DateTime - | Type::YmdHms - | Type::MTime - | Type::Time - | Type::DTime - | Type::WkDay => self.date(f, number), - Type::Month => self.month(f, number), - Type::A | Type::AHex => unreachable!(), - } - } -} - -impl<'b, B> DisplayDatum<'b, B> -where - B: EncodedString, -{ - pub fn new(format: Format, datum: Datum) -> Self { - let settings = PsppSettings::global(); - Self { - format, - datum, - settings: &settings.formats, - endian: settings.endian, - trim_spaces: false, - quote_strings: false, - } - } - pub fn with_settings(self, settings: &'b Settings) -> Self { - Self { settings, ..self } - } - pub fn with_endian(self, endian: EndianSettings) -> Self { - Self { endian, ..self } - } - pub fn with_trimming(self) -> Self { - Self { - trim_spaces: true, - ..self - } - } - pub fn with_quoted_string(self) -> Self { - Self { - quote_strings: true, - ..self - } - } - fn fmt_binary(&self, f: &mut Formatter) -> FmtResult { - let output = self.to_binary().unwrap(); - for b in output { - f.write_char(b as char)?; - } - Ok(()) - } - fn number(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { - if number.is_finite() { - let style = self.settings.number_style(self.format.type_); - if self.format.type_ != Type::E && number.abs() < 1.5 * power10(self.format.w()) { - let rounder = Rounder::new(style, number, self.format.d); - if self.decimal(f, &rounder, style, true)? - || self.scientific(f, number, style, true)? - || self.decimal(f, &rounder, style, false)? - { - return Ok(()); - } - } - - if !self.scientific(f, number, style, false)? { - self.overflow(f)?; - } - Ok(()) - } else { - self.infinite(f, number) - } - } - - fn infinite(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { - if self.format.w >= 3 { - let s = if number.is_nan() { - "NaN" - } else if number.is_infinite() { - if number.is_sign_positive() { - "+Infinity" - } else { - "-Infinity" - } - } else { - "Unknown" - }; - let w = if self.trim_spaces { 0 } else { self.format.w() }; - write!(f, "{s:>w$.w$}") - } else { - self.overflow(f) - } - } - - fn missing(&self, f: &mut Formatter<'_>) -> FmtResult { - match self.format.type_ { - Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => return self.fmt_binary(f), - Type::RBHex => return self.rbhex(f, -f64::MAX), - _ => (), - } - - if self.trim_spaces { - return write!(f, "."); - } - - let w = self.format.w() as isize; - let d = self.format.d() as isize; - let dot_position = match self.format.type_ { - Type::N => w - 1, - Type::Pct => w - d - 2, - Type::E => w - d - 5, - _ => w - d - 1, - }; - let dot_position = dot_position.max(0) as u16; - - for i in 0..self.format.w { - if i == dot_position { - write!(f, ".")?; - } else { - write!(f, " ")?; - } - } - Ok(()) - } - - fn overflow(&self, f: &mut Formatter<'_>) -> FmtResult { - if self.trim_spaces { - write!(f, "*")?; - } else { - for _ in 0..self.format.w { - write!(f, "*")?; - } - } - Ok(()) - } - - fn decimal( - &self, - f: &mut Formatter<'_>, - rounder: &Rounder, - style: &NumberStyle, - require_affixes: bool, - ) -> Result { - for decimals in (0..=self.format.d).rev() { - // Make sure there's room for the number's magnitude, plus the - // negative suffix, plus (if negative) the negative prefix. - let RounderWidth { - mut width, - integer_digits, - negative, - } = rounder.width(decimals as usize); - width += style.neg_suffix.width; - if negative { - width += style.neg_prefix.width; - } - if width > self.format.w() { - continue; - } - - // If there's room for the prefix and suffix, allocate - // space. If the affixes are required, but there's no - // space, give up. - let add_affixes = allocate_space(style.affix_width(), self.format.w(), &mut width); - if !add_affixes && require_affixes { - continue; - } - - // Check whether we should include grouping characters. We need - // room for a complete set or we don't insert any at all. We don't - // include grouping characters if decimal places were requested but - // they were all dropped. - let grouping = style.grouping.filter(|_| { - integer_digits > 3 - && (self.format.d == 0 || decimals > 0) - && allocate_space((integer_digits - 1) / 3, self.format.w(), &mut width) - }); - - // Assemble number. - let magnitude = rounder.format(decimals as usize); - let mut output = SmallString::<[u8; 40]>::new(); - if !self.trim_spaces { - for _ in width..self.format.w() { - output.push(' '); - } - } - if negative { - output.push_str(&style.neg_prefix.s); - } - if add_affixes { - output.push_str(&style.prefix.s); - } - if let Some(grouping) = grouping { - for (i, digit) in magnitude[..integer_digits].bytes().enumerate() { - if i > 0 && (integer_digits - i) % 3 == 0 { - output.push(grouping.into()); - } - output.push(digit as char); - } - } else { - output.push_str(&magnitude[..integer_digits]); - } - if decimals > 0 { - output.push(style.decimal.into()); - let s = &magnitude[integer_digits + 1..]; - output.push_str(&s[..decimals as usize]); - } - if add_affixes { - output.push_str(&style.suffix.s); - } - if negative { - output.push_str(&style.neg_suffix.s); - } else { - for _ in 0..style.neg_suffix.width { - output.push(' '); - } - } - - debug_assert!(self.trim_spaces || output.len() >= self.format.w()); - debug_assert!(output.len() <= self.format.w() + style.extra_bytes); - f.write_str(&output)?; - return Ok(true); - } - Ok(false) - } - - fn scientific( - &self, - f: &mut Formatter<'_>, - number: f64, - style: &NumberStyle, - require_affixes: bool, - ) -> Result { - // Allocate minimum required space. - let mut width = 6 + style.neg_suffix.width; - if number < 0.0 { - width += style.neg_prefix.width; - } - if width > self.format.w() { - return Ok(false); - } - - // Check for room for prefix and suffix. - let add_affixes = allocate_space(style.affix_width(), self.format.w(), &mut width); - if require_affixes && !add_affixes { - return Ok(false); - } - - // Figure out number of characters we can use for the fraction, if any. - // (If that turns out to be `1`, then we'll output a decimal point - // without any digits following.) - let mut fraction_width = min(self.format.d as usize + 1, self.format.w() - width).min(16); - if self.format.type_ != Type::E && fraction_width == 1 { - fraction_width = 0; - } - width += fraction_width; - - let mut output = SmallString::<[u8; 40]>::new(); - if !self.trim_spaces { - for _ in width..self.format.w() { - output.push(' '); - } - } - if number < 0.0 { - output.push_str(&style.neg_prefix.s); - } - if add_affixes { - output.push_str(&style.prefix.s); - } - write!( - &mut output, - "{:.*E}", - fraction_width.saturating_sub(1), - number.abs() - ) - .unwrap(); - if fraction_width == 1 { - // Insert `.` before the `E`, to get a value like "1.E+000". - output.insert(output.find('E').unwrap(), '.'); - } - - // Rust always uses `.` as the decimal point. Translate to `,` if - // necessary. - if style.decimal == Decimal::Comma { - fix_decimal_point(&mut output); - } - - // Make exponent have exactly three digits, plus sign. - let e = output.as_bytes().iter().position(|c| *c == b'E').unwrap(); - let exponent: isize = output[e + 1..].parse().unwrap(); - if exponent.abs() > 999 { - return Ok(false); - } - output.truncate(e + 1); - write!(&mut output, "{exponent:+04}").unwrap(); - - // Add suffixes. - if add_affixes { - output.push_str(&style.suffix.s); - } - if number.is_sign_negative() { - output.push_str(&style.neg_suffix.s); - } else { - for _ in 0..style.neg_suffix.width { - output.push(' '); - } - } - - println!( - "{} for {number} width={width} fraction_width={fraction_width}: {output:?}", - self.format - ); - debug_assert!(self.trim_spaces || output.len() >= self.format.w()); - debug_assert!(output.len() <= self.format.w() + style.extra_bytes); - f.write_str(&output)?; - Ok(true) - } - - fn n(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { - if number < 0.0 { - return self.missing(f); - } - - let legacy = LegacyFormat::new(number, self.format.d()); - let w = self.format.w(); - let len = legacy.len(); - if len > w { - self.overflow(f) - } else { - write!(f, "{}{legacy}", Zeros(w.saturating_sub(len))) - } - } - - fn z(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { - let legacy = LegacyFormat::new(number, self.format.d()); - let w = self.format.w(); - let len = legacy.len(); - if len > w { - self.overflow(f) - } else { - let mut s = legacy.to_small_string::<40>(); - if number < 0.0 { - if let Some(last) = s.pop() { - let last = last.to_digit(10).unwrap(); - s.push(b"}JKLMNOPQR"[last as usize] as char); - } - } - write!(f, "{}{s}", Zeros(w.saturating_sub(len))) - } - } - - fn pibhex(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { - if number < 0.0 { - self.overflow(f) - } else { - let number = number.round(); - if number >= power256(self.format.w / 2) { - self.overflow(f) - } else { - let binary = integer_to_binary(number as u64, self.format.w / 2); - output_hex(f, &binary) - } - } - } - - fn rbhex(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { - let rb = self.rb(Some(number), self.format.w() / 2); - output_hex(f, &rb) - } - - fn date(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { - const MINUTE: f64 = 60.0; - const HOUR: f64 = 60.0 * 60.0; - const DAY: f64 = 60.0 * 60.0 * 24.0; - - let (date, mut time) = match self.format.type_.category() { - Category::Date => { - if number < 0.0 { - return self.missing(f); - } - let Some(date) = calendar_offset_to_gregorian(number / DAY) else { - return self.missing(f); - }; - (date, number % DAY) - } - Category::Time => (NaiveDate::MIN, number), - _ => unreachable!(), - }; - - let mut output = SmallString::<[u8; 40]>::new(); - for TemplateItem { c, n } in DateTemplate::for_format(self.format).unwrap() { - match c { - 'd' if n < 3 => write!(&mut output, "{:02}", date.day()).unwrap(), - 'd' => write!(&mut output, "{:03}", day_of_year(date).unwrap_or(1)).unwrap(), - 'm' if n < 3 => write!(&mut output, "{:02}", date.month()).unwrap(), - 'm' => write!(&mut output, "{}", short_month_name(date.month()).unwrap()).unwrap(), - 'y' if n >= 4 => { - let year = date.year(); - if year <= 9999 { - write!(&mut output, "{year:04}").unwrap(); - } else if self.format.type_ == Type::DateTime - || self.format.type_ == Type::YmdHms - { - write!(&mut output, "****").unwrap(); - } else { - return self.overflow(f); - } - } - 'y' => { - let epoch = self.settings.epoch.0; - let offset = date.year() - epoch; - if !(0..=99).contains(&offset) { - return self.overflow(f); - } - write!(&mut output, "{:02}", date.year().abs() % 100).unwrap(); - } - 'q' => write!(&mut output, "{}", date.month0() / 3 + 1).unwrap(), - 'w' => write!( - &mut output, - "{:2}", - (day_of_year(date).unwrap_or(1) - 1) / 7 + 1 - ) - .unwrap(), - 'D' => { - if time < 0.0 { - output.push('-'); - } - time = time.abs(); - write!(&mut output, "{:1$.0}", (time / DAY).floor(), n).unwrap(); - time %= DAY; - } - 'H' => { - if time < 0.0 { - output.push('-'); - } - time = time.abs(); - write!(&mut output, "{:01$.0}", (time / HOUR).floor(), n).unwrap(); - time %= HOUR; - } - 'M' => { - if time < 0.0 { - output.push('-'); - } - time = time.abs(); - write!(&mut output, "{:02.0}", (time / MINUTE).floor()).unwrap(); - time %= MINUTE; - - let excess_width = self.format.w() as isize - output.len() as isize; - if excess_width < 0 || (self.format.type_ == Type::MTime && excess_width < 3) { - return self.overflow(f); - } - if excess_width == 3 - || excess_width == 4 - || (excess_width >= 5 && self.format.d == 0) - { - write!(&mut output, ":{:02.0}", time.floor()).unwrap(); - } else if excess_width >= 5 { - let d = min(self.format.d(), excess_width as usize - 4); - let w = d + 3; - write!(&mut output, ":{time:0w$.d$}").unwrap(); - if self.settings.decimal == Decimal::Comma { - fix_decimal_point(&mut output); - } - } - break; - } - c if n == 1 => output.push(c), - _ => unreachable!(), - } - } - if !self.trim_spaces { - write!(f, "{:>1$}", &output, self.format.w()) - } else { - f.write_str(&output) - } - } - - fn month(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult { - if let Some(month) = month_name(number as u32) { - if !self.trim_spaces { - write!(f, "{month:.*}", self.format.w()) - } else { - f.write_str(month) - } - } else { - self.missing(f) - } - } - - /// Writes this object to `w`. Writes binary formats ([Type::P], - /// [Type::PIB], and so on) as binary values, and writes other output - /// formats in the given `encoding`. - /// - /// If `dv` is a [DisplayDatum], the difference between `write!(f, "{}", - /// dv)` and `dv.write(f, encoding)` is: - /// - /// * `write!` always outputs UTF-8. Binary formats are encoded as the - /// Unicode characters corresponding to their bytes. - /// - /// * `dv.write` outputs the desired `encoding`. Binary formats are not - /// encoded in `encoding` (and thus they might be invalid for the - /// encoding). - pub fn write(&self, mut w: W, encoding: &'static Encoding) -> Result<(), IoError> - where - W: IoWrite, - { - match self.to_binary() { - Some(binary) => w.write_all(&binary), - None if encoding == UTF_8 => { - write!(&mut w, "{self}") - } - None => w.write_all(&encoding.encode(&self.to_small_string::<64>()).0), - } - } - - fn to_binary(&self) -> Option> { - let number = self.datum.as_number()?; - match self.format.type_() { - Type::P => Some(self.p(number)), - Type::PK => Some(self.pk(number)), - Type::IB => Some(self.ib(number)), - Type::PIB => Some(self.pib(number)), - Type::RB => Some(self.rb(number, self.format.w())), - _ => None, - } - } - - fn bcd(&self, number: Option, digits: usize) -> (bool, SmallVec<[u8; 16]>) { - let legacy = LegacyFormat::new(number.unwrap_or_default(), self.format.d()); - let len = legacy.len(); - - let mut output = SmallVec::new(); - if len > digits { - output.resize(digits.div_ceil(2), 0); - (false, output) - } else { - let mut decimal = SmallString::<[u8; 16]>::new(); - write!( - &mut decimal, - "{}{legacy}", - Zeros(digits.saturating_sub(len)) - ) - .unwrap(); - - let mut src = decimal.bytes(); - for _ in 0..digits / 2 { - let d0 = src.next().unwrap() - b'0'; - let d1 = src.next().unwrap() - b'0'; - output.push((d0 << 4) + d1); - } - if digits % 2 != 0 { - let d = src.next().unwrap() - b'0'; - output.push(d << 4); - } - (true, output) - } - } - - fn p(&self, number: Option) -> SmallVec<[u8; 16]> { - let (valid, mut output) = self.bcd(number, self.format.w() * 2 - 1); - if valid && number.is_some_and(|number| number < 0.0) { - *output.last_mut().unwrap() |= 0xd; - } else { - *output.last_mut().unwrap() |= 0xf; - } - output - } - - fn pk(&self, number: Option) -> SmallVec<[u8; 16]> { - let number = match number { - Some(number) if number < 0.0 => None, - other => other, - }; - let (_valid, output) = self.bcd(number, self.format.w() * 2); - output - } - - fn ib(&self, number: Option) -> SmallVec<[u8; 16]> { - let number = number.map_or(0.0, |number| (number * power10(self.format.d())).round()); - let number = if number >= power256(self.format.w) / 2.0 - 1.0 - || number < -power256(self.format.w) / 2.0 - { - 0.0 - } else { - number - }; - let integer = number.abs() as u64; - let integer = if number < 0.0 { - (-(integer as i64)) as u64 - } else { - integer - }; - endian_to_smallvec(self.endian.output, integer, self.format.w()) - } - - fn pib(&self, number: Option) -> SmallVec<[u8; 16]> { - let number = number.map_or(0.0, |number| (number * power10(self.format.d())).round()); - let number = if number >= power256(self.format.w) || number < 0.0 { - 0.0 - } else { - number - }; - let integer = number.abs() as u64; - endian_to_smallvec(self.endian.output, integer, self.format.w()) - } - - fn rb(&self, number: Option, w: usize) -> SmallVec<[u8; 16]> { - let number = number.unwrap_or(-f64::MAX); - let bytes: [u8; 8] = self.endian.output.to_bytes(number); - let mut vec = SmallVec::new(); - vec.extend_from_slice(&bytes); - vec.resize(w, 0); - vec - } -} - -struct LegacyFormat { - s: SmallVec<[u8; 40]>, - trailing_zeros: usize, -} - -impl LegacyFormat { - fn new(number: f64, d: usize) -> Self { - let mut s = SmallVec::<[u8; 40]>::new(); - write!(&mut s, "{:E}", number.abs()).unwrap(); - debug_assert!(s.is_ascii()); - - // Parse exponent. - // - // Add 1 because of the transformation we will do just below, and `d` so - // that we just need to round to the nearest integer. - let e_index = s.iter().position(|c| *c == b'E').unwrap(); - let mut exponent = unsafe { from_utf8_unchecked(&s[e_index + 1..]) } - .parse::() - .unwrap() - + 1 - + d as i32; - - // Transform `1.234E56` into `1234`. - if e_index == 1 { - // No decimals, e.g. `1E4` or `0E0`. - s.truncate(1) - } else { - s.remove(1); - s.truncate(e_index - 1); - }; - debug_assert!(s.iter().all(|c| c.is_ascii_digit())); - - if exponent >= 0 && exponent < s.len() as i32 { - // The first `exponent` digits are before the decimal point. We - // need to round off there. - let exp = exponent as usize; - - fn round_up(digits: &mut [u8], position: usize) -> bool { - for index in (0..position).rev() { - match digits[index] { - b'0'..=b'8' => { - digits[index] += 1; - return true; - } - b'9' => { - digits[index] = b'0'; - } - _ => unreachable!(), - } - } - false - } - - if s[exp] >= b'5' && !round_up(&mut s, exp) { - s.clear(); - s.push(b'1'); - exponent += 1; - } - } - - let exponent = exponent.max(0) as usize; - s.truncate(exponent); - s.resize(exponent, b'0'); - let trailing_zeros = exponent.saturating_sub(s.len()); - Self { s, trailing_zeros } - } - fn s(&self) -> &str { - unsafe { from_utf8_unchecked(&self.s) } - } - fn len(&self) -> usize { - self.s.len() + self.trailing_zeros - } -} - -impl Display for LegacyFormat { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{}{}", self.s(), Zeros(self.trailing_zeros)) - } -} - -struct Zeros(usize); - -impl Display for Zeros { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let mut n = self.0; - while n > 0 { - static ZEROS: &str = "0000000000000000000000000000000000000000"; - let chunk = n.min(ZEROS.len()); - f.write_str(&ZEROS[..chunk])?; - n -= chunk; - } - Ok(()) - } -} - -fn integer_to_binary(number: u64, width: u16) -> SmallVec<[u8; 8]> { - let bytes = (number << ((8 - width) * 8)).to_be_bytes(); - SmallVec::from_slice(&bytes[..width as usize]) -} - -fn output_hex(f: &mut Formatter<'_>, bytes: &[u8]) -> FmtResult { - for byte in bytes { - write!(f, "{byte:02X}")?; - } - Ok(()) -} - -fn allocate_space(want: usize, capacity: usize, used: &mut usize) -> bool { - if *used + want <= capacity { - *used += want; - true - } else { - false - } -} - -/// A representation of a number that can be quickly rounded to any desired -/// number of decimal places (up to a specified maximum). -#[derive(Debug)] -struct Rounder { - /// Magnitude of number with excess precision. - string: SmallString<[u8; 40]>, - - /// Number of digits before decimal point. - integer_digits: usize, - - /// Number of `9`s or `.`s at start of string. - leading_nines: usize, - - /// Number of `0`s or `.`s at start of string. - leading_zeros: usize, - - /// Is the number negative? - negative: bool, -} - -impl Rounder { - fn new(style: &NumberStyle, number: f64, max_decimals: u8) -> Self { - debug_assert!(number.abs() < 1e41); - debug_assert!((0..=16).contains(&max_decimals)); - - let mut string = SmallString::new(); - if max_decimals == 0 { - // Fast path. No rounding needed. - // - // We append `.00` to the integer representation because - // [Self::round_up] assumes that fractional digits are present. - write!(&mut string, "{:.0}.00", number.round().abs()).unwrap() - } else { - // Slow path. - // - // This is more difficult than it really should be because we have - // to make sure that numbers that are exactly halfway between two - // representations are always rounded away from zero. This is not - // what format! normally does (usually it rounds to even), so we - // have to fake it as best we can, by formatting with extra - // precision and then doing the rounding ourselves. - // - // We take up to two rounds to format numbers. In the first round, - // we obtain 2 digits of precision beyond those requested by the - // user. If those digits are exactly "50", then in a second round - // we format with as many digits as are significant in a "double". - // - // It might be better to directly implement our own floating-point - // formatting routine instead of relying on the system's sprintf - // implementation. But the classic Steele and White paper on - // printing floating-point numbers does not hint how to do what we - // want, and it's not obvious how to change their algorithms to do - // so. It would also be a lot of work. - write!( - &mut string, - "{:.*}", - max_decimals as usize + 2, - number.abs() - ) - .unwrap(); - if string.ends_with("50") { - let (_sig, binary_exponent) = frexp(number); - let decimal_exponent = binary_exponent * 3 / 10; - let format_decimals = (f64::DIGITS as i32 + 1) - decimal_exponent; - if format_decimals > max_decimals as i32 + 2 { - string.clear(); - write!(&mut string, "{:.*}", format_decimals as usize, number.abs()).unwrap(); - } - } - }; - - if !style.leading_zero && string.starts_with("0") { - string.remove(0); - } - let leading_zeros = string - .bytes() - .take_while(|c| *c == b'0' || *c == b'.') - .count(); - let leading_nines = string - .bytes() - .take_while(|c| *c == b'9' || *c == b'.') - .count(); - let integer_digits = string.bytes().take_while(u8::is_ascii_digit).count(); - let negative = number.is_sign_negative(); - Self { - string, - integer_digits, - leading_nines, - leading_zeros, - negative, - } - } - - /// Returns a [RounderWdith] for formatting the magnitude to `decimals` - /// decimal places. `decimals` must be in `0..=16`. - fn width(&self, decimals: usize) -> RounderWidth { - // Calculate base measures. - let mut width = self.integer_digits; - if decimals > 0 { - width += decimals + 1; - } - let mut integer_digits = self.integer_digits; - let mut negative = self.negative; - - // Rounding can cause adjustments. - if self.should_round_up(decimals) { - // Rounding up leading `9s` adds a new digit (a `1`). - if self.leading_nines >= width { - width += 1; - integer_digits += 1; - } - } else { - // Rounding down. - if self.leading_zeros >= width { - // All digits that remain after rounding are zeros. Therefore - // we drop the negative sign. - negative = false; - if self.integer_digits == 0 && decimals == 0 { - // No digits at all are left. We need to display - // at least a single digit (a zero). - debug_assert_eq!(width, 0); - width += 1; - integer_digits = 1; - } - } - } - RounderWidth { - width, - integer_digits, - negative, - } - } - - /// Returns true if the number should be rounded up when chopped off at - /// `decimals` decimal places, false if it should be rounded down. - fn should_round_up(&self, decimals: usize) -> bool { - let digit = self.string.as_bytes()[self.integer_digits + decimals + 1]; - debug_assert!(digit.is_ascii_digit()); - digit >= b'5' - } - - /// Formats the number, rounding to `decimals` decimal places. Exactly as - /// many characters as indicated by [Self::width(decimals)] are written. - fn format(&self, decimals: usize) -> SmallString<[u8; 40]> { - let mut output = SmallString::new(); - let mut base_width = self.integer_digits; - if decimals > 0 { - base_width += decimals + 1; - } - - if self.should_round_up(decimals) { - if self.leading_nines < base_width { - // Rounding up. This is the common case where rounding up - // doesn't add an extra digit. - output.push_str(&self.string[..base_width]); - - // SAFETY: This loop only changes ASCII characters to other - // ASCII characters. - unsafe { - for c in output.as_bytes_mut().iter_mut().rev() { - match *c { - b'9' => *c = b'0', - b'0'..=b'8' => { - *c += 1; - break; - } - b'.' => (), - _ => unreachable!(), - } - } - } - } else { - // Rounding up leading 9s causes the result to be a 1 followed - // by a number of 0s, plus a decimal point. - output.push('1'); - for _ in 0..self.integer_digits { - output.push('0'); - } - if decimals > 0 { - output.push('.'); - for _ in 0..decimals { - output.push('0'); - } - } - debug_assert_eq!(output.len(), base_width + 1); - } - } else { - // Rounding down. - if self.integer_digits != 0 || decimals != 0 { - // Common case: just copy the digits. - output.push_str(&self.string); - } else { - // No digits remain. The output is just a zero. - output.push('0'); - } - } - output - } -} - -struct RounderWidth { - /// Number of characters required to format the number to a specified number - /// of decimal places. This includes integer digits and a decimal point and - /// fractional digits, if any, but it does not include any negative prefix - /// or suffix or other affixes. - width: usize, - - /// Number of digits before the decimal point, between 0 and 40. - integer_digits: usize, - - /// True if the number is negative and its rounded representation would - /// include at least one nonzero digit. - negative: bool, -} - -/// Returns `10^x`. -fn power10(x: usize) -> f64 { - const POWERS: [f64; 41] = [ - 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, - 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, - 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, - ]; - POWERS - .get(x) - .copied() - .unwrap_or_else(|| 10.0_f64.powi(x as i32)) -} - -/// Returns `256^x`. -fn power256(x: u16) -> f64 { - const POWERS: [f64; 9] = [ - 1.0, - 256.0, - 65536.0, - 16777216.0, - 4294967296.0, - 1099511627776.0, - 281474976710656.0, - 72057594037927936.0, - 18446744073709551616.0, - ]; - POWERS - .get(x as usize) - .copied() - .unwrap_or_else(|| 256.0_f64.powi(x as i32)) -} - -fn fix_decimal_point(s: &mut SmallString) -where - A: Array, -{ - // SAFETY: This only changes only one ASCII character (`.`) to - // another ASCII character (`,`). - unsafe { - if let Some(dot) = s.as_bytes_mut().iter_mut().find(|c| **c == b'.') { - *dot = b','; - } - } -} - -pub fn endian_to_smallvec( - endian: Endian, - mut value: u64, - n: usize, -) -> SmallVec<[u8; N]> { - debug_assert!(n <= 8); - let mut vec = SmallVec::new(); - value <<= 8 * (8 - n); - for _ in 0..n { - vec.push((value >> 56) as u8); - value <<= 8; - } - if endian == Endian::Little { - vec.reverse(); - } - vec -} diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs deleted file mode 100644 index 43ba5198b8..0000000000 --- a/rust/pspp/src/format/mod.rs +++ /dev/null @@ -1,1390 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -use std::{ - fmt::{Debug, Display, Formatter, Result as FmtResult, Write}, - ops::{Not, RangeInclusive}, - str::{Chars, FromStr}, - sync::LazyLock, -}; - -use chrono::{Datelike, Local}; -use enum_iterator::{all, Sequence}; -use enum_map::{Enum, EnumMap}; -use serde::{Deserialize, Serialize}; -use thiserror::Error as ThisError; -use unicode_width::UnicodeWidthStr; - -use crate::{ - data::{ByteString, Datum}, - sys::raw, - util::ToSmallString, - variable::{VarType, VarWidth}, -}; - -mod display; -mod parse; -pub use display::{DisplayDatum, DisplayPlain, DisplayPlainF64}; - -#[derive(Clone, ThisError, Debug, PartialEq, Eq)] -pub enum Error { - #[error("Unknown format type {value}.")] - UnknownFormat { value: u16 }, - - #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)] - OddWidthNotAllowed(UncheckedFormat), - - #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())] - BadWidth(UncheckedFormat), - - #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)] - DecimalsNotAllowedForFormat(UncheckedFormat), - - #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)] - DecimalsNotAllowedForWidth(UncheckedFormat), - - #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)] - TooManyDecimalsForWidth { - spec: UncheckedFormat, - max_d: Decimals, - }, - - #[error("String variable is not compatible with numeric format {0}.")] - UnnamedVariableNotCompatibleWithNumericFormat(Type), - - #[error("Numeric variable is not compatible with string format {0}.")] - UnnamedVariableNotCompatibleWithStringFormat(Type), - - #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")] - NamedStringVariableBadSpecWidth { - variable: String, - width: Width, - bad_spec: Format, - good_spec: Format, - }, - - #[error("String variable with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")] - UnnamedStringVariableBadSpecWidth { - width: Width, - bad_spec: Format, - good_spec: Format, - }, -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub enum Category { - // Numeric formats. - Basic, - Custom, - Legacy, - Binary, - Hex, - Date, - Time, - DateComponent, - - // String formats. - String, -} - -impl From for Category { - fn from(source: Type) -> Self { - match source { - Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic, - Type::CC(_) => Self::Custom, - Type::N | Type::Z => Self::Legacy, - Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary, - Type::PIBHex | Type::RBHex => Self::Hex, - Type::Date - | Type::ADate - | Type::EDate - | Type::JDate - | Type::SDate - | Type::QYr - | Type::MoYr - | Type::WkYr - | Type::DateTime - | Type::YmdHms => Self::Date, - Type::MTime | Type::Time | Type::DTime => Self::Time, - Type::WkDay | Type::Month => Self::DateComponent, - Type::A | Type::AHex => Self::String, - } - } -} - -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash, Sequence, Serialize)] -pub enum CC { - A, - B, - C, - D, - E, -} - -impl CC { - pub fn as_string(&self) -> &'static str { - match self { - CC::A => "A", - CC::B => "B", - CC::C => "C", - CC::D => "D", - CC::E => "E", - } - } -} - -impl Display for CC { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", self.as_string()) - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Sequence, Serialize)] -pub enum Type { - // Basic numeric formats. - F, - Comma, - Dot, - Dollar, - Pct, - E, - - // Custom currency formats. - CC(CC), - - // Legacy numeric formats. - N, - Z, - - // Binary and hexadecimal formats. - P, - PK, - IB, - PIB, - PIBHex, - RB, - RBHex, - - // Time and date formats. - Date, - ADate, - EDate, - JDate, - SDate, - QYr, - MoYr, - WkYr, - DateTime, - YmdHms, - MTime, - Time, - DTime, - - // Date component formats. - WkDay, - Month, - - // String formats. - A, - AHex, -} - -pub type Width = u16; -pub type SignedWidth = i16; - -pub type Decimals = u8; - -impl Type { - pub fn max_width(self) -> Width { - match self { - Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16, - Self::IB | Self::PIB | Self::RB => 8, - Self::A => 32767, - Self::AHex => 32767 * 2, - _ => 40, - } - } - - pub fn min_width(self) -> Width { - match self { - // Basic numeric formats. - Self::F => 1, - Self::Comma => 1, - Self::Dot => 1, - Self::Dollar => 2, - Self::Pct => 2, - Self::E => 6, - - // Custom currency formats. - Self::CC(_) => 2, - - // Legacy numeric formats. - Self::N => 1, - Self::Z => 1, - - // Binary and hexadecimal formats. - Self::P => 1, - Self::PK => 1, - Self::IB => 1, - Self::PIB => 1, - Self::PIBHex => 2, - Self::RB => 2, - Self::RBHex => 4, - - // Time and date formats. - Self::Date => 9, - Self::ADate => 8, - Self::EDate => 8, - Self::JDate => 5, - Self::SDate => 8, - Self::QYr => 6, - Self::MoYr => 6, - Self::WkYr => 8, - Self::DateTime => 17, - Self::YmdHms => 16, - Self::MTime => 5, - Self::Time => 5, - Self::DTime => 8, - - // Date component formats. - Self::WkDay => 2, - Self::Month => 3, - - // String formats. - Self::A => 1, - Self::AHex => 2, - } - } - - pub fn width_range(self) -> RangeInclusive { - self.min_width()..=self.max_width() - } - - pub fn max_decimals(self, width: Width) -> Decimals { - let width = width.clamp(1, 40) as SignedWidth; - let max = match self { - Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1, - Self::Dollar | Self::Pct => width - 2, - Self::E => width - 7, - Self::N | Self::Z => width, - Self::P => width * 2 - 1, - Self::PK => width * 2, - Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth, - Self::PIBHex => 0, - Self::RB | Self::RBHex => 16, - Self::Date - | Self::ADate - | Self::EDate - | Self::JDate - | Self::SDate - | Self::QYr - | Self::MoYr - | Self::WkYr => 0, - Self::DateTime => width - 21, - Self::YmdHms => width - 20, - Self::MTime => width - 6, - Self::Time => width - 9, - Self::DTime => width - 12, - Self::WkDay | Self::Month | Self::A | Self::AHex => 0, - }; - max.clamp(0, 16) as Decimals - } - - pub fn takes_decimals(self) -> bool { - self.max_decimals(Width::MAX) > 0 - } - - pub fn category(self) -> Category { - self.into() - } - - pub fn width_step(self) -> Width { - if self.category() == Category::Hex || self == Self::AHex { - 2 - } else { - 1 - } - } - - pub fn clamp_width(self, width: Width) -> Width { - let (min, max) = self.width_range().into_inner(); - let width = width.clamp(min, max); - if self.width_step() == 2 { - width / 2 * 2 - } else { - width - } - } - - pub fn var_type(self) -> VarType { - match self { - Self::A | Self::AHex => VarType::String, - _ => VarType::Numeric, - } - } - - /// Checks whether this format is valid for a variable with the given - /// `var_type`. - pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> { - let my_type = self.var_type(); - match (my_type, var_type) { - (VarType::Numeric, VarType::String) => { - Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self)) - } - (VarType::String, VarType::Numeric) => { - Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self)) - } - _ => Ok(()), - } - } - - pub fn as_str(&self) -> &'static str { - match self { - Self::F => "F", - Self::Comma => "COMMA", - Self::Dot => "DOT", - Self::Dollar => "DOLLAR", - Self::Pct => "PCT", - Self::E => "E", - Self::CC(CC::A) => "CCA", - Self::CC(CC::B) => "CCB", - Self::CC(CC::C) => "CCC", - Self::CC(CC::D) => "CCD", - Self::CC(CC::E) => "CCE", - Self::N => "N", - Self::Z => "Z", - Self::P => "P", - Self::PK => "PK", - Self::IB => "IB", - Self::PIB => "PIB", - Self::PIBHex => "PIBHEX", - Self::RB => "RB", - Self::RBHex => "RBHEX", - Self::Date => "DATE", - Self::ADate => "ADATE", - Self::EDate => "EDATE", - Self::JDate => "JDATE", - Self::SDate => "SDATE", - Self::QYr => "QYR", - Self::MoYr => "MOYR", - Self::WkYr => "WKYR", - Self::DateTime => "DATETIME", - Self::YmdHms => "YMDHMS", - Self::MTime => "MTIME", - Self::Time => "TIME", - Self::DTime => "DTIME", - Self::WkDay => "WKDAY", - Self::Month => "MONTH", - Self::A => "A", - Self::AHex => "AHEX", - } - } - - pub fn default_value(&self) -> Datum { - match self.var_type() { - VarType::Numeric => Datum::sysmis(), - VarType::String => Datum::String(ByteString::default()), - } - } -} - -impl Display for Type { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", self.as_str()) - } -} - -impl FromStr for Type { - type Err = (); - - fn from_str(s: &str) -> Result { - for type_ in all::() { - if type_.as_str().eq_ignore_ascii_case(s) { - return Ok(type_); - } - } - Err(()) - } -} - -fn max_digits_for_bytes(bytes: usize) -> usize { - *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20) -} - -#[derive(Debug, PartialEq, Eq, Hash)] -pub struct AbstractFormat { - pub name: String, - w: Width, - d: Decimals, -} - -fn split(s: &str, predicate: F) -> (&str, &str) -where - F: Fn(&char) -> bool, -{ - let rest = s.trim_start_matches(|c| predicate(&c)); - let start = &s[..s.len() - rest.len()]; - (start, rest) -} - -impl FromStr for AbstractFormat { - type Err = (); - - fn from_str(s: &str) -> Result { - let (name, s) = split(s, char::is_ascii_alphabetic); - if name.is_empty() { - return Err(()); - } - - let (w, s) = split(s, char::is_ascii_digit); - let Ok(w) = w.parse() else { - return Err(()); - }; - - let (d, rest) = if let Some(s) = s.strip_prefix('.') { - let (d, rest) = split(s, char::is_ascii_digit); - let Ok(d) = d.parse() else { - return Err(()); - }; - (d, rest) - } else { - (0, s) - }; - - if !rest.is_empty() { - return Err(()); - } - Ok(Self { - name: name.into(), - w, - d, - }) - } -} - -impl TryFrom for UncheckedFormat { - type Error = (); - - fn try_from(value: AbstractFormat) -> Result { - Ok(UncheckedFormat::new(value.name.parse()?, value.w, value.d)) - } -} - -#[derive(Copy, Clone, PartialEq, Eq, Hash)] -pub struct Format { - type_: Type, - w: Width, - d: Decimals, -} - -impl Serialize for Format { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - self.to_small_string::<16>().serialize(serializer) - } -} - -impl Format { - pub const F40: Format = Format { - type_: Type::F, - w: 40, - d: 0, - }; - - pub const F40_1: Format = Format { - type_: Type::F, - w: 40, - d: 1, - }; - - pub const F40_2: Format = Format { - type_: Type::F, - w: 40, - d: 2, - }; - - pub const F40_3: Format = Format { - type_: Type::F, - w: 40, - d: 3, - }; - - pub const PCT40_1: Format = Format { - type_: Type::Pct, - w: 40, - d: 1, - }; - - pub const F8_2: Format = Format { - type_: Type::F, - w: 8, - d: 2, - }; - - pub const DATETIME40_0: Format = Format { - type_: Type::DateTime, - w: 40, - d: 0, - }; - - pub fn type_(self) -> Type { - self.type_ - } - pub fn w(self) -> usize { - self.w as usize - } - pub fn d(self) -> usize { - self.d as usize - } - - pub fn new(type_: Type, w: Width, d: Decimals) -> Option { - UncheckedFormat { type_, w, d }.try_into().ok() - } - - pub fn default_for_width(var_width: VarWidth) -> Self { - match var_width { - VarWidth::Numeric => Format { - type_: Type::F, - w: 8, - d: 2, - }, - VarWidth::String(w) => Format { - type_: Type::A, - w, - d: 0, - }, - } - } - - pub fn fixed_from(source: &UncheckedFormat) -> Self { - let UncheckedFormat { - type_: format, - w, - d, - } = *source; - let (min, max) = format.width_range().into_inner(); - let mut w = w.clamp(min, max); - if d <= format.max_decimals(Width::MAX) { - while d > format.max_decimals(w) { - w += 1; - assert!(w <= 40); - } - } - let d = d.clamp(0, format.max_decimals(w)); - Self { - type_: format, - w, - d, - } - } - - pub fn var_width(self) -> VarWidth { - match self.type_ { - Type::A => VarWidth::String(self.w), - Type::AHex => VarWidth::String(self.w / 2), - _ => VarWidth::Numeric, - } - } - - pub fn var_type(self) -> VarType { - self.type_.var_type() - } - - /// Checks whether this format specification is valid for a variable with - /// width `var_width`. - pub fn check_width_compatibility(self, var_width: VarWidth) -> Result { - // Verify that the format is right for the variable's type. - self.type_.check_type_compatibility(var_width.into())?; - - if let VarWidth::String(w) = var_width { - if var_width != self.var_width() { - let bad_spec = self; - let good_spec = if self.type_ == Type::A { - Format { w, ..self } - } else { - Format { w: w * 2, ..self } - }; - return Err(Error::UnnamedStringVariableBadSpecWidth { - width: w, - bad_spec, - good_spec, - }); - } - } - - Ok(self) - } - - pub fn default_value(&self) -> Datum { - match self.var_width() { - VarWidth::Numeric => Datum::sysmis(), - VarWidth::String(width) => Datum::String(ByteString::spaces(width as usize)), - } - } - - pub fn resize(&mut self, width: VarWidth) { - match (self.var_width(), width) { - (VarWidth::Numeric, VarWidth::Numeric) => {} - (VarWidth::String(_), VarWidth::String(new_width)) => { - self.w = if self.type_ == Type::AHex { - new_width * 2 - } else { - new_width - }; - } - _ => *self = Self::default_for_width(width), - } - } - - pub fn codepage_to_unicode(&mut self) { - let mut width = self.var_width(); - width.codepage_to_unicode(); - if let Some(width) = width.as_string_width() { - if self.type_ == Type::AHex { - self.w = width as u16 * 2; - } else { - self.w = width as u16; - } - } - } -} - -impl Debug for Format { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{self}") - } -} - -impl Display for Format { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}{}", self.type_, self.w)?; - if self.type_.takes_decimals() || self.d > 0 { - write!(f, ".{}", self.d)?; - } - Ok(()) - } -} - -impl TryFrom for Format { - type Error = Error; - - fn try_from(source: UncheckedFormat) -> Result { - let UncheckedFormat { - type_: format, - w, - d, - } = source; - let max_d = format.max_decimals(w); - if w % format.width_step() != 0 { - Err(Error::OddWidthNotAllowed(source)) - } else if !format.width_range().contains(&w) { - Err(Error::BadWidth(source)) - } else if d > max_d { - if format.takes_decimals() { - Err(Error::DecimalsNotAllowedForFormat(source)) - } else if max_d > 0 { - Err(Error::TooManyDecimalsForWidth { - spec: source, - max_d, - }) - } else { - Err(Error::DecimalsNotAllowedForWidth(source)) - } - } else { - Ok(Format { - type_: format, - w, - d, - }) - } - } -} - -impl From for u16 { - fn from(source: Type) -> Self { - match source { - Type::A => 1, - Type::AHex => 2, - Type::Comma => 3, - Type::Dollar => 4, - Type::F => 5, - Type::IB => 6, - Type::PIBHex => 7, - Type::P => 8, - Type::PIB => 9, - Type::PK => 10, - Type::RB => 11, - Type::RBHex => 12, - Type::Z => 15, - Type::N => 16, - Type::E => 17, - Type::Date => 20, - Type::Time => 21, - Type::DateTime => 22, - Type::ADate => 23, - Type::JDate => 24, - Type::DTime => 25, - Type::WkDay => 26, - Type::Month => 27, - Type::MoYr => 28, - Type::QYr => 29, - Type::WkYr => 30, - Type::Pct => 31, - Type::Dot => 32, - Type::CC(CC::A) => 33, - Type::CC(CC::B) => 34, - Type::CC(CC::C) => 35, - Type::CC(CC::D) => 36, - Type::CC(CC::E) => 37, - Type::EDate => 38, - Type::SDate => 39, - Type::MTime => 40, - Type::YmdHms => 41, - } - } -} - -impl TryFrom for Type { - type Error = Error; - - fn try_from(source: u16) -> Result { - match source { - 1 => Ok(Self::A), - 2 => Ok(Self::AHex), - 3 => Ok(Self::Comma), - 4 => Ok(Self::Dollar), - 5 => Ok(Self::F), - 6 => Ok(Self::IB), - 7 => Ok(Self::PIBHex), - 8 => Ok(Self::P), - 9 => Ok(Self::PIB), - 10 => Ok(Self::PK), - 11 => Ok(Self::RB), - 12 => Ok(Self::RBHex), - 15 => Ok(Self::Z), - 16 => Ok(Self::N), - 17 => Ok(Self::E), - 20 => Ok(Self::Date), - 21 => Ok(Self::Time), - 22 => Ok(Self::DateTime), - 23 => Ok(Self::ADate), - 24 => Ok(Self::JDate), - 25 => Ok(Self::DTime), - 26 => Ok(Self::WkDay), - 27 => Ok(Self::Month), - 28 => Ok(Self::MoYr), - 29 => Ok(Self::QYr), - 30 => Ok(Self::WkYr), - 31 => Ok(Self::Pct), - 32 => Ok(Self::Dot), - 33 => Ok(Self::CC(CC::A)), - 34 => Ok(Self::CC(CC::B)), - 35 => Ok(Self::CC(CC::C)), - 36 => Ok(Self::CC(CC::D)), - 37 => Ok(Self::CC(CC::E)), - 38 => Ok(Self::EDate), - 39 => Ok(Self::SDate), - 40 => Ok(Self::MTime), - 41 => Ok(Self::YmdHms), - _ => Err(Error::UnknownFormat { value: source }), - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub struct UncheckedFormat { - pub type_: Type, - - pub w: Width, - - pub d: Decimals, -} - -impl UncheckedFormat { - pub fn new(type_: Type, w: Width, d: Decimals) -> Self { - Self { type_, w, d } - } - pub fn fix(&self) -> Format { - Format::fixed_from(self) - } -} - -impl TryFrom for UncheckedFormat { - type Error = Error; - - fn try_from(raw: raw::records::RawFormat) -> Result { - let raw = raw.0; - let raw_format = (raw >> 16) as u16; - let format = raw_format.try_into()?; - let w = ((raw >> 8) & 0xff) as Width; - let d = (raw & 0xff) as Decimals; - Ok(Self { - type_: format, - w, - d, - }) - } -} - -impl Display for UncheckedFormat { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}{}", self.type_, self.w)?; - if self.type_.takes_decimals() || self.d > 0 { - write!(f, ".{}", self.d)?; - } - Ok(()) - } -} - -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Enum, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum Decimal { - #[default] - Dot, - Comma, -} - -impl Decimal { - pub fn as_str(&self) -> &'static str { - match self { - Decimal::Dot => ".", - Decimal::Comma => ",", - } - } -} - -impl From for char { - fn from(value: Decimal) -> Self { - u8::from(value).into() - } -} - -impl From for u8 { - fn from(value: Decimal) -> Self { - match value { - Decimal::Dot => b'.', - Decimal::Comma => b',', - } - } -} - -impl TryFrom for Decimal { - type Error = (); - - fn try_from(c: char) -> Result { - match c { - '.' => Ok(Self::Dot), - ',' => Ok(Self::Comma), - _ => Err(()), - } - } -} - -impl Not for Decimal { - type Output = Self; - - fn not(self) -> Self::Output { - match self { - Self::Dot => Self::Comma, - Self::Comma => Self::Dot, - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] -pub struct Epoch(pub i32); - -impl Epoch { - /// Applies the epoch to `year`: - /// - /// - If `year` is 2 digits (between 0 and 99, inclusive), returns it - /// converted it to the correct year considering the epoch. - /// - /// - Otherwise, returns `year` unchanged. - pub fn apply(&self, year: i32) -> i32 { - match year { - 0..=99 => { - let century = self.0 / 100 * 100; - let offset = self.0 - century; - if year >= offset { - year + century - } else { - year + century + 100 - } - } - other => other, - } - } -} - -impl Default for Epoch { - fn default() -> Self { - static DEFAULT: LazyLock = LazyLock::new(|| Epoch(Local::now().year() - 69)); - *DEFAULT - } -} - -impl Display for Epoch { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{}", self.0) - } -} - -#[derive(Clone, Debug, Default, Serialize)] -pub struct Settings { - pub epoch: Epoch, - - /// Either `'.'` or `','`. - pub decimal: Decimal, - - /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5` - /// instead of `.5`)? - pub leading_zero: bool, - - /// Custom currency styles. - pub ccs: EnumMap>>, -} - -#[derive(Copy, Clone, Enum)] -struct StyleParams { - decimal: Decimal, - leading_zero: bool, -} -impl From<&Settings> for StyleParams { - fn from(value: &Settings) -> Self { - Self { - decimal: value.decimal, - leading_zero: value.leading_zero, - } - } -} - -struct StyleSet(EnumMap); - -impl StyleSet { - fn new(f: impl Fn(StyleParams) -> NumberStyle) -> Self { - Self(EnumMap::from_fn(f)) - } - fn get(&self, settings: &Settings) -> &NumberStyle { - &self.0[settings.into()] - } -} - -impl Settings { - pub fn with_cc(mut self, cc: CC, style: NumberStyle) -> Self { - self.ccs[cc] = Some(Box::new(style)); - self - } - pub fn with_leading_zero(self, leading_zero: bool) -> Self { - Self { - leading_zero, - ..self - } - } - pub fn with_epoch(self, epoch: Epoch) -> Self { - Self { epoch, ..self } - } - pub fn number_style(&self, type_: Type) -> &NumberStyle { - static DEFAULT: LazyLock = - LazyLock::new(|| NumberStyle::new("", "", Decimal::Dot, None, false)); - - match type_ { - Type::F | Type::E => { - static F: LazyLock = LazyLock::new(|| { - StyleSet::new(|p| NumberStyle::new("", "", p.decimal, None, p.leading_zero)) - }); - F.get(self) - } - Type::Comma => { - static COMMA: LazyLock = LazyLock::new(|| { - StyleSet::new(|p| { - NumberStyle::new("", "", p.decimal, Some(!p.decimal), p.leading_zero) - }) - }); - COMMA.get(self) - } - Type::Dot => { - static DOT: LazyLock = LazyLock::new(|| { - StyleSet::new(|p| { - NumberStyle::new("", "", !p.decimal, Some(p.decimal), p.leading_zero) - }) - }); - DOT.get(self) - } - Type::Dollar => { - static DOLLAR: LazyLock = LazyLock::new(|| { - StyleSet::new(|p| NumberStyle::new("$", "", p.decimal, Some(!p.decimal), false)) - }); - DOLLAR.get(self) - } - Type::Pct => { - static PCT: LazyLock = LazyLock::new(|| { - StyleSet::new(|p| NumberStyle::new("", "%", p.decimal, None, false)) - }); - PCT.get(self) - } - Type::CC(cc) => self.ccs[cc].as_deref().unwrap_or(&DEFAULT), - Type::N - | Type::Z - | Type::P - | Type::PK - | Type::IB - | Type::PIB - | Type::PIBHex - | Type::RB - | Type::RBHex - | Type::Date - | Type::ADate - | Type::EDate - | Type::JDate - | Type::SDate - | Type::QYr - | Type::MoYr - | Type::WkYr - | Type::DateTime - | Type::YmdHms - | Type::MTime - | Type::Time - | Type::DTime - | Type::WkDay - | Type::Month - | Type::A - | Type::AHex => &DEFAULT, - } - } -} - -/// A numeric output style. This can express numeric formats in -/// [Category::Basic] and [Category::Custom]. -#[derive(Clone, Debug, Serialize)] -pub struct NumberStyle { - pub neg_prefix: Affix, - pub prefix: Affix, - pub suffix: Affix, - pub neg_suffix: Affix, - - /// Decimal point. - pub decimal: Decimal, - - /// Grouping character. - pub grouping: Option, - - /// Format as `.5` or `0.5`? - pub leading_zero: bool, - - /// An `Affix` may require more bytes than its display width; for example, - /// U+00A5 (Â¥) is 2 bytes in UTF-8 but occupies only one display column. - /// This member is the sum of the number of bytes required by all of the - /// `Affix` members in this struct, minus their display widths. Thus, it - /// can be used to size memory allocations: for example, the formatted - /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in - /// UTF-8. - #[serde(skip)] - pub extra_bytes: usize, -} - -impl Display for NumberStyle { - /// Display this number style in the format used for custom currency. - /// - /// This format can only accurately represent number styles that include a - /// grouping character. If this number style doesn't, it will pretend that - /// the grouping character is the opposite of the decimal point character. - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let grouping = char::from(!self.decimal); - write!( - f, - "{}{}{}{}{}{}{}", - self.neg_prefix.display(grouping), - grouping, - self.prefix.display(grouping), - grouping, - self.suffix.display(grouping), - grouping, - self.neg_suffix.display(grouping), - ) - } -} - -impl NumberStyle { - fn new( - prefix: &str, - suffix: &str, - decimal: Decimal, - grouping: Option, - leading_zero: bool, - ) -> Self { - // These assertions ensure that zero is correct for `extra_bytes`. - debug_assert!(prefix.is_ascii()); - debug_assert!(suffix.is_ascii()); - - Self { - neg_prefix: Affix::new("-"), - prefix: Affix::new(prefix), - suffix: Affix::new(suffix), - neg_suffix: Affix::new(""), - decimal, - grouping, - leading_zero, - extra_bytes: 0, - } - } - - fn affix_width(&self) -> usize { - self.prefix.width + self.suffix.width - } -} - -#[derive(Clone, Debug, Serialize)] -pub struct Affix { - /// String contents of affix. - pub s: String, - - #[serde(skip)] - /// Display width in columns (see [unicode_width]) - pub width: usize, -} - -impl Affix { - fn new(s: impl Into) -> Self { - let s = s.into(); - Self { - width: s.width(), - s, - } - } - - fn extra_bytes(&self) -> usize { - self.s.len().checked_sub(self.width).unwrap() - } - - fn display(&self, escape: char) -> DisplayAffix<'_> { - DisplayAffix { - affix: self.s.as_str(), - escape, - } - } -} - -pub struct DisplayAffix<'a> { - affix: &'a str, - escape: char, -} - -impl Display for DisplayAffix<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - for c in self.affix.chars() { - if c == self.escape { - f.write_char('\'')?; - } - f.write_char(c)?; - } - Ok(()) - } -} - -impl FromStr for NumberStyle { - type Err = (); - - fn from_str(s: &str) -> Result { - fn find_separator(s: &str) -> Option { - // Count commas and periods. There must be exactly three of one or - // the other, except that an apostrophe escapes a following comma or - // period. - let mut n_commas = 0; - let mut n_periods = 0; - let s = s.as_bytes(); - for i in 0..s.len() { - if i > 0 && s[i - 1] == b'\'' { - } else if s[i] == b',' { - n_commas += 1; - } else if s[i] == b'.' { - n_periods += 1; - } - } - - if n_commas == 3 && n_periods != 3 { - Some(',') - } else if n_periods == 3 && n_commas != 3 { - Some('.') - } else { - None - } - } - - fn take_cc_token(iter: &mut Chars<'_>, grouping: char) -> Affix { - let mut s = String::new(); - let mut quote = false; - for c in iter { - if c == '\'' && !quote { - quote = true; - } else if c == grouping && !quote { - break; - } else { - s.push(c); - quote = false; - } - } - Affix::new(s) - } - - let Some(grouping) = find_separator(s) else { - return Err(()); - }; - let mut iter = s.chars(); - let neg_prefix = take_cc_token(&mut iter, grouping); - let prefix = take_cc_token(&mut iter, grouping); - let suffix = take_cc_token(&mut iter, grouping); - let neg_suffix = take_cc_token(&mut iter, grouping); - let grouping: Decimal = grouping.try_into().unwrap(); - let decimal = !grouping; - let extra_bytes = neg_prefix.extra_bytes() - + prefix.extra_bytes() - + suffix.extra_bytes() - + neg_suffix.extra_bytes(); - Ok(Self { - neg_prefix, - prefix, - suffix, - neg_suffix, - decimal, - grouping: Some(grouping), - leading_zero: false, - extra_bytes, - }) - } -} - -/// An item within a [DateTemplate]. -pub struct TemplateItem { - /// Character in the template. - pub c: char, - - /// Number of repetitions of the character. - pub n: usize, -} - -/// A template for date and time formats. -#[derive(Clone)] -pub struct DateTemplate(&'static str); - -impl DateTemplate { - /// Returns a [DateTemplate] used for date and time input and output in a - /// field of the given `type_` and `width`. - /// - /// `width` only affects whether a 2-digit year or a 4-digit year is used, - /// that is, whether the returned string contains `yy` or `yyyy`, and - /// whether seconds are included, that is, whether the returned string - /// contains `:SS`. A caller that doesn't care whether the returned string - /// contains `yy` or `yyyy` or `:SS` can just specify 0 to omit them. - pub fn new(type_: Type, width: usize) -> Option { - let (short, long) = match type_ { - Type::F - | Type::Comma - | Type::Dot - | Type::Dollar - | Type::Pct - | Type::E - | Type::CC(_) - | Type::N - | Type::Z - | Type::P - | Type::PK - | Type::IB - | Type::PIB - | Type::PIBHex - | Type::RB - | Type::RBHex - | Type::WkDay - | Type::Month - | Type::A - | Type::AHex => return None, - Type::Date => ("dd-mmm-yy", "dd-mmm-yyyy"), - Type::ADate => ("mm/dd/yy", "mm/dd/yyyy"), - Type::EDate => ("dd.mm.yy", "dd.mm.yyyy"), - Type::JDate => ("yyddd", "yyyyddd"), - Type::SDate => ("yy/mm/dd", "yyyy/mm/dd"), - Type::QYr => ("q Q yy", "q Q yyyy"), - Type::MoYr => ("mmm yy", "mmm yyyy"), - Type::WkYr => ("ww WK yy", "ww WK yyyy"), - Type::DateTime => ("dd-mmm-yyyy HH:MM", "dd-mmm-yyyy HH:MM:SS"), - Type::YmdHms => ("yyyy-mm-dd HH:MM", "yyyy-mm-dd HH:MM:SS"), - Type::MTime => ("MM", "MM:SS"), - Type::Time => ("HH:MM", "HH:MM:SS"), - Type::DTime => ("D HH:MM", "D HH:MM:SS"), - }; - if width >= long.len() { - Some(DateTemplate(long)) - } else { - Some(DateTemplate(short)) - } - } - - pub fn for_format(format: Format) -> Option { - Self::new(format.type_(), format.w()) - } - - #[allow(clippy::len_without_is_empty)] - pub fn len(&self) -> usize { - self.0.len() - } -} - -impl Iterator for DateTemplate { - type Item = TemplateItem; - - fn next(&mut self) -> Option { - let mut iter = self.0.chars(); - let c = iter.next()?; - self.0 = iter.as_str(); - let mut n = 1; - while iter.next() == Some(c) { - self.0 = iter.as_str(); - n += 1; - } - Some(TemplateItem { c, n }) - } -} - -#[cfg(test)] -mod tests { - use crate::format::{Format, Type, Width}; - - #[test] - fn codepage_to_unicode() { - fn check_format(input: Format, expected_width: Width) { - let mut output = input; - output.codepage_to_unicode(); - let expected = Format::new(input.type_, expected_width, input.d).unwrap(); - assert_eq!(output, expected); - } - check_format(Format::new(Type::A, 1, 0).unwrap(), 3); - check_format(Format::new(Type::A, 2, 0).unwrap(), 6); - check_format(Format::new(Type::A, 3, 0).unwrap(), 9); - check_format(Format::new(Type::A, 1000, 0).unwrap(), 3000); - check_format(Format::new(Type::A, 20000, 0).unwrap(), 32767); - - check_format(Format::new(Type::AHex, 2, 0).unwrap(), 6); - check_format(Format::new(Type::AHex, 4, 0).unwrap(), 12); - check_format(Format::new(Type::AHex, 6, 0).unwrap(), 18); - check_format(Format::new(Type::AHex, 2000, 0).unwrap(), 6000); - check_format(Format::new(Type::AHex, 20000, 0).unwrap(), 60000); - check_format(Format::new(Type::AHex, 30000, 0).unwrap(), 65534); - - check_format(Format::new(Type::F, 40, 0).unwrap(), 40); - } -} diff --git a/rust/pspp/src/lex.rs b/rust/pspp/src/lex.rs new file mode 100644 index 0000000000..f92407bb41 --- /dev/null +++ b/rust/pspp/src/lex.rs @@ -0,0 +1,40 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Lexical analysis for PSPP syntax. +//! +//! PSPP divides traditional "lexical analysis" or "tokenization" into three +//! phases: +//! +//! 1. A low level called "segmentation", implemented in the [segment] module. +//! This labels syntax strings with [Segment](segment::Segment)s. +//! +//! 2. A middle level called "scanning", implemented in the [scan] module. +//! This transforms and merges segments to form [Token]s. +//! +//! 3. A high level called "lexing", implemented in the [lexer] module. Lexing +//! brings together multiple source files and invokes macro expansion on the +//! tokens output by the scanner. + +// Warn about missing docs, but not for items declared with `#[cfg(test)]`. +#![cfg_attr(not(test), warn(missing_docs))] + +pub mod command_name; +pub mod lexer; +pub mod scan; +pub mod segment; +mod token; +pub use token::{Punct, Token}; diff --git a/rust/pspp/src/lex/mod.rs b/rust/pspp/src/lex/mod.rs deleted file mode 100644 index f92407bb41..0000000000 --- a/rust/pspp/src/lex/mod.rs +++ /dev/null @@ -1,40 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -//! Lexical analysis for PSPP syntax. -//! -//! PSPP divides traditional "lexical analysis" or "tokenization" into three -//! phases: -//! -//! 1. A low level called "segmentation", implemented in the [segment] module. -//! This labels syntax strings with [Segment](segment::Segment)s. -//! -//! 2. A middle level called "scanning", implemented in the [scan] module. -//! This transforms and merges segments to form [Token]s. -//! -//! 3. A high level called "lexing", implemented in the [lexer] module. Lexing -//! brings together multiple source files and invokes macro expansion on the -//! tokens output by the scanner. - -// Warn about missing docs, but not for items declared with `#[cfg(test)]`. -#![cfg_attr(not(test), warn(missing_docs))] - -pub mod command_name; -pub mod lexer; -pub mod scan; -pub mod segment; -mod token; -pub use token::{Punct, Token}; diff --git a/rust/pspp/src/lex/scan.rs b/rust/pspp/src/lex/scan.rs new file mode 100644 index 0000000000..fcb1bc3416 --- /dev/null +++ b/rust/pspp/src/lex/scan.rs @@ -0,0 +1,482 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Mid-level lexical analysis. +//! +//! This module implements mid-level lexical analysis using the segments +//! output by the lower-level [segmentation phase](super::segment). +//! +//! Scanning accepts as input a stream of segments, which are UTF-8 strings +//! labeled with a [segment type](super::segment::Segment). It outputs a stream +//! of [Token]s used by the PSPP parser or an error. + +use crate::identifier::{Identifier, ReservedWord}; + +use super::{ + segment::{Segment, Segmenter, Syntax}, + token::{Punct, Token}, +}; +use std::collections::VecDeque; +use thiserror::Error as ThisError; + +/// Error returned by [merge_tokens]. +#[derive(ThisError, Clone, Debug, PartialEq, Eq)] +pub enum ScanError { + /// Unterminated string constant. + #[error("Unterminated string constant.")] + ExpectedQuote, + + /// Missing exponent. + #[error("Missing exponent following `{0}`")] + ExpectedExponent(String), + + /// Odd length hex string. + #[error("String of hex digits has {0} characters, which is not a multiple of 2.")] + OddLengthHexString(usize), + + /// Invalid hex digit. + #[error("Invalid hex digit {0:?}.")] + BadHexDigit(char), + + /// Incomplete UTF-8 sequence. + #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] + IncompleteUtf8 { + /// Incomplete sequence. + substring: String, + /// Offset of start of sequence. + offset: usize, + }, + + /// Bad UTF-8 sequence. + #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] + BadUtf8 { + /// Invalid sequence. + substring: String, + /// Offset of start of sequence. + offset: usize, + }, + + /// Invalid length Unicode string. + #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")] + BadLengthUnicodeString(usize), + + /// Invalid code point. + #[error("U+{0:04X} is not a valid Unicode code point.")] + BadCodePoint(u32), + + /// Expected hexadecimal Unicode code point + #[error("Expected hexadecimal Unicode code point.")] + ExpectedCodePoint, + + /// `DO REPEAT` nested too deeply. + #[error("`DO REPEAT` nested too deeply.")] + DoRepeatOverflow, + + /// Unexpected character. + #[error("Unexpected character {0:?} in input.")] + UnexpectedChar(char), +} + +/// The action returned by [merge_tokens]. +#[derive(Clone, Debug)] +pub enum MergeAction { + /// Copy one token literally from input to output. + Copy, + + /// Expand `n` tokens from the input into `token` in the output. + Expand { + /// Number of tokens to expand. + n: usize, + + /// Replacement token. + token: Token, + }, +} + +/// Used by [merge_tokens] to indicate that more input is needed. +#[derive(Copy, Clone, Debug)] +pub struct Incomplete; + +impl Segment { + /// Tries to transform this segment, which was obtained for `s`, into a + /// token. Returns one of: + /// + /// - `None`: This segment doesn't correspond to any token (because it is a + /// comment, white space, etc.) and can be dropped in tokenization. + /// + /// - `Some(Ok(token))`: This segment corresponds to the given token. + /// + /// - `Some(Err(error))`: The segment contains an error, which the caller + /// should report. + /// + /// The raw token (or error) that this function returns should ordinarily be + /// merged with adjacent tokens with [merge_tokens] or some higher-level + /// construct. + pub fn to_token(self, s: &str) -> Option> { + match self { + Segment::Number => Some(Ok(Token::Number(s.parse().unwrap()))), + Segment::QuotedString => { + // Trim quote mark from front and back. + let mut chars = s.chars(); + let quote = chars.next().unwrap(); + let s = chars.as_str().strip_suffix(quote).unwrap(); + + // Replace doubled quotes by single ones. + let (single_quote, double_quote) = match quote { + '\'' => ("'", "''"), + '"' => ("\"", "\"\""), + _ => unreachable!(), + }; + Some(Ok(Token::String(s.replace(double_quote, single_quote)))) + } + Segment::HexString => { + // Strip `X"` prefix and `"` suffix (or variations). + let s = &s[2..s.len() - 1]; + for c in s.chars() { + if !c.is_ascii_hexdigit() { + return Some(Err(ScanError::BadHexDigit(c))); + } + } + if s.len() % 2 != 0 { + return Some(Err(ScanError::OddLengthHexString(s.len()))); + } + let bytes = s + .as_bytes() + .chunks_exact(2) + .map(|pair| { + let hi = char::from(pair[0]).to_digit(16).unwrap() as u8; + let lo = char::from(pair[1]).to_digit(16).unwrap() as u8; + hi * 16 + lo + }) + .collect::>(); + match String::from_utf8(bytes) { + Ok(string) => Some(Ok(Token::String(string))), + Err(error) => { + let details = error.utf8_error(); + let offset = details.valid_up_to() * 2; + let end = details + .error_len() + .map(|len| offset + len * 2) + .unwrap_or(s.len()); + let substring = String::from(&s[offset..end]); + Some(Err(if details.error_len().is_some() { + ScanError::BadUtf8 { substring, offset } + } else { + ScanError::IncompleteUtf8 { substring, offset } + })) + } + } + } + Segment::UnicodeString => { + // Strip `U"` prefix and `"` suffix (or variations). + let s = &s[2..s.len() - 1]; + if !(1..=8).contains(&s.len()) { + return Some(Err(ScanError::BadLengthUnicodeString(s.len()))); + } + let Ok(code_point) = u32::from_str_radix(s, 16) else { + return Some(Err(ScanError::ExpectedCodePoint)); + }; + let Some(c) = char::from_u32(code_point) else { + return Some(Err(ScanError::BadCodePoint(code_point))); + }; + Some(Ok(Token::String(String::from(c)))) + } + + Segment::UnquotedString + | Segment::DoRepeatCommand + | Segment::InlineData + | Segment::Document + | Segment::MacroBody + | Segment::MacroName => Some(Ok(Token::String(String::from(s)))), + + Segment::Identifier => { + if let Ok(reserved_word) = ReservedWord::try_from(s) { + match reserved_word { + ReservedWord::And => Some(Ok(Token::Punct(Punct::And))), + ReservedWord::Or => Some(Ok(Token::Punct(Punct::Or))), + ReservedWord::Not => Some(Ok(Token::Punct(Punct::Not))), + ReservedWord::Eq => Some(Ok(Token::Punct(Punct::Eq))), + ReservedWord::Ge => Some(Ok(Token::Punct(Punct::Ge))), + ReservedWord::Gt => Some(Ok(Token::Punct(Punct::Gt))), + ReservedWord::Le => Some(Ok(Token::Punct(Punct::Le))), + ReservedWord::Lt => Some(Ok(Token::Punct(Punct::Lt))), + ReservedWord::Ne => Some(Ok(Token::Punct(Punct::Ne))), + ReservedWord::All => Some(Ok(Token::Punct(Punct::All))), + ReservedWord::By => Some(Ok(Token::Punct(Punct::By))), + ReservedWord::To => Some(Ok(Token::Punct(Punct::To))), + ReservedWord::With => Some(Ok(Token::Punct(Punct::With))), + } + } else { + Some(Ok(Token::Id(Identifier::new(s).unwrap()))) + } + } + Segment::Punct => match s { + "(" => Some(Ok(Token::Punct(Punct::LParen))), + ")" => Some(Ok(Token::Punct(Punct::RParen))), + "[" => Some(Ok(Token::Punct(Punct::LSquare))), + "]" => Some(Ok(Token::Punct(Punct::RSquare))), + "{" => Some(Ok(Token::Punct(Punct::LCurly))), + "}" => Some(Ok(Token::Punct(Punct::RCurly))), + "," => Some(Ok(Token::Punct(Punct::Comma))), + "=" => Some(Ok(Token::Punct(Punct::Equals))), + "-" => Some(Ok(Token::Punct(Punct::Dash))), + "&" => Some(Ok(Token::Punct(Punct::And))), + "|" => Some(Ok(Token::Punct(Punct::Or))), + "+" => Some(Ok(Token::Punct(Punct::Plus))), + "/" => Some(Ok(Token::Punct(Punct::Slash))), + "*" => Some(Ok(Token::Punct(Punct::Asterisk))), + "<" => Some(Ok(Token::Punct(Punct::Lt))), + ">" => Some(Ok(Token::Punct(Punct::Gt))), + "~" => Some(Ok(Token::Punct(Punct::Not))), + ":" => Some(Ok(Token::Punct(Punct::Colon))), + ";" => Some(Ok(Token::Punct(Punct::Semicolon))), + "**" => Some(Ok(Token::Punct(Punct::Exp))), + "<=" => Some(Ok(Token::Punct(Punct::Le))), + "<>" => Some(Ok(Token::Punct(Punct::Ne))), + "~=" => Some(Ok(Token::Punct(Punct::Ne))), + ">=" => Some(Ok(Token::Punct(Punct::Ge))), + "!" => Some(Ok(Token::Punct(Punct::Bang))), + "%" => Some(Ok(Token::Punct(Punct::Percent))), + "?" => Some(Ok(Token::Punct(Punct::Question))), + "`" => Some(Ok(Token::Punct(Punct::Backtick))), + "_" => Some(Ok(Token::Punct(Punct::Underscore))), + "." => Some(Ok(Token::Punct(Punct::Dot))), + "!*" => Some(Ok(Token::Punct(Punct::BangAsterisk))), + _ => unreachable!("bad punctuator {s:?}"), + }, + Segment::Shbang + | Segment::Spaces + | Segment::Comment + | Segment::Newline + | Segment::CommentCommand => None, + Segment::DoRepeatOverflow => Some(Err(ScanError::DoRepeatOverflow)), + Segment::StartDocument => Some(Ok(Token::Id(Identifier::new("DOCUMENT").unwrap()))), + Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => { + Some(Ok(Token::End)) + } + Segment::ExpectedQuote => Some(Err(ScanError::ExpectedQuote)), + Segment::ExpectedExponent => Some(Err(ScanError::ExpectedExponent(String::from(s)))), + Segment::UnexpectedChar => { + Some(Err(ScanError::UnexpectedChar(s.chars().next().unwrap()))) + } + } + } +} + +/// Attempts to merge a sequence of tokens together into a single token. +/// +/// The tokens are taken from the beginning of `input`, which given +/// 0-based token index returns: +/// +/// * `Ok(Some(token))`: The token with the given index. +/// +/// * `Ok(None)`: End of input. +/// +/// * `Err(Incomplete)`: The given token isn't available yet (it may or may not +/// exist). +/// +/// This function returns one of: +/// +/// * `Ok(Some(MergeAction))`: How to transform one or more input tokens into an +/// output token. +/// +/// * `Ok(None)`: End of input. (Only returned if `input(0)` is `Ok(None)`.) +/// +/// * `Err(Incomplete)`: More input tokens are needed. Call again with longer +/// `input`. ([Token::End] or [Token::Punct(Punct::EndCmd)] is +/// always sufficient as extra input.) +/// +/// This performs two different kinds of token merging: +/// +/// - String concatenation, where syntax like `"a" + "b"` is converted into a +/// single string token. This is definitely needed because the parser relies +/// on it. +/// +/// - Negative number merging, where syntax like `-5` is converted from a pair +/// of tokens (a dash and a positive number) into a single token (a negative +/// number). This might not be needed anymore because the segmenter +/// directly treats a dash followed by a number, with optional intervening +/// white space, as a negative number. It's only needed if we want +/// intervening comments to be allowed or for part of the negative number +/// token to be produced by macro expansion. +pub fn merge_tokens<'a, F>(input: F) -> Result, Incomplete> +where + F: Fn(usize) -> Result, Incomplete>, +{ + let Some(token) = input(0)? else { + return Ok(None); + }; + match token { + Token::Punct(Punct::Dash) => match input(1)? { + Some(Token::Number(number)) if number.is_sign_positive() => { + let number = *number; + Ok(Some(MergeAction::Expand { + n: 2, + token: Token::Number(-number), + })) + } + _ => Ok(Some(MergeAction::Copy)), + }, + Token::String(_) => { + let mut i = 0; + while matches!(input(i * 2 + 1)?, Some(Token::Punct(Punct::Plus))) + && matches!(input(i * 2 + 2)?, Some(Token::String(_))) + { + i += 1; + } + if i == 0 { + Ok(Some(MergeAction::Copy)) + } else { + let mut output = String::new(); + for i in 0..=i { + let Token::String(s) = input(i * 2).unwrap().unwrap() else { + unreachable!() + }; + output.push_str(s); + } + Ok(Some(MergeAction::Expand { + n: i * 2 + 1, + token: Token::String(output), + })) + } + } + _ => Ok(Some(MergeAction::Copy)), + } +} + +/// Too-simple lexical analyzer for strings. +/// +/// Given a string, [StringSegmenter] provides iteration over raw tokens. +/// Unlike [StringScanner], [StringSegmenter] does not merge tokens using +/// [merge_tokens]. Usually merging is desirable, so [StringScanner] should be +/// preferred. +/// +/// This is used as part of macro expansion. +pub struct StringSegmenter<'a> { + input: &'a str, + segmenter: Segmenter, +} + +impl<'a> StringSegmenter<'a> { + /// Creates a new [StringSegmenter] for `input` using syntax variant `mode`. + /// See [Segmenter::new] for an explanation of `is_snippet`. + pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self { + Self { + input, + segmenter: Segmenter::new(mode, is_snippet), + } + } +} + +impl<'a> Iterator for StringSegmenter<'a> { + type Item = (&'a str, Result); + + fn next(&mut self) -> Option { + loop { + let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap()?; + let (s, rest) = self.input.split_at(seg_len); + self.input = rest; + + if let Some(token) = seg_type.to_token(s) { + return Some((s, token)); + } + } + } +} + +/// Simple lexical analyzer for strings. +/// +/// Given a string, [StringScanner] provides iteration over tokens. +pub struct StringScanner<'a> { + input: &'a str, + eof: bool, + segmenter: Segmenter, + tokens: VecDeque, +} + +impl<'a> StringScanner<'a> { + /// Creates a new [StringScanner] for `input` using syntax variant `mode`. + /// See [Segmenter::new] for an explanation of `is_snippet`. + pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self { + Self { + input, + eof: false, + segmenter: Segmenter::new(mode, is_snippet), + tokens: VecDeque::with_capacity(1), + } + } + + fn merge(&mut self, eof: bool) -> Result>, Incomplete> { + match merge_tokens(|index| { + if let Some(token) = self.tokens.get(index) { + Ok(Some(token)) + } else if eof { + Ok(None) + } else { + Err(Incomplete) + } + })? { + Some(MergeAction::Copy) => Ok(Some(Ok(self.tokens.pop_front().unwrap()))), + Some(MergeAction::Expand { n, token }) => { + self.tokens.drain(..n); + Ok(Some(Ok(token))) + } + None => Ok(None), + } + } + + /// Transforms this [StringScanner] into an iterator that includes only the + /// [Token]s, omitting [ScanError]s. + pub fn unwrapped(self) -> impl Iterator + use<'a> { + self.map(|scan_token| scan_token.ok().unwrap()) + } +} + +impl Iterator for StringScanner<'_> { + type Item = Result; + + fn next(&mut self) -> Option { + loop { + if let Ok(Some(token)) = self.merge(self.eof) { + return Some(token); + } + + let Some((seg_len, seg_type)) = self.segmenter.push(self.input, true).unwrap() else { + self.eof = true; + return self.merge(true).unwrap(); + }; + let (s, rest) = self.input.split_at(seg_len); + + match seg_type.to_token(s) { + Some(Err(error)) => { + if let Ok(Some(token)) = self.merge(true) { + return Some(token); + } + self.input = rest; + return Some(Err(error)); + } + Some(Ok(token)) => { + self.tokens.push_back(token); + } + None => (), + } + self.input = rest; + } + } +} + +#[cfg(test)] +mod test; diff --git a/rust/pspp/src/lex/scan/mod.rs b/rust/pspp/src/lex/scan/mod.rs deleted file mode 100644 index fcb1bc3416..0000000000 --- a/rust/pspp/src/lex/scan/mod.rs +++ /dev/null @@ -1,482 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -//! Mid-level lexical analysis. -//! -//! This module implements mid-level lexical analysis using the segments -//! output by the lower-level [segmentation phase](super::segment). -//! -//! Scanning accepts as input a stream of segments, which are UTF-8 strings -//! labeled with a [segment type](super::segment::Segment). It outputs a stream -//! of [Token]s used by the PSPP parser or an error. - -use crate::identifier::{Identifier, ReservedWord}; - -use super::{ - segment::{Segment, Segmenter, Syntax}, - token::{Punct, Token}, -}; -use std::collections::VecDeque; -use thiserror::Error as ThisError; - -/// Error returned by [merge_tokens]. -#[derive(ThisError, Clone, Debug, PartialEq, Eq)] -pub enum ScanError { - /// Unterminated string constant. - #[error("Unterminated string constant.")] - ExpectedQuote, - - /// Missing exponent. - #[error("Missing exponent following `{0}`")] - ExpectedExponent(String), - - /// Odd length hex string. - #[error("String of hex digits has {0} characters, which is not a multiple of 2.")] - OddLengthHexString(usize), - - /// Invalid hex digit. - #[error("Invalid hex digit {0:?}.")] - BadHexDigit(char), - - /// Incomplete UTF-8 sequence. - #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] - IncompleteUtf8 { - /// Incomplete sequence. - substring: String, - /// Offset of start of sequence. - offset: usize, - }, - - /// Bad UTF-8 sequence. - #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] - BadUtf8 { - /// Invalid sequence. - substring: String, - /// Offset of start of sequence. - offset: usize, - }, - - /// Invalid length Unicode string. - #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")] - BadLengthUnicodeString(usize), - - /// Invalid code point. - #[error("U+{0:04X} is not a valid Unicode code point.")] - BadCodePoint(u32), - - /// Expected hexadecimal Unicode code point - #[error("Expected hexadecimal Unicode code point.")] - ExpectedCodePoint, - - /// `DO REPEAT` nested too deeply. - #[error("`DO REPEAT` nested too deeply.")] - DoRepeatOverflow, - - /// Unexpected character. - #[error("Unexpected character {0:?} in input.")] - UnexpectedChar(char), -} - -/// The action returned by [merge_tokens]. -#[derive(Clone, Debug)] -pub enum MergeAction { - /// Copy one token literally from input to output. - Copy, - - /// Expand `n` tokens from the input into `token` in the output. - Expand { - /// Number of tokens to expand. - n: usize, - - /// Replacement token. - token: Token, - }, -} - -/// Used by [merge_tokens] to indicate that more input is needed. -#[derive(Copy, Clone, Debug)] -pub struct Incomplete; - -impl Segment { - /// Tries to transform this segment, which was obtained for `s`, into a - /// token. Returns one of: - /// - /// - `None`: This segment doesn't correspond to any token (because it is a - /// comment, white space, etc.) and can be dropped in tokenization. - /// - /// - `Some(Ok(token))`: This segment corresponds to the given token. - /// - /// - `Some(Err(error))`: The segment contains an error, which the caller - /// should report. - /// - /// The raw token (or error) that this function returns should ordinarily be - /// merged with adjacent tokens with [merge_tokens] or some higher-level - /// construct. - pub fn to_token(self, s: &str) -> Option> { - match self { - Segment::Number => Some(Ok(Token::Number(s.parse().unwrap()))), - Segment::QuotedString => { - // Trim quote mark from front and back. - let mut chars = s.chars(); - let quote = chars.next().unwrap(); - let s = chars.as_str().strip_suffix(quote).unwrap(); - - // Replace doubled quotes by single ones. - let (single_quote, double_quote) = match quote { - '\'' => ("'", "''"), - '"' => ("\"", "\"\""), - _ => unreachable!(), - }; - Some(Ok(Token::String(s.replace(double_quote, single_quote)))) - } - Segment::HexString => { - // Strip `X"` prefix and `"` suffix (or variations). - let s = &s[2..s.len() - 1]; - for c in s.chars() { - if !c.is_ascii_hexdigit() { - return Some(Err(ScanError::BadHexDigit(c))); - } - } - if s.len() % 2 != 0 { - return Some(Err(ScanError::OddLengthHexString(s.len()))); - } - let bytes = s - .as_bytes() - .chunks_exact(2) - .map(|pair| { - let hi = char::from(pair[0]).to_digit(16).unwrap() as u8; - let lo = char::from(pair[1]).to_digit(16).unwrap() as u8; - hi * 16 + lo - }) - .collect::>(); - match String::from_utf8(bytes) { - Ok(string) => Some(Ok(Token::String(string))), - Err(error) => { - let details = error.utf8_error(); - let offset = details.valid_up_to() * 2; - let end = details - .error_len() - .map(|len| offset + len * 2) - .unwrap_or(s.len()); - let substring = String::from(&s[offset..end]); - Some(Err(if details.error_len().is_some() { - ScanError::BadUtf8 { substring, offset } - } else { - ScanError::IncompleteUtf8 { substring, offset } - })) - } - } - } - Segment::UnicodeString => { - // Strip `U"` prefix and `"` suffix (or variations). - let s = &s[2..s.len() - 1]; - if !(1..=8).contains(&s.len()) { - return Some(Err(ScanError::BadLengthUnicodeString(s.len()))); - } - let Ok(code_point) = u32::from_str_radix(s, 16) else { - return Some(Err(ScanError::ExpectedCodePoint)); - }; - let Some(c) = char::from_u32(code_point) else { - return Some(Err(ScanError::BadCodePoint(code_point))); - }; - Some(Ok(Token::String(String::from(c)))) - } - - Segment::UnquotedString - | Segment::DoRepeatCommand - | Segment::InlineData - | Segment::Document - | Segment::MacroBody - | Segment::MacroName => Some(Ok(Token::String(String::from(s)))), - - Segment::Identifier => { - if let Ok(reserved_word) = ReservedWord::try_from(s) { - match reserved_word { - ReservedWord::And => Some(Ok(Token::Punct(Punct::And))), - ReservedWord::Or => Some(Ok(Token::Punct(Punct::Or))), - ReservedWord::Not => Some(Ok(Token::Punct(Punct::Not))), - ReservedWord::Eq => Some(Ok(Token::Punct(Punct::Eq))), - ReservedWord::Ge => Some(Ok(Token::Punct(Punct::Ge))), - ReservedWord::Gt => Some(Ok(Token::Punct(Punct::Gt))), - ReservedWord::Le => Some(Ok(Token::Punct(Punct::Le))), - ReservedWord::Lt => Some(Ok(Token::Punct(Punct::Lt))), - ReservedWord::Ne => Some(Ok(Token::Punct(Punct::Ne))), - ReservedWord::All => Some(Ok(Token::Punct(Punct::All))), - ReservedWord::By => Some(Ok(Token::Punct(Punct::By))), - ReservedWord::To => Some(Ok(Token::Punct(Punct::To))), - ReservedWord::With => Some(Ok(Token::Punct(Punct::With))), - } - } else { - Some(Ok(Token::Id(Identifier::new(s).unwrap()))) - } - } - Segment::Punct => match s { - "(" => Some(Ok(Token::Punct(Punct::LParen))), - ")" => Some(Ok(Token::Punct(Punct::RParen))), - "[" => Some(Ok(Token::Punct(Punct::LSquare))), - "]" => Some(Ok(Token::Punct(Punct::RSquare))), - "{" => Some(Ok(Token::Punct(Punct::LCurly))), - "}" => Some(Ok(Token::Punct(Punct::RCurly))), - "," => Some(Ok(Token::Punct(Punct::Comma))), - "=" => Some(Ok(Token::Punct(Punct::Equals))), - "-" => Some(Ok(Token::Punct(Punct::Dash))), - "&" => Some(Ok(Token::Punct(Punct::And))), - "|" => Some(Ok(Token::Punct(Punct::Or))), - "+" => Some(Ok(Token::Punct(Punct::Plus))), - "/" => Some(Ok(Token::Punct(Punct::Slash))), - "*" => Some(Ok(Token::Punct(Punct::Asterisk))), - "<" => Some(Ok(Token::Punct(Punct::Lt))), - ">" => Some(Ok(Token::Punct(Punct::Gt))), - "~" => Some(Ok(Token::Punct(Punct::Not))), - ":" => Some(Ok(Token::Punct(Punct::Colon))), - ";" => Some(Ok(Token::Punct(Punct::Semicolon))), - "**" => Some(Ok(Token::Punct(Punct::Exp))), - "<=" => Some(Ok(Token::Punct(Punct::Le))), - "<>" => Some(Ok(Token::Punct(Punct::Ne))), - "~=" => Some(Ok(Token::Punct(Punct::Ne))), - ">=" => Some(Ok(Token::Punct(Punct::Ge))), - "!" => Some(Ok(Token::Punct(Punct::Bang))), - "%" => Some(Ok(Token::Punct(Punct::Percent))), - "?" => Some(Ok(Token::Punct(Punct::Question))), - "`" => Some(Ok(Token::Punct(Punct::Backtick))), - "_" => Some(Ok(Token::Punct(Punct::Underscore))), - "." => Some(Ok(Token::Punct(Punct::Dot))), - "!*" => Some(Ok(Token::Punct(Punct::BangAsterisk))), - _ => unreachable!("bad punctuator {s:?}"), - }, - Segment::Shbang - | Segment::Spaces - | Segment::Comment - | Segment::Newline - | Segment::CommentCommand => None, - Segment::DoRepeatOverflow => Some(Err(ScanError::DoRepeatOverflow)), - Segment::StartDocument => Some(Ok(Token::Id(Identifier::new("DOCUMENT").unwrap()))), - Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => { - Some(Ok(Token::End)) - } - Segment::ExpectedQuote => Some(Err(ScanError::ExpectedQuote)), - Segment::ExpectedExponent => Some(Err(ScanError::ExpectedExponent(String::from(s)))), - Segment::UnexpectedChar => { - Some(Err(ScanError::UnexpectedChar(s.chars().next().unwrap()))) - } - } - } -} - -/// Attempts to merge a sequence of tokens together into a single token. -/// -/// The tokens are taken from the beginning of `input`, which given -/// 0-based token index returns: -/// -/// * `Ok(Some(token))`: The token with the given index. -/// -/// * `Ok(None)`: End of input. -/// -/// * `Err(Incomplete)`: The given token isn't available yet (it may or may not -/// exist). -/// -/// This function returns one of: -/// -/// * `Ok(Some(MergeAction))`: How to transform one or more input tokens into an -/// output token. -/// -/// * `Ok(None)`: End of input. (Only returned if `input(0)` is `Ok(None)`.) -/// -/// * `Err(Incomplete)`: More input tokens are needed. Call again with longer -/// `input`. ([Token::End] or [Token::Punct(Punct::EndCmd)] is -/// always sufficient as extra input.) -/// -/// This performs two different kinds of token merging: -/// -/// - String concatenation, where syntax like `"a" + "b"` is converted into a -/// single string token. This is definitely needed because the parser relies -/// on it. -/// -/// - Negative number merging, where syntax like `-5` is converted from a pair -/// of tokens (a dash and a positive number) into a single token (a negative -/// number). This might not be needed anymore because the segmenter -/// directly treats a dash followed by a number, with optional intervening -/// white space, as a negative number. It's only needed if we want -/// intervening comments to be allowed or for part of the negative number -/// token to be produced by macro expansion. -pub fn merge_tokens<'a, F>(input: F) -> Result, Incomplete> -where - F: Fn(usize) -> Result, Incomplete>, -{ - let Some(token) = input(0)? else { - return Ok(None); - }; - match token { - Token::Punct(Punct::Dash) => match input(1)? { - Some(Token::Number(number)) if number.is_sign_positive() => { - let number = *number; - Ok(Some(MergeAction::Expand { - n: 2, - token: Token::Number(-number), - })) - } - _ => Ok(Some(MergeAction::Copy)), - }, - Token::String(_) => { - let mut i = 0; - while matches!(input(i * 2 + 1)?, Some(Token::Punct(Punct::Plus))) - && matches!(input(i * 2 + 2)?, Some(Token::String(_))) - { - i += 1; - } - if i == 0 { - Ok(Some(MergeAction::Copy)) - } else { - let mut output = String::new(); - for i in 0..=i { - let Token::String(s) = input(i * 2).unwrap().unwrap() else { - unreachable!() - }; - output.push_str(s); - } - Ok(Some(MergeAction::Expand { - n: i * 2 + 1, - token: Token::String(output), - })) - } - } - _ => Ok(Some(MergeAction::Copy)), - } -} - -/// Too-simple lexical analyzer for strings. -/// -/// Given a string, [StringSegmenter] provides iteration over raw tokens. -/// Unlike [StringScanner], [StringSegmenter] does not merge tokens using -/// [merge_tokens]. Usually merging is desirable, so [StringScanner] should be -/// preferred. -/// -/// This is used as part of macro expansion. -pub struct StringSegmenter<'a> { - input: &'a str, - segmenter: Segmenter, -} - -impl<'a> StringSegmenter<'a> { - /// Creates a new [StringSegmenter] for `input` using syntax variant `mode`. - /// See [Segmenter::new] for an explanation of `is_snippet`. - pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self { - Self { - input, - segmenter: Segmenter::new(mode, is_snippet), - } - } -} - -impl<'a> Iterator for StringSegmenter<'a> { - type Item = (&'a str, Result); - - fn next(&mut self) -> Option { - loop { - let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap()?; - let (s, rest) = self.input.split_at(seg_len); - self.input = rest; - - if let Some(token) = seg_type.to_token(s) { - return Some((s, token)); - } - } - } -} - -/// Simple lexical analyzer for strings. -/// -/// Given a string, [StringScanner] provides iteration over tokens. -pub struct StringScanner<'a> { - input: &'a str, - eof: bool, - segmenter: Segmenter, - tokens: VecDeque, -} - -impl<'a> StringScanner<'a> { - /// Creates a new [StringScanner] for `input` using syntax variant `mode`. - /// See [Segmenter::new] for an explanation of `is_snippet`. - pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self { - Self { - input, - eof: false, - segmenter: Segmenter::new(mode, is_snippet), - tokens: VecDeque::with_capacity(1), - } - } - - fn merge(&mut self, eof: bool) -> Result>, Incomplete> { - match merge_tokens(|index| { - if let Some(token) = self.tokens.get(index) { - Ok(Some(token)) - } else if eof { - Ok(None) - } else { - Err(Incomplete) - } - })? { - Some(MergeAction::Copy) => Ok(Some(Ok(self.tokens.pop_front().unwrap()))), - Some(MergeAction::Expand { n, token }) => { - self.tokens.drain(..n); - Ok(Some(Ok(token))) - } - None => Ok(None), - } - } - - /// Transforms this [StringScanner] into an iterator that includes only the - /// [Token]s, omitting [ScanError]s. - pub fn unwrapped(self) -> impl Iterator + use<'a> { - self.map(|scan_token| scan_token.ok().unwrap()) - } -} - -impl Iterator for StringScanner<'_> { - type Item = Result; - - fn next(&mut self) -> Option { - loop { - if let Ok(Some(token)) = self.merge(self.eof) { - return Some(token); - } - - let Some((seg_len, seg_type)) = self.segmenter.push(self.input, true).unwrap() else { - self.eof = true; - return self.merge(true).unwrap(); - }; - let (s, rest) = self.input.split_at(seg_len); - - match seg_type.to_token(s) { - Some(Err(error)) => { - if let Ok(Some(token)) = self.merge(true) { - return Some(token); - } - self.input = rest; - return Some(Err(error)); - } - Some(Ok(token)) => { - self.tokens.push_back(token); - } - None => (), - } - self.input = rest; - } - } -} - -#[cfg(test)] -mod test; diff --git a/rust/pspp/src/lex/segment.rs b/rust/pspp/src/lex/segment.rs new file mode 100644 index 0000000000..5a568692b5 --- /dev/null +++ b/rust/pspp/src/lex/segment.rs @@ -0,0 +1,1442 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Low-level lexical analysis. +//! +//! PSPP divides traditional "lexical analysis" or "tokenization" into [three +//! phases](super). This module implements the low-level segmentation phase. +//! +//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label +//! (a segment type) for each byte or contiguous sequence of bytes in the input. +//! It also, in a few corner cases, outputs zero-width segments that label the +//! boundary between a pair of bytes in the input. +//! +//! Some segment types correspond directly to tokens; for example, +//! [Segment::Identifier] becomes [Token::Id] later in lexical analysis. Other +//! segments contribute to tokens but do not correspond directly; for example, +//! multiple quoted string [Segment::QuotedString] separated by +//! [Segment::Spaces] and "+" punctuators [Segment::Punct] may be combined to +//! form a single string token [Token::String]. Still other segments are +//! ignored (e.g. [Segment::Spaces]) or trigger special behavior such as error +//! messages later in tokenization (e.g. [Segment::ExpectedQuote]). +//! +//! [Token::Id]: crate::lex::token::Token::Id +//! [Token::String]: crate::lex::token::Token::String + +use std::cmp::Ordering; + +use crate::{ + identifier::{id_match, id_match_n, IdentifierChar}, + prompt::PromptStyle, +}; +use bitflags::bitflags; + +use super::command_name::{command_match, COMMAND_NAMES}; + +/// Syntax variant. +/// +/// PSPP syntax is written in one of two syntax variant which are broadly +/// defined as follows: +/// +/// - In interactive syntax, commands end with a period at the end of the line +/// or with a blank line. +/// +/// - In batch syntax, the second and subsequent lines of a command are indented +/// from the left margin. +/// +/// The segmenter can also try to automatically detect the kind of syntax in +/// use, using a heuristic that is usually correct. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] +pub enum Syntax { + /// Try to interpret input correctly regardless of whether it is written + /// for interactive or batch syntax. + /// + /// This is `Syntax::default()`. + #[default] + Auto, + + /// Interactive syntax. + Interactive, + + /// Batch syntax. + Batch, +} + +/// The type of a segment. +/// +/// A [Segment] is a label for a string slice and is normally paired with one. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Segment { + /// A number. + Number, + + /// A quoted string (`'...'` or `"..."`).. + QuotedString, + + /// A hexadecimal string (`X'...'` or `X"..."`). + HexString, + + /// A Unicode string (`U'...'` or `U"..."`). + UnicodeString, + + /// An unquoted string. + /// + /// Unquoted strings appear only in a few special-case constructs, such as + /// the `FILE LABEL` command. + UnquotedString, + + /// An identifier. + Identifier, + + /// A punctuator or operator. + Punct, + + /// `#!` at the beginning of a syntax file only. + Shbang, + + /// Spaces. + Spaces, + + /// A comment (`/* ... */`). + Comment, + + /// New-line. + Newline, + + /// A comment command (`* ...` or `COMMENT ...`). + CommentCommand, + + /// In a `DO REPEAT` command, one of the lines to be repeated. + DoRepeatCommand, + + /// Indicates `DO REPEAT` nested more deeply than supported. + DoRepeatOverflow, + + /// A line of inline data inside `BEGIN DATA`...`END DATA`. + InlineData, + + /// In `!DEFINE`, an identifier for the macro being defined. + /// + /// Distinguished from [Identifier](Self::Identifier) because a `MacroName` + /// must never be macro-expanded. + MacroName, + + /// Contents of `!DEFINE`...`!ENDDEFINE`. + MacroBody, + + /// Represents the `DOCUMENT` beginning a `DOCUMENT` command. + /// + /// This token is not associated with any text: the actual `DOCUMENT` + /// keyword is part of the following [Document](Self::Document) segment. + /// This is because documents include the `DOCUMENT` keyword. + StartDocument, + + /// One of the lines of documents in a `DOCUMENT` command. + /// + /// The first line of a document includes the `DOCUMENT` keyword itself. + Document, + + /// A command separator. + /// + /// This segment is usually for `+`, `-`, or `.` at the beginning of a line. + StartCommand, + + /// A command separator. + /// + /// This segment is usually for a blank line. It also appears at the end of + /// a file. + SeparateCommands, + + /// A command separator. + /// + /// This segment is for `.` at the end of a line. + EndCommand, + + /// Missing quote at the end of a line. + /// + /// This segment contains a partial quoted string. It starts with a quote + /// mark (`"` or `'`, possibly preceded by `X` or `U`) but goes to the end + /// of the line without the matching end quote mark. + ExpectedQuote, + + /// Missing exponent in number. + /// + /// This segment contains a number that ends with `E` or `E+` or `E-` + /// without a following exponent. + ExpectedExponent, + + /// Unexpected character. + /// + /// The segment is a single character that isn't valid in syntax. + UnexpectedChar, +} + +bitflags! { + #[derive(Copy, Clone, Debug)] + struct Substate: u8 { + const START_OF_LINE = 1; + const START_OF_COMMAND = 2; + } +} + +/// Used by [Segmenter] to indicate that more input is needed. +#[derive(Copy, Clone, Debug)] +pub struct Incomplete; + +/// Labels syntax input with [Segment]s. +#[derive(Copy, Clone)] +pub struct Segmenter { + state: (State, Substate), + nest: u8, + syntax: Syntax, +} + +impl Segmenter { + /// Returns a segmenter with the given `syntax`. + /// + /// If `is_snippet` is false, then the segmenter will parse as if it's being + /// given a whole file. This means, for example, that it will interpret `-` + /// or `+` at the beginning of the syntax as a separator between commands + /// (since `-` or `+` at the beginning of a line has this meaning). + /// + /// If `is_snippet` is true, then the segmenter will parse as if it's being + /// given an isolated piece of syntax. This means that, for example, that + /// it will interpret `-` or `+` at the beginning of the syntax as an + /// operator token or (if followed by a digit) as part of a number. + pub fn new(syntax: Syntax, is_snippet: bool) -> Self { + Self { + state: if is_snippet { + (State::General, Substate::empty()) + } else { + (State::Shbang, Substate::empty()) + }, + syntax, + nest: 0, + } + } + + /// Returns the [Syntax] variant passed in to [new](Self::new). + pub fn syntax(&self) -> Syntax { + self.syntax + } + + fn start_of_line(&self) -> bool { + self.state.1.contains(Substate::START_OF_LINE) + } + + fn start_of_command(&self) -> bool { + self.state.1.contains(Substate::START_OF_COMMAND) + } + + /// Returns the style of command prompt to display to an interactive user + /// for input in the current state.. The return value is most accurate in + /// with [Syntax::Interactive] syntax and at the beginning of a line (that + /// is, if [Segmenter::push] consumed as much as possible of the input up to + /// a new-line). + pub fn prompt(&self) -> PromptStyle { + match self.state.0 { + State::Shbang => PromptStyle::First, + State::General => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::Comment1 | State::Comment2 => PromptStyle::Comment, + State::Document1 | State::Document2 => PromptStyle::Document, + State::Document3 => PromptStyle::First, + State::FileLabel1 => PromptStyle::Later, + State::FileLabel2 | State::FileLabel3 => PromptStyle::First, + State::DoRepeat1 | State::DoRepeat2 => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::DoRepeat3 => PromptStyle::DoRepeat, + State::DoRepeat4 => PromptStyle::DoRepeat, + State::Define1 | State::Define2 | State::Define3 => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define, + State::BeginData1 => PromptStyle::First, + State::BeginData2 => PromptStyle::Later, + State::BeginData3 | State::BeginData4 => PromptStyle::Data, + } + } + + fn push_rest<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + if input.is_empty() { + if eof { + return Ok(None); + } else { + return Err(Incomplete); + }; + } + + match self.state.0 { + State::Shbang => self.parse_shbang(input, eof), + State::General => { + if self.start_of_line() { + self.parse_start_of_line(input, eof) + } else { + self.parse_mid_line(input, eof) + } + } + State::Comment1 => self.parse_comment_1(input, eof), + State::Comment2 => self.parse_comment_2(input, eof), + State::Document1 => self.parse_document_1(input, eof), + State::Document2 => self.parse_document_2(input, eof), + State::Document3 => self.parse_document_3(input, eof), + State::FileLabel1 => self.parse_file_label_1(input, eof), + State::FileLabel2 => self.parse_file_label_2(input, eof), + State::FileLabel3 => self.parse_file_label_3(input, eof), + State::DoRepeat1 => self.parse_do_repeat_1(input, eof), + State::DoRepeat2 => self.parse_do_repeat_2(input, eof), + State::DoRepeat3 => self.parse_do_repeat_3(input, eof), + State::DoRepeat4 => self.parse_do_repeat_4(input), + State::Define1 => self.parse_define_1_2(input, eof), + State::Define2 => self.parse_define_1_2(input, eof), + State::Define3 => self.parse_define_3(input, eof), + State::Define4 => self.parse_define_4_5(input, eof), + State::Define5 => self.parse_define_4_5(input, eof), + State::Define6 => self.parse_define_6(input, eof), + State::BeginData1 => self.parse_begin_data_1(input, eof), + State::BeginData2 => self.parse_begin_data_2(input, eof), + State::BeginData3 => self.parse_begin_data_3(input, eof), + State::BeginData4 => self.parse_begin_data_4(input, eof), + } + } + + /// Attempts to label a prefix of the remaining input with a segment type. + /// The caller supplies a prefix of the remaining input as `input`. If + /// `eof` is true, then `input` is the entire (remainder) of the input; if + /// `eof` is false, then further input is potentially available. + /// + /// The input may contain `\n` or `\r\n` line ends in any combination. + /// + /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes + /// in the segment at the beginning of `input` (a number in + /// `0..=input.len()`) and the type of that segment. The next call should + /// not include those bytes in `input`, because the segmenter has + /// (figuratively) consumed them. + /// + /// Segments can have zero length, including segment types + /// [Segment::SeparateCommands], [Segment::StartDocument], + /// [Segment::InlineData], and [Segment::Spaces]. + /// + /// Failure occurs only if the segment type of the bytes in `input` cannot + /// yet be determined. In this case, this function returns + /// `Err(Incomplete)`. If more input is available, the caller should obtain + /// some more, then call again with a longer `input`. If this is still not + /// enough, the process might need to repeat again and again. If input is + /// exhausted, then the caller may call again setting `eof` to true. This + /// function will never return `Err(Incomplete)` when `eof` is true. + /// + /// The caller must not, in a sequence of calls, supply contradictory input. + /// That is, bytes provided as part of `input` in one call, but not + /// consumed, must not be provided with *different* values on subsequent + /// calls. This is because the function must often make decisions based on + /// looking ahead beyond the bytes that it consumes. + pub fn push(&mut self, input: &str, eof: bool) -> Result, Incomplete> { + Ok(self + .push_rest(input, eof)? + .map(|(rest, seg_type)| (input.len() - rest.len(), seg_type))) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum State { + Shbang, + General, + Comment1, + Comment2, + Document1, + Document2, + Document3, + FileLabel1, + FileLabel2, + FileLabel3, + DoRepeat1, + DoRepeat2, + DoRepeat3, + DoRepeat4, + Define1, + Define2, + Define3, + Define4, + Define5, + Define6, + BeginData1, + BeginData2, + BeginData3, + BeginData4, +} + +fn take(input: &str, eof: bool) -> Result<(Option, &str), Incomplete> { + let mut iter = input.chars(); + match iter.next() { + None if !eof => Err(Incomplete), + c => Ok((c, iter.as_str())), + } +} + +fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input), + '*' => { + if let (Some('/'), rest) = take(rest, eof)? { + return Ok(rest); + } + } + _ => (), + }; + input = rest; + } +} + +fn skip_matching(f: F, input: &str, eof: bool) -> Result<&str, Incomplete> +where + F: Fn(char) -> bool, +{ + let input = input.trim_start_matches(f); + if input.is_empty() && !eof { + Err(Incomplete) + } else { + Ok(input) + } +} + +fn match_char(f: F, input: &str, eof: bool) -> Result, Incomplete> +where + F: Fn(char) -> bool, +{ + if let (Some(c), rest) = take(input, eof)? { + if f(c) { + return Ok(Some(rest)); + } + } + Ok(None) +} + +fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), + c if c.is_whitespace() => (), + _ => return Ok(input), + } + input = rest; + } +} + +fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> { + skip_matching(|c| c.is_ascii_digit(), input, eof) +} + +fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '/' => { + let (c, rest2) = take(rest, eof)?; + match c { + Some('*') => input = skip_comment(rest2, eof)?, + Some(_) | None => return Ok(rest), + } + } + '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), + c if c.is_whitespace() => input = rest, + _ => return Ok(input), + }; + } +} + +fn is_start_of_string(input: &str, eof: bool) -> Result { + let (Some(c), rest) = take(input, eof)? else { + return Ok(false); + }; + match c { + 'x' | 'X' | 'u' | 'U' => { + let (c, _rest) = take(rest, eof)?; + Ok(c == Some('\'') || c == Some('"')) + } + '\'' | '"' => Ok(true), + '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true), + _ => Ok(false), + } +} + +fn is_end_of_line(input: &str, eof: bool) -> Result { + let (Some(c), rest) = take(input, eof)? else { + return Ok(true); + }; + Ok(match c { + '\n' => true, + '\r' => take(rest, eof)?.0 == Some('\n'), + _ => false, + }) +} + +fn at_end_of_line(input: &str, eof: bool) -> Result { + is_end_of_line(skip_spaces_and_comments(input, eof)?, eof) +} + +fn first(s: &str) -> char { + s.chars().next().unwrap() +} +fn get_command_name_candidates(target: &str) -> &[&'static str] { + if target.is_empty() { + return &[]; + } + let target_first = first(target).to_ascii_uppercase(); + let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first); + let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first); + &COMMAND_NAMES[low..high] +} + +fn detect_command_name(input: &str, eof: bool) -> Result { + let command_name = input + .split(|c: char| { + !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-') + }) + .next() + .unwrap(); + if !eof && command_name.len() == input.len() { + return Err(Incomplete); + } + let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.'); + for command in get_command_name_candidates(command_name) { + if let Some(m) = command_match(command, command_name) { + if m.missing_words <= 0 { + return Ok(true); + } + } + } + Ok(false) +} + +impl Segmenter { + fn parse_shbang<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + if let (Some('#'), rest) = take(input, eof)? { + if let (Some('!'), rest) = take(rest, eof)? { + let rest = self.parse_full_line(rest, eof)?; + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok(Some((rest, Segment::Shbang))); + } + } + + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + self.push_rest(input, eof) + } + fn at_command_start(&self, input: &str, eof: bool) -> Result { + match self.syntax { + Syntax::Auto => detect_command_name(input, eof), + Syntax::Interactive => Ok(false), + Syntax::Batch => Ok(true), + } + } + fn parse_start_of_line<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + debug_assert_eq!(self.state.0, State::General); + debug_assert!(self.start_of_line()); + debug_assert!(!input.is_empty()); + + let (Some(c), rest) = take(input, eof).unwrap() else { + unreachable!() + }; + match c { + '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => { + // This `+` is punctuation that may separate pieces of a string. + self.state = (State::General, Substate::empty()); + return Ok(Some((rest, Segment::Punct))); + } + '+' | '-' | '.' => { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok(Some((rest, Segment::StartCommand))); + } + _ if c.is_whitespace() => { + if at_end_of_line(input, eof)? { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok(Some((input, Segment::SeparateCommands))); + } + } + _ => { + if self.at_command_start(input, eof)? + && !self.state.1.contains(Substate::START_OF_COMMAND) + { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok(Some((input, Segment::StartCommand))); + } + } + } + self.state.1 = Substate::START_OF_COMMAND; + self.parse_mid_line(input, eof) + } + fn parse_mid_line<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + debug_assert!(self.state.0 == State::General); + debug_assert!(!self.state.1.contains(Substate::START_OF_LINE)); + let (Some(c), rest) = take(input, eof)? else { + unreachable!() + }; + match c { + '\r' | '\n' if is_end_of_line(input, eof)? => { + self.state.1 |= Substate::START_OF_LINE; + Ok(Some(( + self.parse_newline(input, eof).unwrap().unwrap(), + Segment::Newline, + ))) + } + '/' => { + if let (Some('*'), rest) = take(rest, eof)? { + let rest = skip_comment(rest, eof)?; + Ok(Some((rest, Segment::Comment))) + } else { + self.state.1 = Substate::empty(); + Ok(Some((rest, Segment::Punct))) + } + } + '-' => { + let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?; + match c { + Some(c) if c.is_ascii_digit() => { + return self.parse_number(rest, eof); + } + Some('.') => { + if let (Some(c), _rest) = take(rest2, eof)? { + if c.is_ascii_digit() { + return self.parse_number(rest, eof); + } + } + } + None | Some(_) => (), + } + self.state.1 = Substate::empty(); + Ok(Some((rest, Segment::Punct))) + } + '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => { + self.state.1 = Substate::empty(); + Ok(Some((rest, Segment::Punct))) + } + '*' => { + if self.state.1.contains(Substate::START_OF_COMMAND) { + self.state = (State::Comment1, Substate::empty()); + self.parse_comment_1(input, eof) + } else { + self.parse_digraph(&['*'], rest, eof) + } + } + '<' => self.parse_digraph(&['=', '>'], rest, eof), + '>' => self.parse_digraph(&['='], rest, eof), + '~' => self.parse_digraph(&['='], rest, eof), + '.' if at_end_of_line(rest, eof)? => { + self.state.1 = Substate::START_OF_COMMAND; + Ok(Some((rest, Segment::EndCommand))) + } + '.' => match take(rest, eof)? { + (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof), + _ => Ok(Some((rest, Segment::Punct))), + }, + '0'..='9' => self.parse_number(input, eof), + 'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof), + 'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof), + '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof), + '!' => { + let (c, rest2) = take(rest, eof)?; + match c { + Some('*') => Ok(Some((rest2, Segment::Punct))), + Some(_) => self.parse_id(input, eof), + None => Ok(Some((rest, Segment::Punct))), + } + } + c if c.is_whitespace() => Ok(Some((skip_spaces(rest, eof)?, Segment::Spaces))), + c if c.may_start_id() => self.parse_id(input, eof), + '#'..='~' if c != '\\' && c != '^' => { + self.state.1 = Substate::empty(); + Ok(Some((rest, Segment::Punct))) + } + _ => { + self.state.1 = Substate::empty(); + Ok(Some((rest, Segment::UnexpectedChar))) + } + } + } + fn parse_string<'a>( + &mut self, + segment: Segment, + quote: char, + mut input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + while let (Some(c), rest) = take(input, eof)? { + match c { + _ if c == quote => { + let (c, rest2) = take(rest, eof)?; + if c != Some(quote) { + self.state.1 = Substate::empty(); + return Ok(Some((rest, segment))); + } + input = rest2; + } + '\r' | '\n' if is_end_of_line(input, eof)? => break, + _ => input = rest, + } + } + self.state.1 = Substate::empty(); + Ok(Some((input, Segment::ExpectedQuote))) + } + fn maybe_parse_string<'a>( + &mut self, + segment: Segment, + input: (&'a str, &'a str), + eof: bool, + ) -> Result, Incomplete> { + match take(input.1, eof)? { + (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof), + _ => self.parse_id(input.0, eof), + } + } + fn next_id_in_command<'a>( + &self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, &'a str), Incomplete> { + let mut sub = Segmenter::new(self.syntax, true); + loop { + let Some((seg_len, seg_type)) = sub.push(input, eof)? else { + return Ok((input, input)); + }; + let (segment, rest) = input.split_at(seg_len); + match seg_type { + Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (), + + Segment::Identifier => return Ok((segment, rest)), + + Segment::Number + | Segment::QuotedString + | Segment::HexString + | Segment::UnicodeString + | Segment::UnquotedString + | Segment::Punct + | Segment::CommentCommand + | Segment::DoRepeatCommand + | Segment::DoRepeatOverflow + | Segment::InlineData + | Segment::MacroName + | Segment::MacroBody + | Segment::StartDocument + | Segment::Document + | Segment::StartCommand + | Segment::SeparateCommands + | Segment::EndCommand + | Segment::ExpectedQuote + | Segment::ExpectedExponent + | Segment::UnexpectedChar => return Ok(("", rest)), + } + input = rest; + } + } + fn parse_id<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (Some(_), mut end) = take(input, eof).unwrap() else { + unreachable!() + }; + while let (Some(c), rest) = take(end, eof)? { + if !c.may_continue_id() { + break; + }; + end = rest; + } + let identifier = &input[..input.len() - end.len()]; + let identifier = match identifier.strip_suffix('.') { + Some(without_dot) if at_end_of_line(end, eof)? => without_dot, + _ => identifier, + }; + let rest = &input[identifier.len()..]; + + if self.state.1.contains(Substate::START_OF_COMMAND) { + if id_match_n("COMMENT", identifier, 4) { + self.state = (State::Comment1, Substate::empty()); + return self.parse_comment_1(input, eof); + } else if id_match("DOCUMENT", identifier) { + self.state = (State::Document1, Substate::empty()); + return Ok(Some((input, Segment::StartDocument))); + } else if id_match_n("DEFINE", identifier, 6) { + self.state = (State::Define1, Substate::empty()); + } else if id_match("FILE", identifier) { + if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) { + self.state = (State::FileLabel1, Substate::empty()); + return Ok(Some((rest, Segment::Identifier))); + } + } else if id_match("DO", identifier) { + if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) { + self.state = (State::DoRepeat1, Substate::empty()); + return Ok(Some((rest, Segment::Identifier))); + } + } else if id_match("BEGIN", identifier) { + let (next_id, rest2) = self.next_id_in_command(rest, eof)?; + if id_match("DATA", next_id) { + let rest2 = skip_spaces_and_comments(rest2, eof)?; + let rest2 = if let Some(s) = rest2.strip_prefix('.') { + skip_spaces_and_comments(s, eof)? + } else { + rest2 + }; + if is_end_of_line(rest2, eof)? { + let s = &input[..input.len() - rest2.len()]; + self.state = ( + if s.contains('\n') { + State::BeginData1 + } else { + State::BeginData2 + }, + Substate::empty(), + ); + return Ok(Some((rest, Segment::Identifier))); + } + } + } + } + + self.state.1 = Substate::empty(); + Ok(Some(( + rest, + if identifier != "!" { + Segment::Identifier + } else { + Segment::Punct + }, + ))) + } + fn parse_digraph<'a>( + &mut self, + seconds: &[char], + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (c, rest) = take(input, eof)?; + self.state.1 = Substate::empty(); + Ok(Some(( + match c { + Some(c) if seconds.contains(&c) => rest, + _ => input, + }, + Segment::Punct, + ))) + } + fn parse_number<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let mut input = skip_digits(input, eof)?; + if let Some(rest) = match_char(|c| c == '.', input, eof)? { + let rest2 = skip_digits(rest, eof)?; + if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? { + input = rest2; + } + }; + if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? { + let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest); + let rest2 = skip_digits(rest, eof)?; + if rest2.len() == rest.len() { + self.state.1 = Substate::empty(); + return Ok(Some((rest, Segment::ExpectedExponent))); + } + input = rest2; + } + self.state.1 = Substate::empty(); + Ok(Some((input, Segment::Number))) + } + fn parse_comment_1<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + enum CommentState<'a> { + Blank, + NotBlank, + Period(&'a str), + } + let mut state = CommentState::Blank; + loop { + let (Some(c), rest) = take(input, eof)? else { + // End of file. + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok(Some((input, Segment::SeparateCommands))); + }; + match c { + '.' => state = CommentState::Period(input), + '\n' | '\r' if is_end_of_line(input, eof)? => { + match state { + CommentState::Blank => { + // Blank line ends comment command. + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok(Some((input, Segment::SeparateCommands))); + } + CommentState::Period(period) => { + // '.' at end of line ends comment command. + self.state = (State::General, Substate::empty()); + return Ok(Some((period, Segment::CommentCommand))); + } + CommentState::NotBlank => { + // Comment continues onto next line. + self.state = (State::Comment2, Substate::empty()); + return Ok(Some((input, Segment::CommentCommand))); + } + } + } + c if c.is_whitespace() => (), + _ => state = CommentState::NotBlank, + } + input = rest; + } + } + fn parse_comment_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + + let new_command = match take(rest, eof)?.0 { + Some('+') | Some('-') | Some('.') => true, + Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?, + None | Some(_) => false, + }; + if new_command { + self.state = ( + State::General, + Substate::START_OF_LINE | Substate::START_OF_COMMAND, + ); + } else { + self.state = (State::Comment1, Substate::empty()); + } + Ok(Some((rest, Segment::Newline))) + } + fn parse_document_1<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let mut end_cmd = false; + loop { + let (Some(c), rest) = take(input, eof)? else { + self.state = (State::Document3, Substate::empty()); + return Ok(Some((input, Segment::Document))); + }; + match c { + '.' => end_cmd = true, + '\n' | '\r' if is_end_of_line(input, eof)? => { + self.state.0 = if end_cmd { + State::Document3 + } else { + State::Document2 + }; + return Ok(Some((input, Segment::Document))); + } + c if !c.is_whitespace() => end_cmd = false, + _ => (), + } + input = rest; + } + } + fn parse_document_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state = (State::Document1, Substate::empty()); + Ok(Some((rest, Segment::Newline))) + } + fn parse_document_3<'a>( + &mut self, + input: &'a str, + _eof: bool, + ) -> Result, Incomplete> { + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + Ok(Some((input, Segment::EndCommand))) + } + fn quoted_file_label(input: &str, eof: bool) -> Result { + let input = skip_spaces_and_comments(input, eof)?; + match take(input, eof)?.0 { + Some('\'') | Some('"') | Some('\n') => Ok(true), + _ => Ok(false), + } + } + fn parse_file_label_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let mut sub = Segmenter { + state: (State::General, self.state.1), + ..*self + }; + let (rest, segment) = sub.push_rest(input, eof)?.unwrap(); + if segment == Segment::Identifier { + let id = &input[..input.len() - rest.len()]; + debug_assert!(id_match("LABEL", id), "{id} should be LABEL"); + if Self::quoted_file_label(rest, eof)? { + *self = sub; + } else { + self.state.0 = State::FileLabel2; + } + } else { + self.state.1 = sub.state.1; + } + Ok(Some((rest, segment))) + } + fn parse_file_label_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let input = skip_spaces(input, eof)?; + self.state = (State::FileLabel3, Substate::empty()); + Ok(Some((input, Segment::Spaces))) + } + fn parse_file_label_3<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let mut end_cmd = None; + loop { + let (c, rest) = take(input, eof)?; + match c { + None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => { + self.state = (State::General, Substate::empty()); + return Ok(Some((end_cmd.unwrap_or(input), Segment::UnquotedString))); + } + None => unreachable!(), + Some('.') => end_cmd = Some(input), + Some(c) if !c.is_whitespace() => end_cmd = None, + Some(_) => (), + } + input = rest; + } + } + fn subparse<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let mut sub = Segmenter { + syntax: self.syntax, + state: (State::General, self.state.1), + nest: 0, + }; + let result = sub.push_rest(input, eof)?; + self.state.1 = sub.state.1; + Ok(result) + } + /// We are segmenting a `DO REPEAT` command, currently reading the syntax + /// that defines the stand-in variables (the head) before the lines of + /// syntax to be repeated (the body). + fn parse_do_repeat_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (rest, segment) = self.subparse(input, eof)?.unwrap(); + if segment == Segment::SeparateCommands { + // We reached a blank line that separates the head from the body. + self.state.0 = State::DoRepeat2; + } else if segment == Segment::EndCommand || segment == Segment::StartCommand { + // We reached the body. + self.state.0 = State::DoRepeat3; + self.nest = 1; + } + Ok(Some((rest, segment))) + } + /// We are segmenting a `DO REPEAT` command, currently reading a blank line + /// that separates the head from the body. + fn parse_do_repeat_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (rest, segment) = self.subparse(input, eof)?.unwrap(); + if segment == Segment::Newline { + // We reached the body. + self.state.0 = State::DoRepeat3; + self.nest = 1; + } + Ok(Some((rest, segment))) + } + fn parse_newline<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (Some(c), rest) = take(input, eof)? else { + return Ok(None); + }; + match c { + '\n' => Ok(Some(rest)), + '\r' => { + if let (Some('\n'), rest) = take(rest, eof)? { + Ok(Some(rest)) + } else { + Ok(None) + } + } + _ => Ok(None), + } + } + + fn parse_full_line<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<&'a str, Incomplete> { + loop { + if is_end_of_line(input, eof)? { + return Ok(input); + } + input = take(input, eof).unwrap().1; + } + } + fn check_repeat_command(&mut self, input: &str, eof: bool) -> Result { + let input = input.strip_prefix(['-', '+']).unwrap_or(input); + let (id1, input) = self.next_id_in_command(input, eof)?; + if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) { + Ok(1) + } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) + { + Ok(-1) + } else { + Ok(0) + } + } + /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that + /// are to be repeated. Report each line of syntax as a single + /// [`Type::DoRepeatCommand`]. + /// + /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT` + /// blocks inside the lines we're segmenting. `self.nest` counts the + /// nesting level, starting at 1. + fn parse_do_repeat_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + if let Some(rest) = self.parse_newline(input, eof)? { + return Ok(Some((rest, Segment::Newline))); + } + let rest = self.parse_full_line(input, eof)?; + match self.check_repeat_command(input, eof)?.cmp(&0) { + Ordering::Greater => { + if let Some(nest) = self.nest.checked_add(1) { + self.nest = nest; + } else { + self.state.0 = State::DoRepeat4; + } + } + Ordering::Less => { + self.nest -= 1; + if self.nest == 0 { + // Nesting level dropped to 0, so we've finished reading the `DO + // REPEAT` body. + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + return self.push_rest(input, eof); + } + } + Ordering::Equal => (), + } + Ok(Some((rest, Segment::DoRepeatCommand))) + } + fn parse_do_repeat_4<'a>( + &mut self, + input: &'a str, + ) -> Result, Incomplete> { + self.state.0 = State::DoRepeat3; + Ok(Some((input, Segment::DoRepeatOverflow))) + } + /// We are segmenting a `DEFINE` command, which consists of: + /// + /// - The `DEFINE` keyword. + /// + /// - An identifier. We transform this into `Type::MacroName` instead of + /// `Type::Identifier` because this identifier must never be macro-expanded. + /// + /// - Anything but `(`. + /// + /// - `(` followed by a sequence of tokens possibly including balanced + /// parentheses up to a final `)`. + /// + /// - A sequence of any number of lines, one string per line, ending with + /// `!ENDDEFINE`. The first line is usually blank (that is, a newline + /// follows the `(`). The last line usually just has `!ENDDEFINE.` on + /// it, but it can start with other tokens. The whole + /// DEFINE...!ENDDEFINE can be on a single line, even. + fn parse_define_1_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (rest, segment) = self.subparse(input, eof)?.unwrap(); + match segment { + Segment::Identifier if self.state.0 == State::Define1 => { + self.state.0 = State::Define2; + return Ok(Some((rest, Segment::MacroName))); + } + Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => { + // The DEFINE command is malformed because we reached its end + // without ever hitting a `(` token. Transition back to general + // parsing. + self.state.0 = State::General; + } + Segment::Punct if input.starts_with('(') => { + self.state.0 = State::Define3; + self.nest = 1; + } + _ => (), + } + Ok(Some((rest, segment))) + } + fn parse_define_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (rest, segment) = self.subparse(input, eof)?.unwrap(); + match segment { + Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => { + // The DEFINE command is malformed because we reached its end + // without ever hitting a `(` token. Transition back to general + // parsing. + self.state.0 = State::General; + } + Segment::Punct if input.starts_with('(') => { + self.nest += 1; + } + Segment::Punct if input.starts_with(')') => { + self.nest -= 1; + if self.nest == 0 { + self.state = (State::Define4, Substate::empty()); + } + } + _ => (), + } + Ok(Some((rest, segment))) + } + fn find_enddefine(mut input: &str) -> Option<&str> { + loop { + input = skip_spaces_and_comments(input, true).unwrap(); + let (Some(c), rest) = take(input, true).unwrap() else { + return None; + }; + match c { + '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => { + return Some(input) + } + '\'' | '"' => { + let index = rest.find(c)?; + input = &rest[index + 1..]; + } + _ => input = rest, + } + } + } + + /// We are in the body of a macro definition, looking for additional lines + /// of the body or `!ENDDEFINE`. + /// + /// In `State::Define4`, we're parsing the first line of the macro body (the + /// same line as the closing parenthesis in the argument definition). In + /// `State::Define5`, we're on a later line. + fn parse_define_4_5<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let rest = self.parse_full_line(input, eof)?; + let line = &input[..input.len() - rest.len()]; + if let Some(end) = Self::find_enddefine(line) { + // Macro ends at the !ENDDEFINE on this line. + self.state = (State::General, Substate::empty()); + let (prefix, rest) = input.split_at(line.len() - end.len()); + if prefix.is_empty() { + // Line starts with `!ENDDEFINE`. + self.push_rest(input, eof) + } else if prefix.trim_start().is_empty() { + // Line starts with spaces followed by `!ENDDEFINE`. + Ok(Some((rest, Segment::Spaces))) + } else { + // Line starts with some content followed by `!ENDDEFINE`. + Ok(Some((rest, Segment::MacroBody))) + } + } else { + // No `!ENDDEFINE`. We have a full line of macro body. + // + // If the first line of the macro body is blank, we just report it + // as spaces, or not at all if there are no spaces, because it's not + // significant. + // + // However, if it's a later line, we need to report it because blank + // lines can have significance. + let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() { + if line.is_empty() { + return self.parse_define_6(input, eof); + } + Segment::Spaces + } else { + Segment::MacroBody + }; + self.state.0 = State::Define6; + Ok(Some((rest, segment))) + } + } + fn parse_define_6<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::Define5; + Ok(Some((rest, Segment::Newline))) + } + fn parse_begin_data_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (rest, segment) = self.subparse(input, eof)?.unwrap(); + if segment == Segment::Newline { + self.state.0 = State::BeginData2; + } + Ok(Some((rest, segment))) + } + fn parse_begin_data_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (rest, segment) = self.subparse(input, eof)?.unwrap(); + if segment == Segment::Newline { + self.state.0 = State::BeginData3; + } + Ok(Some((rest, segment))) + } + fn is_end_data(line: &str) -> bool { + let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else { + return false; + }; + let (Some(c), rest) = take(rest, true).unwrap() else { + return false; + }; + if !c.is_whitespace() { + return false; + }; + let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else { + return false; + }; + + let mut endcmd = false; + for c in rest.chars() { + match c { + '.' if endcmd => return false, + '.' => endcmd = true, + c if c.is_whitespace() => (), + _ => return false, + } + } + true + } + fn parse_begin_data_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let rest = self.parse_full_line(input, eof)?; + let line = &input[..input.len() - rest.len()]; + if Self::is_end_data(line) { + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + self.push_rest(input, eof) + } else { + self.state.0 = State::BeginData4; + Ok(Some((rest, Segment::InlineData))) + } + } + fn parse_begin_data_4<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::BeginData3; + Ok(Some((rest, Segment::Newline))) + } +} + +fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> { + line.get(..pattern.len()).and_then(|prefix| { + prefix + .eq_ignore_ascii_case(pattern) + .then(|| &line[pattern.len()..]) + }) +} + +#[cfg(test)] +mod test; diff --git a/rust/pspp/src/lex/segment/mod.rs b/rust/pspp/src/lex/segment/mod.rs deleted file mode 100644 index 5a568692b5..0000000000 --- a/rust/pspp/src/lex/segment/mod.rs +++ /dev/null @@ -1,1442 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -//! Low-level lexical analysis. -//! -//! PSPP divides traditional "lexical analysis" or "tokenization" into [three -//! phases](super). This module implements the low-level segmentation phase. -//! -//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label -//! (a segment type) for each byte or contiguous sequence of bytes in the input. -//! It also, in a few corner cases, outputs zero-width segments that label the -//! boundary between a pair of bytes in the input. -//! -//! Some segment types correspond directly to tokens; for example, -//! [Segment::Identifier] becomes [Token::Id] later in lexical analysis. Other -//! segments contribute to tokens but do not correspond directly; for example, -//! multiple quoted string [Segment::QuotedString] separated by -//! [Segment::Spaces] and "+" punctuators [Segment::Punct] may be combined to -//! form a single string token [Token::String]. Still other segments are -//! ignored (e.g. [Segment::Spaces]) or trigger special behavior such as error -//! messages later in tokenization (e.g. [Segment::ExpectedQuote]). -//! -//! [Token::Id]: crate::lex::token::Token::Id -//! [Token::String]: crate::lex::token::Token::String - -use std::cmp::Ordering; - -use crate::{ - identifier::{id_match, id_match_n, IdentifierChar}, - prompt::PromptStyle, -}; -use bitflags::bitflags; - -use super::command_name::{command_match, COMMAND_NAMES}; - -/// Syntax variant. -/// -/// PSPP syntax is written in one of two syntax variant which are broadly -/// defined as follows: -/// -/// - In interactive syntax, commands end with a period at the end of the line -/// or with a blank line. -/// -/// - In batch syntax, the second and subsequent lines of a command are indented -/// from the left margin. -/// -/// The segmenter can also try to automatically detect the kind of syntax in -/// use, using a heuristic that is usually correct. -#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] -pub enum Syntax { - /// Try to interpret input correctly regardless of whether it is written - /// for interactive or batch syntax. - /// - /// This is `Syntax::default()`. - #[default] - Auto, - - /// Interactive syntax. - Interactive, - - /// Batch syntax. - Batch, -} - -/// The type of a segment. -/// -/// A [Segment] is a label for a string slice and is normally paired with one. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Segment { - /// A number. - Number, - - /// A quoted string (`'...'` or `"..."`).. - QuotedString, - - /// A hexadecimal string (`X'...'` or `X"..."`). - HexString, - - /// A Unicode string (`U'...'` or `U"..."`). - UnicodeString, - - /// An unquoted string. - /// - /// Unquoted strings appear only in a few special-case constructs, such as - /// the `FILE LABEL` command. - UnquotedString, - - /// An identifier. - Identifier, - - /// A punctuator or operator. - Punct, - - /// `#!` at the beginning of a syntax file only. - Shbang, - - /// Spaces. - Spaces, - - /// A comment (`/* ... */`). - Comment, - - /// New-line. - Newline, - - /// A comment command (`* ...` or `COMMENT ...`). - CommentCommand, - - /// In a `DO REPEAT` command, one of the lines to be repeated. - DoRepeatCommand, - - /// Indicates `DO REPEAT` nested more deeply than supported. - DoRepeatOverflow, - - /// A line of inline data inside `BEGIN DATA`...`END DATA`. - InlineData, - - /// In `!DEFINE`, an identifier for the macro being defined. - /// - /// Distinguished from [Identifier](Self::Identifier) because a `MacroName` - /// must never be macro-expanded. - MacroName, - - /// Contents of `!DEFINE`...`!ENDDEFINE`. - MacroBody, - - /// Represents the `DOCUMENT` beginning a `DOCUMENT` command. - /// - /// This token is not associated with any text: the actual `DOCUMENT` - /// keyword is part of the following [Document](Self::Document) segment. - /// This is because documents include the `DOCUMENT` keyword. - StartDocument, - - /// One of the lines of documents in a `DOCUMENT` command. - /// - /// The first line of a document includes the `DOCUMENT` keyword itself. - Document, - - /// A command separator. - /// - /// This segment is usually for `+`, `-`, or `.` at the beginning of a line. - StartCommand, - - /// A command separator. - /// - /// This segment is usually for a blank line. It also appears at the end of - /// a file. - SeparateCommands, - - /// A command separator. - /// - /// This segment is for `.` at the end of a line. - EndCommand, - - /// Missing quote at the end of a line. - /// - /// This segment contains a partial quoted string. It starts with a quote - /// mark (`"` or `'`, possibly preceded by `X` or `U`) but goes to the end - /// of the line without the matching end quote mark. - ExpectedQuote, - - /// Missing exponent in number. - /// - /// This segment contains a number that ends with `E` or `E+` or `E-` - /// without a following exponent. - ExpectedExponent, - - /// Unexpected character. - /// - /// The segment is a single character that isn't valid in syntax. - UnexpectedChar, -} - -bitflags! { - #[derive(Copy, Clone, Debug)] - struct Substate: u8 { - const START_OF_LINE = 1; - const START_OF_COMMAND = 2; - } -} - -/// Used by [Segmenter] to indicate that more input is needed. -#[derive(Copy, Clone, Debug)] -pub struct Incomplete; - -/// Labels syntax input with [Segment]s. -#[derive(Copy, Clone)] -pub struct Segmenter { - state: (State, Substate), - nest: u8, - syntax: Syntax, -} - -impl Segmenter { - /// Returns a segmenter with the given `syntax`. - /// - /// If `is_snippet` is false, then the segmenter will parse as if it's being - /// given a whole file. This means, for example, that it will interpret `-` - /// or `+` at the beginning of the syntax as a separator between commands - /// (since `-` or `+` at the beginning of a line has this meaning). - /// - /// If `is_snippet` is true, then the segmenter will parse as if it's being - /// given an isolated piece of syntax. This means that, for example, that - /// it will interpret `-` or `+` at the beginning of the syntax as an - /// operator token or (if followed by a digit) as part of a number. - pub fn new(syntax: Syntax, is_snippet: bool) -> Self { - Self { - state: if is_snippet { - (State::General, Substate::empty()) - } else { - (State::Shbang, Substate::empty()) - }, - syntax, - nest: 0, - } - } - - /// Returns the [Syntax] variant passed in to [new](Self::new). - pub fn syntax(&self) -> Syntax { - self.syntax - } - - fn start_of_line(&self) -> bool { - self.state.1.contains(Substate::START_OF_LINE) - } - - fn start_of_command(&self) -> bool { - self.state.1.contains(Substate::START_OF_COMMAND) - } - - /// Returns the style of command prompt to display to an interactive user - /// for input in the current state.. The return value is most accurate in - /// with [Syntax::Interactive] syntax and at the beginning of a line (that - /// is, if [Segmenter::push] consumed as much as possible of the input up to - /// a new-line). - pub fn prompt(&self) -> PromptStyle { - match self.state.0 { - State::Shbang => PromptStyle::First, - State::General => { - if self.start_of_command() { - PromptStyle::First - } else { - PromptStyle::Later - } - } - State::Comment1 | State::Comment2 => PromptStyle::Comment, - State::Document1 | State::Document2 => PromptStyle::Document, - State::Document3 => PromptStyle::First, - State::FileLabel1 => PromptStyle::Later, - State::FileLabel2 | State::FileLabel3 => PromptStyle::First, - State::DoRepeat1 | State::DoRepeat2 => { - if self.start_of_command() { - PromptStyle::First - } else { - PromptStyle::Later - } - } - State::DoRepeat3 => PromptStyle::DoRepeat, - State::DoRepeat4 => PromptStyle::DoRepeat, - State::Define1 | State::Define2 | State::Define3 => { - if self.start_of_command() { - PromptStyle::First - } else { - PromptStyle::Later - } - } - State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define, - State::BeginData1 => PromptStyle::First, - State::BeginData2 => PromptStyle::Later, - State::BeginData3 | State::BeginData4 => PromptStyle::Data, - } - } - - fn push_rest<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - if input.is_empty() { - if eof { - return Ok(None); - } else { - return Err(Incomplete); - }; - } - - match self.state.0 { - State::Shbang => self.parse_shbang(input, eof), - State::General => { - if self.start_of_line() { - self.parse_start_of_line(input, eof) - } else { - self.parse_mid_line(input, eof) - } - } - State::Comment1 => self.parse_comment_1(input, eof), - State::Comment2 => self.parse_comment_2(input, eof), - State::Document1 => self.parse_document_1(input, eof), - State::Document2 => self.parse_document_2(input, eof), - State::Document3 => self.parse_document_3(input, eof), - State::FileLabel1 => self.parse_file_label_1(input, eof), - State::FileLabel2 => self.parse_file_label_2(input, eof), - State::FileLabel3 => self.parse_file_label_3(input, eof), - State::DoRepeat1 => self.parse_do_repeat_1(input, eof), - State::DoRepeat2 => self.parse_do_repeat_2(input, eof), - State::DoRepeat3 => self.parse_do_repeat_3(input, eof), - State::DoRepeat4 => self.parse_do_repeat_4(input), - State::Define1 => self.parse_define_1_2(input, eof), - State::Define2 => self.parse_define_1_2(input, eof), - State::Define3 => self.parse_define_3(input, eof), - State::Define4 => self.parse_define_4_5(input, eof), - State::Define5 => self.parse_define_4_5(input, eof), - State::Define6 => self.parse_define_6(input, eof), - State::BeginData1 => self.parse_begin_data_1(input, eof), - State::BeginData2 => self.parse_begin_data_2(input, eof), - State::BeginData3 => self.parse_begin_data_3(input, eof), - State::BeginData4 => self.parse_begin_data_4(input, eof), - } - } - - /// Attempts to label a prefix of the remaining input with a segment type. - /// The caller supplies a prefix of the remaining input as `input`. If - /// `eof` is true, then `input` is the entire (remainder) of the input; if - /// `eof` is false, then further input is potentially available. - /// - /// The input may contain `\n` or `\r\n` line ends in any combination. - /// - /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes - /// in the segment at the beginning of `input` (a number in - /// `0..=input.len()`) and the type of that segment. The next call should - /// not include those bytes in `input`, because the segmenter has - /// (figuratively) consumed them. - /// - /// Segments can have zero length, including segment types - /// [Segment::SeparateCommands], [Segment::StartDocument], - /// [Segment::InlineData], and [Segment::Spaces]. - /// - /// Failure occurs only if the segment type of the bytes in `input` cannot - /// yet be determined. In this case, this function returns - /// `Err(Incomplete)`. If more input is available, the caller should obtain - /// some more, then call again with a longer `input`. If this is still not - /// enough, the process might need to repeat again and again. If input is - /// exhausted, then the caller may call again setting `eof` to true. This - /// function will never return `Err(Incomplete)` when `eof` is true. - /// - /// The caller must not, in a sequence of calls, supply contradictory input. - /// That is, bytes provided as part of `input` in one call, but not - /// consumed, must not be provided with *different* values on subsequent - /// calls. This is because the function must often make decisions based on - /// looking ahead beyond the bytes that it consumes. - pub fn push(&mut self, input: &str, eof: bool) -> Result, Incomplete> { - Ok(self - .push_rest(input, eof)? - .map(|(rest, seg_type)| (input.len() - rest.len(), seg_type))) - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -enum State { - Shbang, - General, - Comment1, - Comment2, - Document1, - Document2, - Document3, - FileLabel1, - FileLabel2, - FileLabel3, - DoRepeat1, - DoRepeat2, - DoRepeat3, - DoRepeat4, - Define1, - Define2, - Define3, - Define4, - Define5, - Define6, - BeginData1, - BeginData2, - BeginData3, - BeginData4, -} - -fn take(input: &str, eof: bool) -> Result<(Option, &str), Incomplete> { - let mut iter = input.chars(); - match iter.next() { - None if !eof => Err(Incomplete), - c => Ok((c, iter.as_str())), - } -} - -fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - return Ok(input); - }; - match c { - '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input), - '*' => { - if let (Some('/'), rest) = take(rest, eof)? { - return Ok(rest); - } - } - _ => (), - }; - input = rest; - } -} - -fn skip_matching(f: F, input: &str, eof: bool) -> Result<&str, Incomplete> -where - F: Fn(char) -> bool, -{ - let input = input.trim_start_matches(f); - if input.is_empty() && !eof { - Err(Incomplete) - } else { - Ok(input) - } -} - -fn match_char(f: F, input: &str, eof: bool) -> Result, Incomplete> -where - F: Fn(char) -> bool, -{ - if let (Some(c), rest) = take(input, eof)? { - if f(c) { - return Ok(Some(rest)); - } - } - Ok(None) -} - -fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - return Ok(input); - }; - match c { - '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), - c if c.is_whitespace() => (), - _ => return Ok(input), - } - input = rest; - } -} - -fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> { - skip_matching(|c| c.is_ascii_digit(), input, eof) -} - -fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - return Ok(input); - }; - match c { - '/' => { - let (c, rest2) = take(rest, eof)?; - match c { - Some('*') => input = skip_comment(rest2, eof)?, - Some(_) | None => return Ok(rest), - } - } - '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), - c if c.is_whitespace() => input = rest, - _ => return Ok(input), - }; - } -} - -fn is_start_of_string(input: &str, eof: bool) -> Result { - let (Some(c), rest) = take(input, eof)? else { - return Ok(false); - }; - match c { - 'x' | 'X' | 'u' | 'U' => { - let (c, _rest) = take(rest, eof)?; - Ok(c == Some('\'') || c == Some('"')) - } - '\'' | '"' => Ok(true), - '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true), - _ => Ok(false), - } -} - -fn is_end_of_line(input: &str, eof: bool) -> Result { - let (Some(c), rest) = take(input, eof)? else { - return Ok(true); - }; - Ok(match c { - '\n' => true, - '\r' => take(rest, eof)?.0 == Some('\n'), - _ => false, - }) -} - -fn at_end_of_line(input: &str, eof: bool) -> Result { - is_end_of_line(skip_spaces_and_comments(input, eof)?, eof) -} - -fn first(s: &str) -> char { - s.chars().next().unwrap() -} -fn get_command_name_candidates(target: &str) -> &[&'static str] { - if target.is_empty() { - return &[]; - } - let target_first = first(target).to_ascii_uppercase(); - let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first); - let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first); - &COMMAND_NAMES[low..high] -} - -fn detect_command_name(input: &str, eof: bool) -> Result { - let command_name = input - .split(|c: char| { - !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-') - }) - .next() - .unwrap(); - if !eof && command_name.len() == input.len() { - return Err(Incomplete); - } - let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.'); - for command in get_command_name_candidates(command_name) { - if let Some(m) = command_match(command, command_name) { - if m.missing_words <= 0 { - return Ok(true); - } - } - } - Ok(false) -} - -impl Segmenter { - fn parse_shbang<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - if let (Some('#'), rest) = take(input, eof)? { - if let (Some('!'), rest) = take(rest, eof)? { - let rest = self.parse_full_line(rest, eof)?; - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok(Some((rest, Segment::Shbang))); - } - } - - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - self.push_rest(input, eof) - } - fn at_command_start(&self, input: &str, eof: bool) -> Result { - match self.syntax { - Syntax::Auto => detect_command_name(input, eof), - Syntax::Interactive => Ok(false), - Syntax::Batch => Ok(true), - } - } - fn parse_start_of_line<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - debug_assert_eq!(self.state.0, State::General); - debug_assert!(self.start_of_line()); - debug_assert!(!input.is_empty()); - - let (Some(c), rest) = take(input, eof).unwrap() else { - unreachable!() - }; - match c { - '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => { - // This `+` is punctuation that may separate pieces of a string. - self.state = (State::General, Substate::empty()); - return Ok(Some((rest, Segment::Punct))); - } - '+' | '-' | '.' => { - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok(Some((rest, Segment::StartCommand))); - } - _ if c.is_whitespace() => { - if at_end_of_line(input, eof)? { - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok(Some((input, Segment::SeparateCommands))); - } - } - _ => { - if self.at_command_start(input, eof)? - && !self.state.1.contains(Substate::START_OF_COMMAND) - { - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok(Some((input, Segment::StartCommand))); - } - } - } - self.state.1 = Substate::START_OF_COMMAND; - self.parse_mid_line(input, eof) - } - fn parse_mid_line<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - debug_assert!(self.state.0 == State::General); - debug_assert!(!self.state.1.contains(Substate::START_OF_LINE)); - let (Some(c), rest) = take(input, eof)? else { - unreachable!() - }; - match c { - '\r' | '\n' if is_end_of_line(input, eof)? => { - self.state.1 |= Substate::START_OF_LINE; - Ok(Some(( - self.parse_newline(input, eof).unwrap().unwrap(), - Segment::Newline, - ))) - } - '/' => { - if let (Some('*'), rest) = take(rest, eof)? { - let rest = skip_comment(rest, eof)?; - Ok(Some((rest, Segment::Comment))) - } else { - self.state.1 = Substate::empty(); - Ok(Some((rest, Segment::Punct))) - } - } - '-' => { - let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?; - match c { - Some(c) if c.is_ascii_digit() => { - return self.parse_number(rest, eof); - } - Some('.') => { - if let (Some(c), _rest) = take(rest2, eof)? { - if c.is_ascii_digit() { - return self.parse_number(rest, eof); - } - } - } - None | Some(_) => (), - } - self.state.1 = Substate::empty(); - Ok(Some((rest, Segment::Punct))) - } - '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => { - self.state.1 = Substate::empty(); - Ok(Some((rest, Segment::Punct))) - } - '*' => { - if self.state.1.contains(Substate::START_OF_COMMAND) { - self.state = (State::Comment1, Substate::empty()); - self.parse_comment_1(input, eof) - } else { - self.parse_digraph(&['*'], rest, eof) - } - } - '<' => self.parse_digraph(&['=', '>'], rest, eof), - '>' => self.parse_digraph(&['='], rest, eof), - '~' => self.parse_digraph(&['='], rest, eof), - '.' if at_end_of_line(rest, eof)? => { - self.state.1 = Substate::START_OF_COMMAND; - Ok(Some((rest, Segment::EndCommand))) - } - '.' => match take(rest, eof)? { - (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof), - _ => Ok(Some((rest, Segment::Punct))), - }, - '0'..='9' => self.parse_number(input, eof), - 'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof), - 'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof), - '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof), - '!' => { - let (c, rest2) = take(rest, eof)?; - match c { - Some('*') => Ok(Some((rest2, Segment::Punct))), - Some(_) => self.parse_id(input, eof), - None => Ok(Some((rest, Segment::Punct))), - } - } - c if c.is_whitespace() => Ok(Some((skip_spaces(rest, eof)?, Segment::Spaces))), - c if c.may_start_id() => self.parse_id(input, eof), - '#'..='~' if c != '\\' && c != '^' => { - self.state.1 = Substate::empty(); - Ok(Some((rest, Segment::Punct))) - } - _ => { - self.state.1 = Substate::empty(); - Ok(Some((rest, Segment::UnexpectedChar))) - } - } - } - fn parse_string<'a>( - &mut self, - segment: Segment, - quote: char, - mut input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - while let (Some(c), rest) = take(input, eof)? { - match c { - _ if c == quote => { - let (c, rest2) = take(rest, eof)?; - if c != Some(quote) { - self.state.1 = Substate::empty(); - return Ok(Some((rest, segment))); - } - input = rest2; - } - '\r' | '\n' if is_end_of_line(input, eof)? => break, - _ => input = rest, - } - } - self.state.1 = Substate::empty(); - Ok(Some((input, Segment::ExpectedQuote))) - } - fn maybe_parse_string<'a>( - &mut self, - segment: Segment, - input: (&'a str, &'a str), - eof: bool, - ) -> Result, Incomplete> { - match take(input.1, eof)? { - (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof), - _ => self.parse_id(input.0, eof), - } - } - fn next_id_in_command<'a>( - &self, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, &'a str), Incomplete> { - let mut sub = Segmenter::new(self.syntax, true); - loop { - let Some((seg_len, seg_type)) = sub.push(input, eof)? else { - return Ok((input, input)); - }; - let (segment, rest) = input.split_at(seg_len); - match seg_type { - Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (), - - Segment::Identifier => return Ok((segment, rest)), - - Segment::Number - | Segment::QuotedString - | Segment::HexString - | Segment::UnicodeString - | Segment::UnquotedString - | Segment::Punct - | Segment::CommentCommand - | Segment::DoRepeatCommand - | Segment::DoRepeatOverflow - | Segment::InlineData - | Segment::MacroName - | Segment::MacroBody - | Segment::StartDocument - | Segment::Document - | Segment::StartCommand - | Segment::SeparateCommands - | Segment::EndCommand - | Segment::ExpectedQuote - | Segment::ExpectedExponent - | Segment::UnexpectedChar => return Ok(("", rest)), - } - input = rest; - } - } - fn parse_id<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (Some(_), mut end) = take(input, eof).unwrap() else { - unreachable!() - }; - while let (Some(c), rest) = take(end, eof)? { - if !c.may_continue_id() { - break; - }; - end = rest; - } - let identifier = &input[..input.len() - end.len()]; - let identifier = match identifier.strip_suffix('.') { - Some(without_dot) if at_end_of_line(end, eof)? => without_dot, - _ => identifier, - }; - let rest = &input[identifier.len()..]; - - if self.state.1.contains(Substate::START_OF_COMMAND) { - if id_match_n("COMMENT", identifier, 4) { - self.state = (State::Comment1, Substate::empty()); - return self.parse_comment_1(input, eof); - } else if id_match("DOCUMENT", identifier) { - self.state = (State::Document1, Substate::empty()); - return Ok(Some((input, Segment::StartDocument))); - } else if id_match_n("DEFINE", identifier, 6) { - self.state = (State::Define1, Substate::empty()); - } else if id_match("FILE", identifier) { - if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) { - self.state = (State::FileLabel1, Substate::empty()); - return Ok(Some((rest, Segment::Identifier))); - } - } else if id_match("DO", identifier) { - if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) { - self.state = (State::DoRepeat1, Substate::empty()); - return Ok(Some((rest, Segment::Identifier))); - } - } else if id_match("BEGIN", identifier) { - let (next_id, rest2) = self.next_id_in_command(rest, eof)?; - if id_match("DATA", next_id) { - let rest2 = skip_spaces_and_comments(rest2, eof)?; - let rest2 = if let Some(s) = rest2.strip_prefix('.') { - skip_spaces_and_comments(s, eof)? - } else { - rest2 - }; - if is_end_of_line(rest2, eof)? { - let s = &input[..input.len() - rest2.len()]; - self.state = ( - if s.contains('\n') { - State::BeginData1 - } else { - State::BeginData2 - }, - Substate::empty(), - ); - return Ok(Some((rest, Segment::Identifier))); - } - } - } - } - - self.state.1 = Substate::empty(); - Ok(Some(( - rest, - if identifier != "!" { - Segment::Identifier - } else { - Segment::Punct - }, - ))) - } - fn parse_digraph<'a>( - &mut self, - seconds: &[char], - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (c, rest) = take(input, eof)?; - self.state.1 = Substate::empty(); - Ok(Some(( - match c { - Some(c) if seconds.contains(&c) => rest, - _ => input, - }, - Segment::Punct, - ))) - } - fn parse_number<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let mut input = skip_digits(input, eof)?; - if let Some(rest) = match_char(|c| c == '.', input, eof)? { - let rest2 = skip_digits(rest, eof)?; - if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? { - input = rest2; - } - }; - if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? { - let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest); - let rest2 = skip_digits(rest, eof)?; - if rest2.len() == rest.len() { - self.state.1 = Substate::empty(); - return Ok(Some((rest, Segment::ExpectedExponent))); - } - input = rest2; - } - self.state.1 = Substate::empty(); - Ok(Some((input, Segment::Number))) - } - fn parse_comment_1<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - enum CommentState<'a> { - Blank, - NotBlank, - Period(&'a str), - } - let mut state = CommentState::Blank; - loop { - let (Some(c), rest) = take(input, eof)? else { - // End of file. - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok(Some((input, Segment::SeparateCommands))); - }; - match c { - '.' => state = CommentState::Period(input), - '\n' | '\r' if is_end_of_line(input, eof)? => { - match state { - CommentState::Blank => { - // Blank line ends comment command. - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok(Some((input, Segment::SeparateCommands))); - } - CommentState::Period(period) => { - // '.' at end of line ends comment command. - self.state = (State::General, Substate::empty()); - return Ok(Some((period, Segment::CommentCommand))); - } - CommentState::NotBlank => { - // Comment continues onto next line. - self.state = (State::Comment2, Substate::empty()); - return Ok(Some((input, Segment::CommentCommand))); - } - } - } - c if c.is_whitespace() => (), - _ => state = CommentState::NotBlank, - } - input = rest; - } - } - fn parse_comment_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - - let new_command = match take(rest, eof)?.0 { - Some('+') | Some('-') | Some('.') => true, - Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?, - None | Some(_) => false, - }; - if new_command { - self.state = ( - State::General, - Substate::START_OF_LINE | Substate::START_OF_COMMAND, - ); - } else { - self.state = (State::Comment1, Substate::empty()); - } - Ok(Some((rest, Segment::Newline))) - } - fn parse_document_1<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let mut end_cmd = false; - loop { - let (Some(c), rest) = take(input, eof)? else { - self.state = (State::Document3, Substate::empty()); - return Ok(Some((input, Segment::Document))); - }; - match c { - '.' => end_cmd = true, - '\n' | '\r' if is_end_of_line(input, eof)? => { - self.state.0 = if end_cmd { - State::Document3 - } else { - State::Document2 - }; - return Ok(Some((input, Segment::Document))); - } - c if !c.is_whitespace() => end_cmd = false, - _ => (), - } - input = rest; - } - } - fn parse_document_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - self.state = (State::Document1, Substate::empty()); - Ok(Some((rest, Segment::Newline))) - } - fn parse_document_3<'a>( - &mut self, - input: &'a str, - _eof: bool, - ) -> Result, Incomplete> { - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - Ok(Some((input, Segment::EndCommand))) - } - fn quoted_file_label(input: &str, eof: bool) -> Result { - let input = skip_spaces_and_comments(input, eof)?; - match take(input, eof)?.0 { - Some('\'') | Some('"') | Some('\n') => Ok(true), - _ => Ok(false), - } - } - fn parse_file_label_1<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let mut sub = Segmenter { - state: (State::General, self.state.1), - ..*self - }; - let (rest, segment) = sub.push_rest(input, eof)?.unwrap(); - if segment == Segment::Identifier { - let id = &input[..input.len() - rest.len()]; - debug_assert!(id_match("LABEL", id), "{id} should be LABEL"); - if Self::quoted_file_label(rest, eof)? { - *self = sub; - } else { - self.state.0 = State::FileLabel2; - } - } else { - self.state.1 = sub.state.1; - } - Ok(Some((rest, segment))) - } - fn parse_file_label_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let input = skip_spaces(input, eof)?; - self.state = (State::FileLabel3, Substate::empty()); - Ok(Some((input, Segment::Spaces))) - } - fn parse_file_label_3<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let mut end_cmd = None; - loop { - let (c, rest) = take(input, eof)?; - match c { - None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => { - self.state = (State::General, Substate::empty()); - return Ok(Some((end_cmd.unwrap_or(input), Segment::UnquotedString))); - } - None => unreachable!(), - Some('.') => end_cmd = Some(input), - Some(c) if !c.is_whitespace() => end_cmd = None, - Some(_) => (), - } - input = rest; - } - } - fn subparse<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let mut sub = Segmenter { - syntax: self.syntax, - state: (State::General, self.state.1), - nest: 0, - }; - let result = sub.push_rest(input, eof)?; - self.state.1 = sub.state.1; - Ok(result) - } - /// We are segmenting a `DO REPEAT` command, currently reading the syntax - /// that defines the stand-in variables (the head) before the lines of - /// syntax to be repeated (the body). - fn parse_do_repeat_1<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (rest, segment) = self.subparse(input, eof)?.unwrap(); - if segment == Segment::SeparateCommands { - // We reached a blank line that separates the head from the body. - self.state.0 = State::DoRepeat2; - } else if segment == Segment::EndCommand || segment == Segment::StartCommand { - // We reached the body. - self.state.0 = State::DoRepeat3; - self.nest = 1; - } - Ok(Some((rest, segment))) - } - /// We are segmenting a `DO REPEAT` command, currently reading a blank line - /// that separates the head from the body. - fn parse_do_repeat_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (rest, segment) = self.subparse(input, eof)?.unwrap(); - if segment == Segment::Newline { - // We reached the body. - self.state.0 = State::DoRepeat3; - self.nest = 1; - } - Ok(Some((rest, segment))) - } - fn parse_newline<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (Some(c), rest) = take(input, eof)? else { - return Ok(None); - }; - match c { - '\n' => Ok(Some(rest)), - '\r' => { - if let (Some('\n'), rest) = take(rest, eof)? { - Ok(Some(rest)) - } else { - Ok(None) - } - } - _ => Ok(None), - } - } - - fn parse_full_line<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result<&'a str, Incomplete> { - loop { - if is_end_of_line(input, eof)? { - return Ok(input); - } - input = take(input, eof).unwrap().1; - } - } - fn check_repeat_command(&mut self, input: &str, eof: bool) -> Result { - let input = input.strip_prefix(['-', '+']).unwrap_or(input); - let (id1, input) = self.next_id_in_command(input, eof)?; - if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) { - Ok(1) - } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) - { - Ok(-1) - } else { - Ok(0) - } - } - /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that - /// are to be repeated. Report each line of syntax as a single - /// [`Type::DoRepeatCommand`]. - /// - /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT` - /// blocks inside the lines we're segmenting. `self.nest` counts the - /// nesting level, starting at 1. - fn parse_do_repeat_3<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - if let Some(rest) = self.parse_newline(input, eof)? { - return Ok(Some((rest, Segment::Newline))); - } - let rest = self.parse_full_line(input, eof)?; - match self.check_repeat_command(input, eof)?.cmp(&0) { - Ordering::Greater => { - if let Some(nest) = self.nest.checked_add(1) { - self.nest = nest; - } else { - self.state.0 = State::DoRepeat4; - } - } - Ordering::Less => { - self.nest -= 1; - if self.nest == 0 { - // Nesting level dropped to 0, so we've finished reading the `DO - // REPEAT` body. - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - return self.push_rest(input, eof); - } - } - Ordering::Equal => (), - } - Ok(Some((rest, Segment::DoRepeatCommand))) - } - fn parse_do_repeat_4<'a>( - &mut self, - input: &'a str, - ) -> Result, Incomplete> { - self.state.0 = State::DoRepeat3; - Ok(Some((input, Segment::DoRepeatOverflow))) - } - /// We are segmenting a `DEFINE` command, which consists of: - /// - /// - The `DEFINE` keyword. - /// - /// - An identifier. We transform this into `Type::MacroName` instead of - /// `Type::Identifier` because this identifier must never be macro-expanded. - /// - /// - Anything but `(`. - /// - /// - `(` followed by a sequence of tokens possibly including balanced - /// parentheses up to a final `)`. - /// - /// - A sequence of any number of lines, one string per line, ending with - /// `!ENDDEFINE`. The first line is usually blank (that is, a newline - /// follows the `(`). The last line usually just has `!ENDDEFINE.` on - /// it, but it can start with other tokens. The whole - /// DEFINE...!ENDDEFINE can be on a single line, even. - fn parse_define_1_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (rest, segment) = self.subparse(input, eof)?.unwrap(); - match segment { - Segment::Identifier if self.state.0 == State::Define1 => { - self.state.0 = State::Define2; - return Ok(Some((rest, Segment::MacroName))); - } - Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => { - // The DEFINE command is malformed because we reached its end - // without ever hitting a `(` token. Transition back to general - // parsing. - self.state.0 = State::General; - } - Segment::Punct if input.starts_with('(') => { - self.state.0 = State::Define3; - self.nest = 1; - } - _ => (), - } - Ok(Some((rest, segment))) - } - fn parse_define_3<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (rest, segment) = self.subparse(input, eof)?.unwrap(); - match segment { - Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => { - // The DEFINE command is malformed because we reached its end - // without ever hitting a `(` token. Transition back to general - // parsing. - self.state.0 = State::General; - } - Segment::Punct if input.starts_with('(') => { - self.nest += 1; - } - Segment::Punct if input.starts_with(')') => { - self.nest -= 1; - if self.nest == 0 { - self.state = (State::Define4, Substate::empty()); - } - } - _ => (), - } - Ok(Some((rest, segment))) - } - fn find_enddefine(mut input: &str) -> Option<&str> { - loop { - input = skip_spaces_and_comments(input, true).unwrap(); - let (Some(c), rest) = take(input, true).unwrap() else { - return None; - }; - match c { - '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => { - return Some(input) - } - '\'' | '"' => { - let index = rest.find(c)?; - input = &rest[index + 1..]; - } - _ => input = rest, - } - } - } - - /// We are in the body of a macro definition, looking for additional lines - /// of the body or `!ENDDEFINE`. - /// - /// In `State::Define4`, we're parsing the first line of the macro body (the - /// same line as the closing parenthesis in the argument definition). In - /// `State::Define5`, we're on a later line. - fn parse_define_4_5<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let rest = self.parse_full_line(input, eof)?; - let line = &input[..input.len() - rest.len()]; - if let Some(end) = Self::find_enddefine(line) { - // Macro ends at the !ENDDEFINE on this line. - self.state = (State::General, Substate::empty()); - let (prefix, rest) = input.split_at(line.len() - end.len()); - if prefix.is_empty() { - // Line starts with `!ENDDEFINE`. - self.push_rest(input, eof) - } else if prefix.trim_start().is_empty() { - // Line starts with spaces followed by `!ENDDEFINE`. - Ok(Some((rest, Segment::Spaces))) - } else { - // Line starts with some content followed by `!ENDDEFINE`. - Ok(Some((rest, Segment::MacroBody))) - } - } else { - // No `!ENDDEFINE`. We have a full line of macro body. - // - // If the first line of the macro body is blank, we just report it - // as spaces, or not at all if there are no spaces, because it's not - // significant. - // - // However, if it's a later line, we need to report it because blank - // lines can have significance. - let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() { - if line.is_empty() { - return self.parse_define_6(input, eof); - } - Segment::Spaces - } else { - Segment::MacroBody - }; - self.state.0 = State::Define6; - Ok(Some((rest, segment))) - } - } - fn parse_define_6<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - self.state.0 = State::Define5; - Ok(Some((rest, Segment::Newline))) - } - fn parse_begin_data_1<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (rest, segment) = self.subparse(input, eof)?.unwrap(); - if segment == Segment::Newline { - self.state.0 = State::BeginData2; - } - Ok(Some((rest, segment))) - } - fn parse_begin_data_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (rest, segment) = self.subparse(input, eof)?.unwrap(); - if segment == Segment::Newline { - self.state.0 = State::BeginData3; - } - Ok(Some((rest, segment))) - } - fn is_end_data(line: &str) -> bool { - let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else { - return false; - }; - let (Some(c), rest) = take(rest, true).unwrap() else { - return false; - }; - if !c.is_whitespace() { - return false; - }; - let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else { - return false; - }; - - let mut endcmd = false; - for c in rest.chars() { - match c { - '.' if endcmd => return false, - '.' => endcmd = true, - c if c.is_whitespace() => (), - _ => return false, - } - } - true - } - fn parse_begin_data_3<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let rest = self.parse_full_line(input, eof)?; - let line = &input[..input.len() - rest.len()]; - if Self::is_end_data(line) { - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - self.push_rest(input, eof) - } else { - self.state.0 = State::BeginData4; - Ok(Some((rest, Segment::InlineData))) - } - } - fn parse_begin_data_4<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - self.state.0 = State::BeginData3; - Ok(Some((rest, Segment::Newline))) - } -} - -fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> { - line.get(..pattern.len()).and_then(|prefix| { - prefix - .eq_ignore_ascii_case(pattern) - .then(|| &line[pattern.len()..]) - }) -} - -#[cfg(test)] -mod test; diff --git a/rust/pspp/src/output.rs b/rust/pspp/src/output.rs new file mode 100644 index 0000000000..c1e061ed9b --- /dev/null +++ b/rust/pspp/src/output.rs @@ -0,0 +1,317 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +#![allow(dead_code)] +use std::{ + borrow::Cow, + sync::{Arc, OnceLock}, +}; + +use enum_map::EnumMap; +use pivot::PivotTable; +use serde::Serialize; + +use crate::{ + message::Diagnostic, + output::pivot::{Axis3, BorderStyle, Dimension, Group, Look}, +}; + +use self::pivot::Value; + +pub mod cairo; +pub mod csv; +pub mod driver; +pub mod html; +pub mod json; +pub mod page; +pub mod pivot; +pub mod render; +pub mod spv; +pub mod table; +pub mod text; +pub mod text_line; + +/// A single output item. +#[derive(Serialize)] +pub struct Item { + /// The localized label for the item that appears in the outline pane in the + /// output viewer and in PDF outlines. This is `None` if no label has been + /// explicitly set. + label: Option, + + /// A locale-invariant identifier for the command that produced the output, + /// which may be `None` if unknown or if a command did not produce this + /// output. + command_name: Option, + + /// For a group item, this is true if the group's subtree should + /// be expanded in an outline view, false otherwise. + /// + /// For other kinds of output items, this is true to show the item's + /// content, false to hide it. The item's label is always shown in an + /// outline view. + show: bool, + + /// Item details. + details: Details, +} + +impl Item { + pub fn new(details: impl Into
) -> Self { + let details = details.into(); + Self { + label: None, + command_name: details.command_name().cloned(), + show: true, + details, + } + } + + pub fn label(&self) -> Cow<'static, str> { + match &self.label { + Some(label) => Cow::from(label.clone()), + None => self.details.label(), + } + } +} + +impl From for Item +where + T: Into
, +{ + fn from(value: T) -> Self { + Self::new(value) + } +} + +#[derive(Serialize)] +pub enum Details { + Chart, + Image, + Group(Vec>), + Message(Box), + PageBreak, + Table(Box), + Text(Box), +} + +impl Details { + pub fn as_group(&self) -> Option<&[Arc]> { + match self { + Self::Group(children) => Some(children.as_slice()), + _ => None, + } + } + + pub fn command_name(&self) -> Option<&String> { + match self { + Details::Chart + | Details::Image + | Details::Group(_) + | Details::Message(_) + | Details::PageBreak + | Details::Text(_) => None, + Details::Table(pivot_table) => pivot_table.command_c.as_ref(), + } + } + + pub fn label(&self) -> Cow<'static, str> { + match self { + Details::Chart => todo!(), + Details::Image => todo!(), + Details::Group(_) => Cow::from("Group"), + Details::Message(diagnostic) => Cow::from(diagnostic.severity.as_title_str()), + Details::PageBreak => Cow::from("Page Break"), + Details::Table(pivot_table) => Cow::from(pivot_table.label()), + Details::Text(text) => Cow::from(text.type_.as_str()), + } + } + + pub fn is_page_break(&self) -> bool { + matches!(self, Self::PageBreak) + } +} + +impl FromIterator for Details +where + A: Into>, +{ + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + Self::Group(iter.into_iter().map(|value| value.into()).collect()) + } +} + +impl From for Details { + fn from(value: Diagnostic) -> Self { + Self::Message(Box::new(value)) + } +} + +impl From> for Details { + fn from(value: Box) -> Self { + Self::Message(value) + } +} + +impl From for Details { + fn from(value: PivotTable) -> Self { + Self::Table(Box::new(value)) + } +} + +impl From> for Details { + fn from(value: Box) -> Self { + Self::Table(value) + } +} + +impl From for Details { + fn from(value: Text) -> Self { + Self::Text(Box::new(value)) + } +} + +impl From> for Details { + fn from(value: Box) -> Self { + Self::Text(value) + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct Text { + type_: TextType, + + content: Value, +} + +impl Text { + pub fn new_log(value: impl Into) -> Self { + Self { + type_: TextType::Log, + content: value.into(), + } + } +} + +fn text_item_table_look() -> Arc { + static LOOK: OnceLock> = OnceLock::new(); + LOOK.get_or_init(|| { + Arc::new({ + let mut look = Look::default().with_borders(EnumMap::from_fn(|_| BorderStyle::none())); + for style in look.areas.values_mut() { + style.cell_style.margins = EnumMap::from_fn(|_| [0, 0]); + } + look + }) + }) + .clone() +} + +impl From for PivotTable { + fn from(value: Text) -> Self { + let dimension = + Dimension::new(Group::new(Value::new_text("Text")).with(Value::new_user_text("null"))) + .with_all_labels_hidden(); + PivotTable::new([(Axis3::Y, dimension)]) + .with_look(text_item_table_look()) + .with_data([(&[0], value.content)]) + .with_subtype(Value::new_user_text("Text")) + } +} + +impl From<&Diagnostic> for Text { + fn from(value: &Diagnostic) -> Self { + Self::new_log(value.to_string()) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum TextType { + /// `TITLE` and `SUBTITLE` commands. + PageTitle, + + /// Title, + Title, + + /// Syntax printback logging. + Syntax, + + /// Other logging. + Log, +} + +impl TextType { + pub fn as_str(&self) -> &'static str { + match self { + TextType::PageTitle => "Page Title", + TextType::Title => "Title", + TextType::Syntax => "Log", + TextType::Log => "Log", + } + } + + pub fn as_xml_str(&self) -> &'static str { + match self { + TextType::PageTitle => "page-title", + TextType::Title => "title", + TextType::Syntax | TextType::Log => "log", + } + } +} + +pub struct ItemCursor { + cur: Option>, + stack: Vec<(Arc, usize)>, +} + +impl ItemCursor { + pub fn new(start: Arc) -> Self { + Self { + cur: Some(start), + stack: Vec::new(), + } + } + + pub fn cur(&self) -> Option<&Arc> { + self.cur.as_ref() + } + + pub fn next(&mut self) { + let Some(cur) = self.cur.take() else { + return; + }; + match cur.details { + Details::Group(ref children) if !children.is_empty() => { + self.cur = Some(children[0].clone()); + self.stack.push((cur, 1)); + } + _ => { + while let Some((item, index)) = self.stack.pop() { + let children = item.details.as_group().unwrap(); + if index < children.len() { + self.cur = Some(children[index].clone()); + self.stack.push((item, index + 1)); + return; + } + } + } + } + } +} diff --git a/rust/pspp/src/output/cairo.rs b/rust/pspp/src/output/cairo.rs new file mode 100644 index 0000000000..0d6782f142 --- /dev/null +++ b/rust/pspp/src/output/cairo.rs @@ -0,0 +1,52 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use pango::SCALE; + +use crate::output::pivot::HorzAlign; + +mod driver; +pub mod fsm; +pub mod pager; + +pub use driver::{CairoConfig, CairoDriver}; + +/// Conversion from 1/96" units ("pixels") to Cairo/Pango units. +fn px_to_xr(x: usize) -> usize { + x * 3 * (SCALE as usize * 72 / 96) / 3 +} + +fn xr_to_pt(x: usize) -> f64 { + x as f64 / SCALE as f64 +} + +fn horz_align_to_pango(horz_align: HorzAlign) -> pango::Alignment { + match horz_align { + HorzAlign::Right | HorzAlign::Decimal { .. } => pango::Alignment::Right, + HorzAlign::Left => pango::Alignment::Left, + HorzAlign::Center => pango::Alignment::Center, + } +} + +#[cfg(test)] +mod test { + use crate::output::cairo::{CairoConfig, CairoDriver}; + + #[test] + fn create() { + CairoDriver::new(&CairoConfig::new("test.pdf")).unwrap(); + } +} diff --git a/rust/pspp/src/output/cairo/mod.rs b/rust/pspp/src/output/cairo/mod.rs deleted file mode 100644 index 0d6782f142..0000000000 --- a/rust/pspp/src/output/cairo/mod.rs +++ /dev/null @@ -1,52 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -use pango::SCALE; - -use crate::output::pivot::HorzAlign; - -mod driver; -pub mod fsm; -pub mod pager; - -pub use driver::{CairoConfig, CairoDriver}; - -/// Conversion from 1/96" units ("pixels") to Cairo/Pango units. -fn px_to_xr(x: usize) -> usize { - x * 3 * (SCALE as usize * 72 / 96) / 3 -} - -fn xr_to_pt(x: usize) -> f64 { - x as f64 / SCALE as f64 -} - -fn horz_align_to_pango(horz_align: HorzAlign) -> pango::Alignment { - match horz_align { - HorzAlign::Right | HorzAlign::Decimal { .. } => pango::Alignment::Right, - HorzAlign::Left => pango::Alignment::Left, - HorzAlign::Center => pango::Alignment::Center, - } -} - -#[cfg(test)] -mod test { - use crate::output::cairo::{CairoConfig, CairoDriver}; - - #[test] - fn create() { - CairoDriver::new(&CairoConfig::new("test.pdf")).unwrap(); - } -} diff --git a/rust/pspp/src/output/mod.rs b/rust/pspp/src/output/mod.rs deleted file mode 100644 index c1e061ed9b..0000000000 --- a/rust/pspp/src/output/mod.rs +++ /dev/null @@ -1,317 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -#![allow(dead_code)] -use std::{ - borrow::Cow, - sync::{Arc, OnceLock}, -}; - -use enum_map::EnumMap; -use pivot::PivotTable; -use serde::Serialize; - -use crate::{ - message::Diagnostic, - output::pivot::{Axis3, BorderStyle, Dimension, Group, Look}, -}; - -use self::pivot::Value; - -pub mod cairo; -pub mod csv; -pub mod driver; -pub mod html; -pub mod json; -pub mod page; -pub mod pivot; -pub mod render; -pub mod spv; -pub mod table; -pub mod text; -pub mod text_line; - -/// A single output item. -#[derive(Serialize)] -pub struct Item { - /// The localized label for the item that appears in the outline pane in the - /// output viewer and in PDF outlines. This is `None` if no label has been - /// explicitly set. - label: Option, - - /// A locale-invariant identifier for the command that produced the output, - /// which may be `None` if unknown or if a command did not produce this - /// output. - command_name: Option, - - /// For a group item, this is true if the group's subtree should - /// be expanded in an outline view, false otherwise. - /// - /// For other kinds of output items, this is true to show the item's - /// content, false to hide it. The item's label is always shown in an - /// outline view. - show: bool, - - /// Item details. - details: Details, -} - -impl Item { - pub fn new(details: impl Into
) -> Self { - let details = details.into(); - Self { - label: None, - command_name: details.command_name().cloned(), - show: true, - details, - } - } - - pub fn label(&self) -> Cow<'static, str> { - match &self.label { - Some(label) => Cow::from(label.clone()), - None => self.details.label(), - } - } -} - -impl From for Item -where - T: Into
, -{ - fn from(value: T) -> Self { - Self::new(value) - } -} - -#[derive(Serialize)] -pub enum Details { - Chart, - Image, - Group(Vec>), - Message(Box), - PageBreak, - Table(Box), - Text(Box), -} - -impl Details { - pub fn as_group(&self) -> Option<&[Arc]> { - match self { - Self::Group(children) => Some(children.as_slice()), - _ => None, - } - } - - pub fn command_name(&self) -> Option<&String> { - match self { - Details::Chart - | Details::Image - | Details::Group(_) - | Details::Message(_) - | Details::PageBreak - | Details::Text(_) => None, - Details::Table(pivot_table) => pivot_table.command_c.as_ref(), - } - } - - pub fn label(&self) -> Cow<'static, str> { - match self { - Details::Chart => todo!(), - Details::Image => todo!(), - Details::Group(_) => Cow::from("Group"), - Details::Message(diagnostic) => Cow::from(diagnostic.severity.as_title_str()), - Details::PageBreak => Cow::from("Page Break"), - Details::Table(pivot_table) => Cow::from(pivot_table.label()), - Details::Text(text) => Cow::from(text.type_.as_str()), - } - } - - pub fn is_page_break(&self) -> bool { - matches!(self, Self::PageBreak) - } -} - -impl FromIterator for Details -where - A: Into>, -{ - fn from_iter(iter: T) -> Self - where - T: IntoIterator, - { - Self::Group(iter.into_iter().map(|value| value.into()).collect()) - } -} - -impl From for Details { - fn from(value: Diagnostic) -> Self { - Self::Message(Box::new(value)) - } -} - -impl From> for Details { - fn from(value: Box) -> Self { - Self::Message(value) - } -} - -impl From for Details { - fn from(value: PivotTable) -> Self { - Self::Table(Box::new(value)) - } -} - -impl From> for Details { - fn from(value: Box) -> Self { - Self::Table(value) - } -} - -impl From for Details { - fn from(value: Text) -> Self { - Self::Text(Box::new(value)) - } -} - -impl From> for Details { - fn from(value: Box) -> Self { - Self::Text(value) - } -} - -#[derive(Clone, Debug, Serialize)] -pub struct Text { - type_: TextType, - - content: Value, -} - -impl Text { - pub fn new_log(value: impl Into) -> Self { - Self { - type_: TextType::Log, - content: value.into(), - } - } -} - -fn text_item_table_look() -> Arc { - static LOOK: OnceLock> = OnceLock::new(); - LOOK.get_or_init(|| { - Arc::new({ - let mut look = Look::default().with_borders(EnumMap::from_fn(|_| BorderStyle::none())); - for style in look.areas.values_mut() { - style.cell_style.margins = EnumMap::from_fn(|_| [0, 0]); - } - look - }) - }) - .clone() -} - -impl From for PivotTable { - fn from(value: Text) -> Self { - let dimension = - Dimension::new(Group::new(Value::new_text("Text")).with(Value::new_user_text("null"))) - .with_all_labels_hidden(); - PivotTable::new([(Axis3::Y, dimension)]) - .with_look(text_item_table_look()) - .with_data([(&[0], value.content)]) - .with_subtype(Value::new_user_text("Text")) - } -} - -impl From<&Diagnostic> for Text { - fn from(value: &Diagnostic) -> Self { - Self::new_log(value.to_string()) - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum TextType { - /// `TITLE` and `SUBTITLE` commands. - PageTitle, - - /// Title, - Title, - - /// Syntax printback logging. - Syntax, - - /// Other logging. - Log, -} - -impl TextType { - pub fn as_str(&self) -> &'static str { - match self { - TextType::PageTitle => "Page Title", - TextType::Title => "Title", - TextType::Syntax => "Log", - TextType::Log => "Log", - } - } - - pub fn as_xml_str(&self) -> &'static str { - match self { - TextType::PageTitle => "page-title", - TextType::Title => "title", - TextType::Syntax | TextType::Log => "log", - } - } -} - -pub struct ItemCursor { - cur: Option>, - stack: Vec<(Arc, usize)>, -} - -impl ItemCursor { - pub fn new(start: Arc) -> Self { - Self { - cur: Some(start), - stack: Vec::new(), - } - } - - pub fn cur(&self) -> Option<&Arc> { - self.cur.as_ref() - } - - pub fn next(&mut self) { - let Some(cur) = self.cur.take() else { - return; - }; - match cur.details { - Details::Group(ref children) if !children.is_empty() => { - self.cur = Some(children[0].clone()); - self.stack.push((cur, 1)); - } - _ => { - while let Some((item, index)) = self.stack.pop() { - let children = item.details.as_group().unwrap(); - if index < children.len() { - self.cur = Some(children[index].clone()); - self.stack.push((item, index + 1)); - return; - } - } - } - } - } -} diff --git a/rust/pspp/src/output/pivot.rs b/rust/pspp/src/output/pivot.rs new file mode 100644 index 0000000000..92133e2c51 --- /dev/null +++ b/rust/pspp/src/output/pivot.rs @@ -0,0 +1,2859 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Pivot tables. +//! +//! Pivot tables are PSPP's primary form of output. They are analogous to the +//! pivot tables you might be familiar with from spreadsheets and databases. +//! See for a brief introduction to +//! the overall concept of a pivot table. +//! +//! In PSPP, the most important internal pieces of a pivot table are: +//! +//! - Title. Every pivot table has a title that is displayed above it. It also +//! has an optional caption (displayed below it) and corner text (displayed in +//! the upper left corner). +//! +//! - Dimensions. A dimension consists of zero or more categories. A category +//! has a label, such as "df" or "Asymp. Sig." or 123 or a variable name. The +//! categories are the leaves of a tree whose non-leaf nodes form groups of +//! categories. The tree always has a root group whose label is the name of +//! the dimension. +//! +//! - Axes. A table has three axes: column, row, and layer. Each dimension is +//! assigned to an axis, and each axis has zero or more dimensions. When an +//! axis has more than one dimension, they are ordered from innermost to +//! outermost. +//! +//! - Data. A table's data consists of zero or more cells. Each cell maps from +//! a category for each dimension to a value, which is commonly a number but +//! could also be a variable name or an arbitrary text string. + +use std::{ + collections::HashMap, + fmt::{Debug, Display, Write}, + io::Read, + iter::{once, repeat, repeat_n, FusedIterator}, + ops::{Index, IndexMut, Not, Range, RangeInclusive}, + str::{from_utf8, FromStr, Utf8Error}, + sync::{Arc, OnceLock}, +}; + +use binrw::Error as BinError; +use chrono::NaiveDateTime; +pub use color::ParseError as ParseColorError; +use color::{palette::css::TRANSPARENT, AlphaColor, Rgba8, Srgb}; +use enum_iterator::Sequence; +use enum_map::{enum_map, Enum, EnumMap}; +use look_xml::TableProperties; +use quick_xml::{de::from_str, DeError}; +use serde::{ + de::Visitor, + ser::{SerializeMap, SerializeStruct}, + Deserialize, Serialize, Serializer, +}; +use smallstr::SmallString; +use smallvec::SmallVec; +use thiserror::Error as ThisError; +use tlo::parse_tlo; + +use crate::{ + data::{ByteString, Datum, EncodedString, RawString}, + format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, + settings::{Settings, Show}, + util::ToSmallString, + variable::{VarType, Variable}, +}; + +pub mod output; + +mod look_xml; +#[cfg(test)] +pub mod test; +mod tlo; + +/// Areas of a pivot table for styling purposes. +#[derive(Copy, Clone, Debug, Default, Enum, PartialEq, Eq)] +pub enum Area { + Title, + Caption, + + /// Footnotes, + Footer, + + // Top-left corner. + Corner, + + /// Labels for columns ([Axis2::X]) and rows ([Axis2::Y]). + Labels(Axis2), + + #[default] + Data, + + /// Layer indication. + Layers, +} + +impl Display for Area { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Area::Title => write!(f, "title"), + Area::Caption => write!(f, "caption"), + Area::Footer => write!(f, "footer"), + Area::Corner => write!(f, "corner"), + Area::Labels(axis2) => write!(f, "labels({axis2})"), + Area::Data => write!(f, "data"), + Area::Layers => write!(f, "layers"), + } + } +} + +impl Serialize for Area { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_str(&self.to_small_string::<16>()) + } +} + +impl Area { + fn default_cell_style(self) -> CellStyle { + use HorzAlign::*; + use VertAlign::*; + let (horz_align, vert_align, hmargins, vmargins) = match self { + Area::Title => (Some(Center), Middle, [8, 11], [1, 8]), + Area::Caption => (Some(Left), Top, [8, 11], [1, 1]), + Area::Footer => (Some(Left), Top, [11, 8], [2, 3]), + Area::Corner => (Some(Left), Bottom, [8, 11], [1, 1]), + Area::Labels(Axis2::X) => (Some(Center), Top, [8, 11], [1, 3]), + Area::Labels(Axis2::Y) => (Some(Left), Top, [8, 11], [1, 3]), + Area::Data => (None, Top, [8, 11], [1, 1]), + Area::Layers => (Some(Left), Bottom, [8, 11], [1, 3]), + }; + CellStyle { + horz_align, + vert_align, + margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins }, + } + } + + fn default_font_style(self) -> FontStyle { + FontStyle { + bold: self == Area::Title, + italic: false, + underline: false, + markup: false, + font: String::from("Sans Serif"), + fg: [Color::BLACK; 2], + bg: [Color::WHITE; 2], + size: 9, + } + } + + fn default_area_style(self) -> AreaStyle { + AreaStyle { + cell_style: self.default_cell_style(), + font_style: self.default_font_style(), + } + } +} + +/// Table borders for styling purposes. +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)] +pub enum Border { + Title, + OuterFrame(BoxBorder), + InnerFrame(BoxBorder), + Dimension(RowColBorder), + Category(RowColBorder), + DataLeft, + DataTop, +} + +impl Border { + pub fn default_stroke(self) -> Stroke { + match self { + Self::InnerFrame(_) | Self::DataLeft | Self::DataTop => Stroke::Thick, + Self::Dimension( + RowColBorder(HeadingRegion::Columns, _) | RowColBorder(_, Axis2::X), + ) + | Self::Category(RowColBorder(HeadingRegion::Columns, _)) => Stroke::Solid, + _ => Stroke::None, + } + } + pub fn default_border_style(self) -> BorderStyle { + BorderStyle { + stroke: self.default_stroke(), + color: Color::BLACK, + } + } + + fn fallback(self) -> Self { + match self { + Self::Title + | Self::OuterFrame(_) + | Self::InnerFrame(_) + | Self::DataLeft + | Self::DataTop + | Self::Category(_) => self, + Self::Dimension(row_col_border) => Self::Category(row_col_border), + } + } +} + +impl Display for Border { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Border::Title => write!(f, "title"), + Border::OuterFrame(box_border) => write!(f, "outer_frame({box_border})"), + Border::InnerFrame(box_border) => write!(f, "inner_frame({box_border})"), + Border::Dimension(row_col_border) => write!(f, "dimension({row_col_border})"), + Border::Category(row_col_border) => write!(f, "category({row_col_border})"), + Border::DataLeft => write!(f, "data(left)"), + Border::DataTop => write!(f, "data(top)"), + } + } +} + +impl Serialize for Border { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_str(&self.to_small_string::<32>()) + } +} + +/// The borders on a box. +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum BoxBorder { + Left, + Top, + Right, + Bottom, +} + +impl BoxBorder { + fn as_str(&self) -> &'static str { + match self { + BoxBorder::Left => "left", + BoxBorder::Top => "top", + BoxBorder::Right => "right", + BoxBorder::Bottom => "bottom", + } + } +} + +impl Display for BoxBorder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +/// Borders between rows and columns. +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub struct RowColBorder( + /// Row or column headings. + pub HeadingRegion, + /// Horizontal ([Axis2::X]) or vertical ([Axis2::Y]) borders. + pub Axis2, +); + +impl Display for RowColBorder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.0, self.1) + } +} + +/// Sizing for rows or columns of a rendered table. +/// +/// The comments below talk about columns and their widths but they apply +/// equally to rows and their heights. +#[derive(Default, Clone, Debug, Serialize)] +pub struct Sizing { + /// Specific column widths, in 1/96" units. + widths: Vec, + + /// Specific page breaks: 0-based columns after which a page break must + /// occur, e.g. a value of 1 requests a break after the second column. + breaks: Vec, + + /// Keeps: columns to keep together on a page if possible. + keeps: Vec>, +} + +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Sequence, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum Axis3 { + X, + Y, + Z, +} + +impl Axis3 { + fn transpose(&self) -> Option { + match self { + Axis3::X => Some(Axis3::Y), + Axis3::Y => Some(Axis3::X), + Axis3::Z => None, + } + } +} + +impl From for Axis3 { + fn from(axis2: Axis2) -> Self { + match axis2 { + Axis2::X => Self::X, + Axis2::Y => Self::Y, + } + } +} + +/// An axis within a pivot table. +#[derive(Clone, Debug, Default, Serialize)] +pub struct Axis { + /// `dimensions[0]` is the innermost dimension. + pub dimensions: Vec, +} + +pub struct AxisIterator { + indexes: SmallVec<[usize; 4]>, + lengths: SmallVec<[usize; 4]>, + done: bool, +} + +impl FusedIterator for AxisIterator {} +impl Iterator for AxisIterator { + type Item = SmallVec<[usize; 4]>; + + fn next(&mut self) -> Option { + if self.done { + None + } else { + let retval = self.indexes.clone(); + for (index, len) in self.indexes.iter_mut().zip(self.lengths.iter().copied()) { + *index += 1; + if *index < len { + return Some(retval); + }; + *index = 0; + } + self.done = true; + Some(retval) + } + } +} + +impl PivotTable { + pub fn with_look(mut self, look: Arc) -> Self { + self.look = look; + self + } + pub fn insert_number(&mut self, data_indexes: &[usize], number: Option, class: Class) { + let format = match class { + Class::Other => Settings::global().default_format, + Class::Integer => Format::F40, + Class::Correlations => Format::F40_3, + Class::Significance => Format::F40_3, + Class::Percent => Format::PCT40_1, + Class::Residual => Format::F40_2, + Class::Count => Format::F40, // XXX + }; + let value = Value::new(ValueInner::Number(NumberValue { + show: None, + format, + honor_small: class == Class::Other, + value: number, + variable: None, + value_label: None, + })); + self.insert(data_indexes, value); + } + + pub fn with_footnotes(mut self, footnotes: Footnotes) -> Self { + debug_assert!(self.footnotes.is_empty()); + self.footnotes = footnotes; + self + } + fn axis_values(&self, axis: Axis3) -> AxisIterator { + AxisIterator { + indexes: repeat_n(0, self.axes[axis].dimensions.len()).collect(), + lengths: self.axis_dimensions(axis).map(|d| d.len()).collect(), + done: self.axis_extent(axis) == 0, + } + } + + fn axis_extent(&self, axis: Axis3) -> usize { + self.axis_dimensions(axis).map(|d| d.len()).product() + } +} + +/// Dimensions. +/// +/// A [Dimension] identifies the categories associated with a single dimension +/// within a multidimensional pivot table. +/// +/// A dimension contains a collection of categories, which are the leaves in a +/// tree of groups. +/// +/// (A dimension or a group can contain zero categories, but this is unusual. +/// If a dimension contains no categories, then its table cannot contain any +/// data.) +#[derive(Clone, Debug, Serialize)] +pub struct Dimension { + /// Hierarchy of categories within the dimension. The groups and categories + /// are sorted in the order that should be used for display. This might be + /// different from the original order produced for output if the user + /// adjusted it. + /// + /// The root must always be a group, although it is allowed to have no + /// subcategories. + pub root: Group, + + /// Ordering of leaves for presentation. + /// + /// This is a permutation of `0..n` where `n` is the number of leaves. It + /// maps from an index in presentation order to an index in data order. + pub presentation_order: Vec, + + /// Display. + pub hide_all_labels: bool, +} + +pub type GroupVec<'a> = SmallVec<[&'a Group; 4]>; +pub struct Path<'a> { + groups: GroupVec<'a>, + leaf: &'a Leaf, +} + +impl Dimension { + pub fn new(root: Group) -> Self { + Dimension { + presentation_order: (0..root.len()).collect(), + root, + hide_all_labels: false, + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of (leaf) categories in this dimension. + pub fn len(&self) -> usize { + self.root.len() + } + + pub fn nth_leaf(&self, index: usize) -> Option<&Leaf> { + self.root.nth_leaf(index) + } + + pub fn leaf_path(&self, index: usize) -> Option> { + self.root.leaf_path(index, SmallVec::new()) + } + + pub fn with_all_labels_hidden(self) -> Self { + Self { + hide_all_labels: true, + ..self + } + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct Group { + #[serde(skip)] + len: usize, + pub name: Box, + + /// The child categories. + /// + /// A group usually has multiple children, but it is allowed to have + /// only one or even (pathologically) none. + pub children: Vec, + + /// Whether to show the group's label. + pub show_label: bool, +} + +impl Group { + pub fn new(name: impl Into) -> Self { + Self::with_capacity(name, 0) + } + + pub fn with_capacity(name: impl Into, capacity: usize) -> Self { + Self { + len: 0, + name: Box::new(name.into()), + children: Vec::with_capacity(capacity), + show_label: false, + } + } + + pub fn push(&mut self, child: impl Into) { + let mut child = child.into(); + if let Category::Group(group) = &mut child { + group.show_label = true; + } + self.len += child.len(); + self.children.push(child); + } + + pub fn with(mut self, child: impl Into) -> Self { + self.push(child); + self + } + + pub fn with_multiple(mut self, children: impl IntoIterator) -> Self + where + C: Into, + { + self.extend(children); + self + } + + pub fn with_label_shown(self) -> Self { + self.with_show_label(true) + } + + pub fn with_show_label(mut self, show_label: bool) -> Self { + self.show_label = show_label; + self + } + + pub fn nth_leaf(&self, mut index: usize) -> Option<&Leaf> { + for child in &self.children { + let len = child.len(); + if index < len { + return child.nth_leaf(index); + } + index -= len; + } + None + } + + pub fn leaf_path<'a>(&'a self, mut index: usize, mut groups: GroupVec<'a>) -> Option> { + for child in &self.children { + let len = child.len(); + if index < len { + groups.push(self); + return child.leaf_path(index, groups); + } + index -= len; + } + None + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn name(&self) -> &Value { + &self.name + } +} + +impl Extend for Group +where + C: Into, +{ + fn extend>(&mut self, children: T) { + let children = children.into_iter(); + self.children.reserve(children.size_hint().0); + for child in children { + self.push(child); + } + } +} + +#[derive(Clone, Debug, Default, Serialize)] +pub struct Footnotes(pub Vec>); + +impl Footnotes { + pub fn new() -> Self { + Self::default() + } + + pub fn push(&mut self, footnote: Footnote) -> Arc { + let footnote = Arc::new(footnote.with_index(self.0.len())); + self.0.push(footnote.clone()); + footnote + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +#[derive(Clone, Debug)] +pub struct Leaf { + name: Box, +} + +impl Leaf { + pub fn new(name: Value) -> Self { + Self { + name: Box::new(name), + } + } + pub fn name(&self) -> &Value { + &self.name + } +} + +impl Serialize for Leaf { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.name.serialize(serializer) + } +} + +/// Pivot result classes. +/// +/// These are used to mark [Leaf] categories as having particular types of data, +/// to set their numeric formats. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum Class { + Other, + Integer, + Correlations, + Significance, + Percent, + Residual, + Count, +} + +/// A pivot_category is a leaf (a category) or a group. +#[derive(Clone, Debug, Serialize)] +pub enum Category { + Group(Group), + Leaf(Leaf), +} + +impl Category { + pub fn name(&self) -> &Value { + match self { + Category::Group(group) => &group.name, + Category::Leaf(leaf) => &leaf.name, + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn len(&self) -> usize { + match self { + Category::Group(group) => group.len, + Category::Leaf(_) => 1, + } + } + + pub fn nth_leaf(&self, index: usize) -> Option<&Leaf> { + match self { + Category::Group(group) => group.nth_leaf(index), + Category::Leaf(leaf) => { + if index == 0 { + Some(leaf) + } else { + None + } + } + } + } + + pub fn leaf_path<'a>(&'a self, index: usize, groups: GroupVec<'a>) -> Option> { + match self { + Category::Group(group) => group.leaf_path(index, groups), + Category::Leaf(leaf) => { + if index == 0 { + Some(Path { groups, leaf }) + } else { + None + } + } + } + } + + pub fn show_label(&self) -> bool { + match self { + Category::Group(group) => group.show_label, + Category::Leaf(_) => true, + } + } +} + +impl From for Category { + fn from(group: Group) -> Self { + Self::Group(group) + } +} + +impl From for Category { + fn from(group: Leaf) -> Self { + Self::Leaf(group) + } +} + +impl From for Category { + fn from(name: Value) -> Self { + Leaf::new(name).into() + } +} + +impl From<&Variable> for Category { + fn from(variable: &Variable) -> Self { + Value::new_variable(variable).into() + } +} + +impl From<&str> for Category { + fn from(name: &str) -> Self { + Self::Leaf(Leaf::new(Value::new_text(name))) + } +} + +impl From for Category { + fn from(name: String) -> Self { + Self::Leaf(Leaf::new(Value::new_text(name))) + } +} + +impl From<&String> for Category { + fn from(name: &String) -> Self { + Self::Leaf(Leaf::new(Value::new_text(name))) + } +} + +/// Styling for a pivot table. +/// +/// The division between this and the style information in [PivotTable] seems +/// fairly arbitrary. The ultimate reason for the division is simply because +/// that's how SPSS documentation and file formats do it. +#[derive(Clone, Debug, Serialize)] +pub struct Look { + pub name: Option, + + /// Whether to hide rows or columns whose cells are all empty. + pub hide_empty: bool, + + pub row_label_position: LabelPosition, + + /// Ranges of column widths in the two heading regions, in 1/96" units. + pub heading_widths: EnumMap>, + + /// Kind of markers to use for footnotes. + pub footnote_marker_type: FootnoteMarkerType, + + /// Where to put the footnote markers. + pub footnote_marker_position: FootnoteMarkerPosition, + + /// Styles for areas of the pivot table. + pub areas: EnumMap, + + /// Styles for borders in the pivot table. + pub borders: EnumMap, + + pub print_all_layers: bool, + + pub paginate_layers: bool, + + pub shrink_to_fit: EnumMap, + + pub top_continuation: bool, + + pub bottom_continuation: bool, + + pub continuation: Option, + + pub n_orphan_lines: usize, +} + +impl Look { + pub fn with_omit_empty(mut self, omit_empty: bool) -> Self { + self.hide_empty = omit_empty; + self + } + pub fn with_row_label_position(mut self, row_label_position: LabelPosition) -> Self { + self.row_label_position = row_label_position; + self + } + pub fn with_borders(mut self, borders: EnumMap) -> Self { + self.borders = borders; + self + } +} + +impl Default for Look { + fn default() -> Self { + Self { + name: None, + hide_empty: true, + row_label_position: LabelPosition::default(), + heading_widths: EnumMap::from_fn(|region| match region { + HeadingRegion::Rows => 36..=72, + HeadingRegion::Columns => 36..=120, + }), + footnote_marker_type: FootnoteMarkerType::default(), + footnote_marker_position: FootnoteMarkerPosition::default(), + areas: EnumMap::from_fn(Area::default_area_style), + borders: EnumMap::from_fn(Border::default_border_style), + print_all_layers: false, + paginate_layers: false, + shrink_to_fit: EnumMap::from_fn(|_| false), + top_continuation: false, + bottom_continuation: false, + continuation: None, + n_orphan_lines: 0, + } + } +} + +#[derive(ThisError, Debug)] +pub enum ParseLookError { + #[error(transparent)] + XmlError(#[from] DeError), + + #[error(transparent)] + Utf8Error(#[from] Utf8Error), + + #[error(transparent)] + BinError(#[from] BinError), + + #[error(transparent)] + IoError(#[from] std::io::Error), +} + +impl Look { + pub fn shared_default() -> Arc { + static LOOK: OnceLock> = OnceLock::new(); + LOOK.get_or_init(|| Arc::new(Look::default())).clone() + } + + pub fn from_xml(xml: &str) -> Result { + Ok(from_str::(xml) + .map_err(ParseLookError::from)? + .into()) + } + + pub fn from_binary(tlo: &[u8]) -> Result { + parse_tlo(tlo).map_err(ParseLookError::from) + } + + pub fn from_data(data: &[u8]) -> Result { + if data.starts_with(b"\xff\xff\0\0") { + Self::from_binary(data) + } else { + Self::from_xml(from_utf8(data).map_err(ParseLookError::from)?) + } + } + + pub fn from_reader(mut reader: R) -> Result + where + R: Read, + { + let mut buffer = Vec::new(); + reader + .read_to_end(&mut buffer) + .map_err(ParseLookError::from)?; + Self::from_data(&buffer) + } +} + +/// Position for group labels. +#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] +pub enum LabelPosition { + /// Hierarachically enclosing the categories. + /// + /// For column labels, group labels appear above the categories. For row + /// labels, group labels appear to the left of the categories. + /// + /// ```text + /// ┌────┬──────────────┐ ┌─────────┬──────────┐ + /// │ │ nested │ │ │ columns │ + /// │ ├────┬────┬────┤ ├──────┬──┼──────────┤ + /// │ │ a1 │ a2 │ a3 │ │ │a1│...data...│ + /// ├────┼────┼────┼────┤ │nested│a2│...data...│ + /// │ │data│data│data│ │ │a3│...data...│ + /// │ │ . │ . │ . │ └──────┴──┴──────────┘ + /// │rows│ . │ . │ . │ + /// │ │ . │ . │ . │ + /// └────┴────┴────┴────┘ + /// ``` + #[serde(rename = "nested")] + Nested, + + /// In the corner (row labels only). + /// + /// ```text + /// ┌──────┬──────────┐ + /// │corner│ columns │ + /// ├──────┼──────────┤ + /// │ a1│...data...│ + /// │ a2│...data...│ + /// │ a3│...data...│ + /// └──────┴──────────┘ + /// ``` + #[default] + #[serde(rename = "inCorner")] + Corner, +} + +/// The heading region of a rendered pivot table: +/// +/// ```text +/// ┌──────────────────┬─────────────────────────────────────────────────┐ +/// │ │ column headings │ +/// │ ├─────────────────────────────────────────────────┤ +/// │ corner │ │ +/// │ and │ │ +/// │ row headings │ data │ +/// │ │ │ +/// │ │ │ +/// └──────────────────┴─────────────────────────────────────────────────┘ +/// ``` +#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum HeadingRegion { + Rows, + Columns, +} + +impl HeadingRegion { + pub fn as_str(&self) -> &'static str { + match self { + HeadingRegion::Rows => "rows", + HeadingRegion::Columns => "columns", + } + } +} + +impl Display for HeadingRegion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl From for HeadingRegion { + fn from(axis: Axis2) -> Self { + match axis { + Axis2::X => HeadingRegion::Columns, + Axis2::Y => HeadingRegion::Rows, + } + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct AreaStyle { + pub cell_style: CellStyle, + pub font_style: FontStyle, +} + +#[derive(Clone, Debug, Serialize)] +pub struct CellStyle { + /// `None` means "mixed" alignment: align strings to the left, numbers to + /// the right. + pub horz_align: Option, + pub vert_align: VertAlign, + + /// Margins in 1/96" units. + /// + /// `margins[Axis2::X][0]` is the left margin. + /// `margins[Axis2::X][1]` is the right margin. + /// `margins[Axis2::Y][0]` is the top margin. + /// `margins[Axis2::Y][1]` is the bottom margin. + pub margins: EnumMap, +} + +#[derive(Copy, Clone, Debug, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum HorzAlign { + /// Right aligned. + Right, + + /// Left aligned. + Left, + + /// Centered. + Center, + + /// Align the decimal point at the specified position. + Decimal { + /// Decimal offset from the right side of the cell, in 1/96" units. + offset: f64, + + /// Decimal character. + decimal: Decimal, + }, +} + +impl HorzAlign { + pub fn for_mixed(var_type: VarType) -> Self { + match var_type { + VarType::Numeric => Self::Right, + VarType::String => Self::Left, + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum VertAlign { + /// Top alignment. + Top, + + /// Centered, + Middle, + + /// Bottom alignment. + Bottom, +} + +#[derive(Clone, Debug, Serialize)] +pub struct FontStyle { + pub bold: bool, + pub italic: bool, + pub underline: bool, + pub markup: bool, + pub font: String, + + /// `fg[0]` is the usual foreground color. + /// + /// `fg[1]` is used only in [Area::Data] for odd-numbered rows. + pub fg: [Color; 2], + + /// `bg[0]` is the usual background color. + /// + /// `bg[1]` is used only in [Area::Data] for odd-numbered rows. + pub bg: [Color; 2], + + /// In 1/72" units. + pub size: i32, +} + +#[derive(Copy, Clone, PartialEq, Eq)] +pub struct Color { + pub alpha: u8, + pub r: u8, + pub g: u8, + pub b: u8, +} + +impl Color { + pub const BLACK: Color = Color::new(0, 0, 0); + pub const WHITE: Color = Color::new(255, 255, 255); + pub const RED: Color = Color::new(255, 0, 0); + pub const BLUE: Color = Color::new(0, 0, 255); + pub const TRANSPARENT: Color = Color::new(0, 0, 0).with_alpha(0); + + pub const fn new(r: u8, g: u8, b: u8) -> Self { + Self { + alpha: 255, + r, + g, + b, + } + } + + pub const fn with_alpha(self, alpha: u8) -> Self { + Self { alpha, ..self } + } + + pub const fn without_alpha(self) -> Self { + self.with_alpha(255) + } + + pub fn display_css(&self) -> DisplayCss { + DisplayCss(*self) + } +} + +impl Debug for Color { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.display_css()) + } +} + +impl From for Color { + fn from(Rgba8 { r, g, b, a }: Rgba8) -> Self { + Self::new(r, g, b).with_alpha(a) + } +} + +impl FromStr for Color { + type Err = ParseColorError; + + fn from_str(s: &str) -> Result { + fn is_bare_hex(s: &str) -> bool { + let s = s.trim(); + s.chars().count() == 6 && s.chars().all(|c| c.is_ascii_hexdigit()) + } + let color: AlphaColor = match s.parse() { + Err(ParseColorError::UnknownColorSyntax) if is_bare_hex(s) => { + ("#".to_owned() + s).parse() + } + Err(ParseColorError::UnknownColorSyntax) + if s.trim().eq_ignore_ascii_case("transparent") => + { + Ok(TRANSPARENT) + } + other => other, + }?; + Ok(color.to_rgba8().into()) + } +} + +impl Serialize for Color { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_str(&self.display_css().to_small_string::<32>()) + } +} + +impl<'de> Deserialize<'de> for Color { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct ColorVisitor; + + impl<'de> Visitor<'de> for ColorVisitor { + type Value = Color; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("\"#rrggbb\" or \"rrggbb\" or web color name") + } + + fn visit_borrowed_str(self, v: &'de str) -> Result + where + E: serde::de::Error, + { + v.parse().map_err(E::custom) + } + } + + deserializer.deserialize_str(ColorVisitor) + } +} + +pub struct DisplayCss(Color); + +impl Display for DisplayCss { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Color { alpha, r, g, b } = self.0; + match alpha { + 255 => write!(f, "#{r:02x}{g:02x}{b:02x}"), + _ => write!(f, "rgb({r}, {g}, {b}, {:.2})", alpha as f64 / 255.0), + } + } +} + +#[derive(Copy, Clone, Debug, Deserialize)] +pub struct BorderStyle { + #[serde(rename = "@borderStyleType")] + pub stroke: Stroke, + + #[serde(rename = "@color")] + pub color: Color, +} + +impl Serialize for BorderStyle { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = serializer.serialize_struct("BorderStyle", 2)?; + s.serialize_field("stroke", &self.stroke)?; + s.serialize_field("color", &self.color)?; + s.end() + } +} + +impl BorderStyle { + pub const fn none() -> Self { + Self { + stroke: Stroke::None, + color: Color::BLACK, + } + } + + pub fn is_none(&self) -> bool { + self.stroke.is_none() + } + + /// Returns a border style that "combines" the two arguments, that is, that + /// gives a reasonable choice for a rule for different reasons should have + /// both styles. + pub fn combine(self, other: BorderStyle) -> Self { + Self { + stroke: self.stroke.combine(other.stroke), + color: self.color, + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Enum, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub enum Stroke { + None, + Solid, + Dashed, + Thick, + Thin, + Double, +} + +impl Stroke { + pub fn is_none(&self) -> bool { + self == &Self::None + } + + /// Returns a stroke that "combines" the two arguments, that is, that gives + /// a reasonable stroke choice for a rule for different reasons should have + /// both styles. + pub fn combine(self, other: Stroke) -> Self { + self.max(other) + } +} + +/// An axis of a 2-dimensional table. +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Axis2 { + X, + Y, +} + +impl Axis2 { + pub fn new_enum(x: T, y: T) -> EnumMap { + EnumMap::from_array([x, y]) + } + + pub fn as_str(&self) -> &'static str { + match self { + Axis2::X => "x", + Axis2::Y => "y", + } + } +} + +impl Display for Axis2 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl Not for Axis2 { + type Output = Self; + + fn not(self) -> Self::Output { + match self { + Self::X => Self::Y, + Self::Y => Self::X, + } + } +} + +/// A 2-dimensional `(x,y)` pair. +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)] +pub struct Coord2(pub EnumMap); + +impl Coord2 { + pub fn new(x: usize, y: usize) -> Self { + use Axis2::*; + Self(enum_map! { + X => x, + Y => y + }) + } + + pub fn for_axis((a, az): (Axis2, usize), bz: usize) -> Self { + let mut coord = Self::default(); + coord[a] = az; + coord[!a] = bz; + coord + } + + pub fn from_fn(f: F) -> Self + where + F: FnMut(Axis2) -> usize, + { + Self(EnumMap::from_fn(f)) + } + + pub fn x(&self) -> usize { + self.0[Axis2::X] + } + + pub fn y(&self) -> usize { + self.0[Axis2::Y] + } + + pub fn get(&self, axis: Axis2) -> usize { + self.0[axis] + } +} + +impl From> for Coord2 { + fn from(value: EnumMap) -> Self { + Self(value) + } +} + +impl Index for Coord2 { + type Output = usize; + + fn index(&self, index: Axis2) -> &Self::Output { + &self.0[index] + } +} + +impl IndexMut for Coord2 { + fn index_mut(&mut self, index: Axis2) -> &mut Self::Output { + &mut self.0[index] + } +} + +#[derive(Clone, Debug, Default)] +pub struct Rect2(pub EnumMap>); + +impl Rect2 { + pub fn new(x_range: Range, y_range: Range) -> Self { + Self(enum_map! { + Axis2::X => x_range.clone(), + Axis2::Y => y_range.clone(), + }) + } + pub fn for_cell(cell: Coord2) -> Self { + Self::new(cell.x()..cell.x() + 1, cell.y()..cell.y() + 1) + } + pub fn for_ranges((a, a_range): (Axis2, Range), b_range: Range) -> Self { + let b = !a; + let mut ranges = EnumMap::default(); + ranges[a] = a_range; + ranges[b] = b_range; + Self(ranges) + } + pub fn top_left(&self) -> Coord2 { + use Axis2::*; + Coord2::new(self[X].start, self[Y].start) + } + pub fn from_fn(f: F) -> Self + where + F: FnMut(Axis2) -> Range, + { + Self(EnumMap::from_fn(f)) + } + pub fn translate(self, offset: Coord2) -> Rect2 { + Self::from_fn(|axis| self[axis].start + offset[axis]..self[axis].end + offset[axis]) + } + pub fn is_empty(&self) -> bool { + self[Axis2::X].is_empty() || self[Axis2::Y].is_empty() + } +} + +impl From>> for Rect2 { + fn from(value: EnumMap>) -> Self { + Self(value) + } +} + +impl Index for Rect2 { + type Output = Range; + + fn index(&self, index: Axis2) -> &Self::Output { + &self.0[index] + } +} + +impl IndexMut for Rect2 { + fn index_mut(&mut self, index: Axis2) -> &mut Self::Output { + &mut self.0[index] + } +} + +#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub enum FootnoteMarkerType { + /// a, b, c, ... + #[default] + Alphabetic, + + /// 1, 2, 3, ... + Numeric, +} + +#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub enum FootnoteMarkerPosition { + /// Subscripts. + #[default] + Subscript, + + /// Superscripts. + Superscript, +} + +#[derive(Copy, Clone, Debug)] +pub struct ValueOptions { + pub show_values: Option, + + pub show_variables: Option, + + pub small: f64, + + /// Where to put the footnote markers. + pub footnote_marker_type: FootnoteMarkerType, +} + +impl Default for ValueOptions { + fn default() -> Self { + Self { + show_values: None, + show_variables: None, + small: 0.0001, + footnote_marker_type: FootnoteMarkerType::default(), + } + } +} + +pub trait IntoValueOptions { + fn into_value_options(self) -> ValueOptions; +} + +impl IntoValueOptions for () { + fn into_value_options(self) -> ValueOptions { + ValueOptions::default() + } +} + +impl IntoValueOptions for &PivotTable { + fn into_value_options(self) -> ValueOptions { + self.value_options() + } +} + +impl IntoValueOptions for &ValueOptions { + fn into_value_options(self) -> ValueOptions { + *self + } +} + +impl IntoValueOptions for ValueOptions { + fn into_value_options(self) -> ValueOptions { + self + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct PivotTable { + pub look: Arc, + + pub rotate_inner_column_labels: bool, + + pub rotate_outer_row_labels: bool, + + pub show_grid_lines: bool, + + pub show_title: bool, + + pub show_caption: bool, + + pub show_values: Option, + + pub show_variables: Option, + + pub weight_format: Format, + + /// Current layer indexes, with `axes[Axis3::Z].dimensions.len()` elements. + /// `current_layer[i]` is an offset into + /// `axes[Axis3::Z].dimensions[i].data_leaves[]`, except that a dimension + /// can have zero leaves, in which case `current_layer[i]` is zero and + /// there's no corresponding leaf. + pub current_layer: Vec, + + /// Column and row sizing and page breaks. + pub sizing: EnumMap>>, + + /// Format settings. + pub settings: FormatSettings, + + /// Numeric grouping character (usually `.` or `,`). + pub grouping: Option, + + pub small: f64, + + pub command_local: Option, + pub command_c: Option, + pub language: Option, + pub locale: Option, + pub dataset: Option, + pub datafile: Option, + pub date: Option, + pub footnotes: Footnotes, + pub title: Option>, + pub subtype: Option>, + pub corner_text: Option>, + pub caption: Option>, + pub notes: Option, + pub dimensions: Vec, + pub axes: EnumMap, + pub cells: HashMap, +} + +impl PivotTable { + pub fn with_title(mut self, title: impl Into) -> Self { + self.title = Some(Box::new(title.into())); + self.show_title = true; + self + } + + pub fn with_caption(mut self, caption: impl Into) -> Self { + self.caption = Some(Box::new(caption.into())); + self.show_caption = true; + self + } + + pub fn with_corner_text(mut self, corner_text: impl Into) -> Self { + self.corner_text = Some(Box::new(corner_text.into())); + self + } + + pub fn with_subtype(self, subtype: impl Into) -> Self { + Self { + subtype: Some(Box::new(subtype.into())), + ..self + } + } + + pub fn with_show_title(mut self, show_title: bool) -> Self { + self.show_title = show_title; + self + } + + pub fn with_show_caption(mut self, show_caption: bool) -> Self { + self.show_caption = show_caption; + self + } + + pub fn with_layer(mut self, layer: &[usize]) -> Self { + debug_assert_eq!(layer.len(), self.current_layer.len()); + if self.look.print_all_layers { + self.look_mut().print_all_layers = false; + } + self.current_layer.clear(); + self.current_layer.extend_from_slice(layer); + self + } + + pub fn with_all_layers(mut self) -> Self { + if !self.look.print_all_layers { + self.look_mut().print_all_layers = true; + } + self + } + + pub fn look_mut(&mut self) -> &mut Look { + Arc::make_mut(&mut self.look) + } + + pub fn with_show_empty(mut self) -> Self { + if self.look.hide_empty { + self.look_mut().hide_empty = false; + } + self + } + + pub fn with_hide_empty(mut self) -> Self { + if !self.look.hide_empty { + self.look_mut().hide_empty = true; + } + self + } + + pub fn label(&self) -> String { + match &self.title { + Some(title) => title.display(self).to_string(), + None => String::from("Table"), + } + } + + pub fn title(&self) -> &Value { + match &self.title { + Some(title) => title, + None => { + static EMPTY: Value = Value::empty(); + &EMPTY + } + } + } + + pub fn subtype(&self) -> &Value { + match &self.subtype { + Some(subtype) => subtype, + None => { + static EMPTY: Value = Value::empty(); + &EMPTY + } + } + } +} + +impl Default for PivotTable { + fn default() -> Self { + Self { + look: Look::shared_default(), + rotate_inner_column_labels: false, + rotate_outer_row_labels: false, + show_grid_lines: false, + show_title: true, + show_caption: true, + show_values: None, + show_variables: None, + weight_format: Format::F40, + current_layer: Vec::new(), + sizing: EnumMap::default(), + settings: FormatSettings::default(), // XXX from settings + grouping: None, + small: 0.0001, // XXX from settings. + command_local: None, + command_c: None, // XXX from current command name. + language: None, + locale: None, + dataset: None, + datafile: None, + date: None, + footnotes: Footnotes::new(), + subtype: None, + title: None, + corner_text: None, + caption: None, + notes: None, + dimensions: Vec::new(), + axes: EnumMap::default(), + cells: HashMap::new(), + } + } +} + +fn cell_index(data_indexes: &[usize], dimensions: I) -> usize +where + I: ExactSizeIterator, +{ + debug_assert_eq!(data_indexes.len(), dimensions.len()); + let mut index = 0; + for (dimension, data_index) in dimensions.zip(data_indexes.iter()) { + debug_assert!(*data_index < dimension); + index = dimension * index + data_index; + } + index +} + +impl PivotTable { + pub fn new(axes_and_dimensions: impl IntoIterator) -> Self { + let mut dimensions = Vec::new(); + let mut axes = EnumMap::::default(); + for (axis, dimension) in axes_and_dimensions { + axes[axis].dimensions.push(dimensions.len()); + dimensions.push(dimension); + } + Self { + look: Settings::global().look.clone(), + current_layer: repeat_n(0, axes[Axis3::Z].dimensions.len()).collect(), + axes, + dimensions, + ..Self::default() + } + } + fn cell_index(&self, data_indexes: &[usize]) -> usize { + cell_index(data_indexes, self.dimensions.iter().map(|d| d.len())) + } + + pub fn insert(&mut self, data_indexes: &[usize], value: impl Into) { + self.cells + .insert(self.cell_index(data_indexes), value.into()); + } + + pub fn get(&self, data_indexes: &[usize]) -> Option<&Value> { + self.cells.get(&self.cell_index(data_indexes)) + } + + pub fn with_data(mut self, iter: impl IntoIterator) -> Self + where + I: AsRef<[usize]>, + { + self.extend(iter); + self + } + + /// Converts per-axis presentation-order indexes in `presentation_indexes`, + /// into data indexes for each dimension. + fn convert_indexes_ptod( + &self, + presentation_indexes: EnumMap, + ) -> SmallVec<[usize; 4]> { + let mut data_indexes = SmallVec::from_elem(0, self.dimensions.len()); + for (axis, presentation_indexes) in presentation_indexes { + for (&dim_index, &pindex) in self.axes[axis] + .dimensions + .iter() + .zip(presentation_indexes.iter()) + { + data_indexes[dim_index] = self.dimensions[dim_index].presentation_order[pindex]; + } + } + data_indexes + } + + /// Returns an iterator for the layer axis: + /// + /// - If `print` is true and `self.look.print_all_layers`, then the iterator + /// will visit all values of the layer axis. + /// + /// - Otherwise, the iterator will just visit `self.current_layer`. + pub fn layers(&self, print: bool) -> Box>> { + if print && self.look.print_all_layers { + Box::new(self.axis_values(Axis3::Z)) + } else { + Box::new(once(SmallVec::from_slice(&self.current_layer))) + } + } + + pub fn value_options(&self) -> ValueOptions { + ValueOptions { + show_values: self.show_values, + show_variables: self.show_variables, + small: self.small, + footnote_marker_type: self.look.footnote_marker_type, + } + } + + pub fn transpose(&mut self) { + self.axes.swap(Axis3::X, Axis3::Y); + } + + pub fn axis_dimensions( + &self, + axis: Axis3, + ) -> impl DoubleEndedIterator + ExactSizeIterator { + self.axes[axis] + .dimensions + .iter() + .copied() + .map(|index| &self.dimensions[index]) + } + + fn find_dimension(&self, dim_index: usize) -> Option<(Axis3, usize)> { + debug_assert!(dim_index < self.dimensions.len()); + for axis in enum_iterator::all::() { + for (position, dimension) in self.axes[axis].dimensions.iter().copied().enumerate() { + if dimension == dim_index { + return Some((axis, position)); + } + } + } + None + } + pub fn move_dimension(&mut self, dim_index: usize, new_axis: Axis3, new_position: usize) { + let (old_axis, old_position) = self.find_dimension(dim_index).unwrap(); + if old_axis == new_axis && old_position == new_position { + return; + } + + // Update the current layer, if necessary. If we're moving within the + // layer axis, preserve the current layer. + match (old_axis, new_axis) { + (Axis3::Z, Axis3::Z) => { + // Rearrange the layer axis. + if old_position < new_position { + self.current_layer[old_position..=new_position].rotate_left(1); + } else { + self.current_layer[new_position..=old_position].rotate_right(1); + } + } + (Axis3::Z, _) => { + // A layer is becoming a row or column. + self.current_layer.remove(old_position); + } + (_, Axis3::Z) => { + // A row or column is becoming a layer. + self.current_layer.insert(new_position, 0); + } + _ => (), + } + + self.axes[old_axis].dimensions.remove(old_position); + self.axes[new_axis] + .dimensions + .insert(new_position, dim_index); + } +} + +impl Extend<(I, Value)> for PivotTable +where + I: AsRef<[usize]>, +{ + fn extend>(&mut self, iter: T) { + for (data_indexes, value) in iter { + self.insert(data_indexes.as_ref(), value); + } + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct Footnote { + #[serde(skip)] + index: usize, + pub content: Box, + pub marker: Option>, + pub show: bool, +} + +impl Footnote { + pub fn new(content: impl Into) -> Self { + Self { + index: 0, + content: Box::new(content.into()), + marker: None, + show: true, + } + } + pub fn with_marker(mut self, marker: impl Into) -> Self { + self.marker = Some(Box::new(marker.into())); + self + } + + pub fn with_show(mut self, show: bool) -> Self { + self.show = show; + self + } + + pub fn with_index(mut self, index: usize) -> Self { + self.index = index; + self + } + + pub fn display_marker(&self, options: impl IntoValueOptions) -> DisplayMarker<'_> { + DisplayMarker { + footnote: self, + options: options.into_value_options(), + } + } + + pub fn display_content(&self, options: impl IntoValueOptions) -> DisplayValue<'_> { + self.content.display(options) + } + + pub fn index(&self) -> usize { + self.index + } +} + +pub struct DisplayMarker<'a> { + footnote: &'a Footnote, + options: ValueOptions, +} + +impl Display for DisplayMarker<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(marker) = &self.footnote.marker { + write!(f, "{}", marker.display(self.options).without_suffixes()) + } else { + let i = self.footnote.index + 1; + match self.options.footnote_marker_type { + FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic::new_lowercase(i)), + FootnoteMarkerType::Numeric => write!(f, "{i}"), + } + } + } +} + +/// Displays a number in 26adic notation. +/// +/// Zero is displayed as the empty string, 1 through 26 as `a` through `z`, 27 +/// through 52 as `aa` through `az`, and so on. +pub struct Display26Adic { + value: usize, + base: u8, +} + +impl Display26Adic { + /// Constructs a `Display26Adic` for `value`, with letters in lowercase. + pub fn new_lowercase(value: usize) -> Self { + Self { value, base: b'a' } + } + + /// Constructs a `Display26Adic` for `value`, with letters in uppercase. + pub fn new_uppercase(value: usize) -> Self { + Self { value, base: b'A' } + } +} + +impl Display for Display26Adic { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut output = SmallVec::<[u8; 16]>::new(); + let mut number = self.value; + while number > 0 { + number -= 1; + let digit = (number % 26) as u8; + output.push(digit + self.base); + number /= 26; + } + output.reverse(); + write!(f, "{}", from_utf8(&output).unwrap()) + } +} + +/// The content of a single pivot table cell. +/// +/// A [Value] is also a pivot table's title, caption, footnote marker and +/// contents, and so on. +/// +/// A given [Value] is one of: +/// +/// 1. A number resulting from a calculation. +/// +/// A number has an associated display format (usually [F] or [Pct]). This +/// format can be set directly, but that is not usually the easiest way. +/// Instead, it is usually true that all of the values in a single category +/// should have the same format (e.g. all "Significance" values might use +/// format `F40.3`), so PSPP makes it easy to set the default format for a +/// category while creating the category. See pivot_dimension_create() for +/// more details. +/// +/// [F]: crate::format::Type::F +/// [Pct]: crate::format::Type::Pct +/// +/// 2. A numeric or string value obtained from data ([ValueInner::Number] or +/// [ValueInner::String]). If such a value corresponds to a variable, then the +/// variable's name can be attached to the pivot_value. If the value has a +/// value label, then that can also be attached. When a label is present, +/// the user can control whether to show the value or the label or both. +/// +/// 3. A variable name ([ValueInner::Variable]). The variable label, if any, can +/// be attached too, and again the user can control whether to show the value +/// or the label or both. +/// +/// 4. A text string ([ValueInner::Text). The value stores the string in English +/// and translated into the output language (localized). Use +/// pivot_value_new_text() or pivot_value_new_text_format() for those cases. +/// In some cases, only an English or a localized version is available for +/// one reason or another, although this is regrettable; in those cases, use +/// pivot_value_new_user_text() or pivot_value_new_user_text_nocopy(). +/// +/// 5. A template. PSPP doesn't create these itself yet, but it can read and +/// interpret those created by SPSS. +#[derive(Clone, Default)] +pub struct Value { + pub inner: ValueInner, + pub styling: Option>, +} + +impl Serialize for Value { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.inner.serialize(serializer) + } +} + +/// Wrapper for [Value] that uses [Value::serialize_bare] for serialization. +#[derive(Serialize)] +struct BareValue<'a>(#[serde(serialize_with = "Value::serialize_bare")] pub &'a Value); + +impl Value { + pub fn serialize_bare(&self, serializer: S) -> Result + where + S: Serializer, + { + match &self.inner { + ValueInner::Number(number_value) => number_value.serialize_bare(serializer), + ValueInner::String(string_value) => string_value.s.serialize(serializer), + ValueInner::Variable(variable_value) => variable_value.var_name.serialize(serializer), + ValueInner::Text(text_value) => text_value.localized.serialize(serializer), + ValueInner::Template(template_value) => template_value.localized.serialize(serializer), + ValueInner::Empty => serializer.serialize_none(), + } + } + + fn new(inner: ValueInner) -> Self { + Self { + inner, + styling: None, + } + } + pub fn new_number_with_format(x: Option, format: Format) -> Self { + Self::new(ValueInner::Number(NumberValue { + show: None, + format, + honor_small: false, + value: x, + variable: None, + value_label: None, + })) + } + pub fn new_variable(variable: &Variable) -> Self { + Self::new(ValueInner::Variable(VariableValue { + show: None, + var_name: String::from(variable.name.as_str()), + variable_label: variable.label.clone(), + })) + } + pub fn new_datum(value: &Datum) -> Self + where + B: EncodedString, + { + match value { + Datum::Number(number) => Self::new_number(*number), + Datum::String(string) => Self::new_user_text(string.as_str()), + } + } + pub fn new_variable_value(variable: &Variable, value: &Datum) -> Self { + let var_name = Some(variable.name.as_str().into()); + let value_label = variable.value_labels.get(value).map(String::from); + match value { + Datum::Number(number) => Self::new(ValueInner::Number(NumberValue { + show: None, + format: match variable.print_format.var_type() { + VarType::Numeric => variable.print_format, + VarType::String => { + #[cfg(debug_assertions)] + panic!("cannot create numeric pivot value with string format"); + + #[cfg(not(debug_assertions))] + Format::F8_2 + } + }, + honor_small: false, + value: *number, + variable: var_name, + value_label, + })), + Datum::String(string) => Self::new(ValueInner::String(StringValue { + show: None, + hex: variable.print_format.type_() == Type::AHex, + s: string + .as_ref() + .with_encoding(variable.encoding()) + .into_string(), + var_name, + value_label, + })), + } + } + pub fn new_number(x: Option) -> Self { + Self::new_number_with_format(x, Format::F8_2) + } + pub fn new_integer(x: Option) -> Self { + Self::new_number_with_format(x, Format::F40) + } + pub fn new_text(s: impl Into) -> Self { + Self::new_user_text(s) + } + pub fn new_user_text(s: impl Into) -> Self { + let s: String = s.into(); + if s.is_empty() { + Self::default() + } else { + Self::new(ValueInner::Text(TextValue { + user_provided: true, + localized: s.clone(), + c: None, + id: None, + })) + } + } + pub fn with_footnote(mut self, footnote: &Arc) -> Self { + self.add_footnote(footnote); + self + } + pub fn add_footnote(&mut self, footnote: &Arc) { + let footnotes = &mut self.styling.get_or_insert_default().footnotes; + footnotes.push(footnote.clone()); + footnotes.sort_by_key(|f| f.index); + } + pub fn with_show_value_label(mut self, show: Option) -> Self { + let new_show = show; + match &mut self.inner { + ValueInner::Number(NumberValue { show, .. }) + | ValueInner::String(StringValue { show, .. }) => { + *show = new_show; + } + _ => (), + } + self + } + pub fn with_show_variable_label(mut self, show: Option) -> Self { + if let ValueInner::Variable(variable_value) = &mut self.inner { + variable_value.show = show; + } + self + } + pub fn with_value_label(mut self, label: Option) -> Self { + match &mut self.inner { + ValueInner::Number(NumberValue { value_label, .. }) + | ValueInner::String(StringValue { value_label, .. }) => *value_label = label.clone(), + _ => (), + } + self + } + pub const fn empty() -> Self { + Value { + inner: ValueInner::Empty, + styling: None, + } + } + pub const fn is_empty(&self) -> bool { + self.inner.is_empty() && self.styling.is_none() + } +} + +impl From<&str> for Value { + fn from(value: &str) -> Self { + Self::new_text(value) + } +} + +impl From for Value { + fn from(value: String) -> Self { + Self::new_text(value) + } +} + +impl From<&Variable> for Value { + fn from(variable: &Variable) -> Self { + Self::new_variable(variable) + } +} + +pub struct DisplayValue<'a> { + inner: &'a ValueInner, + markup: bool, + subscripts: &'a [String], + footnotes: &'a [Arc], + options: ValueOptions, + show_value: bool, + show_label: Option<&'a str>, +} + +impl<'a> DisplayValue<'a> { + pub fn subscripts(&self) -> impl Iterator { + self.subscripts.iter().map(String::as_str) + } + + pub fn has_subscripts(&self) -> bool { + !self.subscripts.is_empty() + } + + pub fn footnotes(&self) -> impl Iterator> { + self.footnotes + .iter() + .filter(|f| f.show) + .map(|f| f.display_marker(self.options)) + } + + pub fn has_footnotes(&self) -> bool { + self.footnotes().next().is_some() + } + + pub fn without_suffixes(self) -> Self { + Self { + subscripts: &[], + footnotes: &[], + ..self + } + } + + /// Returns this display split into `(body, suffixes)` where `suffixes` is + /// subscripts and footnotes and `body` is everything else. + pub fn split_suffixes(self) -> (Self, Self) { + let suffixes = Self { + inner: &ValueInner::Empty, + ..self + }; + (self.without_suffixes(), suffixes) + } + + pub fn with_styling(mut self, styling: &'a ValueStyle) -> Self { + if let Some(area_style) = &styling.style { + self.markup = area_style.font_style.markup; + } + self.subscripts = styling.subscripts.as_slice(); + self.footnotes = styling.footnotes.as_slice(); + self + } + + pub fn with_font_style(self, font_style: &FontStyle) -> Self { + Self { + markup: font_style.markup, + ..self + } + } + + pub fn with_subscripts(self, subscripts: &'a [String]) -> Self { + Self { subscripts, ..self } + } + + pub fn with_footnotes(self, footnotes: &'a [Arc]) -> Self { + Self { footnotes, ..self } + } + + pub fn is_empty(&self) -> bool { + self.inner.is_empty() && self.subscripts.is_empty() && self.footnotes.is_empty() + } + + fn small(&self) -> f64 { + self.options.small + } + + pub fn var_type(&self) -> VarType { + match self.inner { + ValueInner::Number(NumberValue { .. }) if self.show_label.is_none() => VarType::Numeric, + _ => VarType::String, + } + } + + fn template( + &self, + f: &mut std::fmt::Formatter<'_>, + template: &str, + args: &[Vec], + ) -> std::fmt::Result { + let mut iter = template.as_bytes().iter(); + while let Some(c) = iter.next() { + match c { + b'\\' => { + let c = *iter.next().unwrap_or(&b'\\') as char; + let c = if c == 'n' { '\n' } else { c }; + write!(f, "{c}")?; + } + b'^' => { + let (index, rest) = consume_int(iter.as_slice()); + iter = rest.iter(); + let Some(arg) = args.get(index.wrapping_sub(1)) else { + continue; + }; + if let Some(arg) = arg.first() { + write!(f, "{}", arg.display(self.options))?; + } + } + b'[' => { + let (a, rest) = extract_inner_template(iter.as_slice()); + let (b, rest) = extract_inner_template(rest); + let rest = rest.strip_prefix(b"]").unwrap_or(rest); + let (index, rest) = consume_int(rest); + iter = rest.iter(); + + let Some(mut args) = args.get(index.wrapping_sub(1)).map(|vec| vec.as_slice()) + else { + continue; + }; + let (mut template, mut escape) = + if !a.is_empty() { (a, b'%') } else { (b, b'^') }; + while !args.is_empty() { + let n_consumed = self.inner_template(f, template, escape, args)?; + if n_consumed == 0 { + break; + } + args = &args[n_consumed..]; + + template = b; + escape = b'^'; + } + } + c => write!(f, "{c}")?, + } + } + Ok(()) + } + + fn inner_template( + &self, + f: &mut std::fmt::Formatter<'_>, + template: &[u8], + escape: u8, + args: &[Value], + ) -> Result { + let mut iter = template.iter(); + let mut args_consumed = 0; + while let Some(c) = iter.next() { + match c { + b'\\' => { + let c = *iter.next().unwrap_or(&b'\\') as char; + let c = if c == 'n' { '\n' } else { c }; + write!(f, "{c}")?; + } + c if *c == escape => { + let (index, rest) = consume_int(iter.as_slice()); + iter = rest.iter(); + let Some(arg) = args.get(index.wrapping_sub(1)) else { + continue; + }; + args_consumed = args_consumed.max(index); + write!(f, "{}", arg.display(self.options))?; + } + c => write!(f, "{c}")?, + } + } + Ok(args_consumed) + } +} + +fn consume_int(input: &[u8]) -> (usize, &[u8]) { + let mut n = 0; + for (index, c) in input.iter().enumerate() { + if !c.is_ascii_digit() { + return (n, &input[index..]); + } + n = n * 10 + (c - b'0') as usize; + } + (n, &[]) +} + +fn extract_inner_template(input: &[u8]) -> (&[u8], &[u8]) { + for (index, c) in input.iter().copied().enumerate() { + if c == b':' && (index == 0 || input[index - 1] != b'\\') { + return input.split_at(index); + } + } + (input, &[]) +} + +fn interpret_show( + global_show: impl Fn() -> Show, + table_show: Option, + value_show: Option, + label: &str, +) -> (bool, Option<&str>) { + match value_show.or(table_show).unwrap_or_else(global_show) { + Show::Value => (true, None), + Show::Label => (false, Some(label)), + Show::Both => (true, Some(label)), + } +} + +impl Display for DisplayValue<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.inner { + ValueInner::Number(NumberValue { + format, + honor_small, + value, + .. + }) => { + if self.show_value { + let format = if format.type_() == Type::F + && *honor_small + && value.is_some_and(|value| value != 0.0 && value.abs() < self.small()) + { + UncheckedFormat::new(Type::E, 40, format.d() as u8).fix() + } else { + *format + }; + let mut buf = SmallString::<[u8; 40]>::new(); + write!( + &mut buf, + "{}", + Datum::<&str>::Number(*value).display(format) + ) + .unwrap(); + write!(f, "{}", buf.trim_start_matches(' '))?; + } + if let Some(label) = self.show_label { + if self.show_value { + write!(f, " ")?; + } + f.write_str(label)?; + } + Ok(()) + } + + ValueInner::String(StringValue { s, .. }) + | ValueInner::Variable(VariableValue { var_name: s, .. }) => { + match (self.show_value, self.show_label) { + (true, None) => write!(f, "{s}"), + (false, Some(label)) => write!(f, "{label}"), + (true, Some(label)) => write!(f, "{s} {label}"), + (false, None) => unreachable!(), + } + } + + ValueInner::Text(TextValue { + localized: local, .. + }) => { + /* + if self + .inner + .styling + .as_ref() + .is_some_and(|styling| styling.style.font_style.markup) + { + todo!(); + }*/ + f.write_str(local) + } + + ValueInner::Template(TemplateValue { + args, + localized: local, + .. + }) => self.template(f, local, args), + + ValueInner::Empty => Ok(()), + }?; + + for (subscript, delimiter) in self.subscripts.iter().zip(once('_').chain(repeat(','))) { + write!(f, "{delimiter}{subscript}")?; + } + + for footnote in self.footnotes { + write!(f, "[{}]", footnote.display_marker(self.options))?; + } + + Ok(()) + } +} + +impl Value { + // Returns an object that will format this value, including subscripts and + // superscripts and footnotes. `options` controls whether variable and + // value labels are included. + pub fn display(&self, options: impl IntoValueOptions) -> DisplayValue<'_> { + let display = self.inner.display(options.into_value_options()); + match &self.styling { + Some(styling) => display.with_styling(styling), + None => display, + } + } +} + +impl Debug for Value { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.display(()).to_string()) + } +} + +#[derive(Clone, Debug)] +pub struct NumberValue { + /// The numerical value, or `None` if it is a missing value. + pub value: Option, + pub format: Format, + pub show: Option, + pub honor_small: bool, + pub variable: Option, + pub value_label: Option, +} + +impl Serialize for NumberValue { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if self.format.type_() == Type::F && self.variable.is_none() && self.value_label.is_none() { + self.value.serialize(serializer) + } else { + let mut s = serializer.serialize_map(None)?; + s.serialize_entry("value", &self.value)?; + s.serialize_entry("format", &self.format)?; + if let Some(show) = self.show { + s.serialize_entry("show", &show)?; + } + if self.honor_small { + s.serialize_entry("honor_small", &self.honor_small)?; + } + if let Some(variable) = &self.variable { + s.serialize_entry("variable", variable)?; + } + if let Some(value_label) = &self.value_label { + s.serialize_entry("value_label", value_label)?; + } + s.end() + } + } +} + +impl NumberValue { + pub fn serialize_bare(&self, serializer: S) -> Result + where + S: Serializer, + { + if let Some(number) = self.value + && number.trunc() == number + && number >= -(1i64 << 53) as f64 + && number <= (1i64 << 53) as f64 + { + (number as u64).serialize(serializer) + } else { + self.value.serialize(serializer) + } + } +} + +#[derive(Serialize)] +pub struct BareNumberValue<'a>( + #[serde(serialize_with = "NumberValue::serialize_bare")] pub &'a NumberValue, +); + +#[derive(Clone, Debug, Serialize)] +pub struct StringValue { + /// The string value. + /// + /// If `hex` is true, this should contain hex digits, not raw binary data + /// (otherwise it would be impossible to encode non-UTF-8 data). + pub s: String, + + /// True if `s` is hex digits. + pub hex: bool, + + pub show: Option, + + pub var_name: Option, + pub value_label: Option, +} + +#[derive(Clone, Debug, Serialize)] +pub struct VariableValue { + pub show: Option, + pub var_name: String, + pub variable_label: Option, +} + +#[derive(Clone, Debug)] +pub struct TextValue { + pub user_provided: bool, + /// Localized. + pub localized: String, + /// English. + pub c: Option, + /// Identifier. + pub id: Option, +} + +impl Serialize for TextValue { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if self.user_provided && self.c.is_none() && self.id.is_none() { + serializer.serialize_str(&self.localized) + } else { + let mut s = serializer.serialize_struct( + "TextValue", + 2 + self.c.is_some() as usize + self.id.is_some() as usize, + )?; + s.serialize_field("user_provided", &self.user_provided)?; + s.serialize_field("localized", &self.localized)?; + if let Some(c) = &self.c { + s.serialize_field("c", &c)?; + } + if let Some(id) = &self.id { + s.serialize_field("id", &id)?; + } + s.end() + } + } +} + +impl TextValue { + pub fn localized(&self) -> &str { + self.localized.as_str() + } + pub fn c(&self) -> &str { + self.c.as_ref().unwrap_or(&self.localized).as_str() + } + pub fn id(&self) -> &str { + self.id.as_ref().unwrap_or(&self.localized).as_str() + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct TemplateValue { + pub args: Vec>, + pub localized: String, + pub id: String, +} + +#[derive(Clone, Debug, Default, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ValueInner { + Number(NumberValue), + String(StringValue), + Variable(VariableValue), + Text(TextValue), + Template(TemplateValue), + + #[default] + Empty, +} + +impl ValueInner { + pub const fn is_empty(&self) -> bool { + matches!(self, Self::Empty) + } + fn show(&self) -> Option { + match self { + ValueInner::Number(NumberValue { show, .. }) + | ValueInner::String(StringValue { show, .. }) + | ValueInner::Variable(VariableValue { show, .. }) => *show, + _ => None, + } + } + + fn label(&self) -> Option<&str> { + self.value_label().or_else(|| self.variable_label()) + } + + fn value_label(&self) -> Option<&str> { + match self { + ValueInner::Number(NumberValue { value_label, .. }) + | ValueInner::String(StringValue { value_label, .. }) => { + value_label.as_ref().map(String::as_str) + } + _ => None, + } + } + + fn variable_label(&self) -> Option<&str> { + match self { + ValueInner::Variable(VariableValue { variable_label, .. }) => { + variable_label.as_ref().map(String::as_str) + } + _ => None, + } + } +} + +#[derive(Clone, Debug, Default)] +pub struct ValueStyle { + pub style: Option, + pub subscripts: Vec, + pub footnotes: Vec>, +} + +impl ValueStyle { + pub fn is_empty(&self) -> bool { + self.style.is_none() && self.subscripts.is_empty() && self.footnotes.is_empty() + } +} + +impl ValueInner { + // Returns an object that will format this value. Settings on `options` + // control whether variable and value labels are included. + pub fn display(&self, options: impl IntoValueOptions) -> DisplayValue<'_> { + let options = options.into_value_options(); + let (show_value, show_label) = if let Some(value_label) = self.value_label() { + interpret_show( + || Settings::global().show_values, + options.show_values, + self.show(), + value_label, + ) + } else if let Some(variable_label) = self.variable_label() { + interpret_show( + || Settings::global().show_variables, + options.show_variables, + self.show(), + variable_label, + ) + } else { + (true, None) + }; + DisplayValue { + inner: self, + markup: false, + subscripts: &[], + footnotes: &[], + options, + show_value, + show_label, + } + } +} + +pub struct MetadataEntry { + pub name: Value, + pub value: MetadataValue, +} + +pub enum MetadataValue { + Leaf(Value), + Group(Vec), +} + +impl MetadataEntry { + pub fn into_pivot_table(self) -> PivotTable { + let mut data = Vec::new(); + let group = match self.visit(&mut data) { + Category::Group(group) => group, + Category::Leaf(leaf) => Group::new("Metadata").with(leaf).with_label_shown(), + }; + PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( + data.into_iter() + .enumerate() + .filter(|(_row, value)| !value.is_empty()) + .map(|(row, value)| ([row], value)), + ) + } + fn visit(self, data: &mut Vec) -> Category { + match self.value { + MetadataValue::Leaf(value) => { + data.push(value); + Leaf::new(self.name).into() + } + MetadataValue::Group(items) => Group::with_capacity(self.name, items.len()) + .with_multiple(items.into_iter().map(|item| item.visit(data))) + .into(), + } + } +} + +impl Serialize for MetadataValue { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + MetadataValue::Leaf(value) => value.serialize_bare(serializer), + MetadataValue::Group(items) => { + let mut map = serializer.serialize_map(Some(items.len()))?; + for item in items { + let name = item.name.display(()).to_string(); + map.serialize_entry(&name, &item.value)?; + } + map.end() + } + } + } +} +impl Serialize for MetadataEntry { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match &self.value { + MetadataValue::Leaf(value) => { + let mut map = serializer.serialize_map(Some(1))?; + let name = self.name.display(()).to_string(); + map.serialize_entry(&name, &BareValue(value))?; + map.end() + } + MetadataValue::Group(items) => { + let mut map = serializer.serialize_map(Some(items.len()))?; + for item in items { + let name = item.name.display(()).to_string(); + map.serialize_entry(&name, &item.value)?; + } + map.end() + } + } + } +} + +#[cfg(test)] +mod tests { + use crate::output::pivot::{Display26Adic, MetadataEntry, MetadataValue, Value}; + + #[test] + fn display_26adic() { + for (number, lowercase, uppercase) in [ + (0, "", ""), + (1, "a", "A"), + (2, "b", "B"), + (26, "z", "Z"), + (27, "aa", "AA"), + (28, "ab", "AB"), + (29, "ac", "AC"), + (18278, "zzz", "ZZZ"), + (18279, "aaaa", "AAAA"), + (19010, "abcd", "ABCD"), + ] { + assert_eq!(Display26Adic::new_lowercase(number).to_string(), lowercase); + assert_eq!(Display26Adic::new_uppercase(number).to_string(), uppercase); + } + } + + #[test] + fn metadata_entry() { + let tree = MetadataEntry { + name: Value::from("Group"), + value: MetadataValue::Group(vec![ + MetadataEntry { + name: Value::from("Name 1"), + value: MetadataValue::Leaf(Value::from("Value 1")), + }, + MetadataEntry { + name: Value::from("Subgroup 1"), + value: MetadataValue::Group(vec![ + MetadataEntry { + name: Value::from("Subname 1"), + value: MetadataValue::Leaf(Value::from("Subvalue 1")), + }, + MetadataEntry { + name: Value::from("Subname 2"), + value: MetadataValue::Leaf(Value::from("Subvalue 2")), + }, + MetadataEntry { + name: Value::from("Subname 3"), + value: MetadataValue::Leaf(Value::new_integer(Some(3.0))), + }, + ]), + }, + MetadataEntry { + name: Value::from("Name 2"), + value: MetadataValue::Leaf(Value::from("Value 2")), + }, + ]), + }; + assert_eq!( + serde_json::to_string_pretty(&tree).unwrap(), + r#"{ + "Name 1": "Value 1", + "Subgroup 1": { + "Subname 1": "Subvalue 1", + "Subname 2": "Subvalue 2", + "Subname 3": 3 + }, + "Name 2": "Value 2" +}"# + ); + + assert_eq!( + tree.into_pivot_table().to_string(), + r#"╭────────────────────┬──────────╮ +│ Name 1 │Value 1 │ +├────────────────────┼──────────┤ +│Subgroup 1 Subname 1│Subvalue 1│ +│ Subname 2│Subvalue 2│ +│ Subname 3│ 3│ +├────────────────────┼──────────┤ +│ Name 2 │Value 2 │ +╰────────────────────┴──────────╯ +"# + ); + } +} diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs deleted file mode 100644 index 92133e2c51..0000000000 --- a/rust/pspp/src/output/pivot/mod.rs +++ /dev/null @@ -1,2859 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -//! Pivot tables. -//! -//! Pivot tables are PSPP's primary form of output. They are analogous to the -//! pivot tables you might be familiar with from spreadsheets and databases. -//! See for a brief introduction to -//! the overall concept of a pivot table. -//! -//! In PSPP, the most important internal pieces of a pivot table are: -//! -//! - Title. Every pivot table has a title that is displayed above it. It also -//! has an optional caption (displayed below it) and corner text (displayed in -//! the upper left corner). -//! -//! - Dimensions. A dimension consists of zero or more categories. A category -//! has a label, such as "df" or "Asymp. Sig." or 123 or a variable name. The -//! categories are the leaves of a tree whose non-leaf nodes form groups of -//! categories. The tree always has a root group whose label is the name of -//! the dimension. -//! -//! - Axes. A table has three axes: column, row, and layer. Each dimension is -//! assigned to an axis, and each axis has zero or more dimensions. When an -//! axis has more than one dimension, they are ordered from innermost to -//! outermost. -//! -//! - Data. A table's data consists of zero or more cells. Each cell maps from -//! a category for each dimension to a value, which is commonly a number but -//! could also be a variable name or an arbitrary text string. - -use std::{ - collections::HashMap, - fmt::{Debug, Display, Write}, - io::Read, - iter::{once, repeat, repeat_n, FusedIterator}, - ops::{Index, IndexMut, Not, Range, RangeInclusive}, - str::{from_utf8, FromStr, Utf8Error}, - sync::{Arc, OnceLock}, -}; - -use binrw::Error as BinError; -use chrono::NaiveDateTime; -pub use color::ParseError as ParseColorError; -use color::{palette::css::TRANSPARENT, AlphaColor, Rgba8, Srgb}; -use enum_iterator::Sequence; -use enum_map::{enum_map, Enum, EnumMap}; -use look_xml::TableProperties; -use quick_xml::{de::from_str, DeError}; -use serde::{ - de::Visitor, - ser::{SerializeMap, SerializeStruct}, - Deserialize, Serialize, Serializer, -}; -use smallstr::SmallString; -use smallvec::SmallVec; -use thiserror::Error as ThisError; -use tlo::parse_tlo; - -use crate::{ - data::{ByteString, Datum, EncodedString, RawString}, - format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, - settings::{Settings, Show}, - util::ToSmallString, - variable::{VarType, Variable}, -}; - -pub mod output; - -mod look_xml; -#[cfg(test)] -pub mod test; -mod tlo; - -/// Areas of a pivot table for styling purposes. -#[derive(Copy, Clone, Debug, Default, Enum, PartialEq, Eq)] -pub enum Area { - Title, - Caption, - - /// Footnotes, - Footer, - - // Top-left corner. - Corner, - - /// Labels for columns ([Axis2::X]) and rows ([Axis2::Y]). - Labels(Axis2), - - #[default] - Data, - - /// Layer indication. - Layers, -} - -impl Display for Area { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Area::Title => write!(f, "title"), - Area::Caption => write!(f, "caption"), - Area::Footer => write!(f, "footer"), - Area::Corner => write!(f, "corner"), - Area::Labels(axis2) => write!(f, "labels({axis2})"), - Area::Data => write!(f, "data"), - Area::Layers => write!(f, "layers"), - } - } -} - -impl Serialize for Area { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - serializer.serialize_str(&self.to_small_string::<16>()) - } -} - -impl Area { - fn default_cell_style(self) -> CellStyle { - use HorzAlign::*; - use VertAlign::*; - let (horz_align, vert_align, hmargins, vmargins) = match self { - Area::Title => (Some(Center), Middle, [8, 11], [1, 8]), - Area::Caption => (Some(Left), Top, [8, 11], [1, 1]), - Area::Footer => (Some(Left), Top, [11, 8], [2, 3]), - Area::Corner => (Some(Left), Bottom, [8, 11], [1, 1]), - Area::Labels(Axis2::X) => (Some(Center), Top, [8, 11], [1, 3]), - Area::Labels(Axis2::Y) => (Some(Left), Top, [8, 11], [1, 3]), - Area::Data => (None, Top, [8, 11], [1, 1]), - Area::Layers => (Some(Left), Bottom, [8, 11], [1, 3]), - }; - CellStyle { - horz_align, - vert_align, - margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins }, - } - } - - fn default_font_style(self) -> FontStyle { - FontStyle { - bold: self == Area::Title, - italic: false, - underline: false, - markup: false, - font: String::from("Sans Serif"), - fg: [Color::BLACK; 2], - bg: [Color::WHITE; 2], - size: 9, - } - } - - fn default_area_style(self) -> AreaStyle { - AreaStyle { - cell_style: self.default_cell_style(), - font_style: self.default_font_style(), - } - } -} - -/// Table borders for styling purposes. -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)] -pub enum Border { - Title, - OuterFrame(BoxBorder), - InnerFrame(BoxBorder), - Dimension(RowColBorder), - Category(RowColBorder), - DataLeft, - DataTop, -} - -impl Border { - pub fn default_stroke(self) -> Stroke { - match self { - Self::InnerFrame(_) | Self::DataLeft | Self::DataTop => Stroke::Thick, - Self::Dimension( - RowColBorder(HeadingRegion::Columns, _) | RowColBorder(_, Axis2::X), - ) - | Self::Category(RowColBorder(HeadingRegion::Columns, _)) => Stroke::Solid, - _ => Stroke::None, - } - } - pub fn default_border_style(self) -> BorderStyle { - BorderStyle { - stroke: self.default_stroke(), - color: Color::BLACK, - } - } - - fn fallback(self) -> Self { - match self { - Self::Title - | Self::OuterFrame(_) - | Self::InnerFrame(_) - | Self::DataLeft - | Self::DataTop - | Self::Category(_) => self, - Self::Dimension(row_col_border) => Self::Category(row_col_border), - } - } -} - -impl Display for Border { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Border::Title => write!(f, "title"), - Border::OuterFrame(box_border) => write!(f, "outer_frame({box_border})"), - Border::InnerFrame(box_border) => write!(f, "inner_frame({box_border})"), - Border::Dimension(row_col_border) => write!(f, "dimension({row_col_border})"), - Border::Category(row_col_border) => write!(f, "category({row_col_border})"), - Border::DataLeft => write!(f, "data(left)"), - Border::DataTop => write!(f, "data(top)"), - } - } -} - -impl Serialize for Border { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - serializer.serialize_str(&self.to_small_string::<32>()) - } -} - -/// The borders on a box. -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum BoxBorder { - Left, - Top, - Right, - Bottom, -} - -impl BoxBorder { - fn as_str(&self) -> &'static str { - match self { - BoxBorder::Left => "left", - BoxBorder::Top => "top", - BoxBorder::Right => "right", - BoxBorder::Bottom => "bottom", - } - } -} - -impl Display for BoxBorder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(self.as_str()) - } -} - -/// Borders between rows and columns. -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)] -#[serde(rename_all = "snake_case")] -pub struct RowColBorder( - /// Row or column headings. - pub HeadingRegion, - /// Horizontal ([Axis2::X]) or vertical ([Axis2::Y]) borders. - pub Axis2, -); - -impl Display for RowColBorder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}:{}", self.0, self.1) - } -} - -/// Sizing for rows or columns of a rendered table. -/// -/// The comments below talk about columns and their widths but they apply -/// equally to rows and their heights. -#[derive(Default, Clone, Debug, Serialize)] -pub struct Sizing { - /// Specific column widths, in 1/96" units. - widths: Vec, - - /// Specific page breaks: 0-based columns after which a page break must - /// occur, e.g. a value of 1 requests a break after the second column. - breaks: Vec, - - /// Keeps: columns to keep together on a page if possible. - keeps: Vec>, -} - -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Sequence, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum Axis3 { - X, - Y, - Z, -} - -impl Axis3 { - fn transpose(&self) -> Option { - match self { - Axis3::X => Some(Axis3::Y), - Axis3::Y => Some(Axis3::X), - Axis3::Z => None, - } - } -} - -impl From for Axis3 { - fn from(axis2: Axis2) -> Self { - match axis2 { - Axis2::X => Self::X, - Axis2::Y => Self::Y, - } - } -} - -/// An axis within a pivot table. -#[derive(Clone, Debug, Default, Serialize)] -pub struct Axis { - /// `dimensions[0]` is the innermost dimension. - pub dimensions: Vec, -} - -pub struct AxisIterator { - indexes: SmallVec<[usize; 4]>, - lengths: SmallVec<[usize; 4]>, - done: bool, -} - -impl FusedIterator for AxisIterator {} -impl Iterator for AxisIterator { - type Item = SmallVec<[usize; 4]>; - - fn next(&mut self) -> Option { - if self.done { - None - } else { - let retval = self.indexes.clone(); - for (index, len) in self.indexes.iter_mut().zip(self.lengths.iter().copied()) { - *index += 1; - if *index < len { - return Some(retval); - }; - *index = 0; - } - self.done = true; - Some(retval) - } - } -} - -impl PivotTable { - pub fn with_look(mut self, look: Arc) -> Self { - self.look = look; - self - } - pub fn insert_number(&mut self, data_indexes: &[usize], number: Option, class: Class) { - let format = match class { - Class::Other => Settings::global().default_format, - Class::Integer => Format::F40, - Class::Correlations => Format::F40_3, - Class::Significance => Format::F40_3, - Class::Percent => Format::PCT40_1, - Class::Residual => Format::F40_2, - Class::Count => Format::F40, // XXX - }; - let value = Value::new(ValueInner::Number(NumberValue { - show: None, - format, - honor_small: class == Class::Other, - value: number, - variable: None, - value_label: None, - })); - self.insert(data_indexes, value); - } - - pub fn with_footnotes(mut self, footnotes: Footnotes) -> Self { - debug_assert!(self.footnotes.is_empty()); - self.footnotes = footnotes; - self - } - fn axis_values(&self, axis: Axis3) -> AxisIterator { - AxisIterator { - indexes: repeat_n(0, self.axes[axis].dimensions.len()).collect(), - lengths: self.axis_dimensions(axis).map(|d| d.len()).collect(), - done: self.axis_extent(axis) == 0, - } - } - - fn axis_extent(&self, axis: Axis3) -> usize { - self.axis_dimensions(axis).map(|d| d.len()).product() - } -} - -/// Dimensions. -/// -/// A [Dimension] identifies the categories associated with a single dimension -/// within a multidimensional pivot table. -/// -/// A dimension contains a collection of categories, which are the leaves in a -/// tree of groups. -/// -/// (A dimension or a group can contain zero categories, but this is unusual. -/// If a dimension contains no categories, then its table cannot contain any -/// data.) -#[derive(Clone, Debug, Serialize)] -pub struct Dimension { - /// Hierarchy of categories within the dimension. The groups and categories - /// are sorted in the order that should be used for display. This might be - /// different from the original order produced for output if the user - /// adjusted it. - /// - /// The root must always be a group, although it is allowed to have no - /// subcategories. - pub root: Group, - - /// Ordering of leaves for presentation. - /// - /// This is a permutation of `0..n` where `n` is the number of leaves. It - /// maps from an index in presentation order to an index in data order. - pub presentation_order: Vec, - - /// Display. - pub hide_all_labels: bool, -} - -pub type GroupVec<'a> = SmallVec<[&'a Group; 4]>; -pub struct Path<'a> { - groups: GroupVec<'a>, - leaf: &'a Leaf, -} - -impl Dimension { - pub fn new(root: Group) -> Self { - Dimension { - presentation_order: (0..root.len()).collect(), - root, - hide_all_labels: false, - } - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Returns the number of (leaf) categories in this dimension. - pub fn len(&self) -> usize { - self.root.len() - } - - pub fn nth_leaf(&self, index: usize) -> Option<&Leaf> { - self.root.nth_leaf(index) - } - - pub fn leaf_path(&self, index: usize) -> Option> { - self.root.leaf_path(index, SmallVec::new()) - } - - pub fn with_all_labels_hidden(self) -> Self { - Self { - hide_all_labels: true, - ..self - } - } -} - -#[derive(Clone, Debug, Serialize)] -pub struct Group { - #[serde(skip)] - len: usize, - pub name: Box, - - /// The child categories. - /// - /// A group usually has multiple children, but it is allowed to have - /// only one or even (pathologically) none. - pub children: Vec, - - /// Whether to show the group's label. - pub show_label: bool, -} - -impl Group { - pub fn new(name: impl Into) -> Self { - Self::with_capacity(name, 0) - } - - pub fn with_capacity(name: impl Into, capacity: usize) -> Self { - Self { - len: 0, - name: Box::new(name.into()), - children: Vec::with_capacity(capacity), - show_label: false, - } - } - - pub fn push(&mut self, child: impl Into) { - let mut child = child.into(); - if let Category::Group(group) = &mut child { - group.show_label = true; - } - self.len += child.len(); - self.children.push(child); - } - - pub fn with(mut self, child: impl Into) -> Self { - self.push(child); - self - } - - pub fn with_multiple(mut self, children: impl IntoIterator) -> Self - where - C: Into, - { - self.extend(children); - self - } - - pub fn with_label_shown(self) -> Self { - self.with_show_label(true) - } - - pub fn with_show_label(mut self, show_label: bool) -> Self { - self.show_label = show_label; - self - } - - pub fn nth_leaf(&self, mut index: usize) -> Option<&Leaf> { - for child in &self.children { - let len = child.len(); - if index < len { - return child.nth_leaf(index); - } - index -= len; - } - None - } - - pub fn leaf_path<'a>(&'a self, mut index: usize, mut groups: GroupVec<'a>) -> Option> { - for child in &self.children { - let len = child.len(); - if index < len { - groups.push(self); - return child.leaf_path(index, groups); - } - index -= len; - } - None - } - - pub fn len(&self) -> usize { - self.len - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - pub fn name(&self) -> &Value { - &self.name - } -} - -impl Extend for Group -where - C: Into, -{ - fn extend>(&mut self, children: T) { - let children = children.into_iter(); - self.children.reserve(children.size_hint().0); - for child in children { - self.push(child); - } - } -} - -#[derive(Clone, Debug, Default, Serialize)] -pub struct Footnotes(pub Vec>); - -impl Footnotes { - pub fn new() -> Self { - Self::default() - } - - pub fn push(&mut self, footnote: Footnote) -> Arc { - let footnote = Arc::new(footnote.with_index(self.0.len())); - self.0.push(footnote.clone()); - footnote - } - - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } -} - -#[derive(Clone, Debug)] -pub struct Leaf { - name: Box, -} - -impl Leaf { - pub fn new(name: Value) -> Self { - Self { - name: Box::new(name), - } - } - pub fn name(&self) -> &Value { - &self.name - } -} - -impl Serialize for Leaf { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - self.name.serialize(serializer) - } -} - -/// Pivot result classes. -/// -/// These are used to mark [Leaf] categories as having particular types of data, -/// to set their numeric formats. -#[derive(Clone, Debug, PartialEq, Eq)] -pub enum Class { - Other, - Integer, - Correlations, - Significance, - Percent, - Residual, - Count, -} - -/// A pivot_category is a leaf (a category) or a group. -#[derive(Clone, Debug, Serialize)] -pub enum Category { - Group(Group), - Leaf(Leaf), -} - -impl Category { - pub fn name(&self) -> &Value { - match self { - Category::Group(group) => &group.name, - Category::Leaf(leaf) => &leaf.name, - } - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - pub fn len(&self) -> usize { - match self { - Category::Group(group) => group.len, - Category::Leaf(_) => 1, - } - } - - pub fn nth_leaf(&self, index: usize) -> Option<&Leaf> { - match self { - Category::Group(group) => group.nth_leaf(index), - Category::Leaf(leaf) => { - if index == 0 { - Some(leaf) - } else { - None - } - } - } - } - - pub fn leaf_path<'a>(&'a self, index: usize, groups: GroupVec<'a>) -> Option> { - match self { - Category::Group(group) => group.leaf_path(index, groups), - Category::Leaf(leaf) => { - if index == 0 { - Some(Path { groups, leaf }) - } else { - None - } - } - } - } - - pub fn show_label(&self) -> bool { - match self { - Category::Group(group) => group.show_label, - Category::Leaf(_) => true, - } - } -} - -impl From for Category { - fn from(group: Group) -> Self { - Self::Group(group) - } -} - -impl From for Category { - fn from(group: Leaf) -> Self { - Self::Leaf(group) - } -} - -impl From for Category { - fn from(name: Value) -> Self { - Leaf::new(name).into() - } -} - -impl From<&Variable> for Category { - fn from(variable: &Variable) -> Self { - Value::new_variable(variable).into() - } -} - -impl From<&str> for Category { - fn from(name: &str) -> Self { - Self::Leaf(Leaf::new(Value::new_text(name))) - } -} - -impl From for Category { - fn from(name: String) -> Self { - Self::Leaf(Leaf::new(Value::new_text(name))) - } -} - -impl From<&String> for Category { - fn from(name: &String) -> Self { - Self::Leaf(Leaf::new(Value::new_text(name))) - } -} - -/// Styling for a pivot table. -/// -/// The division between this and the style information in [PivotTable] seems -/// fairly arbitrary. The ultimate reason for the division is simply because -/// that's how SPSS documentation and file formats do it. -#[derive(Clone, Debug, Serialize)] -pub struct Look { - pub name: Option, - - /// Whether to hide rows or columns whose cells are all empty. - pub hide_empty: bool, - - pub row_label_position: LabelPosition, - - /// Ranges of column widths in the two heading regions, in 1/96" units. - pub heading_widths: EnumMap>, - - /// Kind of markers to use for footnotes. - pub footnote_marker_type: FootnoteMarkerType, - - /// Where to put the footnote markers. - pub footnote_marker_position: FootnoteMarkerPosition, - - /// Styles for areas of the pivot table. - pub areas: EnumMap, - - /// Styles for borders in the pivot table. - pub borders: EnumMap, - - pub print_all_layers: bool, - - pub paginate_layers: bool, - - pub shrink_to_fit: EnumMap, - - pub top_continuation: bool, - - pub bottom_continuation: bool, - - pub continuation: Option, - - pub n_orphan_lines: usize, -} - -impl Look { - pub fn with_omit_empty(mut self, omit_empty: bool) -> Self { - self.hide_empty = omit_empty; - self - } - pub fn with_row_label_position(mut self, row_label_position: LabelPosition) -> Self { - self.row_label_position = row_label_position; - self - } - pub fn with_borders(mut self, borders: EnumMap) -> Self { - self.borders = borders; - self - } -} - -impl Default for Look { - fn default() -> Self { - Self { - name: None, - hide_empty: true, - row_label_position: LabelPosition::default(), - heading_widths: EnumMap::from_fn(|region| match region { - HeadingRegion::Rows => 36..=72, - HeadingRegion::Columns => 36..=120, - }), - footnote_marker_type: FootnoteMarkerType::default(), - footnote_marker_position: FootnoteMarkerPosition::default(), - areas: EnumMap::from_fn(Area::default_area_style), - borders: EnumMap::from_fn(Border::default_border_style), - print_all_layers: false, - paginate_layers: false, - shrink_to_fit: EnumMap::from_fn(|_| false), - top_continuation: false, - bottom_continuation: false, - continuation: None, - n_orphan_lines: 0, - } - } -} - -#[derive(ThisError, Debug)] -pub enum ParseLookError { - #[error(transparent)] - XmlError(#[from] DeError), - - #[error(transparent)] - Utf8Error(#[from] Utf8Error), - - #[error(transparent)] - BinError(#[from] BinError), - - #[error(transparent)] - IoError(#[from] std::io::Error), -} - -impl Look { - pub fn shared_default() -> Arc { - static LOOK: OnceLock> = OnceLock::new(); - LOOK.get_or_init(|| Arc::new(Look::default())).clone() - } - - pub fn from_xml(xml: &str) -> Result { - Ok(from_str::(xml) - .map_err(ParseLookError::from)? - .into()) - } - - pub fn from_binary(tlo: &[u8]) -> Result { - parse_tlo(tlo).map_err(ParseLookError::from) - } - - pub fn from_data(data: &[u8]) -> Result { - if data.starts_with(b"\xff\xff\0\0") { - Self::from_binary(data) - } else { - Self::from_xml(from_utf8(data).map_err(ParseLookError::from)?) - } - } - - pub fn from_reader(mut reader: R) -> Result - where - R: Read, - { - let mut buffer = Vec::new(); - reader - .read_to_end(&mut buffer) - .map_err(ParseLookError::from)?; - Self::from_data(&buffer) - } -} - -/// Position for group labels. -#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] -pub enum LabelPosition { - /// Hierarachically enclosing the categories. - /// - /// For column labels, group labels appear above the categories. For row - /// labels, group labels appear to the left of the categories. - /// - /// ```text - /// ┌────┬──────────────┐ ┌─────────┬──────────┐ - /// │ │ nested │ │ │ columns │ - /// │ ├────┬────┬────┤ ├──────┬──┼──────────┤ - /// │ │ a1 │ a2 │ a3 │ │ │a1│...data...│ - /// ├────┼────┼────┼────┤ │nested│a2│...data...│ - /// │ │data│data│data│ │ │a3│...data...│ - /// │ │ . │ . │ . │ └──────┴──┴──────────┘ - /// │rows│ . │ . │ . │ - /// │ │ . │ . │ . │ - /// └────┴────┴────┴────┘ - /// ``` - #[serde(rename = "nested")] - Nested, - - /// In the corner (row labels only). - /// - /// ```text - /// ┌──────┬──────────┐ - /// │corner│ columns │ - /// ├──────┼──────────┤ - /// │ a1│...data...│ - /// │ a2│...data...│ - /// │ a3│...data...│ - /// └──────┴──────────┘ - /// ``` - #[default] - #[serde(rename = "inCorner")] - Corner, -} - -/// The heading region of a rendered pivot table: -/// -/// ```text -/// ┌──────────────────┬─────────────────────────────────────────────────┐ -/// │ │ column headings │ -/// │ ├─────────────────────────────────────────────────┤ -/// │ corner │ │ -/// │ and │ │ -/// │ row headings │ data │ -/// │ │ │ -/// │ │ │ -/// └──────────────────┴─────────────────────────────────────────────────┘ -/// ``` -#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum HeadingRegion { - Rows, - Columns, -} - -impl HeadingRegion { - pub fn as_str(&self) -> &'static str { - match self { - HeadingRegion::Rows => "rows", - HeadingRegion::Columns => "columns", - } - } -} - -impl Display for HeadingRegion { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -impl From for HeadingRegion { - fn from(axis: Axis2) -> Self { - match axis { - Axis2::X => HeadingRegion::Columns, - Axis2::Y => HeadingRegion::Rows, - } - } -} - -#[derive(Clone, Debug, Serialize)] -pub struct AreaStyle { - pub cell_style: CellStyle, - pub font_style: FontStyle, -} - -#[derive(Clone, Debug, Serialize)] -pub struct CellStyle { - /// `None` means "mixed" alignment: align strings to the left, numbers to - /// the right. - pub horz_align: Option, - pub vert_align: VertAlign, - - /// Margins in 1/96" units. - /// - /// `margins[Axis2::X][0]` is the left margin. - /// `margins[Axis2::X][1]` is the right margin. - /// `margins[Axis2::Y][0]` is the top margin. - /// `margins[Axis2::Y][1]` is the bottom margin. - pub margins: EnumMap, -} - -#[derive(Copy, Clone, Debug, PartialEq, Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum HorzAlign { - /// Right aligned. - Right, - - /// Left aligned. - Left, - - /// Centered. - Center, - - /// Align the decimal point at the specified position. - Decimal { - /// Decimal offset from the right side of the cell, in 1/96" units. - offset: f64, - - /// Decimal character. - decimal: Decimal, - }, -} - -impl HorzAlign { - pub fn for_mixed(var_type: VarType) -> Self { - match var_type { - VarType::Numeric => Self::Right, - VarType::String => Self::Left, - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum VertAlign { - /// Top alignment. - Top, - - /// Centered, - Middle, - - /// Bottom alignment. - Bottom, -} - -#[derive(Clone, Debug, Serialize)] -pub struct FontStyle { - pub bold: bool, - pub italic: bool, - pub underline: bool, - pub markup: bool, - pub font: String, - - /// `fg[0]` is the usual foreground color. - /// - /// `fg[1]` is used only in [Area::Data] for odd-numbered rows. - pub fg: [Color; 2], - - /// `bg[0]` is the usual background color. - /// - /// `bg[1]` is used only in [Area::Data] for odd-numbered rows. - pub bg: [Color; 2], - - /// In 1/72" units. - pub size: i32, -} - -#[derive(Copy, Clone, PartialEq, Eq)] -pub struct Color { - pub alpha: u8, - pub r: u8, - pub g: u8, - pub b: u8, -} - -impl Color { - pub const BLACK: Color = Color::new(0, 0, 0); - pub const WHITE: Color = Color::new(255, 255, 255); - pub const RED: Color = Color::new(255, 0, 0); - pub const BLUE: Color = Color::new(0, 0, 255); - pub const TRANSPARENT: Color = Color::new(0, 0, 0).with_alpha(0); - - pub const fn new(r: u8, g: u8, b: u8) -> Self { - Self { - alpha: 255, - r, - g, - b, - } - } - - pub const fn with_alpha(self, alpha: u8) -> Self { - Self { alpha, ..self } - } - - pub const fn without_alpha(self) -> Self { - self.with_alpha(255) - } - - pub fn display_css(&self) -> DisplayCss { - DisplayCss(*self) - } -} - -impl Debug for Color { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.display_css()) - } -} - -impl From for Color { - fn from(Rgba8 { r, g, b, a }: Rgba8) -> Self { - Self::new(r, g, b).with_alpha(a) - } -} - -impl FromStr for Color { - type Err = ParseColorError; - - fn from_str(s: &str) -> Result { - fn is_bare_hex(s: &str) -> bool { - let s = s.trim(); - s.chars().count() == 6 && s.chars().all(|c| c.is_ascii_hexdigit()) - } - let color: AlphaColor = match s.parse() { - Err(ParseColorError::UnknownColorSyntax) if is_bare_hex(s) => { - ("#".to_owned() + s).parse() - } - Err(ParseColorError::UnknownColorSyntax) - if s.trim().eq_ignore_ascii_case("transparent") => - { - Ok(TRANSPARENT) - } - other => other, - }?; - Ok(color.to_rgba8().into()) - } -} - -impl Serialize for Color { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - serializer.serialize_str(&self.display_css().to_small_string::<32>()) - } -} - -impl<'de> Deserialize<'de> for Color { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - struct ColorVisitor; - - impl<'de> Visitor<'de> for ColorVisitor { - type Value = Color; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - formatter.write_str("\"#rrggbb\" or \"rrggbb\" or web color name") - } - - fn visit_borrowed_str(self, v: &'de str) -> Result - where - E: serde::de::Error, - { - v.parse().map_err(E::custom) - } - } - - deserializer.deserialize_str(ColorVisitor) - } -} - -pub struct DisplayCss(Color); - -impl Display for DisplayCss { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let Color { alpha, r, g, b } = self.0; - match alpha { - 255 => write!(f, "#{r:02x}{g:02x}{b:02x}"), - _ => write!(f, "rgb({r}, {g}, {b}, {:.2})", alpha as f64 / 255.0), - } - } -} - -#[derive(Copy, Clone, Debug, Deserialize)] -pub struct BorderStyle { - #[serde(rename = "@borderStyleType")] - pub stroke: Stroke, - - #[serde(rename = "@color")] - pub color: Color, -} - -impl Serialize for BorderStyle { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - let mut s = serializer.serialize_struct("BorderStyle", 2)?; - s.serialize_field("stroke", &self.stroke)?; - s.serialize_field("color", &self.color)?; - s.end() - } -} - -impl BorderStyle { - pub const fn none() -> Self { - Self { - stroke: Stroke::None, - color: Color::BLACK, - } - } - - pub fn is_none(&self) -> bool { - self.stroke.is_none() - } - - /// Returns a border style that "combines" the two arguments, that is, that - /// gives a reasonable choice for a rule for different reasons should have - /// both styles. - pub fn combine(self, other: BorderStyle) -> Self { - Self { - stroke: self.stroke.combine(other.stroke), - color: self.color, - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Enum, Deserialize, Serialize)] -#[serde(rename_all = "camelCase")] -pub enum Stroke { - None, - Solid, - Dashed, - Thick, - Thin, - Double, -} - -impl Stroke { - pub fn is_none(&self) -> bool { - self == &Self::None - } - - /// Returns a stroke that "combines" the two arguments, that is, that gives - /// a reasonable stroke choice for a rule for different reasons should have - /// both styles. - pub fn combine(self, other: Stroke) -> Self { - self.max(other) - } -} - -/// An axis of a 2-dimensional table. -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum Axis2 { - X, - Y, -} - -impl Axis2 { - pub fn new_enum(x: T, y: T) -> EnumMap { - EnumMap::from_array([x, y]) - } - - pub fn as_str(&self) -> &'static str { - match self { - Axis2::X => "x", - Axis2::Y => "y", - } - } -} - -impl Display for Axis2 { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -impl Not for Axis2 { - type Output = Self; - - fn not(self) -> Self::Output { - match self { - Self::X => Self::Y, - Self::Y => Self::X, - } - } -} - -/// A 2-dimensional `(x,y)` pair. -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)] -pub struct Coord2(pub EnumMap); - -impl Coord2 { - pub fn new(x: usize, y: usize) -> Self { - use Axis2::*; - Self(enum_map! { - X => x, - Y => y - }) - } - - pub fn for_axis((a, az): (Axis2, usize), bz: usize) -> Self { - let mut coord = Self::default(); - coord[a] = az; - coord[!a] = bz; - coord - } - - pub fn from_fn(f: F) -> Self - where - F: FnMut(Axis2) -> usize, - { - Self(EnumMap::from_fn(f)) - } - - pub fn x(&self) -> usize { - self.0[Axis2::X] - } - - pub fn y(&self) -> usize { - self.0[Axis2::Y] - } - - pub fn get(&self, axis: Axis2) -> usize { - self.0[axis] - } -} - -impl From> for Coord2 { - fn from(value: EnumMap) -> Self { - Self(value) - } -} - -impl Index for Coord2 { - type Output = usize; - - fn index(&self, index: Axis2) -> &Self::Output { - &self.0[index] - } -} - -impl IndexMut for Coord2 { - fn index_mut(&mut self, index: Axis2) -> &mut Self::Output { - &mut self.0[index] - } -} - -#[derive(Clone, Debug, Default)] -pub struct Rect2(pub EnumMap>); - -impl Rect2 { - pub fn new(x_range: Range, y_range: Range) -> Self { - Self(enum_map! { - Axis2::X => x_range.clone(), - Axis2::Y => y_range.clone(), - }) - } - pub fn for_cell(cell: Coord2) -> Self { - Self::new(cell.x()..cell.x() + 1, cell.y()..cell.y() + 1) - } - pub fn for_ranges((a, a_range): (Axis2, Range), b_range: Range) -> Self { - let b = !a; - let mut ranges = EnumMap::default(); - ranges[a] = a_range; - ranges[b] = b_range; - Self(ranges) - } - pub fn top_left(&self) -> Coord2 { - use Axis2::*; - Coord2::new(self[X].start, self[Y].start) - } - pub fn from_fn(f: F) -> Self - where - F: FnMut(Axis2) -> Range, - { - Self(EnumMap::from_fn(f)) - } - pub fn translate(self, offset: Coord2) -> Rect2 { - Self::from_fn(|axis| self[axis].start + offset[axis]..self[axis].end + offset[axis]) - } - pub fn is_empty(&self) -> bool { - self[Axis2::X].is_empty() || self[Axis2::Y].is_empty() - } -} - -impl From>> for Rect2 { - fn from(value: EnumMap>) -> Self { - Self(value) - } -} - -impl Index for Rect2 { - type Output = Range; - - fn index(&self, index: Axis2) -> &Self::Output { - &self.0[index] - } -} - -impl IndexMut for Rect2 { - fn index_mut(&mut self, index: Axis2) -> &mut Self::Output { - &mut self.0[index] - } -} - -#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] -pub enum FootnoteMarkerType { - /// a, b, c, ... - #[default] - Alphabetic, - - /// 1, 2, 3, ... - Numeric, -} - -#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] -pub enum FootnoteMarkerPosition { - /// Subscripts. - #[default] - Subscript, - - /// Superscripts. - Superscript, -} - -#[derive(Copy, Clone, Debug)] -pub struct ValueOptions { - pub show_values: Option, - - pub show_variables: Option, - - pub small: f64, - - /// Where to put the footnote markers. - pub footnote_marker_type: FootnoteMarkerType, -} - -impl Default for ValueOptions { - fn default() -> Self { - Self { - show_values: None, - show_variables: None, - small: 0.0001, - footnote_marker_type: FootnoteMarkerType::default(), - } - } -} - -pub trait IntoValueOptions { - fn into_value_options(self) -> ValueOptions; -} - -impl IntoValueOptions for () { - fn into_value_options(self) -> ValueOptions { - ValueOptions::default() - } -} - -impl IntoValueOptions for &PivotTable { - fn into_value_options(self) -> ValueOptions { - self.value_options() - } -} - -impl IntoValueOptions for &ValueOptions { - fn into_value_options(self) -> ValueOptions { - *self - } -} - -impl IntoValueOptions for ValueOptions { - fn into_value_options(self) -> ValueOptions { - self - } -} - -#[derive(Clone, Debug, Serialize)] -pub struct PivotTable { - pub look: Arc, - - pub rotate_inner_column_labels: bool, - - pub rotate_outer_row_labels: bool, - - pub show_grid_lines: bool, - - pub show_title: bool, - - pub show_caption: bool, - - pub show_values: Option, - - pub show_variables: Option, - - pub weight_format: Format, - - /// Current layer indexes, with `axes[Axis3::Z].dimensions.len()` elements. - /// `current_layer[i]` is an offset into - /// `axes[Axis3::Z].dimensions[i].data_leaves[]`, except that a dimension - /// can have zero leaves, in which case `current_layer[i]` is zero and - /// there's no corresponding leaf. - pub current_layer: Vec, - - /// Column and row sizing and page breaks. - pub sizing: EnumMap>>, - - /// Format settings. - pub settings: FormatSettings, - - /// Numeric grouping character (usually `.` or `,`). - pub grouping: Option, - - pub small: f64, - - pub command_local: Option, - pub command_c: Option, - pub language: Option, - pub locale: Option, - pub dataset: Option, - pub datafile: Option, - pub date: Option, - pub footnotes: Footnotes, - pub title: Option>, - pub subtype: Option>, - pub corner_text: Option>, - pub caption: Option>, - pub notes: Option, - pub dimensions: Vec, - pub axes: EnumMap, - pub cells: HashMap, -} - -impl PivotTable { - pub fn with_title(mut self, title: impl Into) -> Self { - self.title = Some(Box::new(title.into())); - self.show_title = true; - self - } - - pub fn with_caption(mut self, caption: impl Into) -> Self { - self.caption = Some(Box::new(caption.into())); - self.show_caption = true; - self - } - - pub fn with_corner_text(mut self, corner_text: impl Into) -> Self { - self.corner_text = Some(Box::new(corner_text.into())); - self - } - - pub fn with_subtype(self, subtype: impl Into) -> Self { - Self { - subtype: Some(Box::new(subtype.into())), - ..self - } - } - - pub fn with_show_title(mut self, show_title: bool) -> Self { - self.show_title = show_title; - self - } - - pub fn with_show_caption(mut self, show_caption: bool) -> Self { - self.show_caption = show_caption; - self - } - - pub fn with_layer(mut self, layer: &[usize]) -> Self { - debug_assert_eq!(layer.len(), self.current_layer.len()); - if self.look.print_all_layers { - self.look_mut().print_all_layers = false; - } - self.current_layer.clear(); - self.current_layer.extend_from_slice(layer); - self - } - - pub fn with_all_layers(mut self) -> Self { - if !self.look.print_all_layers { - self.look_mut().print_all_layers = true; - } - self - } - - pub fn look_mut(&mut self) -> &mut Look { - Arc::make_mut(&mut self.look) - } - - pub fn with_show_empty(mut self) -> Self { - if self.look.hide_empty { - self.look_mut().hide_empty = false; - } - self - } - - pub fn with_hide_empty(mut self) -> Self { - if !self.look.hide_empty { - self.look_mut().hide_empty = true; - } - self - } - - pub fn label(&self) -> String { - match &self.title { - Some(title) => title.display(self).to_string(), - None => String::from("Table"), - } - } - - pub fn title(&self) -> &Value { - match &self.title { - Some(title) => title, - None => { - static EMPTY: Value = Value::empty(); - &EMPTY - } - } - } - - pub fn subtype(&self) -> &Value { - match &self.subtype { - Some(subtype) => subtype, - None => { - static EMPTY: Value = Value::empty(); - &EMPTY - } - } - } -} - -impl Default for PivotTable { - fn default() -> Self { - Self { - look: Look::shared_default(), - rotate_inner_column_labels: false, - rotate_outer_row_labels: false, - show_grid_lines: false, - show_title: true, - show_caption: true, - show_values: None, - show_variables: None, - weight_format: Format::F40, - current_layer: Vec::new(), - sizing: EnumMap::default(), - settings: FormatSettings::default(), // XXX from settings - grouping: None, - small: 0.0001, // XXX from settings. - command_local: None, - command_c: None, // XXX from current command name. - language: None, - locale: None, - dataset: None, - datafile: None, - date: None, - footnotes: Footnotes::new(), - subtype: None, - title: None, - corner_text: None, - caption: None, - notes: None, - dimensions: Vec::new(), - axes: EnumMap::default(), - cells: HashMap::new(), - } - } -} - -fn cell_index(data_indexes: &[usize], dimensions: I) -> usize -where - I: ExactSizeIterator, -{ - debug_assert_eq!(data_indexes.len(), dimensions.len()); - let mut index = 0; - for (dimension, data_index) in dimensions.zip(data_indexes.iter()) { - debug_assert!(*data_index < dimension); - index = dimension * index + data_index; - } - index -} - -impl PivotTable { - pub fn new(axes_and_dimensions: impl IntoIterator) -> Self { - let mut dimensions = Vec::new(); - let mut axes = EnumMap::::default(); - for (axis, dimension) in axes_and_dimensions { - axes[axis].dimensions.push(dimensions.len()); - dimensions.push(dimension); - } - Self { - look: Settings::global().look.clone(), - current_layer: repeat_n(0, axes[Axis3::Z].dimensions.len()).collect(), - axes, - dimensions, - ..Self::default() - } - } - fn cell_index(&self, data_indexes: &[usize]) -> usize { - cell_index(data_indexes, self.dimensions.iter().map(|d| d.len())) - } - - pub fn insert(&mut self, data_indexes: &[usize], value: impl Into) { - self.cells - .insert(self.cell_index(data_indexes), value.into()); - } - - pub fn get(&self, data_indexes: &[usize]) -> Option<&Value> { - self.cells.get(&self.cell_index(data_indexes)) - } - - pub fn with_data(mut self, iter: impl IntoIterator) -> Self - where - I: AsRef<[usize]>, - { - self.extend(iter); - self - } - - /// Converts per-axis presentation-order indexes in `presentation_indexes`, - /// into data indexes for each dimension. - fn convert_indexes_ptod( - &self, - presentation_indexes: EnumMap, - ) -> SmallVec<[usize; 4]> { - let mut data_indexes = SmallVec::from_elem(0, self.dimensions.len()); - for (axis, presentation_indexes) in presentation_indexes { - for (&dim_index, &pindex) in self.axes[axis] - .dimensions - .iter() - .zip(presentation_indexes.iter()) - { - data_indexes[dim_index] = self.dimensions[dim_index].presentation_order[pindex]; - } - } - data_indexes - } - - /// Returns an iterator for the layer axis: - /// - /// - If `print` is true and `self.look.print_all_layers`, then the iterator - /// will visit all values of the layer axis. - /// - /// - Otherwise, the iterator will just visit `self.current_layer`. - pub fn layers(&self, print: bool) -> Box>> { - if print && self.look.print_all_layers { - Box::new(self.axis_values(Axis3::Z)) - } else { - Box::new(once(SmallVec::from_slice(&self.current_layer))) - } - } - - pub fn value_options(&self) -> ValueOptions { - ValueOptions { - show_values: self.show_values, - show_variables: self.show_variables, - small: self.small, - footnote_marker_type: self.look.footnote_marker_type, - } - } - - pub fn transpose(&mut self) { - self.axes.swap(Axis3::X, Axis3::Y); - } - - pub fn axis_dimensions( - &self, - axis: Axis3, - ) -> impl DoubleEndedIterator + ExactSizeIterator { - self.axes[axis] - .dimensions - .iter() - .copied() - .map(|index| &self.dimensions[index]) - } - - fn find_dimension(&self, dim_index: usize) -> Option<(Axis3, usize)> { - debug_assert!(dim_index < self.dimensions.len()); - for axis in enum_iterator::all::() { - for (position, dimension) in self.axes[axis].dimensions.iter().copied().enumerate() { - if dimension == dim_index { - return Some((axis, position)); - } - } - } - None - } - pub fn move_dimension(&mut self, dim_index: usize, new_axis: Axis3, new_position: usize) { - let (old_axis, old_position) = self.find_dimension(dim_index).unwrap(); - if old_axis == new_axis && old_position == new_position { - return; - } - - // Update the current layer, if necessary. If we're moving within the - // layer axis, preserve the current layer. - match (old_axis, new_axis) { - (Axis3::Z, Axis3::Z) => { - // Rearrange the layer axis. - if old_position < new_position { - self.current_layer[old_position..=new_position].rotate_left(1); - } else { - self.current_layer[new_position..=old_position].rotate_right(1); - } - } - (Axis3::Z, _) => { - // A layer is becoming a row or column. - self.current_layer.remove(old_position); - } - (_, Axis3::Z) => { - // A row or column is becoming a layer. - self.current_layer.insert(new_position, 0); - } - _ => (), - } - - self.axes[old_axis].dimensions.remove(old_position); - self.axes[new_axis] - .dimensions - .insert(new_position, dim_index); - } -} - -impl Extend<(I, Value)> for PivotTable -where - I: AsRef<[usize]>, -{ - fn extend>(&mut self, iter: T) { - for (data_indexes, value) in iter { - self.insert(data_indexes.as_ref(), value); - } - } -} - -#[derive(Clone, Debug, Serialize)] -pub struct Footnote { - #[serde(skip)] - index: usize, - pub content: Box, - pub marker: Option>, - pub show: bool, -} - -impl Footnote { - pub fn new(content: impl Into) -> Self { - Self { - index: 0, - content: Box::new(content.into()), - marker: None, - show: true, - } - } - pub fn with_marker(mut self, marker: impl Into) -> Self { - self.marker = Some(Box::new(marker.into())); - self - } - - pub fn with_show(mut self, show: bool) -> Self { - self.show = show; - self - } - - pub fn with_index(mut self, index: usize) -> Self { - self.index = index; - self - } - - pub fn display_marker(&self, options: impl IntoValueOptions) -> DisplayMarker<'_> { - DisplayMarker { - footnote: self, - options: options.into_value_options(), - } - } - - pub fn display_content(&self, options: impl IntoValueOptions) -> DisplayValue<'_> { - self.content.display(options) - } - - pub fn index(&self) -> usize { - self.index - } -} - -pub struct DisplayMarker<'a> { - footnote: &'a Footnote, - options: ValueOptions, -} - -impl Display for DisplayMarker<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if let Some(marker) = &self.footnote.marker { - write!(f, "{}", marker.display(self.options).without_suffixes()) - } else { - let i = self.footnote.index + 1; - match self.options.footnote_marker_type { - FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic::new_lowercase(i)), - FootnoteMarkerType::Numeric => write!(f, "{i}"), - } - } - } -} - -/// Displays a number in 26adic notation. -/// -/// Zero is displayed as the empty string, 1 through 26 as `a` through `z`, 27 -/// through 52 as `aa` through `az`, and so on. -pub struct Display26Adic { - value: usize, - base: u8, -} - -impl Display26Adic { - /// Constructs a `Display26Adic` for `value`, with letters in lowercase. - pub fn new_lowercase(value: usize) -> Self { - Self { value, base: b'a' } - } - - /// Constructs a `Display26Adic` for `value`, with letters in uppercase. - pub fn new_uppercase(value: usize) -> Self { - Self { value, base: b'A' } - } -} - -impl Display for Display26Adic { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let mut output = SmallVec::<[u8; 16]>::new(); - let mut number = self.value; - while number > 0 { - number -= 1; - let digit = (number % 26) as u8; - output.push(digit + self.base); - number /= 26; - } - output.reverse(); - write!(f, "{}", from_utf8(&output).unwrap()) - } -} - -/// The content of a single pivot table cell. -/// -/// A [Value] is also a pivot table's title, caption, footnote marker and -/// contents, and so on. -/// -/// A given [Value] is one of: -/// -/// 1. A number resulting from a calculation. -/// -/// A number has an associated display format (usually [F] or [Pct]). This -/// format can be set directly, but that is not usually the easiest way. -/// Instead, it is usually true that all of the values in a single category -/// should have the same format (e.g. all "Significance" values might use -/// format `F40.3`), so PSPP makes it easy to set the default format for a -/// category while creating the category. See pivot_dimension_create() for -/// more details. -/// -/// [F]: crate::format::Type::F -/// [Pct]: crate::format::Type::Pct -/// -/// 2. A numeric or string value obtained from data ([ValueInner::Number] or -/// [ValueInner::String]). If such a value corresponds to a variable, then the -/// variable's name can be attached to the pivot_value. If the value has a -/// value label, then that can also be attached. When a label is present, -/// the user can control whether to show the value or the label or both. -/// -/// 3. A variable name ([ValueInner::Variable]). The variable label, if any, can -/// be attached too, and again the user can control whether to show the value -/// or the label or both. -/// -/// 4. A text string ([ValueInner::Text). The value stores the string in English -/// and translated into the output language (localized). Use -/// pivot_value_new_text() or pivot_value_new_text_format() for those cases. -/// In some cases, only an English or a localized version is available for -/// one reason or another, although this is regrettable; in those cases, use -/// pivot_value_new_user_text() or pivot_value_new_user_text_nocopy(). -/// -/// 5. A template. PSPP doesn't create these itself yet, but it can read and -/// interpret those created by SPSS. -#[derive(Clone, Default)] -pub struct Value { - pub inner: ValueInner, - pub styling: Option>, -} - -impl Serialize for Value { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - self.inner.serialize(serializer) - } -} - -/// Wrapper for [Value] that uses [Value::serialize_bare] for serialization. -#[derive(Serialize)] -struct BareValue<'a>(#[serde(serialize_with = "Value::serialize_bare")] pub &'a Value); - -impl Value { - pub fn serialize_bare(&self, serializer: S) -> Result - where - S: Serializer, - { - match &self.inner { - ValueInner::Number(number_value) => number_value.serialize_bare(serializer), - ValueInner::String(string_value) => string_value.s.serialize(serializer), - ValueInner::Variable(variable_value) => variable_value.var_name.serialize(serializer), - ValueInner::Text(text_value) => text_value.localized.serialize(serializer), - ValueInner::Template(template_value) => template_value.localized.serialize(serializer), - ValueInner::Empty => serializer.serialize_none(), - } - } - - fn new(inner: ValueInner) -> Self { - Self { - inner, - styling: None, - } - } - pub fn new_number_with_format(x: Option, format: Format) -> Self { - Self::new(ValueInner::Number(NumberValue { - show: None, - format, - honor_small: false, - value: x, - variable: None, - value_label: None, - })) - } - pub fn new_variable(variable: &Variable) -> Self { - Self::new(ValueInner::Variable(VariableValue { - show: None, - var_name: String::from(variable.name.as_str()), - variable_label: variable.label.clone(), - })) - } - pub fn new_datum(value: &Datum) -> Self - where - B: EncodedString, - { - match value { - Datum::Number(number) => Self::new_number(*number), - Datum::String(string) => Self::new_user_text(string.as_str()), - } - } - pub fn new_variable_value(variable: &Variable, value: &Datum) -> Self { - let var_name = Some(variable.name.as_str().into()); - let value_label = variable.value_labels.get(value).map(String::from); - match value { - Datum::Number(number) => Self::new(ValueInner::Number(NumberValue { - show: None, - format: match variable.print_format.var_type() { - VarType::Numeric => variable.print_format, - VarType::String => { - #[cfg(debug_assertions)] - panic!("cannot create numeric pivot value with string format"); - - #[cfg(not(debug_assertions))] - Format::F8_2 - } - }, - honor_small: false, - value: *number, - variable: var_name, - value_label, - })), - Datum::String(string) => Self::new(ValueInner::String(StringValue { - show: None, - hex: variable.print_format.type_() == Type::AHex, - s: string - .as_ref() - .with_encoding(variable.encoding()) - .into_string(), - var_name, - value_label, - })), - } - } - pub fn new_number(x: Option) -> Self { - Self::new_number_with_format(x, Format::F8_2) - } - pub fn new_integer(x: Option) -> Self { - Self::new_number_with_format(x, Format::F40) - } - pub fn new_text(s: impl Into) -> Self { - Self::new_user_text(s) - } - pub fn new_user_text(s: impl Into) -> Self { - let s: String = s.into(); - if s.is_empty() { - Self::default() - } else { - Self::new(ValueInner::Text(TextValue { - user_provided: true, - localized: s.clone(), - c: None, - id: None, - })) - } - } - pub fn with_footnote(mut self, footnote: &Arc) -> Self { - self.add_footnote(footnote); - self - } - pub fn add_footnote(&mut self, footnote: &Arc) { - let footnotes = &mut self.styling.get_or_insert_default().footnotes; - footnotes.push(footnote.clone()); - footnotes.sort_by_key(|f| f.index); - } - pub fn with_show_value_label(mut self, show: Option) -> Self { - let new_show = show; - match &mut self.inner { - ValueInner::Number(NumberValue { show, .. }) - | ValueInner::String(StringValue { show, .. }) => { - *show = new_show; - } - _ => (), - } - self - } - pub fn with_show_variable_label(mut self, show: Option) -> Self { - if let ValueInner::Variable(variable_value) = &mut self.inner { - variable_value.show = show; - } - self - } - pub fn with_value_label(mut self, label: Option) -> Self { - match &mut self.inner { - ValueInner::Number(NumberValue { value_label, .. }) - | ValueInner::String(StringValue { value_label, .. }) => *value_label = label.clone(), - _ => (), - } - self - } - pub const fn empty() -> Self { - Value { - inner: ValueInner::Empty, - styling: None, - } - } - pub const fn is_empty(&self) -> bool { - self.inner.is_empty() && self.styling.is_none() - } -} - -impl From<&str> for Value { - fn from(value: &str) -> Self { - Self::new_text(value) - } -} - -impl From for Value { - fn from(value: String) -> Self { - Self::new_text(value) - } -} - -impl From<&Variable> for Value { - fn from(variable: &Variable) -> Self { - Self::new_variable(variable) - } -} - -pub struct DisplayValue<'a> { - inner: &'a ValueInner, - markup: bool, - subscripts: &'a [String], - footnotes: &'a [Arc], - options: ValueOptions, - show_value: bool, - show_label: Option<&'a str>, -} - -impl<'a> DisplayValue<'a> { - pub fn subscripts(&self) -> impl Iterator { - self.subscripts.iter().map(String::as_str) - } - - pub fn has_subscripts(&self) -> bool { - !self.subscripts.is_empty() - } - - pub fn footnotes(&self) -> impl Iterator> { - self.footnotes - .iter() - .filter(|f| f.show) - .map(|f| f.display_marker(self.options)) - } - - pub fn has_footnotes(&self) -> bool { - self.footnotes().next().is_some() - } - - pub fn without_suffixes(self) -> Self { - Self { - subscripts: &[], - footnotes: &[], - ..self - } - } - - /// Returns this display split into `(body, suffixes)` where `suffixes` is - /// subscripts and footnotes and `body` is everything else. - pub fn split_suffixes(self) -> (Self, Self) { - let suffixes = Self { - inner: &ValueInner::Empty, - ..self - }; - (self.without_suffixes(), suffixes) - } - - pub fn with_styling(mut self, styling: &'a ValueStyle) -> Self { - if let Some(area_style) = &styling.style { - self.markup = area_style.font_style.markup; - } - self.subscripts = styling.subscripts.as_slice(); - self.footnotes = styling.footnotes.as_slice(); - self - } - - pub fn with_font_style(self, font_style: &FontStyle) -> Self { - Self { - markup: font_style.markup, - ..self - } - } - - pub fn with_subscripts(self, subscripts: &'a [String]) -> Self { - Self { subscripts, ..self } - } - - pub fn with_footnotes(self, footnotes: &'a [Arc]) -> Self { - Self { footnotes, ..self } - } - - pub fn is_empty(&self) -> bool { - self.inner.is_empty() && self.subscripts.is_empty() && self.footnotes.is_empty() - } - - fn small(&self) -> f64 { - self.options.small - } - - pub fn var_type(&self) -> VarType { - match self.inner { - ValueInner::Number(NumberValue { .. }) if self.show_label.is_none() => VarType::Numeric, - _ => VarType::String, - } - } - - fn template( - &self, - f: &mut std::fmt::Formatter<'_>, - template: &str, - args: &[Vec], - ) -> std::fmt::Result { - let mut iter = template.as_bytes().iter(); - while let Some(c) = iter.next() { - match c { - b'\\' => { - let c = *iter.next().unwrap_or(&b'\\') as char; - let c = if c == 'n' { '\n' } else { c }; - write!(f, "{c}")?; - } - b'^' => { - let (index, rest) = consume_int(iter.as_slice()); - iter = rest.iter(); - let Some(arg) = args.get(index.wrapping_sub(1)) else { - continue; - }; - if let Some(arg) = arg.first() { - write!(f, "{}", arg.display(self.options))?; - } - } - b'[' => { - let (a, rest) = extract_inner_template(iter.as_slice()); - let (b, rest) = extract_inner_template(rest); - let rest = rest.strip_prefix(b"]").unwrap_or(rest); - let (index, rest) = consume_int(rest); - iter = rest.iter(); - - let Some(mut args) = args.get(index.wrapping_sub(1)).map(|vec| vec.as_slice()) - else { - continue; - }; - let (mut template, mut escape) = - if !a.is_empty() { (a, b'%') } else { (b, b'^') }; - while !args.is_empty() { - let n_consumed = self.inner_template(f, template, escape, args)?; - if n_consumed == 0 { - break; - } - args = &args[n_consumed..]; - - template = b; - escape = b'^'; - } - } - c => write!(f, "{c}")?, - } - } - Ok(()) - } - - fn inner_template( - &self, - f: &mut std::fmt::Formatter<'_>, - template: &[u8], - escape: u8, - args: &[Value], - ) -> Result { - let mut iter = template.iter(); - let mut args_consumed = 0; - while let Some(c) = iter.next() { - match c { - b'\\' => { - let c = *iter.next().unwrap_or(&b'\\') as char; - let c = if c == 'n' { '\n' } else { c }; - write!(f, "{c}")?; - } - c if *c == escape => { - let (index, rest) = consume_int(iter.as_slice()); - iter = rest.iter(); - let Some(arg) = args.get(index.wrapping_sub(1)) else { - continue; - }; - args_consumed = args_consumed.max(index); - write!(f, "{}", arg.display(self.options))?; - } - c => write!(f, "{c}")?, - } - } - Ok(args_consumed) - } -} - -fn consume_int(input: &[u8]) -> (usize, &[u8]) { - let mut n = 0; - for (index, c) in input.iter().enumerate() { - if !c.is_ascii_digit() { - return (n, &input[index..]); - } - n = n * 10 + (c - b'0') as usize; - } - (n, &[]) -} - -fn extract_inner_template(input: &[u8]) -> (&[u8], &[u8]) { - for (index, c) in input.iter().copied().enumerate() { - if c == b':' && (index == 0 || input[index - 1] != b'\\') { - return input.split_at(index); - } - } - (input, &[]) -} - -fn interpret_show( - global_show: impl Fn() -> Show, - table_show: Option, - value_show: Option, - label: &str, -) -> (bool, Option<&str>) { - match value_show.or(table_show).unwrap_or_else(global_show) { - Show::Value => (true, None), - Show::Label => (false, Some(label)), - Show::Both => (true, Some(label)), - } -} - -impl Display for DisplayValue<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self.inner { - ValueInner::Number(NumberValue { - format, - honor_small, - value, - .. - }) => { - if self.show_value { - let format = if format.type_() == Type::F - && *honor_small - && value.is_some_and(|value| value != 0.0 && value.abs() < self.small()) - { - UncheckedFormat::new(Type::E, 40, format.d() as u8).fix() - } else { - *format - }; - let mut buf = SmallString::<[u8; 40]>::new(); - write!( - &mut buf, - "{}", - Datum::<&str>::Number(*value).display(format) - ) - .unwrap(); - write!(f, "{}", buf.trim_start_matches(' '))?; - } - if let Some(label) = self.show_label { - if self.show_value { - write!(f, " ")?; - } - f.write_str(label)?; - } - Ok(()) - } - - ValueInner::String(StringValue { s, .. }) - | ValueInner::Variable(VariableValue { var_name: s, .. }) => { - match (self.show_value, self.show_label) { - (true, None) => write!(f, "{s}"), - (false, Some(label)) => write!(f, "{label}"), - (true, Some(label)) => write!(f, "{s} {label}"), - (false, None) => unreachable!(), - } - } - - ValueInner::Text(TextValue { - localized: local, .. - }) => { - /* - if self - .inner - .styling - .as_ref() - .is_some_and(|styling| styling.style.font_style.markup) - { - todo!(); - }*/ - f.write_str(local) - } - - ValueInner::Template(TemplateValue { - args, - localized: local, - .. - }) => self.template(f, local, args), - - ValueInner::Empty => Ok(()), - }?; - - for (subscript, delimiter) in self.subscripts.iter().zip(once('_').chain(repeat(','))) { - write!(f, "{delimiter}{subscript}")?; - } - - for footnote in self.footnotes { - write!(f, "[{}]", footnote.display_marker(self.options))?; - } - - Ok(()) - } -} - -impl Value { - // Returns an object that will format this value, including subscripts and - // superscripts and footnotes. `options` controls whether variable and - // value labels are included. - pub fn display(&self, options: impl IntoValueOptions) -> DisplayValue<'_> { - let display = self.inner.display(options.into_value_options()); - match &self.styling { - Some(styling) => display.with_styling(styling), - None => display, - } - } -} - -impl Debug for Value { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.display(()).to_string()) - } -} - -#[derive(Clone, Debug)] -pub struct NumberValue { - /// The numerical value, or `None` if it is a missing value. - pub value: Option, - pub format: Format, - pub show: Option, - pub honor_small: bool, - pub variable: Option, - pub value_label: Option, -} - -impl Serialize for NumberValue { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - if self.format.type_() == Type::F && self.variable.is_none() && self.value_label.is_none() { - self.value.serialize(serializer) - } else { - let mut s = serializer.serialize_map(None)?; - s.serialize_entry("value", &self.value)?; - s.serialize_entry("format", &self.format)?; - if let Some(show) = self.show { - s.serialize_entry("show", &show)?; - } - if self.honor_small { - s.serialize_entry("honor_small", &self.honor_small)?; - } - if let Some(variable) = &self.variable { - s.serialize_entry("variable", variable)?; - } - if let Some(value_label) = &self.value_label { - s.serialize_entry("value_label", value_label)?; - } - s.end() - } - } -} - -impl NumberValue { - pub fn serialize_bare(&self, serializer: S) -> Result - where - S: Serializer, - { - if let Some(number) = self.value - && number.trunc() == number - && number >= -(1i64 << 53) as f64 - && number <= (1i64 << 53) as f64 - { - (number as u64).serialize(serializer) - } else { - self.value.serialize(serializer) - } - } -} - -#[derive(Serialize)] -pub struct BareNumberValue<'a>( - #[serde(serialize_with = "NumberValue::serialize_bare")] pub &'a NumberValue, -); - -#[derive(Clone, Debug, Serialize)] -pub struct StringValue { - /// The string value. - /// - /// If `hex` is true, this should contain hex digits, not raw binary data - /// (otherwise it would be impossible to encode non-UTF-8 data). - pub s: String, - - /// True if `s` is hex digits. - pub hex: bool, - - pub show: Option, - - pub var_name: Option, - pub value_label: Option, -} - -#[derive(Clone, Debug, Serialize)] -pub struct VariableValue { - pub show: Option, - pub var_name: String, - pub variable_label: Option, -} - -#[derive(Clone, Debug)] -pub struct TextValue { - pub user_provided: bool, - /// Localized. - pub localized: String, - /// English. - pub c: Option, - /// Identifier. - pub id: Option, -} - -impl Serialize for TextValue { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - if self.user_provided && self.c.is_none() && self.id.is_none() { - serializer.serialize_str(&self.localized) - } else { - let mut s = serializer.serialize_struct( - "TextValue", - 2 + self.c.is_some() as usize + self.id.is_some() as usize, - )?; - s.serialize_field("user_provided", &self.user_provided)?; - s.serialize_field("localized", &self.localized)?; - if let Some(c) = &self.c { - s.serialize_field("c", &c)?; - } - if let Some(id) = &self.id { - s.serialize_field("id", &id)?; - } - s.end() - } - } -} - -impl TextValue { - pub fn localized(&self) -> &str { - self.localized.as_str() - } - pub fn c(&self) -> &str { - self.c.as_ref().unwrap_or(&self.localized).as_str() - } - pub fn id(&self) -> &str { - self.id.as_ref().unwrap_or(&self.localized).as_str() - } -} - -#[derive(Clone, Debug, Serialize)] -pub struct TemplateValue { - pub args: Vec>, - pub localized: String, - pub id: String, -} - -#[derive(Clone, Debug, Default, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum ValueInner { - Number(NumberValue), - String(StringValue), - Variable(VariableValue), - Text(TextValue), - Template(TemplateValue), - - #[default] - Empty, -} - -impl ValueInner { - pub const fn is_empty(&self) -> bool { - matches!(self, Self::Empty) - } - fn show(&self) -> Option { - match self { - ValueInner::Number(NumberValue { show, .. }) - | ValueInner::String(StringValue { show, .. }) - | ValueInner::Variable(VariableValue { show, .. }) => *show, - _ => None, - } - } - - fn label(&self) -> Option<&str> { - self.value_label().or_else(|| self.variable_label()) - } - - fn value_label(&self) -> Option<&str> { - match self { - ValueInner::Number(NumberValue { value_label, .. }) - | ValueInner::String(StringValue { value_label, .. }) => { - value_label.as_ref().map(String::as_str) - } - _ => None, - } - } - - fn variable_label(&self) -> Option<&str> { - match self { - ValueInner::Variable(VariableValue { variable_label, .. }) => { - variable_label.as_ref().map(String::as_str) - } - _ => None, - } - } -} - -#[derive(Clone, Debug, Default)] -pub struct ValueStyle { - pub style: Option, - pub subscripts: Vec, - pub footnotes: Vec>, -} - -impl ValueStyle { - pub fn is_empty(&self) -> bool { - self.style.is_none() && self.subscripts.is_empty() && self.footnotes.is_empty() - } -} - -impl ValueInner { - // Returns an object that will format this value. Settings on `options` - // control whether variable and value labels are included. - pub fn display(&self, options: impl IntoValueOptions) -> DisplayValue<'_> { - let options = options.into_value_options(); - let (show_value, show_label) = if let Some(value_label) = self.value_label() { - interpret_show( - || Settings::global().show_values, - options.show_values, - self.show(), - value_label, - ) - } else if let Some(variable_label) = self.variable_label() { - interpret_show( - || Settings::global().show_variables, - options.show_variables, - self.show(), - variable_label, - ) - } else { - (true, None) - }; - DisplayValue { - inner: self, - markup: false, - subscripts: &[], - footnotes: &[], - options, - show_value, - show_label, - } - } -} - -pub struct MetadataEntry { - pub name: Value, - pub value: MetadataValue, -} - -pub enum MetadataValue { - Leaf(Value), - Group(Vec), -} - -impl MetadataEntry { - pub fn into_pivot_table(self) -> PivotTable { - let mut data = Vec::new(); - let group = match self.visit(&mut data) { - Category::Group(group) => group, - Category::Leaf(leaf) => Group::new("Metadata").with(leaf).with_label_shown(), - }; - PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( - data.into_iter() - .enumerate() - .filter(|(_row, value)| !value.is_empty()) - .map(|(row, value)| ([row], value)), - ) - } - fn visit(self, data: &mut Vec) -> Category { - match self.value { - MetadataValue::Leaf(value) => { - data.push(value); - Leaf::new(self.name).into() - } - MetadataValue::Group(items) => Group::with_capacity(self.name, items.len()) - .with_multiple(items.into_iter().map(|item| item.visit(data))) - .into(), - } - } -} - -impl Serialize for MetadataValue { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - match self { - MetadataValue::Leaf(value) => value.serialize_bare(serializer), - MetadataValue::Group(items) => { - let mut map = serializer.serialize_map(Some(items.len()))?; - for item in items { - let name = item.name.display(()).to_string(); - map.serialize_entry(&name, &item.value)?; - } - map.end() - } - } - } -} -impl Serialize for MetadataEntry { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - match &self.value { - MetadataValue::Leaf(value) => { - let mut map = serializer.serialize_map(Some(1))?; - let name = self.name.display(()).to_string(); - map.serialize_entry(&name, &BareValue(value))?; - map.end() - } - MetadataValue::Group(items) => { - let mut map = serializer.serialize_map(Some(items.len()))?; - for item in items { - let name = item.name.display(()).to_string(); - map.serialize_entry(&name, &item.value)?; - } - map.end() - } - } - } -} - -#[cfg(test)] -mod tests { - use crate::output::pivot::{Display26Adic, MetadataEntry, MetadataValue, Value}; - - #[test] - fn display_26adic() { - for (number, lowercase, uppercase) in [ - (0, "", ""), - (1, "a", "A"), - (2, "b", "B"), - (26, "z", "Z"), - (27, "aa", "AA"), - (28, "ab", "AB"), - (29, "ac", "AC"), - (18278, "zzz", "ZZZ"), - (18279, "aaaa", "AAAA"), - (19010, "abcd", "ABCD"), - ] { - assert_eq!(Display26Adic::new_lowercase(number).to_string(), lowercase); - assert_eq!(Display26Adic::new_uppercase(number).to_string(), uppercase); - } - } - - #[test] - fn metadata_entry() { - let tree = MetadataEntry { - name: Value::from("Group"), - value: MetadataValue::Group(vec![ - MetadataEntry { - name: Value::from("Name 1"), - value: MetadataValue::Leaf(Value::from("Value 1")), - }, - MetadataEntry { - name: Value::from("Subgroup 1"), - value: MetadataValue::Group(vec![ - MetadataEntry { - name: Value::from("Subname 1"), - value: MetadataValue::Leaf(Value::from("Subvalue 1")), - }, - MetadataEntry { - name: Value::from("Subname 2"), - value: MetadataValue::Leaf(Value::from("Subvalue 2")), - }, - MetadataEntry { - name: Value::from("Subname 3"), - value: MetadataValue::Leaf(Value::new_integer(Some(3.0))), - }, - ]), - }, - MetadataEntry { - name: Value::from("Name 2"), - value: MetadataValue::Leaf(Value::from("Value 2")), - }, - ]), - }; - assert_eq!( - serde_json::to_string_pretty(&tree).unwrap(), - r#"{ - "Name 1": "Value 1", - "Subgroup 1": { - "Subname 1": "Subvalue 1", - "Subname 2": "Subvalue 2", - "Subname 3": 3 - }, - "Name 2": "Value 2" -}"# - ); - - assert_eq!( - tree.into_pivot_table().to_string(), - r#"╭────────────────────┬──────────╮ -│ Name 1 │Value 1 │ -├────────────────────┼──────────┤ -│Subgroup 1 Subname 1│Subvalue 1│ -│ Subname 2│Subvalue 2│ -│ Subname 3│ 3│ -├────────────────────┼──────────┤ -│ Name 2 │Value 2 │ -╰────────────────────┴──────────╯ -"# - ); - } -} diff --git a/rust/pspp/src/sys.rs b/rust/pspp/src/sys.rs new file mode 100644 index 0000000000..4f59614100 --- /dev/null +++ b/rust/pspp/src/sys.rs @@ -0,0 +1,55 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Reading and writing system files. +//! +//! This module enables reading and writing "system files", the binary format +//! for SPSS data files. The system file format dates back 40+ years and has +//! evolved greatly over that time to support new features, but in a way to +//! facilitate interchange between even the oldest and newest versions of +//! software. +//! +//! Use [ReadOptions] to read a system file in the simplest way. +//! Use [WriteOptions] to write a system file. + +// Warn about missing docs, but not for items declared with `#[cfg(test)]`. +#![cfg_attr(not(test), warn(missing_docs))] + +mod cooked; +use binrw::Endian; +pub use cooked::*; +pub mod encoding; +pub mod raw; + +#[cfg(test)] +pub mod sack; + +mod write; +use serde::Serializer; +pub use write::{SystemFileVersion, WriteOptions, Writer}; + +#[cfg(test)] +mod test; + +fn serialize_endian(endian: &Endian, serializer: S) -> Result +where + S: Serializer, +{ + match endian { + Endian::Big => serializer.serialize_unit_variant("Endian", 0, "Big"), + Endian::Little => serializer.serialize_unit_variant("Endian", 1, "Little"), + } +} diff --git a/rust/pspp/src/sys/mod.rs b/rust/pspp/src/sys/mod.rs deleted file mode 100644 index 4f59614100..0000000000 --- a/rust/pspp/src/sys/mod.rs +++ /dev/null @@ -1,55 +0,0 @@ -// PSPP - a program for statistical analysis. -// Copyright (C) 2025 Free Software Foundation, Inc. -// -// This program is free software: you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free Software -// Foundation, either version 3 of the License, or (at your option) any later -// version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -// details. -// -// You should have received a copy of the GNU General Public License along with -// this program. If not, see . - -//! Reading and writing system files. -//! -//! This module enables reading and writing "system files", the binary format -//! for SPSS data files. The system file format dates back 40+ years and has -//! evolved greatly over that time to support new features, but in a way to -//! facilitate interchange between even the oldest and newest versions of -//! software. -//! -//! Use [ReadOptions] to read a system file in the simplest way. -//! Use [WriteOptions] to write a system file. - -// Warn about missing docs, but not for items declared with `#[cfg(test)]`. -#![cfg_attr(not(test), warn(missing_docs))] - -mod cooked; -use binrw::Endian; -pub use cooked::*; -pub mod encoding; -pub mod raw; - -#[cfg(test)] -pub mod sack; - -mod write; -use serde::Serializer; -pub use write::{SystemFileVersion, WriteOptions, Writer}; - -#[cfg(test)] -mod test; - -fn serialize_endian(endian: &Endian, serializer: S) -> Result -where - S: Serializer, -{ - match endian { - Endian::Big => serializer.serialize_unit_variant("Endian", 0, "Big"), - Endian::Little => serializer.serialize_unit_variant("Endian", 1, "Little"), - } -}