--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+#![allow(dead_code)]
+use std::{
+ fmt::{Debug, Write},
+ ops::RangeFrom,
+ sync::OnceLock,
+};
+
+use crosstabs::crosstabs_command;
+use ctables::ctables_command;
+use data_list::data_list_command;
+use descriptives::descriptives_command;
+use either::Either;
+use flagset::{flags, FlagSet};
+use pspp_derive::FromTokens;
+
+use crate::{
+ format::AbstractFormat,
+ identifier::Identifier,
+ integer::ToInteger,
+ lex::{
+ command_name::CommandMatcher,
+ lexer::{LexToken, TokenSlice},
+ Punct, Token,
+ },
+ message::{Diagnostic, Diagnostics},
+};
+
+pub mod crosstabs;
+pub mod ctables;
+pub mod data_list;
+pub mod descriptives;
+
+flags! {
+ enum State: u8 {
+ /// No active dataset yet defined.
+ Initial,
+
+ /// Active dataset has been defined.
+ Data,
+
+ /// Inside `INPUT PROGRAM`.
+ InputProgram,
+
+ /// Inside `FILE TYPE`.
+ FileType,
+
+ /// State nested inside `LOOP` or `DO IF`, inside [State::Data].
+ NestedData,
+
+ /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram].
+ NestedInputProgram,
+ }
+}
+
+struct Command {
+ allowed_states: FlagSet<State>,
+ enhanced_only: bool,
+ testing_only: bool,
+ no_abbrev: bool,
+ name: &'static str,
+ run: Box<dyn Fn(&mut Context) + Send + Sync>, //-> Box<dyn ParsedCommand> + Send + Sync>,
+}
+
+#[derive(Debug)]
+enum ParseError {
+ Error(Diagnostics),
+ Mismatch(Diagnostics),
+}
+
+#[derive(Debug)]
+struct Parsed<T> {
+ value: T,
+ rest: TokenSlice,
+ diagnostics: Diagnostics,
+}
+
+impl<T> Parsed<T> {
+ pub fn new(value: T, rest: TokenSlice, warnings: Diagnostics) -> Self {
+ Self {
+ value,
+ rest,
+ diagnostics: warnings,
+ }
+ }
+ pub fn ok(value: T, rest: TokenSlice) -> Self {
+ Self {
+ value,
+ rest,
+ diagnostics: Diagnostics::default(),
+ }
+ }
+ pub fn into_tuple(self) -> (T, TokenSlice, Diagnostics) {
+ (self.value, self.rest, self.diagnostics)
+ }
+ pub fn take_diagnostics(self, d: &mut Diagnostics) -> (T, TokenSlice) {
+ let (value, rest, mut diagnostics) = self.into_tuple();
+ d.0.append(&mut diagnostics.0);
+ (value, rest)
+ }
+ pub fn map<F, R>(self, f: F) -> Parsed<R>
+ where
+ F: FnOnce(T) -> R,
+ {
+ Parsed {
+ value: f(self.value),
+ rest: self.rest,
+ diagnostics: self.diagnostics,
+ }
+ }
+ pub fn warn(self, mut warnings: Diagnostics) -> Self {
+ Self {
+ value: self.value,
+ rest: self.rest,
+ diagnostics: {
+ let mut vec = self.diagnostics.0;
+ vec.append(&mut warnings.0);
+ Diagnostics(vec)
+ },
+ }
+ }
+}
+
+type ParseResult<T> = Result<Parsed<T>, ParseError>;
+
+trait MismatchToError {
+ fn mismatch_to_error(self) -> Self;
+}
+
+impl<T> MismatchToError for ParseResult<T> {
+ fn mismatch_to_error(self) -> Self {
+ match self {
+ Err(ParseError::Mismatch(diagnostic)) => Err(ParseError::Error(diagnostic)),
+ rest => rest,
+ }
+ }
+}
+
+trait FromTokens {
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized;
+}
+
+impl<T> FromTokens for Option<T>
+where
+ T: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ match T::from_tokens(input) {
+ Ok(p) => Ok(p.map(Some)),
+ Err(ParseError::Mismatch(_)) => Ok(Parsed::ok(None, input.clone())),
+ Err(ParseError::Error(error)) => Err(ParseError::Error(error)),
+ }
+ }
+}
+
+impl<L, R> FromTokens for Either<L, R>
+where
+ L: FromTokens,
+ R: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ match L::from_tokens(input) {
+ Ok(p) => Ok(p.map(Either::Left)),
+ Err(ParseError::Mismatch(_)) => Ok(R::from_tokens(input)?.map(Either::Right)),
+ Err(ParseError::Error(error)) => Err(ParseError::Error(error)),
+ }
+ }
+}
+
+impl<A, B> FromTokens for (A, B)
+where
+ A: FromTokens,
+ B: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let (a, input, mut diagnostics) = A::from_tokens(input)?.into_tuple();
+ let (b, rest, mut diagnostics2) = B::from_tokens(&input)?.into_tuple();
+ diagnostics.0.append(&mut diagnostics2.0);
+ Ok(Parsed::new((a, b), rest, diagnostics))
+ }
+}
+
+impl<A, B, C> FromTokens for (A, B, C)
+where
+ A: FromTokens,
+ B: FromTokens,
+ C: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let (a, input, mut diagnostics) = A::from_tokens(input)?.into_tuple();
+ let (b, input, mut diagnostics2) = B::from_tokens(&input)?.into_tuple();
+ let (c, rest, mut diagnostics3) = C::from_tokens(&input)?.into_tuple();
+ diagnostics.0.append(&mut diagnostics2.0);
+ diagnostics.0.append(&mut diagnostics3.0);
+ Ok(Parsed::new((a, b, c), rest, diagnostics))
+ }
+}
+
+#[derive(Debug, pspp_derive::FromTokens)]
+#[pspp(syntax = "/")]
+pub struct Slash;
+
+#[derive(Debug)]
+pub struct Comma;
+
+impl FromTokens for Comma {
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ _parse_token(input, &Token::Punct(Punct::Comma)).map(|p| p.map(|_| Comma))
+ }
+}
+
+#[derive(Debug, pspp_derive::FromTokens)]
+#[pspp(syntax = "=")]
+pub struct Equals;
+
+#[derive(Debug, pspp_derive::FromTokens)]
+#[pspp(syntax = "&")]
+pub struct And;
+
+#[derive(Debug, pspp_derive::FromTokens)]
+#[pspp(syntax = ">")]
+pub struct Gt;
+
+#[derive(Debug, pspp_derive::FromTokens)]
+#[pspp(syntax = "+")]
+pub struct Plus;
+
+#[derive(Debug, pspp_derive::FromTokens)]
+#[pspp(syntax = "-")]
+pub struct Dash;
+
+#[derive(Debug, pspp_derive::FromTokens)]
+#[pspp(syntax = "*")]
+pub struct Asterisk;
+
+#[derive(Debug, pspp_derive::FromTokens)]
+#[pspp(syntax = "**")]
+pub struct Exp;
+
+#[derive(Debug, pspp_derive::FromTokens)]
+struct By;
+
+pub struct Punctuated<T, P = Option<Comma>> {
+ head: Vec<(T, P)>,
+ tail: Option<T>,
+}
+
+impl<T, P> Debug for Punctuated<T, P>
+where
+ T: Debug,
+{
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "[")?;
+ for (index, item) in self
+ .head
+ .iter()
+ .map(|(t, _p)| t)
+ .chain(self.tail.iter())
+ .enumerate()
+ {
+ if index > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{item:?}")?;
+ }
+ write!(f, "]")
+ }
+}
+
+impl<T, P> FromTokens for Punctuated<T, P>
+where
+ T: FromTokens,
+ P: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let mut head = Vec::new();
+ let mut warnings_vec = Vec::new();
+ let mut input = input.clone();
+ let tail = loop {
+ let t = match T::from_tokens(&input) {
+ Ok(Parsed {
+ value,
+ rest,
+ diagnostics: mut warnings,
+ }) => {
+ warnings_vec.append(&mut warnings.0);
+ input = rest;
+ value
+ }
+ Err(ParseError::Mismatch(_)) => break None,
+ Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
+ };
+ let p = match P::from_tokens(&input) {
+ Ok(Parsed {
+ value,
+ rest,
+ diagnostics: mut warnings,
+ }) => {
+ warnings_vec.append(&mut warnings.0);
+ input = rest;
+ value
+ }
+ Err(ParseError::Mismatch(_)) => break Some(t),
+ Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
+ };
+ head.push((t, p));
+ };
+ Ok(Parsed {
+ value: Punctuated { head, tail },
+ rest: input,
+ diagnostics: Diagnostics(warnings_vec),
+ })
+ }
+}
+
+impl<T> FromTokens for Box<T>
+where
+ T: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ T::from_tokens(input).map(|p| p.map(|value| Box::new(value)))
+ }
+}
+
+pub struct Subcommands<T>(Vec<T>);
+
+impl<T> Debug for Subcommands<T>
+where
+ T: Debug,
+{
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "Subcommands[")?;
+ for (index, item) in self.0.iter().enumerate() {
+ if index > 0 {
+ writeln!(f, ",")?;
+ }
+ write!(f, "{item:?}")?;
+ }
+ write!(f, "]")
+ }
+}
+
+impl<T> FromTokens for Subcommands<T>
+where
+ T: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let mut items = Vec::new();
+ let mut diagnostics = Vec::new();
+ let mut input = input.clone();
+ loop {
+ let start = input.skip_until(|token| token != &Token::Punct(Punct::Slash));
+ if start.is_empty() {
+ break;
+ }
+ let end = start.skip_to(&Token::Punct(Punct::Slash));
+ let subcommand = start.subslice(0..start.len() - end.len());
+ match T::from_tokens(&subcommand) {
+ Ok(p) => {
+ let (value, rest, mut d) = p.into_tuple();
+ items.push(value);
+ diagnostics.append(&mut d.0);
+ if !rest.is_empty() {
+ diagnostics.push(rest.warning("Syntax error expecting end of subcommand."));
+ }
+ }
+ Err(ParseError::Error(mut d) | ParseError::Mismatch(mut d)) => {
+ diagnostics.append(&mut d.0);
+ }
+ }
+ input = end;
+ }
+ println!("{diagnostics:?}");
+ Ok(Parsed {
+ value: Subcommands(items),
+ rest: input,
+ diagnostics: Diagnostics(diagnostics),
+ })
+ }
+}
+
+#[derive(Debug)]
+pub struct Seq0<T>(Vec<T>);
+
+impl<T> FromTokens for Seq0<T>
+where
+ T: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let mut values_vec = Vec::new();
+ let mut warnings_vec = Vec::new();
+ let mut input = input.clone();
+ while !input.is_empty() {
+ match T::from_tokens(&input) {
+ Ok(Parsed {
+ value,
+ rest,
+ diagnostics: mut warnings,
+ }) => {
+ warnings_vec.append(&mut warnings.0);
+ if input.len() == rest.len() {
+ break;
+ }
+ values_vec.push(value);
+ input = rest;
+ }
+ Err(ParseError::Mismatch(_)) => break,
+ Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
+ }
+ }
+ Ok(Parsed {
+ value: Seq0(values_vec),
+ rest: input,
+ diagnostics: Diagnostics(warnings_vec),
+ })
+ }
+}
+
+#[derive(Debug)]
+pub struct Seq1<T>(Vec<T>);
+
+impl<T> FromTokens for Seq1<T>
+where
+ T: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let mut values_vec = Vec::new();
+ let mut warnings_vec = Vec::new();
+ let mut input = input.clone();
+ while !input.is_empty() {
+ match T::from_tokens(&input) {
+ Ok(Parsed {
+ value,
+ rest,
+ diagnostics: mut warnings,
+ }) => {
+ warnings_vec.append(&mut warnings.0);
+ if input.len() == rest.len() {
+ break;
+ }
+ values_vec.push(value);
+ input = rest;
+ }
+ Err(ParseError::Mismatch(_)) => break,
+ Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
+ }
+ }
+ if values_vec.is_empty() {
+ return Err(ParseError::Mismatch(input.error("Syntax error.").into()));
+ }
+ Ok(Parsed {
+ value: Seq1(values_vec),
+ rest: input,
+ diagnostics: Diagnostics(warnings_vec),
+ })
+ }
+}
+
+/*
+impl<T> FromTokens for Vec<T>
+where
+ T: FromTokens,
+{
+ fn from_tokens(mut input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let mut values_vec = Vec::new();
+ let mut warnings_vec = Vec::new();
+ while !input.is_empty() {
+ match T::from_tokens(input) {
+ Ok(Parsed {
+ value,
+ rest,
+ diagnostics: mut warnings,
+ }) => {
+ values_vec.push(value);
+ warnings_vec.append(&mut warnings.0);
+ input = rest;
+ }
+ Err(ParseError::Mismatch(_)) => break,
+ Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
+ }
+ }
+ Ok(Parsed {
+ value: values_vec,
+ rest: input,
+ diagnostics: Diagnostics(warnings_vec),
+ })
+ }
+}*/
+
+impl FromTokens for TokenSlice {
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ Ok(Parsed::ok(input.clone(), input.end()))
+ }
+}
+
+#[derive(Debug)]
+struct Subcommand<T>(pub T);
+
+impl<T> FromTokens for Subcommand<T>
+where
+ T: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let start = input.skip_until(|token| token != &Token::Punct(Punct::Slash));
+ if start.is_empty() {
+ return Err(ParseError::Error(
+ input.error("Syntax error at end of input.").into(),
+ ));
+ }
+ let end = start.skip_to(&Token::Punct(Punct::Slash));
+ let subcommand = start.subslice(0..start.len() - end.len());
+ let (value, rest, mut warnings) = T::from_tokens(&subcommand)?.into_tuple();
+ if !rest.is_empty() {
+ warnings
+ .0
+ .push(rest.warning("Syntax error expecting end of subcommand."));
+ }
+ Ok(Parsed::new(Self(value), end, warnings))
+ }
+}
+
+#[derive(Debug)]
+struct InParens<T>(pub T);
+
+impl<T> FromTokens for InParens<T>
+where
+ T: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let ((), rest, _) = parse_token(input, &Token::Punct(Punct::LParen))?.into_tuple();
+ let (value, rest, warnings) = T::from_tokens(&rest)?.into_tuple();
+ let ((), rest, _) = parse_token(&rest, &Token::Punct(Punct::RParen))?.into_tuple();
+ Ok(Parsed {
+ value: Self(value),
+ rest,
+ diagnostics: warnings,
+ })
+ }
+}
+
+#[derive(Debug)]
+struct InSquares<T>(pub T);
+
+impl<T> FromTokens for InSquares<T>
+where
+ T: FromTokens,
+{
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ let ((), rest, _) = parse_token(input, &Token::Punct(Punct::LSquare))?.into_tuple();
+ let (value, rest, warnings) = T::from_tokens(&rest)?.into_tuple();
+ let ((), rest, _) = parse_token(&rest, &Token::Punct(Punct::RSquare))?.into_tuple();
+ Ok(Parsed {
+ value: Self(value),
+ rest,
+ diagnostics: warnings,
+ })
+ }
+}
+
+fn parse_token_if<F, R>(input: &TokenSlice, parse: F) -> ParseResult<R>
+where
+ F: Fn(&Token) -> Option<R>,
+{
+ if let Some(token) = input.get_token(0) {
+ if let Some(result) = parse(token) {
+ return Ok(Parsed::ok(result, input.subslice(1..input.len())));
+ }
+ }
+ Err(ParseError::Mismatch(Diagnostics::default()))
+}
+
+fn _parse_token(input: &TokenSlice, token: &Token) -> ParseResult<Token> {
+ if let Some(rest) = input.skip(token) {
+ Ok(Parsed::ok(input.first().token.clone(), rest))
+ } else {
+ Err(ParseError::Mismatch(
+ input.error(format!("expecting {token}")).into(),
+ ))
+ }
+}
+
+fn parse_token(input: &TokenSlice, token: &Token) -> ParseResult<()> {
+ if let Some(rest) = input.skip(token) {
+ Ok(Parsed::ok((), rest))
+ } else {
+ Err(ParseError::Mismatch(
+ input.error(format!("expecting {token}")).into(),
+ ))
+ }
+}
+
+fn parse_syntax(input: &TokenSlice, syntax: &str) -> ParseResult<()> {
+ if let Some(rest) = input.skip_syntax(syntax) {
+ Ok(Parsed::ok((), rest))
+ } else {
+ Err(ParseError::Mismatch(
+ input.error(format!("expecting {syntax}")).into(),
+ ))
+ }
+}
+
+pub type VarList = Punctuated<VarRange>;
+
+pub struct Number(f64);
+
+impl Debug for Number {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{:?}", self.0)
+ }
+}
+
+impl FromTokens for Number {
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ parse_token_if(input, |token| token.as_number().map(Number))
+ .map_err(|_| ParseError::Mismatch(input.error(String::from("expecting number")).into()))
+ }
+}
+
+#[derive(Debug)]
+pub struct Integer(i64);
+
+impl FromTokens for Integer {
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ parse_token_if(input, |token| token.as_integer().map(Integer)).map_err(|_| {
+ ParseError::Mismatch(input.error(String::from("expecting integer")).into())
+ })
+ }
+}
+
+pub enum VarRange {
+ Single(Identifier),
+ Range(Identifier, Identifier),
+ All,
+}
+
+impl Debug for VarRange {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Self::Single(var) => write!(f, "{var:?}"),
+ Self::Range(from, to) => write!(f, "{from:?} TO {to:?}"),
+ Self::All => write!(f, "ALL"),
+ }
+ }
+}
+
+impl FromTokens for VarRange {
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ if let Ok(Parsed { rest, .. }) = parse_token(input, &Token::Punct(Punct::All)) {
+ Ok(Parsed::ok(Self::All, rest))
+ } else {
+ let (from, rest, _) = parse_id(input)?.into_tuple();
+ if let Ok(Parsed { rest, .. }) = parse_token(&rest, &Token::Punct(Punct::To)) {
+ if let Ok(p) = parse_id(&rest) {
+ return Ok(p.map(|to| Self::Range(from, to)));
+ }
+ }
+ Ok(Parsed::ok(Self::Single(from), rest))
+ }
+ }
+}
+
+fn parse_id(input: &TokenSlice) -> ParseResult<Identifier> {
+ let mut iter = input.iter();
+ if let Some(LexToken {
+ token: Token::Id(id),
+ ..
+ }) = iter.next()
+ {
+ Ok(Parsed::ok(id.clone(), iter.remainder()))
+ } else {
+ Err(ParseError::Mismatch(
+ input.error("Syntax error expecting identifier.").into(),
+ ))
+ }
+}
+
+fn parse_format(input: &TokenSlice) -> ParseResult<AbstractFormat> {
+ let mut iter = input.iter();
+ if let Some(LexToken {
+ token: Token::Id(id),
+ ..
+ }) = iter.next()
+ {
+ if let Ok(format) = id.0.as_ref().parse() {
+ return Ok(Parsed::ok(format, iter.remainder()));
+ }
+ }
+ Err(ParseError::Mismatch(
+ input.error("Syntax error expecting identifier.").into(),
+ ))
+}
+
+fn parse_string(input: &TokenSlice) -> ParseResult<String> {
+ let mut iter = input.iter();
+ if let Some(LexToken {
+ token: Token::String(s),
+ ..
+ }) = iter.next()
+ {
+ Ok(Parsed::ok(s.clone(), iter.remainder()))
+ } else {
+ Err(ParseError::Mismatch(
+ input.error("Syntax error expecting identifier.").into(),
+ ))
+ }
+}
+
+impl FromTokens for Identifier {
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ parse_id(input)
+ }
+}
+
+impl FromTokens for String {
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ parse_string(input)
+ }
+}
+
+impl FromTokens for AbstractFormat {
+ fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
+ where
+ Self: Sized,
+ {
+ parse_format(input)
+ }
+}
+
+fn collect_subcommands(src: TokenSlice) -> Vec<TokenSlice> {
+ src.split(|token| token.token == Token::Punct(Punct::Slash))
+ .filter(|slice| !slice.is_empty())
+ .collect()
+}
+
+fn commands() -> &'static [Command] {
+ fn new_commands() -> Vec<Command> {
+ vec![
+ descriptives_command(),
+ crosstabs_command(),
+ ctables_command(),
+ data_list_command(),
+ Command {
+ allowed_states: FlagSet::full(),
+ enhanced_only: false,
+ testing_only: false,
+ no_abbrev: false,
+ name: "ECHO",
+ run: Box::new(|_context| todo!()),
+ },
+ ]
+ }
+
+ static COMMANDS: OnceLock<Vec<Command>> = OnceLock::new();
+ COMMANDS.get_or_init(new_commands).as_slice()
+}
+
+fn parse_command_word(lexer: &mut TokenSlice, s: &mut String, n: usize) -> bool {
+ let separator = match s.chars().next_back() {
+ Some(c) if c != '-' => " ",
+ _ => "",
+ };
+
+ match lexer.get_token(n) {
+ Some(Token::Punct(Punct::Dash)) => {
+ s.push('-');
+ true
+ }
+ Some(Token::Id(id)) => {
+ write!(s, "{separator}{id}").unwrap();
+ true
+ }
+ Some(Token::Number(number)) if number.is_sign_positive() => {
+ if let Some(integer) = number.to_exact_usize() {
+ write!(s, "{separator}{integer}").unwrap();
+ true
+ } else {
+ false
+ }
+ }
+ _ => false,
+ }
+}
+
+fn find_best_match(s: &str) -> (Option<&'static Command>, isize) {
+ let mut cm = CommandMatcher::new(s);
+ for command in commands() {
+ cm.add(command.name, command);
+ }
+ cm.get_match()
+}
+
+fn parse_command_name(
+ lexer: &mut TokenSlice,
+ error: &dyn Fn(Diagnostic),
+) -> Result<(&'static Command, usize), ()> {
+ let mut s = String::new();
+ let mut word = 0;
+ let mut missing_words = 0;
+ let mut command = None;
+ while parse_command_word(lexer, &mut s, word) {
+ (command, missing_words) = find_best_match(&s);
+ if missing_words <= 0 {
+ break;
+ }
+ word += 1;
+ }
+ if command.is_none() && missing_words > 0 {
+ s.push_str(" .");
+ (command, missing_words) = find_best_match(&s);
+ s.truncate(s.len() - 2);
+ }
+
+ match command {
+ Some(command) => Ok((command, ((word as isize + 1) + missing_words) as usize)),
+ None => {
+ if word == 0 {
+ error(
+ lexer
+ .subslice(0..1)
+ .error("Syntax error expecting command name"),
+ )
+ } else {
+ error(lexer.subslice(0..word + 1).error("Unknown command `{s}`."))
+ };
+ Err(())
+ }
+ }
+}
+
+pub enum Success {
+ Success,
+ Eof,
+ Finish,
+}
+
+pub fn end_of_command(context: &Context, range: RangeFrom<usize>) -> Result<Success, ()> {
+ match context.lexer.get_token(range.start) {
+ None | Some(Token::End) => Ok(Success::Success),
+ _ => {
+ context.error(
+ context
+ .lexer
+ .subslice(range.start..context.lexer.len())
+ .error("Syntax error expecting end of command."),
+ );
+ Err(())
+ }
+ }
+}
+
+fn parse_in_state(mut lexer: TokenSlice, error: &dyn Fn(Diagnostic), _state: State) {
+ match lexer.get_token(0) {
+ None | Some(Token::End) => (),
+ _ => match parse_command_name(&mut lexer, error) {
+ Ok((command, n_tokens)) => {
+ let mut context = Context {
+ error,
+ lexer: lexer.subslice(n_tokens..lexer.len()),
+ command_name: Some(command.name),
+ };
+ (command.run)(&mut context);
+ }
+ Err(error) => println!("{error:?}"),
+ },
+ }
+}
+
+pub fn parse_command(lexer: TokenSlice, error: &dyn Fn(Diagnostic)) {
+ parse_in_state(lexer, error, State::Initial)
+}
+
+pub struct Context<'a> {
+ error: &'a dyn Fn(Diagnostic),
+ lexer: TokenSlice,
+ command_name: Option<&'static str>,
+}
+
+impl Context<'_> {
+ pub fn error(&self, diagnostic: Diagnostic) {
+ (self.error)(diagnostic);
+ }
+}
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-#![allow(dead_code)]
-use std::{
- fmt::{Debug, Write},
- ops::RangeFrom,
- sync::OnceLock,
-};
-
-use crosstabs::crosstabs_command;
-use ctables::ctables_command;
-use data_list::data_list_command;
-use descriptives::descriptives_command;
-use either::Either;
-use flagset::{flags, FlagSet};
-use pspp_derive::FromTokens;
-
-use crate::{
- format::AbstractFormat,
- identifier::Identifier,
- integer::ToInteger,
- lex::{
- command_name::CommandMatcher,
- lexer::{LexToken, TokenSlice},
- Punct, Token,
- },
- message::{Diagnostic, Diagnostics},
-};
-
-pub mod crosstabs;
-pub mod ctables;
-pub mod data_list;
-pub mod descriptives;
-
-flags! {
- enum State: u8 {
- /// No active dataset yet defined.
- Initial,
-
- /// Active dataset has been defined.
- Data,
-
- /// Inside `INPUT PROGRAM`.
- InputProgram,
-
- /// Inside `FILE TYPE`.
- FileType,
-
- /// State nested inside `LOOP` or `DO IF`, inside [State::Data].
- NestedData,
-
- /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram].
- NestedInputProgram,
- }
-}
-
-struct Command {
- allowed_states: FlagSet<State>,
- enhanced_only: bool,
- testing_only: bool,
- no_abbrev: bool,
- name: &'static str,
- run: Box<dyn Fn(&mut Context) + Send + Sync>, //-> Box<dyn ParsedCommand> + Send + Sync>,
-}
-
-#[derive(Debug)]
-enum ParseError {
- Error(Diagnostics),
- Mismatch(Diagnostics),
-}
-
-#[derive(Debug)]
-struct Parsed<T> {
- value: T,
- rest: TokenSlice,
- diagnostics: Diagnostics,
-}
-
-impl<T> Parsed<T> {
- pub fn new(value: T, rest: TokenSlice, warnings: Diagnostics) -> Self {
- Self {
- value,
- rest,
- diagnostics: warnings,
- }
- }
- pub fn ok(value: T, rest: TokenSlice) -> Self {
- Self {
- value,
- rest,
- diagnostics: Diagnostics::default(),
- }
- }
- pub fn into_tuple(self) -> (T, TokenSlice, Diagnostics) {
- (self.value, self.rest, self.diagnostics)
- }
- pub fn take_diagnostics(self, d: &mut Diagnostics) -> (T, TokenSlice) {
- let (value, rest, mut diagnostics) = self.into_tuple();
- d.0.append(&mut diagnostics.0);
- (value, rest)
- }
- pub fn map<F, R>(self, f: F) -> Parsed<R>
- where
- F: FnOnce(T) -> R,
- {
- Parsed {
- value: f(self.value),
- rest: self.rest,
- diagnostics: self.diagnostics,
- }
- }
- pub fn warn(self, mut warnings: Diagnostics) -> Self {
- Self {
- value: self.value,
- rest: self.rest,
- diagnostics: {
- let mut vec = self.diagnostics.0;
- vec.append(&mut warnings.0);
- Diagnostics(vec)
- },
- }
- }
-}
-
-type ParseResult<T> = Result<Parsed<T>, ParseError>;
-
-trait MismatchToError {
- fn mismatch_to_error(self) -> Self;
-}
-
-impl<T> MismatchToError for ParseResult<T> {
- fn mismatch_to_error(self) -> Self {
- match self {
- Err(ParseError::Mismatch(diagnostic)) => Err(ParseError::Error(diagnostic)),
- rest => rest,
- }
- }
-}
-
-trait FromTokens {
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized;
-}
-
-impl<T> FromTokens for Option<T>
-where
- T: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- match T::from_tokens(input) {
- Ok(p) => Ok(p.map(Some)),
- Err(ParseError::Mismatch(_)) => Ok(Parsed::ok(None, input.clone())),
- Err(ParseError::Error(error)) => Err(ParseError::Error(error)),
- }
- }
-}
-
-impl<L, R> FromTokens for Either<L, R>
-where
- L: FromTokens,
- R: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- match L::from_tokens(input) {
- Ok(p) => Ok(p.map(Either::Left)),
- Err(ParseError::Mismatch(_)) => Ok(R::from_tokens(input)?.map(Either::Right)),
- Err(ParseError::Error(error)) => Err(ParseError::Error(error)),
- }
- }
-}
-
-impl<A, B> FromTokens for (A, B)
-where
- A: FromTokens,
- B: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let (a, input, mut diagnostics) = A::from_tokens(input)?.into_tuple();
- let (b, rest, mut diagnostics2) = B::from_tokens(&input)?.into_tuple();
- diagnostics.0.append(&mut diagnostics2.0);
- Ok(Parsed::new((a, b), rest, diagnostics))
- }
-}
-
-impl<A, B, C> FromTokens for (A, B, C)
-where
- A: FromTokens,
- B: FromTokens,
- C: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let (a, input, mut diagnostics) = A::from_tokens(input)?.into_tuple();
- let (b, input, mut diagnostics2) = B::from_tokens(&input)?.into_tuple();
- let (c, rest, mut diagnostics3) = C::from_tokens(&input)?.into_tuple();
- diagnostics.0.append(&mut diagnostics2.0);
- diagnostics.0.append(&mut diagnostics3.0);
- Ok(Parsed::new((a, b, c), rest, diagnostics))
- }
-}
-
-#[derive(Debug, pspp_derive::FromTokens)]
-#[pspp(syntax = "/")]
-pub struct Slash;
-
-#[derive(Debug)]
-pub struct Comma;
-
-impl FromTokens for Comma {
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- _parse_token(input, &Token::Punct(Punct::Comma)).map(|p| p.map(|_| Comma))
- }
-}
-
-#[derive(Debug, pspp_derive::FromTokens)]
-#[pspp(syntax = "=")]
-pub struct Equals;
-
-#[derive(Debug, pspp_derive::FromTokens)]
-#[pspp(syntax = "&")]
-pub struct And;
-
-#[derive(Debug, pspp_derive::FromTokens)]
-#[pspp(syntax = ">")]
-pub struct Gt;
-
-#[derive(Debug, pspp_derive::FromTokens)]
-#[pspp(syntax = "+")]
-pub struct Plus;
-
-#[derive(Debug, pspp_derive::FromTokens)]
-#[pspp(syntax = "-")]
-pub struct Dash;
-
-#[derive(Debug, pspp_derive::FromTokens)]
-#[pspp(syntax = "*")]
-pub struct Asterisk;
-
-#[derive(Debug, pspp_derive::FromTokens)]
-#[pspp(syntax = "**")]
-pub struct Exp;
-
-#[derive(Debug, pspp_derive::FromTokens)]
-struct By;
-
-pub struct Punctuated<T, P = Option<Comma>> {
- head: Vec<(T, P)>,
- tail: Option<T>,
-}
-
-impl<T, P> Debug for Punctuated<T, P>
-where
- T: Debug,
-{
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "[")?;
- for (index, item) in self
- .head
- .iter()
- .map(|(t, _p)| t)
- .chain(self.tail.iter())
- .enumerate()
- {
- if index > 0 {
- write!(f, ", ")?;
- }
- write!(f, "{item:?}")?;
- }
- write!(f, "]")
- }
-}
-
-impl<T, P> FromTokens for Punctuated<T, P>
-where
- T: FromTokens,
- P: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let mut head = Vec::new();
- let mut warnings_vec = Vec::new();
- let mut input = input.clone();
- let tail = loop {
- let t = match T::from_tokens(&input) {
- Ok(Parsed {
- value,
- rest,
- diagnostics: mut warnings,
- }) => {
- warnings_vec.append(&mut warnings.0);
- input = rest;
- value
- }
- Err(ParseError::Mismatch(_)) => break None,
- Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
- };
- let p = match P::from_tokens(&input) {
- Ok(Parsed {
- value,
- rest,
- diagnostics: mut warnings,
- }) => {
- warnings_vec.append(&mut warnings.0);
- input = rest;
- value
- }
- Err(ParseError::Mismatch(_)) => break Some(t),
- Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
- };
- head.push((t, p));
- };
- Ok(Parsed {
- value: Punctuated { head, tail },
- rest: input,
- diagnostics: Diagnostics(warnings_vec),
- })
- }
-}
-
-impl<T> FromTokens for Box<T>
-where
- T: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- T::from_tokens(input).map(|p| p.map(|value| Box::new(value)))
- }
-}
-
-pub struct Subcommands<T>(Vec<T>);
-
-impl<T> Debug for Subcommands<T>
-where
- T: Debug,
-{
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "Subcommands[")?;
- for (index, item) in self.0.iter().enumerate() {
- if index > 0 {
- writeln!(f, ",")?;
- }
- write!(f, "{item:?}")?;
- }
- write!(f, "]")
- }
-}
-
-impl<T> FromTokens for Subcommands<T>
-where
- T: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let mut items = Vec::new();
- let mut diagnostics = Vec::new();
- let mut input = input.clone();
- loop {
- let start = input.skip_until(|token| token != &Token::Punct(Punct::Slash));
- if start.is_empty() {
- break;
- }
- let end = start.skip_to(&Token::Punct(Punct::Slash));
- let subcommand = start.subslice(0..start.len() - end.len());
- match T::from_tokens(&subcommand) {
- Ok(p) => {
- let (value, rest, mut d) = p.into_tuple();
- items.push(value);
- diagnostics.append(&mut d.0);
- if !rest.is_empty() {
- diagnostics.push(rest.warning("Syntax error expecting end of subcommand."));
- }
- }
- Err(ParseError::Error(mut d) | ParseError::Mismatch(mut d)) => {
- diagnostics.append(&mut d.0);
- }
- }
- input = end;
- }
- println!("{diagnostics:?}");
- Ok(Parsed {
- value: Subcommands(items),
- rest: input,
- diagnostics: Diagnostics(diagnostics),
- })
- }
-}
-
-#[derive(Debug)]
-pub struct Seq0<T>(Vec<T>);
-
-impl<T> FromTokens for Seq0<T>
-where
- T: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let mut values_vec = Vec::new();
- let mut warnings_vec = Vec::new();
- let mut input = input.clone();
- while !input.is_empty() {
- match T::from_tokens(&input) {
- Ok(Parsed {
- value,
- rest,
- diagnostics: mut warnings,
- }) => {
- warnings_vec.append(&mut warnings.0);
- if input.len() == rest.len() {
- break;
- }
- values_vec.push(value);
- input = rest;
- }
- Err(ParseError::Mismatch(_)) => break,
- Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
- }
- }
- Ok(Parsed {
- value: Seq0(values_vec),
- rest: input,
- diagnostics: Diagnostics(warnings_vec),
- })
- }
-}
-
-#[derive(Debug)]
-pub struct Seq1<T>(Vec<T>);
-
-impl<T> FromTokens for Seq1<T>
-where
- T: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let mut values_vec = Vec::new();
- let mut warnings_vec = Vec::new();
- let mut input = input.clone();
- while !input.is_empty() {
- match T::from_tokens(&input) {
- Ok(Parsed {
- value,
- rest,
- diagnostics: mut warnings,
- }) => {
- warnings_vec.append(&mut warnings.0);
- if input.len() == rest.len() {
- break;
- }
- values_vec.push(value);
- input = rest;
- }
- Err(ParseError::Mismatch(_)) => break,
- Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
- }
- }
- if values_vec.is_empty() {
- return Err(ParseError::Mismatch(input.error("Syntax error.").into()));
- }
- Ok(Parsed {
- value: Seq1(values_vec),
- rest: input,
- diagnostics: Diagnostics(warnings_vec),
- })
- }
-}
-
-/*
-impl<T> FromTokens for Vec<T>
-where
- T: FromTokens,
-{
- fn from_tokens(mut input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let mut values_vec = Vec::new();
- let mut warnings_vec = Vec::new();
- while !input.is_empty() {
- match T::from_tokens(input) {
- Ok(Parsed {
- value,
- rest,
- diagnostics: mut warnings,
- }) => {
- values_vec.push(value);
- warnings_vec.append(&mut warnings.0);
- input = rest;
- }
- Err(ParseError::Mismatch(_)) => break,
- Err(ParseError::Error(e)) => return Err(ParseError::Error(e)),
- }
- }
- Ok(Parsed {
- value: values_vec,
- rest: input,
- diagnostics: Diagnostics(warnings_vec),
- })
- }
-}*/
-
-impl FromTokens for TokenSlice {
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- Ok(Parsed::ok(input.clone(), input.end()))
- }
-}
-
-#[derive(Debug)]
-struct Subcommand<T>(pub T);
-
-impl<T> FromTokens for Subcommand<T>
-where
- T: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let start = input.skip_until(|token| token != &Token::Punct(Punct::Slash));
- if start.is_empty() {
- return Err(ParseError::Error(
- input.error("Syntax error at end of input.").into(),
- ));
- }
- let end = start.skip_to(&Token::Punct(Punct::Slash));
- let subcommand = start.subslice(0..start.len() - end.len());
- let (value, rest, mut warnings) = T::from_tokens(&subcommand)?.into_tuple();
- if !rest.is_empty() {
- warnings
- .0
- .push(rest.warning("Syntax error expecting end of subcommand."));
- }
- Ok(Parsed::new(Self(value), end, warnings))
- }
-}
-
-#[derive(Debug)]
-struct InParens<T>(pub T);
-
-impl<T> FromTokens for InParens<T>
-where
- T: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let ((), rest, _) = parse_token(input, &Token::Punct(Punct::LParen))?.into_tuple();
- let (value, rest, warnings) = T::from_tokens(&rest)?.into_tuple();
- let ((), rest, _) = parse_token(&rest, &Token::Punct(Punct::RParen))?.into_tuple();
- Ok(Parsed {
- value: Self(value),
- rest,
- diagnostics: warnings,
- })
- }
-}
-
-#[derive(Debug)]
-struct InSquares<T>(pub T);
-
-impl<T> FromTokens for InSquares<T>
-where
- T: FromTokens,
-{
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- let ((), rest, _) = parse_token(input, &Token::Punct(Punct::LSquare))?.into_tuple();
- let (value, rest, warnings) = T::from_tokens(&rest)?.into_tuple();
- let ((), rest, _) = parse_token(&rest, &Token::Punct(Punct::RSquare))?.into_tuple();
- Ok(Parsed {
- value: Self(value),
- rest,
- diagnostics: warnings,
- })
- }
-}
-
-fn parse_token_if<F, R>(input: &TokenSlice, parse: F) -> ParseResult<R>
-where
- F: Fn(&Token) -> Option<R>,
-{
- if let Some(token) = input.get_token(0) {
- if let Some(result) = parse(token) {
- return Ok(Parsed::ok(result, input.subslice(1..input.len())));
- }
- }
- Err(ParseError::Mismatch(Diagnostics::default()))
-}
-
-fn _parse_token(input: &TokenSlice, token: &Token) -> ParseResult<Token> {
- if let Some(rest) = input.skip(token) {
- Ok(Parsed::ok(input.first().token.clone(), rest))
- } else {
- Err(ParseError::Mismatch(
- input.error(format!("expecting {token}")).into(),
- ))
- }
-}
-
-fn parse_token(input: &TokenSlice, token: &Token) -> ParseResult<()> {
- if let Some(rest) = input.skip(token) {
- Ok(Parsed::ok((), rest))
- } else {
- Err(ParseError::Mismatch(
- input.error(format!("expecting {token}")).into(),
- ))
- }
-}
-
-fn parse_syntax(input: &TokenSlice, syntax: &str) -> ParseResult<()> {
- if let Some(rest) = input.skip_syntax(syntax) {
- Ok(Parsed::ok((), rest))
- } else {
- Err(ParseError::Mismatch(
- input.error(format!("expecting {syntax}")).into(),
- ))
- }
-}
-
-pub type VarList = Punctuated<VarRange>;
-
-pub struct Number(f64);
-
-impl Debug for Number {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{:?}", self.0)
- }
-}
-
-impl FromTokens for Number {
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- parse_token_if(input, |token| token.as_number().map(Number))
- .map_err(|_| ParseError::Mismatch(input.error(String::from("expecting number")).into()))
- }
-}
-
-#[derive(Debug)]
-pub struct Integer(i64);
-
-impl FromTokens for Integer {
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- parse_token_if(input, |token| token.as_integer().map(Integer)).map_err(|_| {
- ParseError::Mismatch(input.error(String::from("expecting integer")).into())
- })
- }
-}
-
-pub enum VarRange {
- Single(Identifier),
- Range(Identifier, Identifier),
- All,
-}
-
-impl Debug for VarRange {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- Self::Single(var) => write!(f, "{var:?}"),
- Self::Range(from, to) => write!(f, "{from:?} TO {to:?}"),
- Self::All => write!(f, "ALL"),
- }
- }
-}
-
-impl FromTokens for VarRange {
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- if let Ok(Parsed { rest, .. }) = parse_token(input, &Token::Punct(Punct::All)) {
- Ok(Parsed::ok(Self::All, rest))
- } else {
- let (from, rest, _) = parse_id(input)?.into_tuple();
- if let Ok(Parsed { rest, .. }) = parse_token(&rest, &Token::Punct(Punct::To)) {
- if let Ok(p) = parse_id(&rest) {
- return Ok(p.map(|to| Self::Range(from, to)));
- }
- }
- Ok(Parsed::ok(Self::Single(from), rest))
- }
- }
-}
-
-fn parse_id(input: &TokenSlice) -> ParseResult<Identifier> {
- let mut iter = input.iter();
- if let Some(LexToken {
- token: Token::Id(id),
- ..
- }) = iter.next()
- {
- Ok(Parsed::ok(id.clone(), iter.remainder()))
- } else {
- Err(ParseError::Mismatch(
- input.error("Syntax error expecting identifier.").into(),
- ))
- }
-}
-
-fn parse_format(input: &TokenSlice) -> ParseResult<AbstractFormat> {
- let mut iter = input.iter();
- if let Some(LexToken {
- token: Token::Id(id),
- ..
- }) = iter.next()
- {
- if let Ok(format) = id.0.as_ref().parse() {
- return Ok(Parsed::ok(format, iter.remainder()));
- }
- }
- Err(ParseError::Mismatch(
- input.error("Syntax error expecting identifier.").into(),
- ))
-}
-
-fn parse_string(input: &TokenSlice) -> ParseResult<String> {
- let mut iter = input.iter();
- if let Some(LexToken {
- token: Token::String(s),
- ..
- }) = iter.next()
- {
- Ok(Parsed::ok(s.clone(), iter.remainder()))
- } else {
- Err(ParseError::Mismatch(
- input.error("Syntax error expecting identifier.").into(),
- ))
- }
-}
-
-impl FromTokens for Identifier {
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- parse_id(input)
- }
-}
-
-impl FromTokens for String {
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- parse_string(input)
- }
-}
-
-impl FromTokens for AbstractFormat {
- fn from_tokens(input: &TokenSlice) -> ParseResult<Self>
- where
- Self: Sized,
- {
- parse_format(input)
- }
-}
-
-fn collect_subcommands(src: TokenSlice) -> Vec<TokenSlice> {
- src.split(|token| token.token == Token::Punct(Punct::Slash))
- .filter(|slice| !slice.is_empty())
- .collect()
-}
-
-fn commands() -> &'static [Command] {
- fn new_commands() -> Vec<Command> {
- vec![
- descriptives_command(),
- crosstabs_command(),
- ctables_command(),
- data_list_command(),
- Command {
- allowed_states: FlagSet::full(),
- enhanced_only: false,
- testing_only: false,
- no_abbrev: false,
- name: "ECHO",
- run: Box::new(|_context| todo!()),
- },
- ]
- }
-
- static COMMANDS: OnceLock<Vec<Command>> = OnceLock::new();
- COMMANDS.get_or_init(new_commands).as_slice()
-}
-
-fn parse_command_word(lexer: &mut TokenSlice, s: &mut String, n: usize) -> bool {
- let separator = match s.chars().next_back() {
- Some(c) if c != '-' => " ",
- _ => "",
- };
-
- match lexer.get_token(n) {
- Some(Token::Punct(Punct::Dash)) => {
- s.push('-');
- true
- }
- Some(Token::Id(id)) => {
- write!(s, "{separator}{id}").unwrap();
- true
- }
- Some(Token::Number(number)) if number.is_sign_positive() => {
- if let Some(integer) = number.to_exact_usize() {
- write!(s, "{separator}{integer}").unwrap();
- true
- } else {
- false
- }
- }
- _ => false,
- }
-}
-
-fn find_best_match(s: &str) -> (Option<&'static Command>, isize) {
- let mut cm = CommandMatcher::new(s);
- for command in commands() {
- cm.add(command.name, command);
- }
- cm.get_match()
-}
-
-fn parse_command_name(
- lexer: &mut TokenSlice,
- error: &dyn Fn(Diagnostic),
-) -> Result<(&'static Command, usize), ()> {
- let mut s = String::new();
- let mut word = 0;
- let mut missing_words = 0;
- let mut command = None;
- while parse_command_word(lexer, &mut s, word) {
- (command, missing_words) = find_best_match(&s);
- if missing_words <= 0 {
- break;
- }
- word += 1;
- }
- if command.is_none() && missing_words > 0 {
- s.push_str(" .");
- (command, missing_words) = find_best_match(&s);
- s.truncate(s.len() - 2);
- }
-
- match command {
- Some(command) => Ok((command, ((word as isize + 1) + missing_words) as usize)),
- None => {
- if word == 0 {
- error(
- lexer
- .subslice(0..1)
- .error("Syntax error expecting command name"),
- )
- } else {
- error(lexer.subslice(0..word + 1).error("Unknown command `{s}`."))
- };
- Err(())
- }
- }
-}
-
-pub enum Success {
- Success,
- Eof,
- Finish,
-}
-
-pub fn end_of_command(context: &Context, range: RangeFrom<usize>) -> Result<Success, ()> {
- match context.lexer.get_token(range.start) {
- None | Some(Token::End) => Ok(Success::Success),
- _ => {
- context.error(
- context
- .lexer
- .subslice(range.start..context.lexer.len())
- .error("Syntax error expecting end of command."),
- );
- Err(())
- }
- }
-}
-
-fn parse_in_state(mut lexer: TokenSlice, error: &dyn Fn(Diagnostic), _state: State) {
- match lexer.get_token(0) {
- None | Some(Token::End) => (),
- _ => match parse_command_name(&mut lexer, error) {
- Ok((command, n_tokens)) => {
- let mut context = Context {
- error,
- lexer: lexer.subslice(n_tokens..lexer.len()),
- command_name: Some(command.name),
- };
- (command.run)(&mut context);
- }
- Err(error) => println!("{error:?}"),
- },
- }
-}
-
-pub fn parse_command(lexer: TokenSlice, error: &dyn Fn(Diagnostic)) {
- parse_in_state(lexer, error, State::Initial)
-}
-
-pub struct Context<'a> {
- error: &'a dyn Fn(Diagnostic),
- lexer: TokenSlice,
- command_name: Option<&'static str>,
-}
-
-impl Context<'_> {
- pub fn error(&self, diagnostic: Diagnostic) {
- (self.error)(diagnostic);
- }
-}
--- /dev/null
+//! # Decryption for SPSS encrypted files
+//!
+//! SPSS supports encryption using a password for data, viewer, and syntax
+//! files. The encryption mechanism is poorly designed, so this module provides
+//! support for decrypting, but not encrypting, the SPSS format.
+//! Use [EncryptedFile] as the starting point for reading an encrypted file.
+//!
+//! SPSS also supports what calls "encrypted passwords". Use [EncodedPassword]
+//! to encode and decode these passwords.
+
+// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
+#![cfg_attr(not(test), warn(missing_docs))]
+
+use aes::{
+ cipher::{generic_array::GenericArray, BlockDecrypt, KeyInit},
+ Aes256, Aes256Dec,
+};
+use cmac::{Cmac, Mac};
+use smallvec::SmallVec;
+use std::{
+ fmt::Debug,
+ io::{BufRead, Error as IoError, ErrorKind, Read, Seek, SeekFrom},
+};
+use thiserror::Error as ThisError;
+
+use binrw::{io::NoSeek, BinRead};
+
+/// Error reading an encrypted file.
+#[derive(Clone, Debug, ThisError)]
+pub enum Error {
+ /// I/O error.
+ #[error("I/O error reading encrypted file wrapper ({0})")]
+ IoError(ErrorKind),
+
+ /// Invalid padding in final encrypted data block.
+ #[error("Invalid padding in final encrypted data block")]
+ InvalidPadding,
+
+ /// Not an encrypted file.
+ #[error("Not an encrypted file")]
+ NotEncrypted,
+
+ /// Encrypted file has invalid length.
+ #[error("Encrypted file has invalid length {0} (expected 4 more than a multiple of 16).")]
+ InvalidLength(u64),
+
+ /// Unknown file type.
+ #[error("Unknown file type {0:?}.")]
+ UnknownFileType(String),
+}
+
+impl From<std::io::Error> for Error {
+ fn from(value: std::io::Error) -> Self {
+ Self::IoError(value.kind())
+ }
+}
+
+#[derive(BinRead)]
+struct EncryptedHeader {
+ /// Fixed as `1c 00 00 00 00 00 00 00` in practice.
+ _ignore: [u8; 8],
+
+ /// File type.
+ #[br(magic = b"ENCRYPTED")]
+ file_type: [u8; 3],
+
+ /// Fixed as `15 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00` in practice.
+ _ignore2: [u8; 16],
+}
+
+/// An encrypted file.
+pub struct EncryptedFile<R> {
+ reader: R,
+ file_type: FileType,
+
+ /// Length of the ciphertext (excluding the 36-byte header).
+ length: u64,
+
+ /// First block of ciphertext, for verifying that any password the user
+ /// tries is correct.
+ first_block: [u8; 16],
+
+ /// Last block of ciphertext, for checking padding and determining the
+ /// plaintext length.
+ last_block: [u8; 16],
+}
+
+/// Type of encrypted file.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum FileType {
+ /// A `.sps` syntax file.
+ Syntax,
+
+ /// A `.spv` viewer file.
+ Viewer,
+
+ /// A `.sav` data file.
+ Data,
+}
+
+impl<R> EncryptedFile<R>
+where
+ R: Read + Seek,
+{
+ /// Opens `reader` as an encrypted file.
+ ///
+ /// This reads enough of the file to verify that it is in the expected
+ /// format and returns an error if it cannot be read or is not the expected
+ /// format.
+ ///
+ /// `reader` doesn't need to be [BufRead], and probably should not be. The
+ /// [EncryptedReader] returned by [unlock] or [unlock_literal] will be
+ /// [BufRead].
+ ///
+ /// [unlock]: Self::unlock
+ /// [unlock_literal]: Self::unlock_literal
+ pub fn new(mut reader: R) -> Result<Self, Error> {
+ let header =
+ EncryptedHeader::read_le(&mut NoSeek::new(&mut reader)).map_err(
+ |error| match error {
+ binrw::Error::BadMagic { .. } => Error::NotEncrypted,
+ binrw::Error::Io(error) => Error::IoError(error.kind()),
+ _ => unreachable!(),
+ },
+ )?;
+ let file_type = match &header.file_type {
+ b"SAV" => FileType::Data,
+ b"SPV" => FileType::Viewer,
+ b"SPS" => FileType::Syntax,
+ _ => {
+ return Err(Error::UnknownFileType(
+ header.file_type.iter().map(|b| *b as char).collect(),
+ ))
+ }
+ };
+ let mut first_block = [0; 16];
+ reader.read_exact(&mut first_block)?;
+ let length = reader.seek(SeekFrom::End(-16))? + 16;
+ if length < 36 + 16 || (length - 36) % 16 != 0 {
+ return Err(Error::InvalidLength(length + 36));
+ }
+ let mut last_block = [0; 16];
+ reader.read_exact(&mut last_block)?;
+ reader.seek(SeekFrom::Start(36))?;
+ Ok(Self {
+ reader,
+ file_type,
+ length,
+ first_block,
+ last_block,
+ })
+ }
+
+ /// Tries to unlock the encrypted file using both `password` and with
+ /// `password` decoded with [EncodedPassword::decode]. If successful,
+ /// returns an [EncryptedReader] for the file; on failure, returns the
+ /// [EncryptedFile] again for another try.
+ pub fn unlock(self, password: &[u8]) -> Result<EncryptedReader<R>, Self> {
+ self.unlock_literal(password).or_else(|this| {
+ match EncodedPassword::from_encoded(password) {
+ Some(encoded) => this.unlock_literal(&encoded.decode()),
+ None => Err(this),
+ }
+ })
+ }
+
+ /// Tries to unlock the encrypted file using just `password`. If
+ /// successful, returns an [EncryptedReader] for the file; on failure,
+ /// returns the [EncryptedFile] again for another try.
+ ///
+ /// If the password itself might be encoded ("encrypted"), instead use
+ /// [Self::unlock] to try it both ways.
+ pub fn unlock_literal(self, password: &[u8]) -> Result<EncryptedReader<R>, Self> {
+ // NIST SP 800-108 fixed data.
+ #[rustfmt::skip]
+ static FIXED: &[u8] = &[
+ // i
+ 0x00, 0x00, 0x00, 0x01,
+
+ // label
+ 0x35, 0x27, 0x13, 0xcc, 0x53, 0xa7, 0x78, 0x89,
+ 0x87, 0x53, 0x22, 0x11, 0xd6, 0x5b, 0x31, 0x58,
+ 0xdc, 0xfe, 0x2e, 0x7e, 0x94, 0xda, 0x2f, 0x00,
+ 0xcc, 0x15, 0x71, 0x80, 0x0a, 0x6c, 0x63, 0x53,
+
+ // delimiter
+ 0x00,
+
+ // context
+ 0x38, 0xc3, 0x38, 0xac, 0x22, 0xf3, 0x63, 0x62,
+ 0x0e, 0xce, 0x85, 0x3f, 0xb8, 0x07, 0x4c, 0x4e,
+ 0x2b, 0x77, 0xc7, 0x21, 0xf5, 0x1a, 0x80, 0x1d,
+ 0x67, 0xfb, 0xe1, 0xe1, 0x83, 0x07, 0xd8, 0x0d,
+
+ // L
+ 0x00, 0x00, 0x01, 0x00,
+ ];
+
+ // Truncate password to at most 10 bytes.
+ let password = password.get(..10).unwrap_or(password);
+ let n = password.len();
+
+ // padded_password = password padded with zeros to 32 bytes.
+ let mut padded_password = [0; 32];
+ padded_password[..n].copy_from_slice(password);
+
+ // cmac = CMAC(padded_password, fixed).
+ let mut cmac = <Cmac<Aes256> as Mac>::new_from_slice(&padded_password).unwrap();
+ cmac.update(FIXED);
+ let cmac = cmac.finalize().into_bytes();
+
+ // The key is the cmac repeated twice.
+ let mut key = [0; 32];
+ key[..16].copy_from_slice(cmac.as_slice());
+ key[16..].copy_from_slice(cmac.as_slice());
+
+ // Use key to initialize AES.
+ let aes = <Aes256Dec as KeyInit>::new_from_slice(&key).unwrap();
+
+ // Decrypt first block to verify password.
+ let mut out = [0; 16];
+ aes.decrypt_block_b2b(
+ GenericArray::from_slice(&self.first_block),
+ GenericArray::from_mut_slice(&mut out),
+ );
+ static MAGIC: &[&[u8]] = &[
+ b"$FL2@(#)",
+ b"$FL3@(#)",
+ b"* Encoding",
+ b"PK\x03\x04\x14\0\x08",
+ ];
+ if !MAGIC.iter().any(|magic| out.starts_with(magic)) {
+ return Err(self);
+ }
+
+ // Decrypt last block to check padding and get final length.
+ aes.decrypt_block_b2b(
+ GenericArray::from_slice(&self.last_block),
+ GenericArray::from_mut_slice(&mut out),
+ );
+ let Some(padding_length) = parse_padding(&out) else {
+ return Err(self);
+ };
+
+ Ok(EncryptedReader::new(
+ self.reader,
+ aes,
+ self.file_type,
+ self.length - 36 - padding_length as u64,
+ ))
+ }
+
+ /// Returns the type of encrypted file.
+ pub fn file_type(&self) -> FileType {
+ self.file_type
+ }
+}
+
+fn parse_padding(block: &[u8; 16]) -> Option<usize> {
+ let pad = block[15] as usize;
+ if (1..=16).contains(&pad) && block[16 - pad..].iter().all(|b| *b == pad as u8) {
+ Some(pad)
+ } else {
+ None
+ }
+}
+
+impl<R> Debug for EncryptedFile<R>
+where
+ R: Read,
+{
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "EncryptedFile({:?})", &self.file_type)
+ }
+}
+
+/// Encrypted file reader.
+///
+/// This implements [Read] and [Seek] for SPSS encrypted files. To construct an
+/// [EncryptedReader], call [EncryptedFile::new], then [EncryptedFile::unlock].
+pub struct EncryptedReader<R> {
+ /// Underlying reader.
+ reader: R,
+
+ /// AES-256 decryption key.
+ aes: Aes256Dec,
+
+ /// Type of file.
+ file_type: FileType,
+
+ /// Plaintext file length (not including the file header or padding).
+ length: u64,
+
+ /// Plaintext data buffer.
+ buffer: Box<[u8; 4096]>,
+
+ /// Plaintext offset of the byte in `buffer[0]`. A multiple of 16 less than
+ /// or equal to `length`.
+ start: u64,
+
+ /// Number of bytes in buffer (`0 <= head <= 4096`).
+ head: usize,
+
+ /// Offset in buffer of the next byte to read (`head <= tail`).
+ tail: usize,
+}
+
+impl<R> EncryptedReader<R> {
+ fn new(reader: R, aes: Aes256Dec, file_type: FileType, length: u64) -> Self {
+ Self {
+ reader,
+ aes,
+ file_type,
+ length,
+ buffer: Box::new([0; 4096]),
+ start: 0,
+ head: 0,
+ tail: 0,
+ }
+ }
+
+ fn read_buffer(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
+ let n = buf.len().min(self.head - self.tail);
+ buf[..n].copy_from_slice(&self.buffer[self.tail..n + self.tail]);
+ self.tail += n;
+ Ok(n)
+ }
+
+ /// Returns the type of encrypted file.
+ pub fn file_type(&self) -> FileType {
+ self.file_type
+ }
+}
+
+impl<R> EncryptedReader<R>
+where
+ R: Read,
+{
+ fn fill_buffer(&mut self, offset: u64) -> Result<(), IoError> {
+ self.start = offset / 16 * 16;
+ self.head = 0;
+ self.tail = (offset % 16) as usize;
+ let n = self.buffer.len().min((self.length - self.start) as usize);
+ self.reader
+ .read_exact(&mut self.buffer[..n.next_multiple_of(16)])?;
+ for offset in (0..n).step_by(16) {
+ self.aes.decrypt_block(GenericArray::from_mut_slice(
+ &mut self.buffer[offset..offset + 16],
+ ));
+ }
+ self.head = n;
+ Ok(())
+ }
+}
+
+impl<R> Read for EncryptedReader<R>
+where
+ R: Read,
+{
+ fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
+ if self.tail < self.head {
+ self.read_buffer(buf)
+ } else {
+ let offset = self.start + self.head as u64;
+ if offset < self.length {
+ self.fill_buffer(offset)?;
+ self.read_buffer(buf)
+ } else {
+ Ok(0)
+ }
+ }
+ }
+}
+
+impl<R> Seek for EncryptedReader<R>
+where
+ R: Read + Seek,
+{
+ fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
+ let offset = match pos {
+ SeekFrom::Start(offset) => Some(offset),
+ SeekFrom::End(relative) => self.length.checked_add_signed(relative),
+ SeekFrom::Current(relative) => {
+ (self.start + self.tail as u64).checked_add_signed(relative)
+ }
+ }
+ .filter(|offset| *offset < u64::MAX - 36)
+ .ok_or(IoError::from(ErrorKind::InvalidInput))?;
+ if offset != self.start + self.tail as u64 {
+ self.reader.seek(SeekFrom::Start(offset / 16 * 16 + 36))?;
+ self.fill_buffer(offset)?;
+ }
+ Ok(offset)
+ }
+}
+
+impl<R> BufRead for EncryptedReader<R>
+where
+ R: Read + Seek,
+{
+ fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
+ if self.tail >= self.head {
+ let offset = self.start + self.head as u64;
+ if offset < self.length {
+ self.fill_buffer(offset)?;
+ }
+ }
+ Ok(&self.buffer[self.tail..self.head])
+ }
+
+ fn consume(&mut self, amount: usize) {
+ self.tail += amount;
+ debug_assert!(self.tail <= self.head);
+ }
+}
+
+const fn b(x: i32) -> u16 {
+ 1 << x
+}
+
+static AH: [[u16; 2]; 4] = [
+ [b(2), b(2) | b(3) | b(6) | b(7)],
+ [b(3), b(0) | b(1) | b(4) | b(5)],
+ [b(4) | b(7), b(8) | b(9) | b(12) | b(13)],
+ [b(5) | b(6), b(10) | b(11) | b(14) | b(15)],
+];
+
+static AL: [[u16; 2]; 4] = [
+ [b(0) | b(3) | b(12) | b(15), b(0) | b(1) | b(4) | b(5)],
+ [b(1) | b(2) | b(13) | b(14), b(2) | b(3) | b(6) | b(7)],
+ [b(4) | b(7) | b(8) | b(11), b(8) | b(9) | b(12) | b(13)],
+ [b(5) | b(6) | b(9) | b(10), b(10) | b(11) | b(14) | b(15)],
+];
+
+static BH: [[u16; 2]; 4] = [
+ [b(2), b(1) | b(3) | b(9) | b(11)],
+ [b(3), b(0) | b(2) | b(8) | b(10)],
+ [b(4) | b(7), b(4) | b(6) | b(12) | b(14)],
+ [b(5) | b(6), b(5) | b(7) | b(13) | b(15)],
+];
+
+static BL: [[u16; 2]; 4] = [
+ [b(0) | b(3) | b(12) | b(15), b(0) | b(2) | b(8) | b(10)],
+ [b(1) | b(2) | b(13) | b(14), b(1) | b(3) | b(9) | b(11)],
+ [b(4) | b(7) | b(8) | b(11), b(4) | b(6) | b(12) | b(14)],
+ [b(5) | b(6) | b(9) | b(10), b(5) | b(7) | b(13) | b(15)],
+];
+
+fn decode_nibble(table: &[[u16; 2]; 4], nibble: u8) -> u16 {
+ for section in table.iter() {
+ if section[0] & (1 << nibble) != 0 {
+ return section[1];
+ }
+ }
+ 0
+}
+
+fn find_1bit(x: u16) -> Option<u8> {
+ x.is_power_of_two().then(|| x.trailing_zeros() as u8)
+}
+
+fn decode_pair(a: u8, b: u8) -> Option<u8> {
+ let x = find_1bit(decode_nibble(&AH, a >> 4) & decode_nibble(&BH, b >> 4))?;
+ let y = find_1bit(decode_nibble(&AL, a & 15) & decode_nibble(&BL, b & 15))?;
+ Some((x << 4) | y)
+}
+
+fn encode_nibble(table: &[[u16; 2]; 4], nibble: u8) -> Vec<u8> {
+ for section in table.iter() {
+ if section[1] & (1 << nibble) != 0 {
+ let mut outputs = Vec::with_capacity(4);
+ let mut bits = section[0];
+ while bits != 0 {
+ outputs.push(bits.trailing_zeros() as u8);
+ bits &= bits - 1;
+ }
+ return outputs;
+ }
+ }
+ unreachable!()
+}
+
+fn encode_byte(hi_table: &[[u16; 2]; 4], lo_table: &[[u16; 2]; 4], byte: u8) -> Vec<char> {
+ let hi_variants = encode_nibble(hi_table, byte >> 4);
+ let lo_variants = encode_nibble(lo_table, byte & 15);
+ let mut variants = Vec::with_capacity(hi_variants.len() * lo_variants.len());
+ for hi in hi_variants.iter().copied() {
+ for lo in lo_variants.iter().copied() {
+ let byte = (hi << 4) | lo;
+ if byte != 127 {
+ variants.push(byte as char);
+ }
+ }
+ }
+ variants
+}
+
+/// An encoded password.
+///
+/// SPSS calls these "encrypted passwords", but they are not encrypted. They
+/// are encoded with a simple scheme, analogous to base64 encoding but
+/// one-to-many: any plaintext password maps to many possible encoded passwords.
+///
+/// The encoding scheme maps each plaintext password byte to 2 ASCII characters,
+/// using only at most the first 10 bytes of the plaintext password. Thus, an
+/// encoded password is always a multiple of 2 characters long, and never longer
+/// than 20 characters. The characters in an encoded password are always in the
+/// graphic ASCII range 33 through 126. Each successive pair of characters in
+/// the password encodes a single byte in the plaintext password.
+///
+/// This struct supports both encoding and decoding passwords.
+#[derive(Clone, Debug)]
+pub struct EncodedPassword(Vec<Vec<char>>);
+
+impl EncodedPassword {
+ /// Creates an [EncodedPassword] from an already-encoded password `encoded`.
+ /// Returns `None` if `encoded` is not a valid encoded password.
+ pub fn from_encoded(encoded: &[u8]) -> Option<Self> {
+ if encoded.len() > 20
+ || encoded.len() % 2 != 0
+ || !encoded.iter().all(|byte| (32..=127).contains(byte))
+ {
+ return None;
+ }
+
+ Some(EncodedPassword(
+ encoded.iter().map(|byte| vec![*byte as char]).collect(),
+ ))
+ }
+
+ /// Returns an [EncodedPassword] as an encoded version of the given
+ /// `plaintext` password. Only the first 10 bytes, at most, of the
+ /// plaintext password is used.
+ pub fn from_plaintext(plaintext: &[u8]) -> EncodedPassword {
+ let input = plaintext.get(..10).unwrap_or(plaintext);
+ EncodedPassword(
+ input
+ .iter()
+ .copied()
+ .flat_map(|byte| [encode_byte(&AH, &AL, byte), encode_byte(&BH, &BL, byte)])
+ .collect(),
+ )
+ }
+
+ /// Returns the number of variations of this encoded password.
+ ///
+ /// An [EncodedPassword] created by [EncodedPassword::from_plaintext] has
+ /// many variations: between `16**n` and `32**n` for an `n`-byte plaintext
+ /// password, so up to `32**10` (about 1e15) for the 10-byte longest
+ /// plaintext passwords.
+ ///
+ /// An [EncodedPassword] created by [EncodedPassword::from_encoded] has only
+ /// a single variation, the one passed in by that function.
+ pub fn n_variants(&self) -> u64 {
+ self.0
+ .iter()
+ .map(|variants| variants.len() as u64)
+ .product()
+ }
+
+ /// Returns one variation of this encoded password, numbered `index`. All
+ /// variations decode the same way.
+ pub fn variant(&self, mut index: u64) -> String {
+ let mut output = String::with_capacity(20);
+ for variants in &self.0 {
+ let n = variants.len() as u64;
+ output.push(variants[(index % n) as usize]);
+ index /= n;
+ }
+ output
+ }
+
+ /// Returns the decoded version of this encoded password.
+ pub fn decode(&self) -> SmallVec<[u8; 10]> {
+ let mut output = SmallVec::new();
+ for [a, b] in self.0.as_chunks::<2>().0 {
+ output.push(decode_pair(a[0] as u8, b[0] as u8).unwrap());
+ }
+ output
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use std::{io::Cursor, path::Path};
+
+ use crate::crypto::{EncodedPassword, EncryptedFile, FileType};
+
+ fn test_decrypt(input_name: &Path, expected_name: &Path, password: &str, file_type: FileType) {
+ let input_filename = Path::new("src/crypto/testdata").join(input_name);
+ let input = std::fs::read(&input_filename).unwrap();
+ let mut cursor = Cursor::new(&input);
+ let file = EncryptedFile::new(&mut cursor).unwrap();
+ assert_eq!(file.file_type(), file_type);
+ let mut reader = file.unlock_literal(password.as_bytes()).unwrap();
+ assert_eq!(reader.file_type(), file_type);
+ let mut actual = Vec::new();
+ std::io::copy(&mut reader, &mut actual).unwrap();
+
+ let expected_filename = Path::new("src/crypto/testdata").join(expected_name);
+ let expected = std::fs::read(&expected_filename).unwrap();
+ if actual != expected {
+ panic!();
+ }
+ }
+
+ #[test]
+ fn sys_file() {
+ test_decrypt(
+ Path::new("test-encrypted.sav"),
+ Path::new("test.sav"),
+ "pspp",
+ FileType::Data,
+ );
+ }
+
+ #[test]
+ fn syntax_file() {
+ test_decrypt(
+ Path::new("test-encrypted.sps"),
+ Path::new("test.sps"),
+ "password",
+ FileType::Syntax,
+ );
+ }
+
+ #[test]
+ fn spv_file() {
+ test_decrypt(
+ Path::new("test-encrypted.spv"),
+ Path::new("test.spv"),
+ "Password1",
+ FileType::Viewer,
+ );
+ }
+
+ #[test]
+ fn password_encoding() {
+ // Decode a few specific passwords.
+ assert_eq!(
+ EncodedPassword::from_encoded(b"-|")
+ .unwrap()
+ .decode()
+ .as_slice(),
+ b"b"
+ );
+ assert_eq!(
+ EncodedPassword::from_encoded(b" A")
+ .unwrap()
+ .decode()
+ .as_slice(),
+ b"a"
+ );
+
+ // Check that the encoding and decoding algorithms are inverses
+ // for individual characters at least.
+ for plaintext in 0..=255 {
+ let encoded = EncodedPassword::from_plaintext(&[plaintext]);
+ for variant in 0..encoded.n_variants() {
+ let encoded_variant = encoded.variant(variant);
+ let decoded = EncodedPassword::from_encoded(encoded_variant.as_bytes())
+ .unwrap()
+ .decode();
+ assert_eq!(&[plaintext], decoded.as_slice());
+ }
+ }
+ }
+}
+++ /dev/null
-//! # Decryption for SPSS encrypted files
-//!
-//! SPSS supports encryption using a password for data, viewer, and syntax
-//! files. The encryption mechanism is poorly designed, so this module provides
-//! support for decrypting, but not encrypting, the SPSS format.
-//! Use [EncryptedFile] as the starting point for reading an encrypted file.
-//!
-//! SPSS also supports what calls "encrypted passwords". Use [EncodedPassword]
-//! to encode and decode these passwords.
-
-// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
-#![cfg_attr(not(test), warn(missing_docs))]
-
-use aes::{
- cipher::{generic_array::GenericArray, BlockDecrypt, KeyInit},
- Aes256, Aes256Dec,
-};
-use cmac::{Cmac, Mac};
-use smallvec::SmallVec;
-use std::{
- fmt::Debug,
- io::{BufRead, Error as IoError, ErrorKind, Read, Seek, SeekFrom},
-};
-use thiserror::Error as ThisError;
-
-use binrw::{io::NoSeek, BinRead};
-
-/// Error reading an encrypted file.
-#[derive(Clone, Debug, ThisError)]
-pub enum Error {
- /// I/O error.
- #[error("I/O error reading encrypted file wrapper ({0})")]
- IoError(ErrorKind),
-
- /// Invalid padding in final encrypted data block.
- #[error("Invalid padding in final encrypted data block")]
- InvalidPadding,
-
- /// Not an encrypted file.
- #[error("Not an encrypted file")]
- NotEncrypted,
-
- /// Encrypted file has invalid length.
- #[error("Encrypted file has invalid length {0} (expected 4 more than a multiple of 16).")]
- InvalidLength(u64),
-
- /// Unknown file type.
- #[error("Unknown file type {0:?}.")]
- UnknownFileType(String),
-}
-
-impl From<std::io::Error> for Error {
- fn from(value: std::io::Error) -> Self {
- Self::IoError(value.kind())
- }
-}
-
-#[derive(BinRead)]
-struct EncryptedHeader {
- /// Fixed as `1c 00 00 00 00 00 00 00` in practice.
- _ignore: [u8; 8],
-
- /// File type.
- #[br(magic = b"ENCRYPTED")]
- file_type: [u8; 3],
-
- /// Fixed as `15 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00` in practice.
- _ignore2: [u8; 16],
-}
-
-/// An encrypted file.
-pub struct EncryptedFile<R> {
- reader: R,
- file_type: FileType,
-
- /// Length of the ciphertext (excluding the 36-byte header).
- length: u64,
-
- /// First block of ciphertext, for verifying that any password the user
- /// tries is correct.
- first_block: [u8; 16],
-
- /// Last block of ciphertext, for checking padding and determining the
- /// plaintext length.
- last_block: [u8; 16],
-}
-
-/// Type of encrypted file.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum FileType {
- /// A `.sps` syntax file.
- Syntax,
-
- /// A `.spv` viewer file.
- Viewer,
-
- /// A `.sav` data file.
- Data,
-}
-
-impl<R> EncryptedFile<R>
-where
- R: Read + Seek,
-{
- /// Opens `reader` as an encrypted file.
- ///
- /// This reads enough of the file to verify that it is in the expected
- /// format and returns an error if it cannot be read or is not the expected
- /// format.
- ///
- /// `reader` doesn't need to be [BufRead], and probably should not be. The
- /// [EncryptedReader] returned by [unlock] or [unlock_literal] will be
- /// [BufRead].
- ///
- /// [unlock]: Self::unlock
- /// [unlock_literal]: Self::unlock_literal
- pub fn new(mut reader: R) -> Result<Self, Error> {
- let header =
- EncryptedHeader::read_le(&mut NoSeek::new(&mut reader)).map_err(
- |error| match error {
- binrw::Error::BadMagic { .. } => Error::NotEncrypted,
- binrw::Error::Io(error) => Error::IoError(error.kind()),
- _ => unreachable!(),
- },
- )?;
- let file_type = match &header.file_type {
- b"SAV" => FileType::Data,
- b"SPV" => FileType::Viewer,
- b"SPS" => FileType::Syntax,
- _ => {
- return Err(Error::UnknownFileType(
- header.file_type.iter().map(|b| *b as char).collect(),
- ))
- }
- };
- let mut first_block = [0; 16];
- reader.read_exact(&mut first_block)?;
- let length = reader.seek(SeekFrom::End(-16))? + 16;
- if length < 36 + 16 || (length - 36) % 16 != 0 {
- return Err(Error::InvalidLength(length + 36));
- }
- let mut last_block = [0; 16];
- reader.read_exact(&mut last_block)?;
- reader.seek(SeekFrom::Start(36))?;
- Ok(Self {
- reader,
- file_type,
- length,
- first_block,
- last_block,
- })
- }
-
- /// Tries to unlock the encrypted file using both `password` and with
- /// `password` decoded with [EncodedPassword::decode]. If successful,
- /// returns an [EncryptedReader] for the file; on failure, returns the
- /// [EncryptedFile] again for another try.
- pub fn unlock(self, password: &[u8]) -> Result<EncryptedReader<R>, Self> {
- self.unlock_literal(password).or_else(|this| {
- match EncodedPassword::from_encoded(password) {
- Some(encoded) => this.unlock_literal(&encoded.decode()),
- None => Err(this),
- }
- })
- }
-
- /// Tries to unlock the encrypted file using just `password`. If
- /// successful, returns an [EncryptedReader] for the file; on failure,
- /// returns the [EncryptedFile] again for another try.
- ///
- /// If the password itself might be encoded ("encrypted"), instead use
- /// [Self::unlock] to try it both ways.
- pub fn unlock_literal(self, password: &[u8]) -> Result<EncryptedReader<R>, Self> {
- // NIST SP 800-108 fixed data.
- #[rustfmt::skip]
- static FIXED: &[u8] = &[
- // i
- 0x00, 0x00, 0x00, 0x01,
-
- // label
- 0x35, 0x27, 0x13, 0xcc, 0x53, 0xa7, 0x78, 0x89,
- 0x87, 0x53, 0x22, 0x11, 0xd6, 0x5b, 0x31, 0x58,
- 0xdc, 0xfe, 0x2e, 0x7e, 0x94, 0xda, 0x2f, 0x00,
- 0xcc, 0x15, 0x71, 0x80, 0x0a, 0x6c, 0x63, 0x53,
-
- // delimiter
- 0x00,
-
- // context
- 0x38, 0xc3, 0x38, 0xac, 0x22, 0xf3, 0x63, 0x62,
- 0x0e, 0xce, 0x85, 0x3f, 0xb8, 0x07, 0x4c, 0x4e,
- 0x2b, 0x77, 0xc7, 0x21, 0xf5, 0x1a, 0x80, 0x1d,
- 0x67, 0xfb, 0xe1, 0xe1, 0x83, 0x07, 0xd8, 0x0d,
-
- // L
- 0x00, 0x00, 0x01, 0x00,
- ];
-
- // Truncate password to at most 10 bytes.
- let password = password.get(..10).unwrap_or(password);
- let n = password.len();
-
- // padded_password = password padded with zeros to 32 bytes.
- let mut padded_password = [0; 32];
- padded_password[..n].copy_from_slice(password);
-
- // cmac = CMAC(padded_password, fixed).
- let mut cmac = <Cmac<Aes256> as Mac>::new_from_slice(&padded_password).unwrap();
- cmac.update(FIXED);
- let cmac = cmac.finalize().into_bytes();
-
- // The key is the cmac repeated twice.
- let mut key = [0; 32];
- key[..16].copy_from_slice(cmac.as_slice());
- key[16..].copy_from_slice(cmac.as_slice());
-
- // Use key to initialize AES.
- let aes = <Aes256Dec as KeyInit>::new_from_slice(&key).unwrap();
-
- // Decrypt first block to verify password.
- let mut out = [0; 16];
- aes.decrypt_block_b2b(
- GenericArray::from_slice(&self.first_block),
- GenericArray::from_mut_slice(&mut out),
- );
- static MAGIC: &[&[u8]] = &[
- b"$FL2@(#)",
- b"$FL3@(#)",
- b"* Encoding",
- b"PK\x03\x04\x14\0\x08",
- ];
- if !MAGIC.iter().any(|magic| out.starts_with(magic)) {
- return Err(self);
- }
-
- // Decrypt last block to check padding and get final length.
- aes.decrypt_block_b2b(
- GenericArray::from_slice(&self.last_block),
- GenericArray::from_mut_slice(&mut out),
- );
- let Some(padding_length) = parse_padding(&out) else {
- return Err(self);
- };
-
- Ok(EncryptedReader::new(
- self.reader,
- aes,
- self.file_type,
- self.length - 36 - padding_length as u64,
- ))
- }
-
- /// Returns the type of encrypted file.
- pub fn file_type(&self) -> FileType {
- self.file_type
- }
-}
-
-fn parse_padding(block: &[u8; 16]) -> Option<usize> {
- let pad = block[15] as usize;
- if (1..=16).contains(&pad) && block[16 - pad..].iter().all(|b| *b == pad as u8) {
- Some(pad)
- } else {
- None
- }
-}
-
-impl<R> Debug for EncryptedFile<R>
-where
- R: Read,
-{
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "EncryptedFile({:?})", &self.file_type)
- }
-}
-
-/// Encrypted file reader.
-///
-/// This implements [Read] and [Seek] for SPSS encrypted files. To construct an
-/// [EncryptedReader], call [EncryptedFile::new], then [EncryptedFile::unlock].
-pub struct EncryptedReader<R> {
- /// Underlying reader.
- reader: R,
-
- /// AES-256 decryption key.
- aes: Aes256Dec,
-
- /// Type of file.
- file_type: FileType,
-
- /// Plaintext file length (not including the file header or padding).
- length: u64,
-
- /// Plaintext data buffer.
- buffer: Box<[u8; 4096]>,
-
- /// Plaintext offset of the byte in `buffer[0]`. A multiple of 16 less than
- /// or equal to `length`.
- start: u64,
-
- /// Number of bytes in buffer (`0 <= head <= 4096`).
- head: usize,
-
- /// Offset in buffer of the next byte to read (`head <= tail`).
- tail: usize,
-}
-
-impl<R> EncryptedReader<R> {
- fn new(reader: R, aes: Aes256Dec, file_type: FileType, length: u64) -> Self {
- Self {
- reader,
- aes,
- file_type,
- length,
- buffer: Box::new([0; 4096]),
- start: 0,
- head: 0,
- tail: 0,
- }
- }
-
- fn read_buffer(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
- let n = buf.len().min(self.head - self.tail);
- buf[..n].copy_from_slice(&self.buffer[self.tail..n + self.tail]);
- self.tail += n;
- Ok(n)
- }
-
- /// Returns the type of encrypted file.
- pub fn file_type(&self) -> FileType {
- self.file_type
- }
-}
-
-impl<R> EncryptedReader<R>
-where
- R: Read,
-{
- fn fill_buffer(&mut self, offset: u64) -> Result<(), IoError> {
- self.start = offset / 16 * 16;
- self.head = 0;
- self.tail = (offset % 16) as usize;
- let n = self.buffer.len().min((self.length - self.start) as usize);
- self.reader
- .read_exact(&mut self.buffer[..n.next_multiple_of(16)])?;
- for offset in (0..n).step_by(16) {
- self.aes.decrypt_block(GenericArray::from_mut_slice(
- &mut self.buffer[offset..offset + 16],
- ));
- }
- self.head = n;
- Ok(())
- }
-}
-
-impl<R> Read for EncryptedReader<R>
-where
- R: Read,
-{
- fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
- if self.tail < self.head {
- self.read_buffer(buf)
- } else {
- let offset = self.start + self.head as u64;
- if offset < self.length {
- self.fill_buffer(offset)?;
- self.read_buffer(buf)
- } else {
- Ok(0)
- }
- }
- }
-}
-
-impl<R> Seek for EncryptedReader<R>
-where
- R: Read + Seek,
-{
- fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
- let offset = match pos {
- SeekFrom::Start(offset) => Some(offset),
- SeekFrom::End(relative) => self.length.checked_add_signed(relative),
- SeekFrom::Current(relative) => {
- (self.start + self.tail as u64).checked_add_signed(relative)
- }
- }
- .filter(|offset| *offset < u64::MAX - 36)
- .ok_or(IoError::from(ErrorKind::InvalidInput))?;
- if offset != self.start + self.tail as u64 {
- self.reader.seek(SeekFrom::Start(offset / 16 * 16 + 36))?;
- self.fill_buffer(offset)?;
- }
- Ok(offset)
- }
-}
-
-impl<R> BufRead for EncryptedReader<R>
-where
- R: Read + Seek,
-{
- fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
- if self.tail >= self.head {
- let offset = self.start + self.head as u64;
- if offset < self.length {
- self.fill_buffer(offset)?;
- }
- }
- Ok(&self.buffer[self.tail..self.head])
- }
-
- fn consume(&mut self, amount: usize) {
- self.tail += amount;
- debug_assert!(self.tail <= self.head);
- }
-}
-
-const fn b(x: i32) -> u16 {
- 1 << x
-}
-
-static AH: [[u16; 2]; 4] = [
- [b(2), b(2) | b(3) | b(6) | b(7)],
- [b(3), b(0) | b(1) | b(4) | b(5)],
- [b(4) | b(7), b(8) | b(9) | b(12) | b(13)],
- [b(5) | b(6), b(10) | b(11) | b(14) | b(15)],
-];
-
-static AL: [[u16; 2]; 4] = [
- [b(0) | b(3) | b(12) | b(15), b(0) | b(1) | b(4) | b(5)],
- [b(1) | b(2) | b(13) | b(14), b(2) | b(3) | b(6) | b(7)],
- [b(4) | b(7) | b(8) | b(11), b(8) | b(9) | b(12) | b(13)],
- [b(5) | b(6) | b(9) | b(10), b(10) | b(11) | b(14) | b(15)],
-];
-
-static BH: [[u16; 2]; 4] = [
- [b(2), b(1) | b(3) | b(9) | b(11)],
- [b(3), b(0) | b(2) | b(8) | b(10)],
- [b(4) | b(7), b(4) | b(6) | b(12) | b(14)],
- [b(5) | b(6), b(5) | b(7) | b(13) | b(15)],
-];
-
-static BL: [[u16; 2]; 4] = [
- [b(0) | b(3) | b(12) | b(15), b(0) | b(2) | b(8) | b(10)],
- [b(1) | b(2) | b(13) | b(14), b(1) | b(3) | b(9) | b(11)],
- [b(4) | b(7) | b(8) | b(11), b(4) | b(6) | b(12) | b(14)],
- [b(5) | b(6) | b(9) | b(10), b(5) | b(7) | b(13) | b(15)],
-];
-
-fn decode_nibble(table: &[[u16; 2]; 4], nibble: u8) -> u16 {
- for section in table.iter() {
- if section[0] & (1 << nibble) != 0 {
- return section[1];
- }
- }
- 0
-}
-
-fn find_1bit(x: u16) -> Option<u8> {
- x.is_power_of_two().then(|| x.trailing_zeros() as u8)
-}
-
-fn decode_pair(a: u8, b: u8) -> Option<u8> {
- let x = find_1bit(decode_nibble(&AH, a >> 4) & decode_nibble(&BH, b >> 4))?;
- let y = find_1bit(decode_nibble(&AL, a & 15) & decode_nibble(&BL, b & 15))?;
- Some((x << 4) | y)
-}
-
-fn encode_nibble(table: &[[u16; 2]; 4], nibble: u8) -> Vec<u8> {
- for section in table.iter() {
- if section[1] & (1 << nibble) != 0 {
- let mut outputs = Vec::with_capacity(4);
- let mut bits = section[0];
- while bits != 0 {
- outputs.push(bits.trailing_zeros() as u8);
- bits &= bits - 1;
- }
- return outputs;
- }
- }
- unreachable!()
-}
-
-fn encode_byte(hi_table: &[[u16; 2]; 4], lo_table: &[[u16; 2]; 4], byte: u8) -> Vec<char> {
- let hi_variants = encode_nibble(hi_table, byte >> 4);
- let lo_variants = encode_nibble(lo_table, byte & 15);
- let mut variants = Vec::with_capacity(hi_variants.len() * lo_variants.len());
- for hi in hi_variants.iter().copied() {
- for lo in lo_variants.iter().copied() {
- let byte = (hi << 4) | lo;
- if byte != 127 {
- variants.push(byte as char);
- }
- }
- }
- variants
-}
-
-/// An encoded password.
-///
-/// SPSS calls these "encrypted passwords", but they are not encrypted. They
-/// are encoded with a simple scheme, analogous to base64 encoding but
-/// one-to-many: any plaintext password maps to many possible encoded passwords.
-///
-/// The encoding scheme maps each plaintext password byte to 2 ASCII characters,
-/// using only at most the first 10 bytes of the plaintext password. Thus, an
-/// encoded password is always a multiple of 2 characters long, and never longer
-/// than 20 characters. The characters in an encoded password are always in the
-/// graphic ASCII range 33 through 126. Each successive pair of characters in
-/// the password encodes a single byte in the plaintext password.
-///
-/// This struct supports both encoding and decoding passwords.
-#[derive(Clone, Debug)]
-pub struct EncodedPassword(Vec<Vec<char>>);
-
-impl EncodedPassword {
- /// Creates an [EncodedPassword] from an already-encoded password `encoded`.
- /// Returns `None` if `encoded` is not a valid encoded password.
- pub fn from_encoded(encoded: &[u8]) -> Option<Self> {
- if encoded.len() > 20
- || encoded.len() % 2 != 0
- || !encoded.iter().all(|byte| (32..=127).contains(byte))
- {
- return None;
- }
-
- Some(EncodedPassword(
- encoded.iter().map(|byte| vec![*byte as char]).collect(),
- ))
- }
-
- /// Returns an [EncodedPassword] as an encoded version of the given
- /// `plaintext` password. Only the first 10 bytes, at most, of the
- /// plaintext password is used.
- pub fn from_plaintext(plaintext: &[u8]) -> EncodedPassword {
- let input = plaintext.get(..10).unwrap_or(plaintext);
- EncodedPassword(
- input
- .iter()
- .copied()
- .flat_map(|byte| [encode_byte(&AH, &AL, byte), encode_byte(&BH, &BL, byte)])
- .collect(),
- )
- }
-
- /// Returns the number of variations of this encoded password.
- ///
- /// An [EncodedPassword] created by [EncodedPassword::from_plaintext] has
- /// many variations: between `16**n` and `32**n` for an `n`-byte plaintext
- /// password, so up to `32**10` (about 1e15) for the 10-byte longest
- /// plaintext passwords.
- ///
- /// An [EncodedPassword] created by [EncodedPassword::from_encoded] has only
- /// a single variation, the one passed in by that function.
- pub fn n_variants(&self) -> u64 {
- self.0
- .iter()
- .map(|variants| variants.len() as u64)
- .product()
- }
-
- /// Returns one variation of this encoded password, numbered `index`. All
- /// variations decode the same way.
- pub fn variant(&self, mut index: u64) -> String {
- let mut output = String::with_capacity(20);
- for variants in &self.0 {
- let n = variants.len() as u64;
- output.push(variants[(index % n) as usize]);
- index /= n;
- }
- output
- }
-
- /// Returns the decoded version of this encoded password.
- pub fn decode(&self) -> SmallVec<[u8; 10]> {
- let mut output = SmallVec::new();
- for [a, b] in self.0.as_chunks::<2>().0 {
- output.push(decode_pair(a[0] as u8, b[0] as u8).unwrap());
- }
- output
- }
-}
-
-#[cfg(test)]
-mod test {
- use std::{io::Cursor, path::Path};
-
- use crate::crypto::{EncodedPassword, EncryptedFile, FileType};
-
- fn test_decrypt(input_name: &Path, expected_name: &Path, password: &str, file_type: FileType) {
- let input_filename = Path::new("src/crypto/testdata").join(input_name);
- let input = std::fs::read(&input_filename).unwrap();
- let mut cursor = Cursor::new(&input);
- let file = EncryptedFile::new(&mut cursor).unwrap();
- assert_eq!(file.file_type(), file_type);
- let mut reader = file.unlock_literal(password.as_bytes()).unwrap();
- assert_eq!(reader.file_type(), file_type);
- let mut actual = Vec::new();
- std::io::copy(&mut reader, &mut actual).unwrap();
-
- let expected_filename = Path::new("src/crypto/testdata").join(expected_name);
- let expected = std::fs::read(&expected_filename).unwrap();
- if actual != expected {
- panic!();
- }
- }
-
- #[test]
- fn sys_file() {
- test_decrypt(
- Path::new("test-encrypted.sav"),
- Path::new("test.sav"),
- "pspp",
- FileType::Data,
- );
- }
-
- #[test]
- fn syntax_file() {
- test_decrypt(
- Path::new("test-encrypted.sps"),
- Path::new("test.sps"),
- "password",
- FileType::Syntax,
- );
- }
-
- #[test]
- fn spv_file() {
- test_decrypt(
- Path::new("test-encrypted.spv"),
- Path::new("test.spv"),
- "Password1",
- FileType::Viewer,
- );
- }
-
- #[test]
- fn password_encoding() {
- // Decode a few specific passwords.
- assert_eq!(
- EncodedPassword::from_encoded(b"-|")
- .unwrap()
- .decode()
- .as_slice(),
- b"b"
- );
- assert_eq!(
- EncodedPassword::from_encoded(b" A")
- .unwrap()
- .decode()
- .as_slice(),
- b"a"
- );
-
- // Check that the encoding and decoding algorithms are inverses
- // for individual characters at least.
- for plaintext in 0..=255 {
- let encoded = EncodedPassword::from_plaintext(&[plaintext]);
- for variant in 0..encoded.n_variants() {
- let encoded_variant = encoded.variant(variant);
- let decoded = EncodedPassword::from_encoded(encoded_variant.as_bytes())
- .unwrap()
- .decode();
- assert_eq!(&[plaintext], decoded.as_slice());
- }
- }
- }
-}
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+use std::{
+ fmt::{Debug, Display, Formatter, Result as FmtResult, Write},
+ ops::{Not, RangeInclusive},
+ str::{Chars, FromStr},
+ sync::LazyLock,
+};
+
+use chrono::{Datelike, Local};
+use enum_iterator::{all, Sequence};
+use enum_map::{Enum, EnumMap};
+use serde::{Deserialize, Serialize};
+use thiserror::Error as ThisError;
+use unicode_width::UnicodeWidthStr;
+
+use crate::{
+ data::{ByteString, Datum},
+ sys::raw,
+ util::ToSmallString,
+ variable::{VarType, VarWidth},
+};
+
+mod display;
+mod parse;
+pub use display::{DisplayDatum, DisplayPlain, DisplayPlainF64};
+
+#[derive(Clone, ThisError, Debug, PartialEq, Eq)]
+pub enum Error {
+ #[error("Unknown format type {value}.")]
+ UnknownFormat { value: u16 },
+
+ #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)]
+ OddWidthNotAllowed(UncheckedFormat),
+
+ #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())]
+ BadWidth(UncheckedFormat),
+
+ #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)]
+ DecimalsNotAllowedForFormat(UncheckedFormat),
+
+ #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)]
+ DecimalsNotAllowedForWidth(UncheckedFormat),
+
+ #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)]
+ TooManyDecimalsForWidth {
+ spec: UncheckedFormat,
+ max_d: Decimals,
+ },
+
+ #[error("String variable is not compatible with numeric format {0}.")]
+ UnnamedVariableNotCompatibleWithNumericFormat(Type),
+
+ #[error("Numeric variable is not compatible with string format {0}.")]
+ UnnamedVariableNotCompatibleWithStringFormat(Type),
+
+ #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")]
+ NamedStringVariableBadSpecWidth {
+ variable: String,
+ width: Width,
+ bad_spec: Format,
+ good_spec: Format,
+ },
+
+ #[error("String variable with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")]
+ UnnamedStringVariableBadSpecWidth {
+ width: Width,
+ bad_spec: Format,
+ good_spec: Format,
+ },
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum Category {
+ // Numeric formats.
+ Basic,
+ Custom,
+ Legacy,
+ Binary,
+ Hex,
+ Date,
+ Time,
+ DateComponent,
+
+ // String formats.
+ String,
+}
+
+impl From<Type> for Category {
+ fn from(source: Type) -> Self {
+ match source {
+ Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic,
+ Type::CC(_) => Self::Custom,
+ Type::N | Type::Z => Self::Legacy,
+ Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary,
+ Type::PIBHex | Type::RBHex => Self::Hex,
+ Type::Date
+ | Type::ADate
+ | Type::EDate
+ | Type::JDate
+ | Type::SDate
+ | Type::QYr
+ | Type::MoYr
+ | Type::WkYr
+ | Type::DateTime
+ | Type::YmdHms => Self::Date,
+ Type::MTime | Type::Time | Type::DTime => Self::Time,
+ Type::WkDay | Type::Month => Self::DateComponent,
+ Type::A | Type::AHex => Self::String,
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash, Sequence, Serialize)]
+pub enum CC {
+ A,
+ B,
+ C,
+ D,
+ E,
+}
+
+impl CC {
+ pub fn as_string(&self) -> &'static str {
+ match self {
+ CC::A => "A",
+ CC::B => "B",
+ CC::C => "C",
+ CC::D => "D",
+ CC::E => "E",
+ }
+ }
+}
+
+impl Display for CC {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{}", self.as_string())
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Sequence, Serialize)]
+pub enum Type {
+ // Basic numeric formats.
+ F,
+ Comma,
+ Dot,
+ Dollar,
+ Pct,
+ E,
+
+ // Custom currency formats.
+ CC(CC),
+
+ // Legacy numeric formats.
+ N,
+ Z,
+
+ // Binary and hexadecimal formats.
+ P,
+ PK,
+ IB,
+ PIB,
+ PIBHex,
+ RB,
+ RBHex,
+
+ // Time and date formats.
+ Date,
+ ADate,
+ EDate,
+ JDate,
+ SDate,
+ QYr,
+ MoYr,
+ WkYr,
+ DateTime,
+ YmdHms,
+ MTime,
+ Time,
+ DTime,
+
+ // Date component formats.
+ WkDay,
+ Month,
+
+ // String formats.
+ A,
+ AHex,
+}
+
+pub type Width = u16;
+pub type SignedWidth = i16;
+
+pub type Decimals = u8;
+
+impl Type {
+ pub fn max_width(self) -> Width {
+ match self {
+ Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16,
+ Self::IB | Self::PIB | Self::RB => 8,
+ Self::A => 32767,
+ Self::AHex => 32767 * 2,
+ _ => 40,
+ }
+ }
+
+ pub fn min_width(self) -> Width {
+ match self {
+ // Basic numeric formats.
+ Self::F => 1,
+ Self::Comma => 1,
+ Self::Dot => 1,
+ Self::Dollar => 2,
+ Self::Pct => 2,
+ Self::E => 6,
+
+ // Custom currency formats.
+ Self::CC(_) => 2,
+
+ // Legacy numeric formats.
+ Self::N => 1,
+ Self::Z => 1,
+
+ // Binary and hexadecimal formats.
+ Self::P => 1,
+ Self::PK => 1,
+ Self::IB => 1,
+ Self::PIB => 1,
+ Self::PIBHex => 2,
+ Self::RB => 2,
+ Self::RBHex => 4,
+
+ // Time and date formats.
+ Self::Date => 9,
+ Self::ADate => 8,
+ Self::EDate => 8,
+ Self::JDate => 5,
+ Self::SDate => 8,
+ Self::QYr => 6,
+ Self::MoYr => 6,
+ Self::WkYr => 8,
+ Self::DateTime => 17,
+ Self::YmdHms => 16,
+ Self::MTime => 5,
+ Self::Time => 5,
+ Self::DTime => 8,
+
+ // Date component formats.
+ Self::WkDay => 2,
+ Self::Month => 3,
+
+ // String formats.
+ Self::A => 1,
+ Self::AHex => 2,
+ }
+ }
+
+ pub fn width_range(self) -> RangeInclusive<Width> {
+ self.min_width()..=self.max_width()
+ }
+
+ pub fn max_decimals(self, width: Width) -> Decimals {
+ let width = width.clamp(1, 40) as SignedWidth;
+ let max = match self {
+ Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1,
+ Self::Dollar | Self::Pct => width - 2,
+ Self::E => width - 7,
+ Self::N | Self::Z => width,
+ Self::P => width * 2 - 1,
+ Self::PK => width * 2,
+ Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth,
+ Self::PIBHex => 0,
+ Self::RB | Self::RBHex => 16,
+ Self::Date
+ | Self::ADate
+ | Self::EDate
+ | Self::JDate
+ | Self::SDate
+ | Self::QYr
+ | Self::MoYr
+ | Self::WkYr => 0,
+ Self::DateTime => width - 21,
+ Self::YmdHms => width - 20,
+ Self::MTime => width - 6,
+ Self::Time => width - 9,
+ Self::DTime => width - 12,
+ Self::WkDay | Self::Month | Self::A | Self::AHex => 0,
+ };
+ max.clamp(0, 16) as Decimals
+ }
+
+ pub fn takes_decimals(self) -> bool {
+ self.max_decimals(Width::MAX) > 0
+ }
+
+ pub fn category(self) -> Category {
+ self.into()
+ }
+
+ pub fn width_step(self) -> Width {
+ if self.category() == Category::Hex || self == Self::AHex {
+ 2
+ } else {
+ 1
+ }
+ }
+
+ pub fn clamp_width(self, width: Width) -> Width {
+ let (min, max) = self.width_range().into_inner();
+ let width = width.clamp(min, max);
+ if self.width_step() == 2 {
+ width / 2 * 2
+ } else {
+ width
+ }
+ }
+
+ pub fn var_type(self) -> VarType {
+ match self {
+ Self::A | Self::AHex => VarType::String,
+ _ => VarType::Numeric,
+ }
+ }
+
+ /// Checks whether this format is valid for a variable with the given
+ /// `var_type`.
+ pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> {
+ let my_type = self.var_type();
+ match (my_type, var_type) {
+ (VarType::Numeric, VarType::String) => {
+ Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self))
+ }
+ (VarType::String, VarType::Numeric) => {
+ Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self))
+ }
+ _ => Ok(()),
+ }
+ }
+
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ Self::F => "F",
+ Self::Comma => "COMMA",
+ Self::Dot => "DOT",
+ Self::Dollar => "DOLLAR",
+ Self::Pct => "PCT",
+ Self::E => "E",
+ Self::CC(CC::A) => "CCA",
+ Self::CC(CC::B) => "CCB",
+ Self::CC(CC::C) => "CCC",
+ Self::CC(CC::D) => "CCD",
+ Self::CC(CC::E) => "CCE",
+ Self::N => "N",
+ Self::Z => "Z",
+ Self::P => "P",
+ Self::PK => "PK",
+ Self::IB => "IB",
+ Self::PIB => "PIB",
+ Self::PIBHex => "PIBHEX",
+ Self::RB => "RB",
+ Self::RBHex => "RBHEX",
+ Self::Date => "DATE",
+ Self::ADate => "ADATE",
+ Self::EDate => "EDATE",
+ Self::JDate => "JDATE",
+ Self::SDate => "SDATE",
+ Self::QYr => "QYR",
+ Self::MoYr => "MOYR",
+ Self::WkYr => "WKYR",
+ Self::DateTime => "DATETIME",
+ Self::YmdHms => "YMDHMS",
+ Self::MTime => "MTIME",
+ Self::Time => "TIME",
+ Self::DTime => "DTIME",
+ Self::WkDay => "WKDAY",
+ Self::Month => "MONTH",
+ Self::A => "A",
+ Self::AHex => "AHEX",
+ }
+ }
+
+ pub fn default_value(&self) -> Datum<ByteString> {
+ match self.var_type() {
+ VarType::Numeric => Datum::sysmis(),
+ VarType::String => Datum::String(ByteString::default()),
+ }
+ }
+}
+
+impl Display for Type {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+impl FromStr for Type {
+ type Err = ();
+
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
+ for type_ in all::<Type>() {
+ if type_.as_str().eq_ignore_ascii_case(s) {
+ return Ok(type_);
+ }
+ }
+ Err(())
+ }
+}
+
+fn max_digits_for_bytes(bytes: usize) -> usize {
+ *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20)
+}
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct AbstractFormat {
+ pub name: String,
+ w: Width,
+ d: Decimals,
+}
+
+fn split<F>(s: &str, predicate: F) -> (&str, &str)
+where
+ F: Fn(&char) -> bool,
+{
+ let rest = s.trim_start_matches(|c| predicate(&c));
+ let start = &s[..s.len() - rest.len()];
+ (start, rest)
+}
+
+impl FromStr for AbstractFormat {
+ type Err = ();
+
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
+ let (name, s) = split(s, char::is_ascii_alphabetic);
+ if name.is_empty() {
+ return Err(());
+ }
+
+ let (w, s) = split(s, char::is_ascii_digit);
+ let Ok(w) = w.parse() else {
+ return Err(());
+ };
+
+ let (d, rest) = if let Some(s) = s.strip_prefix('.') {
+ let (d, rest) = split(s, char::is_ascii_digit);
+ let Ok(d) = d.parse() else {
+ return Err(());
+ };
+ (d, rest)
+ } else {
+ (0, s)
+ };
+
+ if !rest.is_empty() {
+ return Err(());
+ }
+ Ok(Self {
+ name: name.into(),
+ w,
+ d,
+ })
+ }
+}
+
+impl TryFrom<AbstractFormat> for UncheckedFormat {
+ type Error = ();
+
+ fn try_from(value: AbstractFormat) -> Result<Self, Self::Error> {
+ Ok(UncheckedFormat::new(value.name.parse()?, value.w, value.d))
+ }
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Format {
+ type_: Type,
+ w: Width,
+ d: Decimals,
+}
+
+impl Serialize for Format {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ self.to_small_string::<16>().serialize(serializer)
+ }
+}
+
+impl Format {
+ pub const F40: Format = Format {
+ type_: Type::F,
+ w: 40,
+ d: 0,
+ };
+
+ pub const F40_1: Format = Format {
+ type_: Type::F,
+ w: 40,
+ d: 1,
+ };
+
+ pub const F40_2: Format = Format {
+ type_: Type::F,
+ w: 40,
+ d: 2,
+ };
+
+ pub const F40_3: Format = Format {
+ type_: Type::F,
+ w: 40,
+ d: 3,
+ };
+
+ pub const PCT40_1: Format = Format {
+ type_: Type::Pct,
+ w: 40,
+ d: 1,
+ };
+
+ pub const F8_2: Format = Format {
+ type_: Type::F,
+ w: 8,
+ d: 2,
+ };
+
+ pub const DATETIME40_0: Format = Format {
+ type_: Type::DateTime,
+ w: 40,
+ d: 0,
+ };
+
+ pub fn type_(self) -> Type {
+ self.type_
+ }
+ pub fn w(self) -> usize {
+ self.w as usize
+ }
+ pub fn d(self) -> usize {
+ self.d as usize
+ }
+
+ pub fn new(type_: Type, w: Width, d: Decimals) -> Option<Self> {
+ UncheckedFormat { type_, w, d }.try_into().ok()
+ }
+
+ pub fn default_for_width(var_width: VarWidth) -> Self {
+ match var_width {
+ VarWidth::Numeric => Format {
+ type_: Type::F,
+ w: 8,
+ d: 2,
+ },
+ VarWidth::String(w) => Format {
+ type_: Type::A,
+ w,
+ d: 0,
+ },
+ }
+ }
+
+ pub fn fixed_from(source: &UncheckedFormat) -> Self {
+ let UncheckedFormat {
+ type_: format,
+ w,
+ d,
+ } = *source;
+ let (min, max) = format.width_range().into_inner();
+ let mut w = w.clamp(min, max);
+ if d <= format.max_decimals(Width::MAX) {
+ while d > format.max_decimals(w) {
+ w += 1;
+ assert!(w <= 40);
+ }
+ }
+ let d = d.clamp(0, format.max_decimals(w));
+ Self {
+ type_: format,
+ w,
+ d,
+ }
+ }
+
+ pub fn var_width(self) -> VarWidth {
+ match self.type_ {
+ Type::A => VarWidth::String(self.w),
+ Type::AHex => VarWidth::String(self.w / 2),
+ _ => VarWidth::Numeric,
+ }
+ }
+
+ pub fn var_type(self) -> VarType {
+ self.type_.var_type()
+ }
+
+ /// Checks whether this format specification is valid for a variable with
+ /// width `var_width`.
+ pub fn check_width_compatibility(self, var_width: VarWidth) -> Result<Self, Error> {
+ // Verify that the format is right for the variable's type.
+ self.type_.check_type_compatibility(var_width.into())?;
+
+ if let VarWidth::String(w) = var_width {
+ if var_width != self.var_width() {
+ let bad_spec = self;
+ let good_spec = if self.type_ == Type::A {
+ Format { w, ..self }
+ } else {
+ Format { w: w * 2, ..self }
+ };
+ return Err(Error::UnnamedStringVariableBadSpecWidth {
+ width: w,
+ bad_spec,
+ good_spec,
+ });
+ }
+ }
+
+ Ok(self)
+ }
+
+ pub fn default_value(&self) -> Datum<ByteString> {
+ match self.var_width() {
+ VarWidth::Numeric => Datum::sysmis(),
+ VarWidth::String(width) => Datum::String(ByteString::spaces(width as usize)),
+ }
+ }
+
+ pub fn resize(&mut self, width: VarWidth) {
+ match (self.var_width(), width) {
+ (VarWidth::Numeric, VarWidth::Numeric) => {}
+ (VarWidth::String(_), VarWidth::String(new_width)) => {
+ self.w = if self.type_ == Type::AHex {
+ new_width * 2
+ } else {
+ new_width
+ };
+ }
+ _ => *self = Self::default_for_width(width),
+ }
+ }
+
+ pub fn codepage_to_unicode(&mut self) {
+ let mut width = self.var_width();
+ width.codepage_to_unicode();
+ if let Some(width) = width.as_string_width() {
+ if self.type_ == Type::AHex {
+ self.w = width as u16 * 2;
+ } else {
+ self.w = width as u16;
+ }
+ }
+ }
+}
+
+impl Debug for Format {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ write!(f, "{self}")
+ }
+}
+
+impl Display for Format {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{}{}", self.type_, self.w)?;
+ if self.type_.takes_decimals() || self.d > 0 {
+ write!(f, ".{}", self.d)?;
+ }
+ Ok(())
+ }
+}
+
+impl TryFrom<UncheckedFormat> for Format {
+ type Error = Error;
+
+ fn try_from(source: UncheckedFormat) -> Result<Self, Self::Error> {
+ let UncheckedFormat {
+ type_: format,
+ w,
+ d,
+ } = source;
+ let max_d = format.max_decimals(w);
+ if w % format.width_step() != 0 {
+ Err(Error::OddWidthNotAllowed(source))
+ } else if !format.width_range().contains(&w) {
+ Err(Error::BadWidth(source))
+ } else if d > max_d {
+ if format.takes_decimals() {
+ Err(Error::DecimalsNotAllowedForFormat(source))
+ } else if max_d > 0 {
+ Err(Error::TooManyDecimalsForWidth {
+ spec: source,
+ max_d,
+ })
+ } else {
+ Err(Error::DecimalsNotAllowedForWidth(source))
+ }
+ } else {
+ Ok(Format {
+ type_: format,
+ w,
+ d,
+ })
+ }
+ }
+}
+
+impl From<Type> for u16 {
+ fn from(source: Type) -> Self {
+ match source {
+ Type::A => 1,
+ Type::AHex => 2,
+ Type::Comma => 3,
+ Type::Dollar => 4,
+ Type::F => 5,
+ Type::IB => 6,
+ Type::PIBHex => 7,
+ Type::P => 8,
+ Type::PIB => 9,
+ Type::PK => 10,
+ Type::RB => 11,
+ Type::RBHex => 12,
+ Type::Z => 15,
+ Type::N => 16,
+ Type::E => 17,
+ Type::Date => 20,
+ Type::Time => 21,
+ Type::DateTime => 22,
+ Type::ADate => 23,
+ Type::JDate => 24,
+ Type::DTime => 25,
+ Type::WkDay => 26,
+ Type::Month => 27,
+ Type::MoYr => 28,
+ Type::QYr => 29,
+ Type::WkYr => 30,
+ Type::Pct => 31,
+ Type::Dot => 32,
+ Type::CC(CC::A) => 33,
+ Type::CC(CC::B) => 34,
+ Type::CC(CC::C) => 35,
+ Type::CC(CC::D) => 36,
+ Type::CC(CC::E) => 37,
+ Type::EDate => 38,
+ Type::SDate => 39,
+ Type::MTime => 40,
+ Type::YmdHms => 41,
+ }
+ }
+}
+
+impl TryFrom<u16> for Type {
+ type Error = Error;
+
+ fn try_from(source: u16) -> Result<Self, Self::Error> {
+ match source {
+ 1 => Ok(Self::A),
+ 2 => Ok(Self::AHex),
+ 3 => Ok(Self::Comma),
+ 4 => Ok(Self::Dollar),
+ 5 => Ok(Self::F),
+ 6 => Ok(Self::IB),
+ 7 => Ok(Self::PIBHex),
+ 8 => Ok(Self::P),
+ 9 => Ok(Self::PIB),
+ 10 => Ok(Self::PK),
+ 11 => Ok(Self::RB),
+ 12 => Ok(Self::RBHex),
+ 15 => Ok(Self::Z),
+ 16 => Ok(Self::N),
+ 17 => Ok(Self::E),
+ 20 => Ok(Self::Date),
+ 21 => Ok(Self::Time),
+ 22 => Ok(Self::DateTime),
+ 23 => Ok(Self::ADate),
+ 24 => Ok(Self::JDate),
+ 25 => Ok(Self::DTime),
+ 26 => Ok(Self::WkDay),
+ 27 => Ok(Self::Month),
+ 28 => Ok(Self::MoYr),
+ 29 => Ok(Self::QYr),
+ 30 => Ok(Self::WkYr),
+ 31 => Ok(Self::Pct),
+ 32 => Ok(Self::Dot),
+ 33 => Ok(Self::CC(CC::A)),
+ 34 => Ok(Self::CC(CC::B)),
+ 35 => Ok(Self::CC(CC::C)),
+ 36 => Ok(Self::CC(CC::D)),
+ 37 => Ok(Self::CC(CC::E)),
+ 38 => Ok(Self::EDate),
+ 39 => Ok(Self::SDate),
+ 40 => Ok(Self::MTime),
+ 41 => Ok(Self::YmdHms),
+ _ => Err(Error::UnknownFormat { value: source }),
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct UncheckedFormat {
+ pub type_: Type,
+
+ pub w: Width,
+
+ pub d: Decimals,
+}
+
+impl UncheckedFormat {
+ pub fn new(type_: Type, w: Width, d: Decimals) -> Self {
+ Self { type_, w, d }
+ }
+ pub fn fix(&self) -> Format {
+ Format::fixed_from(self)
+ }
+}
+
+impl TryFrom<raw::records::RawFormat> for UncheckedFormat {
+ type Error = Error;
+
+ fn try_from(raw: raw::records::RawFormat) -> Result<Self, Self::Error> {
+ let raw = raw.0;
+ let raw_format = (raw >> 16) as u16;
+ let format = raw_format.try_into()?;
+ let w = ((raw >> 8) & 0xff) as Width;
+ let d = (raw & 0xff) as Decimals;
+ Ok(Self {
+ type_: format,
+ w,
+ d,
+ })
+ }
+}
+
+impl Display for UncheckedFormat {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{}{}", self.type_, self.w)?;
+ if self.type_.takes_decimals() || self.d > 0 {
+ write!(f, ".{}", self.d)?;
+ }
+ Ok(())
+ }
+}
+
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Enum, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Decimal {
+ #[default]
+ Dot,
+ Comma,
+}
+
+impl Decimal {
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ Decimal::Dot => ".",
+ Decimal::Comma => ",",
+ }
+ }
+}
+
+impl From<Decimal> for char {
+ fn from(value: Decimal) -> Self {
+ u8::from(value).into()
+ }
+}
+
+impl From<Decimal> for u8 {
+ fn from(value: Decimal) -> Self {
+ match value {
+ Decimal::Dot => b'.',
+ Decimal::Comma => b',',
+ }
+ }
+}
+
+impl TryFrom<char> for Decimal {
+ type Error = ();
+
+ fn try_from(c: char) -> Result<Self, Self::Error> {
+ match c {
+ '.' => Ok(Self::Dot),
+ ',' => Ok(Self::Comma),
+ _ => Err(()),
+ }
+ }
+}
+
+impl Not for Decimal {
+ type Output = Self;
+
+ fn not(self) -> Self::Output {
+ match self {
+ Self::Dot => Self::Comma,
+ Self::Comma => Self::Dot,
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)]
+pub struct Epoch(pub i32);
+
+impl Epoch {
+ /// Applies the epoch to `year`:
+ ///
+ /// - If `year` is 2 digits (between 0 and 99, inclusive), returns it
+ /// converted it to the correct year considering the epoch.
+ ///
+ /// - Otherwise, returns `year` unchanged.
+ pub fn apply(&self, year: i32) -> i32 {
+ match year {
+ 0..=99 => {
+ let century = self.0 / 100 * 100;
+ let offset = self.0 - century;
+ if year >= offset {
+ year + century
+ } else {
+ year + century + 100
+ }
+ }
+ other => other,
+ }
+ }
+}
+
+impl Default for Epoch {
+ fn default() -> Self {
+ static DEFAULT: LazyLock<Epoch> = LazyLock::new(|| Epoch(Local::now().year() - 69));
+ *DEFAULT
+ }
+}
+
+impl Display for Epoch {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ write!(f, "{}", self.0)
+ }
+}
+
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct Settings {
+ pub epoch: Epoch,
+
+ /// Either `'.'` or `','`.
+ pub decimal: Decimal,
+
+ /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5`
+ /// instead of `.5`)?
+ pub leading_zero: bool,
+
+ /// Custom currency styles.
+ pub ccs: EnumMap<CC, Option<Box<NumberStyle>>>,
+}
+
+#[derive(Copy, Clone, Enum)]
+struct StyleParams {
+ decimal: Decimal,
+ leading_zero: bool,
+}
+impl From<&Settings> for StyleParams {
+ fn from(value: &Settings) -> Self {
+ Self {
+ decimal: value.decimal,
+ leading_zero: value.leading_zero,
+ }
+ }
+}
+
+struct StyleSet(EnumMap<StyleParams, NumberStyle>);
+
+impl StyleSet {
+ fn new(f: impl Fn(StyleParams) -> NumberStyle) -> Self {
+ Self(EnumMap::from_fn(f))
+ }
+ fn get(&self, settings: &Settings) -> &NumberStyle {
+ &self.0[settings.into()]
+ }
+}
+
+impl Settings {
+ pub fn with_cc(mut self, cc: CC, style: NumberStyle) -> Self {
+ self.ccs[cc] = Some(Box::new(style));
+ self
+ }
+ pub fn with_leading_zero(self, leading_zero: bool) -> Self {
+ Self {
+ leading_zero,
+ ..self
+ }
+ }
+ pub fn with_epoch(self, epoch: Epoch) -> Self {
+ Self { epoch, ..self }
+ }
+ pub fn number_style(&self, type_: Type) -> &NumberStyle {
+ static DEFAULT: LazyLock<NumberStyle> =
+ LazyLock::new(|| NumberStyle::new("", "", Decimal::Dot, None, false));
+
+ match type_ {
+ Type::F | Type::E => {
+ static F: LazyLock<StyleSet> = LazyLock::new(|| {
+ StyleSet::new(|p| NumberStyle::new("", "", p.decimal, None, p.leading_zero))
+ });
+ F.get(self)
+ }
+ Type::Comma => {
+ static COMMA: LazyLock<StyleSet> = LazyLock::new(|| {
+ StyleSet::new(|p| {
+ NumberStyle::new("", "", p.decimal, Some(!p.decimal), p.leading_zero)
+ })
+ });
+ COMMA.get(self)
+ }
+ Type::Dot => {
+ static DOT: LazyLock<StyleSet> = LazyLock::new(|| {
+ StyleSet::new(|p| {
+ NumberStyle::new("", "", !p.decimal, Some(p.decimal), p.leading_zero)
+ })
+ });
+ DOT.get(self)
+ }
+ Type::Dollar => {
+ static DOLLAR: LazyLock<StyleSet> = LazyLock::new(|| {
+ StyleSet::new(|p| NumberStyle::new("$", "", p.decimal, Some(!p.decimal), false))
+ });
+ DOLLAR.get(self)
+ }
+ Type::Pct => {
+ static PCT: LazyLock<StyleSet> = LazyLock::new(|| {
+ StyleSet::new(|p| NumberStyle::new("", "%", p.decimal, None, false))
+ });
+ PCT.get(self)
+ }
+ Type::CC(cc) => self.ccs[cc].as_deref().unwrap_or(&DEFAULT),
+ Type::N
+ | Type::Z
+ | Type::P
+ | Type::PK
+ | Type::IB
+ | Type::PIB
+ | Type::PIBHex
+ | Type::RB
+ | Type::RBHex
+ | Type::Date
+ | Type::ADate
+ | Type::EDate
+ | Type::JDate
+ | Type::SDate
+ | Type::QYr
+ | Type::MoYr
+ | Type::WkYr
+ | Type::DateTime
+ | Type::YmdHms
+ | Type::MTime
+ | Type::Time
+ | Type::DTime
+ | Type::WkDay
+ | Type::Month
+ | Type::A
+ | Type::AHex => &DEFAULT,
+ }
+ }
+}
+
+/// A numeric output style. This can express numeric formats in
+/// [Category::Basic] and [Category::Custom].
+#[derive(Clone, Debug, Serialize)]
+pub struct NumberStyle {
+ pub neg_prefix: Affix,
+ pub prefix: Affix,
+ pub suffix: Affix,
+ pub neg_suffix: Affix,
+
+ /// Decimal point.
+ pub decimal: Decimal,
+
+ /// Grouping character.
+ pub grouping: Option<Decimal>,
+
+ /// Format as `.5` or `0.5`?
+ pub leading_zero: bool,
+
+ /// An `Affix` may require more bytes than its display width; for example,
+ /// U+00A5 (¥) is 2 bytes in UTF-8 but occupies only one display column.
+ /// This member is the sum of the number of bytes required by all of the
+ /// `Affix` members in this struct, minus their display widths. Thus, it
+ /// can be used to size memory allocations: for example, the formatted
+ /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in
+ /// UTF-8.
+ #[serde(skip)]
+ pub extra_bytes: usize,
+}
+
+impl Display for NumberStyle {
+ /// Display this number style in the format used for custom currency.
+ ///
+ /// This format can only accurately represent number styles that include a
+ /// grouping character. If this number style doesn't, it will pretend that
+ /// the grouping character is the opposite of the decimal point character.
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ let grouping = char::from(!self.decimal);
+ write!(
+ f,
+ "{}{}{}{}{}{}{}",
+ self.neg_prefix.display(grouping),
+ grouping,
+ self.prefix.display(grouping),
+ grouping,
+ self.suffix.display(grouping),
+ grouping,
+ self.neg_suffix.display(grouping),
+ )
+ }
+}
+
+impl NumberStyle {
+ fn new(
+ prefix: &str,
+ suffix: &str,
+ decimal: Decimal,
+ grouping: Option<Decimal>,
+ leading_zero: bool,
+ ) -> Self {
+ // These assertions ensure that zero is correct for `extra_bytes`.
+ debug_assert!(prefix.is_ascii());
+ debug_assert!(suffix.is_ascii());
+
+ Self {
+ neg_prefix: Affix::new("-"),
+ prefix: Affix::new(prefix),
+ suffix: Affix::new(suffix),
+ neg_suffix: Affix::new(""),
+ decimal,
+ grouping,
+ leading_zero,
+ extra_bytes: 0,
+ }
+ }
+
+ fn affix_width(&self) -> usize {
+ self.prefix.width + self.suffix.width
+ }
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct Affix {
+ /// String contents of affix.
+ pub s: String,
+
+ #[serde(skip)]
+ /// Display width in columns (see [unicode_width])
+ pub width: usize,
+}
+
+impl Affix {
+ fn new(s: impl Into<String>) -> Self {
+ let s = s.into();
+ Self {
+ width: s.width(),
+ s,
+ }
+ }
+
+ fn extra_bytes(&self) -> usize {
+ self.s.len().checked_sub(self.width).unwrap()
+ }
+
+ fn display(&self, escape: char) -> DisplayAffix<'_> {
+ DisplayAffix {
+ affix: self.s.as_str(),
+ escape,
+ }
+ }
+}
+
+pub struct DisplayAffix<'a> {
+ affix: &'a str,
+ escape: char,
+}
+
+impl Display for DisplayAffix<'_> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ for c in self.affix.chars() {
+ if c == self.escape {
+ f.write_char('\'')?;
+ }
+ f.write_char(c)?;
+ }
+ Ok(())
+ }
+}
+
+impl FromStr for NumberStyle {
+ type Err = ();
+
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
+ fn find_separator(s: &str) -> Option<char> {
+ // Count commas and periods. There must be exactly three of one or
+ // the other, except that an apostrophe escapes a following comma or
+ // period.
+ let mut n_commas = 0;
+ let mut n_periods = 0;
+ let s = s.as_bytes();
+ for i in 0..s.len() {
+ if i > 0 && s[i - 1] == b'\'' {
+ } else if s[i] == b',' {
+ n_commas += 1;
+ } else if s[i] == b'.' {
+ n_periods += 1;
+ }
+ }
+
+ if n_commas == 3 && n_periods != 3 {
+ Some(',')
+ } else if n_periods == 3 && n_commas != 3 {
+ Some('.')
+ } else {
+ None
+ }
+ }
+
+ fn take_cc_token(iter: &mut Chars<'_>, grouping: char) -> Affix {
+ let mut s = String::new();
+ let mut quote = false;
+ for c in iter {
+ if c == '\'' && !quote {
+ quote = true;
+ } else if c == grouping && !quote {
+ break;
+ } else {
+ s.push(c);
+ quote = false;
+ }
+ }
+ Affix::new(s)
+ }
+
+ let Some(grouping) = find_separator(s) else {
+ return Err(());
+ };
+ let mut iter = s.chars();
+ let neg_prefix = take_cc_token(&mut iter, grouping);
+ let prefix = take_cc_token(&mut iter, grouping);
+ let suffix = take_cc_token(&mut iter, grouping);
+ let neg_suffix = take_cc_token(&mut iter, grouping);
+ let grouping: Decimal = grouping.try_into().unwrap();
+ let decimal = !grouping;
+ let extra_bytes = neg_prefix.extra_bytes()
+ + prefix.extra_bytes()
+ + suffix.extra_bytes()
+ + neg_suffix.extra_bytes();
+ Ok(Self {
+ neg_prefix,
+ prefix,
+ suffix,
+ neg_suffix,
+ decimal,
+ grouping: Some(grouping),
+ leading_zero: false,
+ extra_bytes,
+ })
+ }
+}
+
+/// An item within a [DateTemplate].
+pub struct TemplateItem {
+ /// Character in the template.
+ pub c: char,
+
+ /// Number of repetitions of the character.
+ pub n: usize,
+}
+
+/// A template for date and time formats.
+#[derive(Clone)]
+pub struct DateTemplate(&'static str);
+
+impl DateTemplate {
+ /// Returns a [DateTemplate] used for date and time input and output in a
+ /// field of the given `type_` and `width`.
+ ///
+ /// `width` only affects whether a 2-digit year or a 4-digit year is used,
+ /// that is, whether the returned string contains `yy` or `yyyy`, and
+ /// whether seconds are included, that is, whether the returned string
+ /// contains `:SS`. A caller that doesn't care whether the returned string
+ /// contains `yy` or `yyyy` or `:SS` can just specify 0 to omit them.
+ pub fn new(type_: Type, width: usize) -> Option<Self> {
+ let (short, long) = match type_ {
+ Type::F
+ | Type::Comma
+ | Type::Dot
+ | Type::Dollar
+ | Type::Pct
+ | Type::E
+ | Type::CC(_)
+ | Type::N
+ | Type::Z
+ | Type::P
+ | Type::PK
+ | Type::IB
+ | Type::PIB
+ | Type::PIBHex
+ | Type::RB
+ | Type::RBHex
+ | Type::WkDay
+ | Type::Month
+ | Type::A
+ | Type::AHex => return None,
+ Type::Date => ("dd-mmm-yy", "dd-mmm-yyyy"),
+ Type::ADate => ("mm/dd/yy", "mm/dd/yyyy"),
+ Type::EDate => ("dd.mm.yy", "dd.mm.yyyy"),
+ Type::JDate => ("yyddd", "yyyyddd"),
+ Type::SDate => ("yy/mm/dd", "yyyy/mm/dd"),
+ Type::QYr => ("q Q yy", "q Q yyyy"),
+ Type::MoYr => ("mmm yy", "mmm yyyy"),
+ Type::WkYr => ("ww WK yy", "ww WK yyyy"),
+ Type::DateTime => ("dd-mmm-yyyy HH:MM", "dd-mmm-yyyy HH:MM:SS"),
+ Type::YmdHms => ("yyyy-mm-dd HH:MM", "yyyy-mm-dd HH:MM:SS"),
+ Type::MTime => ("MM", "MM:SS"),
+ Type::Time => ("HH:MM", "HH:MM:SS"),
+ Type::DTime => ("D HH:MM", "D HH:MM:SS"),
+ };
+ if width >= long.len() {
+ Some(DateTemplate(long))
+ } else {
+ Some(DateTemplate(short))
+ }
+ }
+
+ pub fn for_format(format: Format) -> Option<Self> {
+ Self::new(format.type_(), format.w())
+ }
+
+ #[allow(clippy::len_without_is_empty)]
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+}
+
+impl Iterator for DateTemplate {
+ type Item = TemplateItem;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let mut iter = self.0.chars();
+ let c = iter.next()?;
+ self.0 = iter.as_str();
+ let mut n = 1;
+ while iter.next() == Some(c) {
+ self.0 = iter.as_str();
+ n += 1;
+ }
+ Some(TemplateItem { c, n })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::format::{Format, Type, Width};
+
+ #[test]
+ fn codepage_to_unicode() {
+ fn check_format(input: Format, expected_width: Width) {
+ let mut output = input;
+ output.codepage_to_unicode();
+ let expected = Format::new(input.type_, expected_width, input.d).unwrap();
+ assert_eq!(output, expected);
+ }
+ check_format(Format::new(Type::A, 1, 0).unwrap(), 3);
+ check_format(Format::new(Type::A, 2, 0).unwrap(), 6);
+ check_format(Format::new(Type::A, 3, 0).unwrap(), 9);
+ check_format(Format::new(Type::A, 1000, 0).unwrap(), 3000);
+ check_format(Format::new(Type::A, 20000, 0).unwrap(), 32767);
+
+ check_format(Format::new(Type::AHex, 2, 0).unwrap(), 6);
+ check_format(Format::new(Type::AHex, 4, 0).unwrap(), 12);
+ check_format(Format::new(Type::AHex, 6, 0).unwrap(), 18);
+ check_format(Format::new(Type::AHex, 2000, 0).unwrap(), 6000);
+ check_format(Format::new(Type::AHex, 20000, 0).unwrap(), 60000);
+ check_format(Format::new(Type::AHex, 30000, 0).unwrap(), 65534);
+
+ check_format(Format::new(Type::F, 40, 0).unwrap(), 40);
+ }
+}
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+use std::{
+ cmp::min,
+ fmt::{Display, Error as FmtError, Formatter, Result as FmtResult, Write as _},
+ io::{Error as IoError, Write as IoWrite},
+ str::from_utf8_unchecked,
+};
+
+use binrw::Endian;
+use chrono::{Datelike, NaiveDate};
+use encoding_rs::{Encoding, UTF_8};
+use libm::frexp;
+use smallstr::SmallString;
+use smallvec::{Array, SmallVec};
+
+use crate::{
+ calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name},
+ data::{ByteStr, Datum, EncodedString, QuotedDatum, WithEncoding},
+ endian::ToBytes,
+ format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type},
+ settings::{EndianSettings, Settings as PsppSettings},
+ util::ToSmallString,
+};
+
+pub struct DisplayDatum<'b, B> {
+ format: Format,
+ settings: &'b Settings,
+ endian: EndianSettings,
+ datum: Datum<B>,
+
+ /// If true, the output will remove leading and trailing spaces from numeric
+ /// values, and trailing spaces from string values. (This might make the
+ /// output narrower than the requested width.)
+ trim_spaces: bool,
+
+ /// If true, the output will include a double quote before and after string
+ /// values.
+ quote_strings: bool,
+}
+
+#[cfg(test)]
+mod test;
+
+pub trait DisplayPlain {
+ fn display_plain(&self) -> DisplayPlainF64;
+}
+
+impl DisplayPlain for f64 {
+ fn display_plain(&self) -> DisplayPlainF64 {
+ DisplayPlainF64 {
+ value: *self,
+ decimal: '.',
+ }
+ }
+}
+
+pub struct DisplayPlainF64 {
+ pub value: f64,
+ pub decimal: char,
+}
+
+impl DisplayPlainF64 {
+ pub fn with_decimal(self, decimal: char) -> Self {
+ Self { decimal, ..self }
+ }
+}
+
+impl Display for DisplayPlainF64 {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ struct Inner(f64);
+
+ impl Display for Inner {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ let value = self.0;
+ if (value.abs() < 0.0005 && value != 0.0) || value.abs() > 1e15 {
+ // Print 0s that would otherwise have lots of leading or
+ // trailing zeros in scientific notation with full precision.
+ write!(f, "{value:.e}")
+ } else if value == value.trunc() {
+ // Print integers without decimal places.
+ write!(f, "{value:.0}")
+ } else {
+ // Print other numbers with full precision.
+ write!(f, "{value:.}")
+ }
+ }
+ }
+
+ match self.decimal {
+ '.' => write!(f, "{}", Inner(self.value)),
+ _ => {
+ let tmp = Inner(self.value).to_small_string::<64>();
+ if let Some(position) = tmp.find('.') {
+ f.write_str(&tmp[..position])?;
+ f.write_char(self.decimal)?;
+ f.write_str(&tmp[position + 1..])
+ } else {
+ f.write_str(&tmp)
+ }
+ }
+ }
+ }
+}
+
+impl<'a, D> Datum<D>
+where
+ D: EncodedString,
+{
+ /// Returns an object that implements [Display] for printing this [Datum] as
+ /// `format`.
+ ///
+ /// [Display]: std::fmt::Display
+ pub fn display(&'a self, format: Format) -> DisplayDatum<'a, WithEncoding<&'a ByteStr>> {
+ DisplayDatum::new(format, self.as_borrowed())
+ }
+
+ pub fn display_plain(&self) -> QuotedDatum<'_, D> {
+ self.quoted()
+ }
+}
+
+impl<'b, B> Display for DisplayDatum<'b, B>
+where
+ B: EncodedString,
+{
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ let number = match &self.datum {
+ Datum::Number(number) => *number,
+ Datum::String(string) => {
+ if self.format.type_() == Type::AHex {
+ for byte in string.raw_string_bytes() {
+ write!(f, "{byte:02x}")?;
+ }
+ } else {
+ let quote = if self.quote_strings { "\"" } else { "" };
+ let s = string.as_str();
+ let s = if self.trim_spaces {
+ s.trim_end_matches(' ')
+ } else {
+ &s
+ };
+ write!(f, "{quote}{s}{quote}")?;
+ }
+ return Ok(());
+ }
+ };
+
+ let Some(number) = number else {
+ return self.missing(f);
+ };
+
+ match self.format.type_() {
+ Type::F
+ | Type::Comma
+ | Type::Dot
+ | Type::Dollar
+ | Type::Pct
+ | Type::E
+ | Type::CC(_) => self.number(f, number),
+ Type::N => self.n(f, number),
+ Type::Z => self.z(f, number),
+
+ Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => self.fmt_binary(f),
+
+ Type::PIBHex => self.pibhex(f, number),
+ Type::RBHex => self.rbhex(f, number),
+ Type::Date
+ | Type::ADate
+ | Type::EDate
+ | Type::JDate
+ | Type::SDate
+ | Type::QYr
+ | Type::MoYr
+ | Type::WkYr
+ | Type::DateTime
+ | Type::YmdHms
+ | Type::MTime
+ | Type::Time
+ | Type::DTime
+ | Type::WkDay => self.date(f, number),
+ Type::Month => self.month(f, number),
+ Type::A | Type::AHex => unreachable!(),
+ }
+ }
+}
+
+impl<'b, B> DisplayDatum<'b, B>
+where
+ B: EncodedString,
+{
+ pub fn new(format: Format, datum: Datum<B>) -> Self {
+ let settings = PsppSettings::global();
+ Self {
+ format,
+ datum,
+ settings: &settings.formats,
+ endian: settings.endian,
+ trim_spaces: false,
+ quote_strings: false,
+ }
+ }
+ pub fn with_settings(self, settings: &'b Settings) -> Self {
+ Self { settings, ..self }
+ }
+ pub fn with_endian(self, endian: EndianSettings) -> Self {
+ Self { endian, ..self }
+ }
+ pub fn with_trimming(self) -> Self {
+ Self {
+ trim_spaces: true,
+ ..self
+ }
+ }
+ pub fn with_quoted_string(self) -> Self {
+ Self {
+ quote_strings: true,
+ ..self
+ }
+ }
+ fn fmt_binary(&self, f: &mut Formatter) -> FmtResult {
+ let output = self.to_binary().unwrap();
+ for b in output {
+ f.write_char(b as char)?;
+ }
+ Ok(())
+ }
+ fn number(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
+ if number.is_finite() {
+ let style = self.settings.number_style(self.format.type_);
+ if self.format.type_ != Type::E && number.abs() < 1.5 * power10(self.format.w()) {
+ let rounder = Rounder::new(style, number, self.format.d);
+ if self.decimal(f, &rounder, style, true)?
+ || self.scientific(f, number, style, true)?
+ || self.decimal(f, &rounder, style, false)?
+ {
+ return Ok(());
+ }
+ }
+
+ if !self.scientific(f, number, style, false)? {
+ self.overflow(f)?;
+ }
+ Ok(())
+ } else {
+ self.infinite(f, number)
+ }
+ }
+
+ fn infinite(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
+ if self.format.w >= 3 {
+ let s = if number.is_nan() {
+ "NaN"
+ } else if number.is_infinite() {
+ if number.is_sign_positive() {
+ "+Infinity"
+ } else {
+ "-Infinity"
+ }
+ } else {
+ "Unknown"
+ };
+ let w = if self.trim_spaces { 0 } else { self.format.w() };
+ write!(f, "{s:>w$.w$}")
+ } else {
+ self.overflow(f)
+ }
+ }
+
+ fn missing(&self, f: &mut Formatter<'_>) -> FmtResult {
+ match self.format.type_ {
+ Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => return self.fmt_binary(f),
+ Type::RBHex => return self.rbhex(f, -f64::MAX),
+ _ => (),
+ }
+
+ if self.trim_spaces {
+ return write!(f, ".");
+ }
+
+ let w = self.format.w() as isize;
+ let d = self.format.d() as isize;
+ let dot_position = match self.format.type_ {
+ Type::N => w - 1,
+ Type::Pct => w - d - 2,
+ Type::E => w - d - 5,
+ _ => w - d - 1,
+ };
+ let dot_position = dot_position.max(0) as u16;
+
+ for i in 0..self.format.w {
+ if i == dot_position {
+ write!(f, ".")?;
+ } else {
+ write!(f, " ")?;
+ }
+ }
+ Ok(())
+ }
+
+ fn overflow(&self, f: &mut Formatter<'_>) -> FmtResult {
+ if self.trim_spaces {
+ write!(f, "*")?;
+ } else {
+ for _ in 0..self.format.w {
+ write!(f, "*")?;
+ }
+ }
+ Ok(())
+ }
+
+ fn decimal(
+ &self,
+ f: &mut Formatter<'_>,
+ rounder: &Rounder,
+ style: &NumberStyle,
+ require_affixes: bool,
+ ) -> Result<bool, FmtError> {
+ for decimals in (0..=self.format.d).rev() {
+ // Make sure there's room for the number's magnitude, plus the
+ // negative suffix, plus (if negative) the negative prefix.
+ let RounderWidth {
+ mut width,
+ integer_digits,
+ negative,
+ } = rounder.width(decimals as usize);
+ width += style.neg_suffix.width;
+ if negative {
+ width += style.neg_prefix.width;
+ }
+ if width > self.format.w() {
+ continue;
+ }
+
+ // If there's room for the prefix and suffix, allocate
+ // space. If the affixes are required, but there's no
+ // space, give up.
+ let add_affixes = allocate_space(style.affix_width(), self.format.w(), &mut width);
+ if !add_affixes && require_affixes {
+ continue;
+ }
+
+ // Check whether we should include grouping characters. We need
+ // room for a complete set or we don't insert any at all. We don't
+ // include grouping characters if decimal places were requested but
+ // they were all dropped.
+ let grouping = style.grouping.filter(|_| {
+ integer_digits > 3
+ && (self.format.d == 0 || decimals > 0)
+ && allocate_space((integer_digits - 1) / 3, self.format.w(), &mut width)
+ });
+
+ // Assemble number.
+ let magnitude = rounder.format(decimals as usize);
+ let mut output = SmallString::<[u8; 40]>::new();
+ if !self.trim_spaces {
+ for _ in width..self.format.w() {
+ output.push(' ');
+ }
+ }
+ if negative {
+ output.push_str(&style.neg_prefix.s);
+ }
+ if add_affixes {
+ output.push_str(&style.prefix.s);
+ }
+ if let Some(grouping) = grouping {
+ for (i, digit) in magnitude[..integer_digits].bytes().enumerate() {
+ if i > 0 && (integer_digits - i) % 3 == 0 {
+ output.push(grouping.into());
+ }
+ output.push(digit as char);
+ }
+ } else {
+ output.push_str(&magnitude[..integer_digits]);
+ }
+ if decimals > 0 {
+ output.push(style.decimal.into());
+ let s = &magnitude[integer_digits + 1..];
+ output.push_str(&s[..decimals as usize]);
+ }
+ if add_affixes {
+ output.push_str(&style.suffix.s);
+ }
+ if negative {
+ output.push_str(&style.neg_suffix.s);
+ } else {
+ for _ in 0..style.neg_suffix.width {
+ output.push(' ');
+ }
+ }
+
+ debug_assert!(self.trim_spaces || output.len() >= self.format.w());
+ debug_assert!(output.len() <= self.format.w() + style.extra_bytes);
+ f.write_str(&output)?;
+ return Ok(true);
+ }
+ Ok(false)
+ }
+
+ fn scientific(
+ &self,
+ f: &mut Formatter<'_>,
+ number: f64,
+ style: &NumberStyle,
+ require_affixes: bool,
+ ) -> Result<bool, FmtError> {
+ // Allocate minimum required space.
+ let mut width = 6 + style.neg_suffix.width;
+ if number < 0.0 {
+ width += style.neg_prefix.width;
+ }
+ if width > self.format.w() {
+ return Ok(false);
+ }
+
+ // Check for room for prefix and suffix.
+ let add_affixes = allocate_space(style.affix_width(), self.format.w(), &mut width);
+ if require_affixes && !add_affixes {
+ return Ok(false);
+ }
+
+ // Figure out number of characters we can use for the fraction, if any.
+ // (If that turns out to be `1`, then we'll output a decimal point
+ // without any digits following.)
+ let mut fraction_width = min(self.format.d as usize + 1, self.format.w() - width).min(16);
+ if self.format.type_ != Type::E && fraction_width == 1 {
+ fraction_width = 0;
+ }
+ width += fraction_width;
+
+ let mut output = SmallString::<[u8; 40]>::new();
+ if !self.trim_spaces {
+ for _ in width..self.format.w() {
+ output.push(' ');
+ }
+ }
+ if number < 0.0 {
+ output.push_str(&style.neg_prefix.s);
+ }
+ if add_affixes {
+ output.push_str(&style.prefix.s);
+ }
+ write!(
+ &mut output,
+ "{:.*E}",
+ fraction_width.saturating_sub(1),
+ number.abs()
+ )
+ .unwrap();
+ if fraction_width == 1 {
+ // Insert `.` before the `E`, to get a value like "1.E+000".
+ output.insert(output.find('E').unwrap(), '.');
+ }
+
+ // Rust always uses `.` as the decimal point. Translate to `,` if
+ // necessary.
+ if style.decimal == Decimal::Comma {
+ fix_decimal_point(&mut output);
+ }
+
+ // Make exponent have exactly three digits, plus sign.
+ let e = output.as_bytes().iter().position(|c| *c == b'E').unwrap();
+ let exponent: isize = output[e + 1..].parse().unwrap();
+ if exponent.abs() > 999 {
+ return Ok(false);
+ }
+ output.truncate(e + 1);
+ write!(&mut output, "{exponent:+04}").unwrap();
+
+ // Add suffixes.
+ if add_affixes {
+ output.push_str(&style.suffix.s);
+ }
+ if number.is_sign_negative() {
+ output.push_str(&style.neg_suffix.s);
+ } else {
+ for _ in 0..style.neg_suffix.width {
+ output.push(' ');
+ }
+ }
+
+ println!(
+ "{} for {number} width={width} fraction_width={fraction_width}: {output:?}",
+ self.format
+ );
+ debug_assert!(self.trim_spaces || output.len() >= self.format.w());
+ debug_assert!(output.len() <= self.format.w() + style.extra_bytes);
+ f.write_str(&output)?;
+ Ok(true)
+ }
+
+ fn n(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
+ if number < 0.0 {
+ return self.missing(f);
+ }
+
+ let legacy = LegacyFormat::new(number, self.format.d());
+ let w = self.format.w();
+ let len = legacy.len();
+ if len > w {
+ self.overflow(f)
+ } else {
+ write!(f, "{}{legacy}", Zeros(w.saturating_sub(len)))
+ }
+ }
+
+ fn z(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
+ let legacy = LegacyFormat::new(number, self.format.d());
+ let w = self.format.w();
+ let len = legacy.len();
+ if len > w {
+ self.overflow(f)
+ } else {
+ let mut s = legacy.to_small_string::<40>();
+ if number < 0.0 {
+ if let Some(last) = s.pop() {
+ let last = last.to_digit(10).unwrap();
+ s.push(b"}JKLMNOPQR"[last as usize] as char);
+ }
+ }
+ write!(f, "{}{s}", Zeros(w.saturating_sub(len)))
+ }
+ }
+
+ fn pibhex(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
+ if number < 0.0 {
+ self.overflow(f)
+ } else {
+ let number = number.round();
+ if number >= power256(self.format.w / 2) {
+ self.overflow(f)
+ } else {
+ let binary = integer_to_binary(number as u64, self.format.w / 2);
+ output_hex(f, &binary)
+ }
+ }
+ }
+
+ fn rbhex(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
+ let rb = self.rb(Some(number), self.format.w() / 2);
+ output_hex(f, &rb)
+ }
+
+ fn date(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
+ const MINUTE: f64 = 60.0;
+ const HOUR: f64 = 60.0 * 60.0;
+ const DAY: f64 = 60.0 * 60.0 * 24.0;
+
+ let (date, mut time) = match self.format.type_.category() {
+ Category::Date => {
+ if number < 0.0 {
+ return self.missing(f);
+ }
+ let Some(date) = calendar_offset_to_gregorian(number / DAY) else {
+ return self.missing(f);
+ };
+ (date, number % DAY)
+ }
+ Category::Time => (NaiveDate::MIN, number),
+ _ => unreachable!(),
+ };
+
+ let mut output = SmallString::<[u8; 40]>::new();
+ for TemplateItem { c, n } in DateTemplate::for_format(self.format).unwrap() {
+ match c {
+ 'd' if n < 3 => write!(&mut output, "{:02}", date.day()).unwrap(),
+ 'd' => write!(&mut output, "{:03}", day_of_year(date).unwrap_or(1)).unwrap(),
+ 'm' if n < 3 => write!(&mut output, "{:02}", date.month()).unwrap(),
+ 'm' => write!(&mut output, "{}", short_month_name(date.month()).unwrap()).unwrap(),
+ 'y' if n >= 4 => {
+ let year = date.year();
+ if year <= 9999 {
+ write!(&mut output, "{year:04}").unwrap();
+ } else if self.format.type_ == Type::DateTime
+ || self.format.type_ == Type::YmdHms
+ {
+ write!(&mut output, "****").unwrap();
+ } else {
+ return self.overflow(f);
+ }
+ }
+ 'y' => {
+ let epoch = self.settings.epoch.0;
+ let offset = date.year() - epoch;
+ if !(0..=99).contains(&offset) {
+ return self.overflow(f);
+ }
+ write!(&mut output, "{:02}", date.year().abs() % 100).unwrap();
+ }
+ 'q' => write!(&mut output, "{}", date.month0() / 3 + 1).unwrap(),
+ 'w' => write!(
+ &mut output,
+ "{:2}",
+ (day_of_year(date).unwrap_or(1) - 1) / 7 + 1
+ )
+ .unwrap(),
+ 'D' => {
+ if time < 0.0 {
+ output.push('-');
+ }
+ time = time.abs();
+ write!(&mut output, "{:1$.0}", (time / DAY).floor(), n).unwrap();
+ time %= DAY;
+ }
+ 'H' => {
+ if time < 0.0 {
+ output.push('-');
+ }
+ time = time.abs();
+ write!(&mut output, "{:01$.0}", (time / HOUR).floor(), n).unwrap();
+ time %= HOUR;
+ }
+ 'M' => {
+ if time < 0.0 {
+ output.push('-');
+ }
+ time = time.abs();
+ write!(&mut output, "{:02.0}", (time / MINUTE).floor()).unwrap();
+ time %= MINUTE;
+
+ let excess_width = self.format.w() as isize - output.len() as isize;
+ if excess_width < 0 || (self.format.type_ == Type::MTime && excess_width < 3) {
+ return self.overflow(f);
+ }
+ if excess_width == 3
+ || excess_width == 4
+ || (excess_width >= 5 && self.format.d == 0)
+ {
+ write!(&mut output, ":{:02.0}", time.floor()).unwrap();
+ } else if excess_width >= 5 {
+ let d = min(self.format.d(), excess_width as usize - 4);
+ let w = d + 3;
+ write!(&mut output, ":{time:0w$.d$}").unwrap();
+ if self.settings.decimal == Decimal::Comma {
+ fix_decimal_point(&mut output);
+ }
+ }
+ break;
+ }
+ c if n == 1 => output.push(c),
+ _ => unreachable!(),
+ }
+ }
+ if !self.trim_spaces {
+ write!(f, "{:>1$}", &output, self.format.w())
+ } else {
+ f.write_str(&output)
+ }
+ }
+
+ fn month(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
+ if let Some(month) = month_name(number as u32) {
+ if !self.trim_spaces {
+ write!(f, "{month:.*}", self.format.w())
+ } else {
+ f.write_str(month)
+ }
+ } else {
+ self.missing(f)
+ }
+ }
+
+ /// Writes this object to `w`. Writes binary formats ([Type::P],
+ /// [Type::PIB], and so on) as binary values, and writes other output
+ /// formats in the given `encoding`.
+ ///
+ /// If `dv` is a [DisplayDatum], the difference between `write!(f, "{}",
+ /// dv)` and `dv.write(f, encoding)` is:
+ ///
+ /// * `write!` always outputs UTF-8. Binary formats are encoded as the
+ /// Unicode characters corresponding to their bytes.
+ ///
+ /// * `dv.write` outputs the desired `encoding`. Binary formats are not
+ /// encoded in `encoding` (and thus they might be invalid for the
+ /// encoding).
+ pub fn write<W>(&self, mut w: W, encoding: &'static Encoding) -> Result<(), IoError>
+ where
+ W: IoWrite,
+ {
+ match self.to_binary() {
+ Some(binary) => w.write_all(&binary),
+ None if encoding == UTF_8 => {
+ write!(&mut w, "{self}")
+ }
+ None => w.write_all(&encoding.encode(&self.to_small_string::<64>()).0),
+ }
+ }
+
+ fn to_binary(&self) -> Option<SmallVec<[u8; 16]>> {
+ let number = self.datum.as_number()?;
+ match self.format.type_() {
+ Type::P => Some(self.p(number)),
+ Type::PK => Some(self.pk(number)),
+ Type::IB => Some(self.ib(number)),
+ Type::PIB => Some(self.pib(number)),
+ Type::RB => Some(self.rb(number, self.format.w())),
+ _ => None,
+ }
+ }
+
+ fn bcd(&self, number: Option<f64>, digits: usize) -> (bool, SmallVec<[u8; 16]>) {
+ let legacy = LegacyFormat::new(number.unwrap_or_default(), self.format.d());
+ let len = legacy.len();
+
+ let mut output = SmallVec::new();
+ if len > digits {
+ output.resize(digits.div_ceil(2), 0);
+ (false, output)
+ } else {
+ let mut decimal = SmallString::<[u8; 16]>::new();
+ write!(
+ &mut decimal,
+ "{}{legacy}",
+ Zeros(digits.saturating_sub(len))
+ )
+ .unwrap();
+
+ let mut src = decimal.bytes();
+ for _ in 0..digits / 2 {
+ let d0 = src.next().unwrap() - b'0';
+ let d1 = src.next().unwrap() - b'0';
+ output.push((d0 << 4) + d1);
+ }
+ if digits % 2 != 0 {
+ let d = src.next().unwrap() - b'0';
+ output.push(d << 4);
+ }
+ (true, output)
+ }
+ }
+
+ fn p(&self, number: Option<f64>) -> SmallVec<[u8; 16]> {
+ let (valid, mut output) = self.bcd(number, self.format.w() * 2 - 1);
+ if valid && number.is_some_and(|number| number < 0.0) {
+ *output.last_mut().unwrap() |= 0xd;
+ } else {
+ *output.last_mut().unwrap() |= 0xf;
+ }
+ output
+ }
+
+ fn pk(&self, number: Option<f64>) -> SmallVec<[u8; 16]> {
+ let number = match number {
+ Some(number) if number < 0.0 => None,
+ other => other,
+ };
+ let (_valid, output) = self.bcd(number, self.format.w() * 2);
+ output
+ }
+
+ fn ib(&self, number: Option<f64>) -> SmallVec<[u8; 16]> {
+ let number = number.map_or(0.0, |number| (number * power10(self.format.d())).round());
+ let number = if number >= power256(self.format.w) / 2.0 - 1.0
+ || number < -power256(self.format.w) / 2.0
+ {
+ 0.0
+ } else {
+ number
+ };
+ let integer = number.abs() as u64;
+ let integer = if number < 0.0 {
+ (-(integer as i64)) as u64
+ } else {
+ integer
+ };
+ endian_to_smallvec(self.endian.output, integer, self.format.w())
+ }
+
+ fn pib(&self, number: Option<f64>) -> SmallVec<[u8; 16]> {
+ let number = number.map_or(0.0, |number| (number * power10(self.format.d())).round());
+ let number = if number >= power256(self.format.w) || number < 0.0 {
+ 0.0
+ } else {
+ number
+ };
+ let integer = number.abs() as u64;
+ endian_to_smallvec(self.endian.output, integer, self.format.w())
+ }
+
+ fn rb(&self, number: Option<f64>, w: usize) -> SmallVec<[u8; 16]> {
+ let number = number.unwrap_or(-f64::MAX);
+ let bytes: [u8; 8] = self.endian.output.to_bytes(number);
+ let mut vec = SmallVec::new();
+ vec.extend_from_slice(&bytes);
+ vec.resize(w, 0);
+ vec
+ }
+}
+
+struct LegacyFormat {
+ s: SmallVec<[u8; 40]>,
+ trailing_zeros: usize,
+}
+
+impl LegacyFormat {
+ fn new(number: f64, d: usize) -> Self {
+ let mut s = SmallVec::<[u8; 40]>::new();
+ write!(&mut s, "{:E}", number.abs()).unwrap();
+ debug_assert!(s.is_ascii());
+
+ // Parse exponent.
+ //
+ // Add 1 because of the transformation we will do just below, and `d` so
+ // that we just need to round to the nearest integer.
+ let e_index = s.iter().position(|c| *c == b'E').unwrap();
+ let mut exponent = unsafe { from_utf8_unchecked(&s[e_index + 1..]) }
+ .parse::<i32>()
+ .unwrap()
+ + 1
+ + d as i32;
+
+ // Transform `1.234E56` into `1234`.
+ if e_index == 1 {
+ // No decimals, e.g. `1E4` or `0E0`.
+ s.truncate(1)
+ } else {
+ s.remove(1);
+ s.truncate(e_index - 1);
+ };
+ debug_assert!(s.iter().all(|c| c.is_ascii_digit()));
+
+ if exponent >= 0 && exponent < s.len() as i32 {
+ // The first `exponent` digits are before the decimal point. We
+ // need to round off there.
+ let exp = exponent as usize;
+
+ fn round_up(digits: &mut [u8], position: usize) -> bool {
+ for index in (0..position).rev() {
+ match digits[index] {
+ b'0'..=b'8' => {
+ digits[index] += 1;
+ return true;
+ }
+ b'9' => {
+ digits[index] = b'0';
+ }
+ _ => unreachable!(),
+ }
+ }
+ false
+ }
+
+ if s[exp] >= b'5' && !round_up(&mut s, exp) {
+ s.clear();
+ s.push(b'1');
+ exponent += 1;
+ }
+ }
+
+ let exponent = exponent.max(0) as usize;
+ s.truncate(exponent);
+ s.resize(exponent, b'0');
+ let trailing_zeros = exponent.saturating_sub(s.len());
+ Self { s, trailing_zeros }
+ }
+ fn s(&self) -> &str {
+ unsafe { from_utf8_unchecked(&self.s) }
+ }
+ fn len(&self) -> usize {
+ self.s.len() + self.trailing_zeros
+ }
+}
+
+impl Display for LegacyFormat {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ write!(f, "{}{}", self.s(), Zeros(self.trailing_zeros))
+ }
+}
+
+struct Zeros(usize);
+
+impl Display for Zeros {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ let mut n = self.0;
+ while n > 0 {
+ static ZEROS: &str = "0000000000000000000000000000000000000000";
+ let chunk = n.min(ZEROS.len());
+ f.write_str(&ZEROS[..chunk])?;
+ n -= chunk;
+ }
+ Ok(())
+ }
+}
+
+fn integer_to_binary(number: u64, width: u16) -> SmallVec<[u8; 8]> {
+ let bytes = (number << ((8 - width) * 8)).to_be_bytes();
+ SmallVec::from_slice(&bytes[..width as usize])
+}
+
+fn output_hex(f: &mut Formatter<'_>, bytes: &[u8]) -> FmtResult {
+ for byte in bytes {
+ write!(f, "{byte:02X}")?;
+ }
+ Ok(())
+}
+
+fn allocate_space(want: usize, capacity: usize, used: &mut usize) -> bool {
+ if *used + want <= capacity {
+ *used += want;
+ true
+ } else {
+ false
+ }
+}
+
+/// A representation of a number that can be quickly rounded to any desired
+/// number of decimal places (up to a specified maximum).
+#[derive(Debug)]
+struct Rounder {
+ /// Magnitude of number with excess precision.
+ string: SmallString<[u8; 40]>,
+
+ /// Number of digits before decimal point.
+ integer_digits: usize,
+
+ /// Number of `9`s or `.`s at start of string.
+ leading_nines: usize,
+
+ /// Number of `0`s or `.`s at start of string.
+ leading_zeros: usize,
+
+ /// Is the number negative?
+ negative: bool,
+}
+
+impl Rounder {
+ fn new(style: &NumberStyle, number: f64, max_decimals: u8) -> Self {
+ debug_assert!(number.abs() < 1e41);
+ debug_assert!((0..=16).contains(&max_decimals));
+
+ let mut string = SmallString::new();
+ if max_decimals == 0 {
+ // Fast path. No rounding needed.
+ //
+ // We append `.00` to the integer representation because
+ // [Self::round_up] assumes that fractional digits are present.
+ write!(&mut string, "{:.0}.00", number.round().abs()).unwrap()
+ } else {
+ // Slow path.
+ //
+ // This is more difficult than it really should be because we have
+ // to make sure that numbers that are exactly halfway between two
+ // representations are always rounded away from zero. This is not
+ // what format! normally does (usually it rounds to even), so we
+ // have to fake it as best we can, by formatting with extra
+ // precision and then doing the rounding ourselves.
+ //
+ // We take up to two rounds to format numbers. In the first round,
+ // we obtain 2 digits of precision beyond those requested by the
+ // user. If those digits are exactly "50", then in a second round
+ // we format with as many digits as are significant in a "double".
+ //
+ // It might be better to directly implement our own floating-point
+ // formatting routine instead of relying on the system's sprintf
+ // implementation. But the classic Steele and White paper on
+ // printing floating-point numbers does not hint how to do what we
+ // want, and it's not obvious how to change their algorithms to do
+ // so. It would also be a lot of work.
+ write!(
+ &mut string,
+ "{:.*}",
+ max_decimals as usize + 2,
+ number.abs()
+ )
+ .unwrap();
+ if string.ends_with("50") {
+ let (_sig, binary_exponent) = frexp(number);
+ let decimal_exponent = binary_exponent * 3 / 10;
+ let format_decimals = (f64::DIGITS as i32 + 1) - decimal_exponent;
+ if format_decimals > max_decimals as i32 + 2 {
+ string.clear();
+ write!(&mut string, "{:.*}", format_decimals as usize, number.abs()).unwrap();
+ }
+ }
+ };
+
+ if !style.leading_zero && string.starts_with("0") {
+ string.remove(0);
+ }
+ let leading_zeros = string
+ .bytes()
+ .take_while(|c| *c == b'0' || *c == b'.')
+ .count();
+ let leading_nines = string
+ .bytes()
+ .take_while(|c| *c == b'9' || *c == b'.')
+ .count();
+ let integer_digits = string.bytes().take_while(u8::is_ascii_digit).count();
+ let negative = number.is_sign_negative();
+ Self {
+ string,
+ integer_digits,
+ leading_nines,
+ leading_zeros,
+ negative,
+ }
+ }
+
+ /// Returns a [RounderWdith] for formatting the magnitude to `decimals`
+ /// decimal places. `decimals` must be in `0..=16`.
+ fn width(&self, decimals: usize) -> RounderWidth {
+ // Calculate base measures.
+ let mut width = self.integer_digits;
+ if decimals > 0 {
+ width += decimals + 1;
+ }
+ let mut integer_digits = self.integer_digits;
+ let mut negative = self.negative;
+
+ // Rounding can cause adjustments.
+ if self.should_round_up(decimals) {
+ // Rounding up leading `9s` adds a new digit (a `1`).
+ if self.leading_nines >= width {
+ width += 1;
+ integer_digits += 1;
+ }
+ } else {
+ // Rounding down.
+ if self.leading_zeros >= width {
+ // All digits that remain after rounding are zeros. Therefore
+ // we drop the negative sign.
+ negative = false;
+ if self.integer_digits == 0 && decimals == 0 {
+ // No digits at all are left. We need to display
+ // at least a single digit (a zero).
+ debug_assert_eq!(width, 0);
+ width += 1;
+ integer_digits = 1;
+ }
+ }
+ }
+ RounderWidth {
+ width,
+ integer_digits,
+ negative,
+ }
+ }
+
+ /// Returns true if the number should be rounded up when chopped off at
+ /// `decimals` decimal places, false if it should be rounded down.
+ fn should_round_up(&self, decimals: usize) -> bool {
+ let digit = self.string.as_bytes()[self.integer_digits + decimals + 1];
+ debug_assert!(digit.is_ascii_digit());
+ digit >= b'5'
+ }
+
+ /// Formats the number, rounding to `decimals` decimal places. Exactly as
+ /// many characters as indicated by [Self::width(decimals)] are written.
+ fn format(&self, decimals: usize) -> SmallString<[u8; 40]> {
+ let mut output = SmallString::new();
+ let mut base_width = self.integer_digits;
+ if decimals > 0 {
+ base_width += decimals + 1;
+ }
+
+ if self.should_round_up(decimals) {
+ if self.leading_nines < base_width {
+ // Rounding up. This is the common case where rounding up
+ // doesn't add an extra digit.
+ output.push_str(&self.string[..base_width]);
+
+ // SAFETY: This loop only changes ASCII characters to other
+ // ASCII characters.
+ unsafe {
+ for c in output.as_bytes_mut().iter_mut().rev() {
+ match *c {
+ b'9' => *c = b'0',
+ b'0'..=b'8' => {
+ *c += 1;
+ break;
+ }
+ b'.' => (),
+ _ => unreachable!(),
+ }
+ }
+ }
+ } else {
+ // Rounding up leading 9s causes the result to be a 1 followed
+ // by a number of 0s, plus a decimal point.
+ output.push('1');
+ for _ in 0..self.integer_digits {
+ output.push('0');
+ }
+ if decimals > 0 {
+ output.push('.');
+ for _ in 0..decimals {
+ output.push('0');
+ }
+ }
+ debug_assert_eq!(output.len(), base_width + 1);
+ }
+ } else {
+ // Rounding down.
+ if self.integer_digits != 0 || decimals != 0 {
+ // Common case: just copy the digits.
+ output.push_str(&self.string);
+ } else {
+ // No digits remain. The output is just a zero.
+ output.push('0');
+ }
+ }
+ output
+ }
+}
+
+struct RounderWidth {
+ /// Number of characters required to format the number to a specified number
+ /// of decimal places. This includes integer digits and a decimal point and
+ /// fractional digits, if any, but it does not include any negative prefix
+ /// or suffix or other affixes.
+ width: usize,
+
+ /// Number of digits before the decimal point, between 0 and 40.
+ integer_digits: usize,
+
+ /// True if the number is negative and its rounded representation would
+ /// include at least one nonzero digit.
+ negative: bool,
+}
+
+/// Returns `10^x`.
+fn power10(x: usize) -> f64 {
+ const POWERS: [f64; 41] = [
+ 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16,
+ 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31,
+ 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40,
+ ];
+ POWERS
+ .get(x)
+ .copied()
+ .unwrap_or_else(|| 10.0_f64.powi(x as i32))
+}
+
+/// Returns `256^x`.
+fn power256(x: u16) -> f64 {
+ const POWERS: [f64; 9] = [
+ 1.0,
+ 256.0,
+ 65536.0,
+ 16777216.0,
+ 4294967296.0,
+ 1099511627776.0,
+ 281474976710656.0,
+ 72057594037927936.0,
+ 18446744073709551616.0,
+ ];
+ POWERS
+ .get(x as usize)
+ .copied()
+ .unwrap_or_else(|| 256.0_f64.powi(x as i32))
+}
+
+fn fix_decimal_point<A>(s: &mut SmallString<A>)
+where
+ A: Array<Item = u8>,
+{
+ // SAFETY: This only changes only one ASCII character (`.`) to
+ // another ASCII character (`,`).
+ unsafe {
+ if let Some(dot) = s.as_bytes_mut().iter_mut().find(|c| **c == b'.') {
+ *dot = b',';
+ }
+ }
+}
+
+pub fn endian_to_smallvec<const N: usize>(
+ endian: Endian,
+ mut value: u64,
+ n: usize,
+) -> SmallVec<[u8; N]> {
+ debug_assert!(n <= 8);
+ let mut vec = SmallVec::new();
+ value <<= 8 * (8 - n);
+ for _ in 0..n {
+ vec.push((value >> 56) as u8);
+ value <<= 8;
+ }
+ if endian == Endian::Little {
+ vec.reverse();
+ }
+ vec
+}
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-use std::{
- cmp::min,
- fmt::{Display, Error as FmtError, Formatter, Result as FmtResult, Write as _},
- io::{Error as IoError, Write as IoWrite},
- str::from_utf8_unchecked,
-};
-
-use binrw::Endian;
-use chrono::{Datelike, NaiveDate};
-use encoding_rs::{Encoding, UTF_8};
-use libm::frexp;
-use smallstr::SmallString;
-use smallvec::{Array, SmallVec};
-
-use crate::{
- calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name},
- data::{ByteStr, Datum, EncodedString, QuotedDatum, WithEncoding},
- endian::ToBytes,
- format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type},
- settings::{EndianSettings, Settings as PsppSettings},
- util::ToSmallString,
-};
-
-pub struct DisplayDatum<'b, B> {
- format: Format,
- settings: &'b Settings,
- endian: EndianSettings,
- datum: Datum<B>,
-
- /// If true, the output will remove leading and trailing spaces from numeric
- /// values, and trailing spaces from string values. (This might make the
- /// output narrower than the requested width.)
- trim_spaces: bool,
-
- /// If true, the output will include a double quote before and after string
- /// values.
- quote_strings: bool,
-}
-
-#[cfg(test)]
-mod test;
-
-pub trait DisplayPlain {
- fn display_plain(&self) -> DisplayPlainF64;
-}
-
-impl DisplayPlain for f64 {
- fn display_plain(&self) -> DisplayPlainF64 {
- DisplayPlainF64 {
- value: *self,
- decimal: '.',
- }
- }
-}
-
-pub struct DisplayPlainF64 {
- pub value: f64,
- pub decimal: char,
-}
-
-impl DisplayPlainF64 {
- pub fn with_decimal(self, decimal: char) -> Self {
- Self { decimal, ..self }
- }
-}
-
-impl Display for DisplayPlainF64 {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- struct Inner(f64);
-
- impl Display for Inner {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- let value = self.0;
- if (value.abs() < 0.0005 && value != 0.0) || value.abs() > 1e15 {
- // Print 0s that would otherwise have lots of leading or
- // trailing zeros in scientific notation with full precision.
- write!(f, "{value:.e}")
- } else if value == value.trunc() {
- // Print integers without decimal places.
- write!(f, "{value:.0}")
- } else {
- // Print other numbers with full precision.
- write!(f, "{value:.}")
- }
- }
- }
-
- match self.decimal {
- '.' => write!(f, "{}", Inner(self.value)),
- _ => {
- let tmp = Inner(self.value).to_small_string::<64>();
- if let Some(position) = tmp.find('.') {
- f.write_str(&tmp[..position])?;
- f.write_char(self.decimal)?;
- f.write_str(&tmp[position + 1..])
- } else {
- f.write_str(&tmp)
- }
- }
- }
- }
-}
-
-impl<'a, D> Datum<D>
-where
- D: EncodedString,
-{
- /// Returns an object that implements [Display] for printing this [Datum] as
- /// `format`.
- ///
- /// [Display]: std::fmt::Display
- pub fn display(&'a self, format: Format) -> DisplayDatum<'a, WithEncoding<&'a ByteStr>> {
- DisplayDatum::new(format, self.as_borrowed())
- }
-
- pub fn display_plain(&self) -> QuotedDatum<'_, D> {
- self.quoted()
- }
-}
-
-impl<'b, B> Display for DisplayDatum<'b, B>
-where
- B: EncodedString,
-{
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- let number = match &self.datum {
- Datum::Number(number) => *number,
- Datum::String(string) => {
- if self.format.type_() == Type::AHex {
- for byte in string.raw_string_bytes() {
- write!(f, "{byte:02x}")?;
- }
- } else {
- let quote = if self.quote_strings { "\"" } else { "" };
- let s = string.as_str();
- let s = if self.trim_spaces {
- s.trim_end_matches(' ')
- } else {
- &s
- };
- write!(f, "{quote}{s}{quote}")?;
- }
- return Ok(());
- }
- };
-
- let Some(number) = number else {
- return self.missing(f);
- };
-
- match self.format.type_() {
- Type::F
- | Type::Comma
- | Type::Dot
- | Type::Dollar
- | Type::Pct
- | Type::E
- | Type::CC(_) => self.number(f, number),
- Type::N => self.n(f, number),
- Type::Z => self.z(f, number),
-
- Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => self.fmt_binary(f),
-
- Type::PIBHex => self.pibhex(f, number),
- Type::RBHex => self.rbhex(f, number),
- Type::Date
- | Type::ADate
- | Type::EDate
- | Type::JDate
- | Type::SDate
- | Type::QYr
- | Type::MoYr
- | Type::WkYr
- | Type::DateTime
- | Type::YmdHms
- | Type::MTime
- | Type::Time
- | Type::DTime
- | Type::WkDay => self.date(f, number),
- Type::Month => self.month(f, number),
- Type::A | Type::AHex => unreachable!(),
- }
- }
-}
-
-impl<'b, B> DisplayDatum<'b, B>
-where
- B: EncodedString,
-{
- pub fn new(format: Format, datum: Datum<B>) -> Self {
- let settings = PsppSettings::global();
- Self {
- format,
- datum,
- settings: &settings.formats,
- endian: settings.endian,
- trim_spaces: false,
- quote_strings: false,
- }
- }
- pub fn with_settings(self, settings: &'b Settings) -> Self {
- Self { settings, ..self }
- }
- pub fn with_endian(self, endian: EndianSettings) -> Self {
- Self { endian, ..self }
- }
- pub fn with_trimming(self) -> Self {
- Self {
- trim_spaces: true,
- ..self
- }
- }
- pub fn with_quoted_string(self) -> Self {
- Self {
- quote_strings: true,
- ..self
- }
- }
- fn fmt_binary(&self, f: &mut Formatter) -> FmtResult {
- let output = self.to_binary().unwrap();
- for b in output {
- f.write_char(b as char)?;
- }
- Ok(())
- }
- fn number(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
- if number.is_finite() {
- let style = self.settings.number_style(self.format.type_);
- if self.format.type_ != Type::E && number.abs() < 1.5 * power10(self.format.w()) {
- let rounder = Rounder::new(style, number, self.format.d);
- if self.decimal(f, &rounder, style, true)?
- || self.scientific(f, number, style, true)?
- || self.decimal(f, &rounder, style, false)?
- {
- return Ok(());
- }
- }
-
- if !self.scientific(f, number, style, false)? {
- self.overflow(f)?;
- }
- Ok(())
- } else {
- self.infinite(f, number)
- }
- }
-
- fn infinite(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
- if self.format.w >= 3 {
- let s = if number.is_nan() {
- "NaN"
- } else if number.is_infinite() {
- if number.is_sign_positive() {
- "+Infinity"
- } else {
- "-Infinity"
- }
- } else {
- "Unknown"
- };
- let w = if self.trim_spaces { 0 } else { self.format.w() };
- write!(f, "{s:>w$.w$}")
- } else {
- self.overflow(f)
- }
- }
-
- fn missing(&self, f: &mut Formatter<'_>) -> FmtResult {
- match self.format.type_ {
- Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => return self.fmt_binary(f),
- Type::RBHex => return self.rbhex(f, -f64::MAX),
- _ => (),
- }
-
- if self.trim_spaces {
- return write!(f, ".");
- }
-
- let w = self.format.w() as isize;
- let d = self.format.d() as isize;
- let dot_position = match self.format.type_ {
- Type::N => w - 1,
- Type::Pct => w - d - 2,
- Type::E => w - d - 5,
- _ => w - d - 1,
- };
- let dot_position = dot_position.max(0) as u16;
-
- for i in 0..self.format.w {
- if i == dot_position {
- write!(f, ".")?;
- } else {
- write!(f, " ")?;
- }
- }
- Ok(())
- }
-
- fn overflow(&self, f: &mut Formatter<'_>) -> FmtResult {
- if self.trim_spaces {
- write!(f, "*")?;
- } else {
- for _ in 0..self.format.w {
- write!(f, "*")?;
- }
- }
- Ok(())
- }
-
- fn decimal(
- &self,
- f: &mut Formatter<'_>,
- rounder: &Rounder,
- style: &NumberStyle,
- require_affixes: bool,
- ) -> Result<bool, FmtError> {
- for decimals in (0..=self.format.d).rev() {
- // Make sure there's room for the number's magnitude, plus the
- // negative suffix, plus (if negative) the negative prefix.
- let RounderWidth {
- mut width,
- integer_digits,
- negative,
- } = rounder.width(decimals as usize);
- width += style.neg_suffix.width;
- if negative {
- width += style.neg_prefix.width;
- }
- if width > self.format.w() {
- continue;
- }
-
- // If there's room for the prefix and suffix, allocate
- // space. If the affixes are required, but there's no
- // space, give up.
- let add_affixes = allocate_space(style.affix_width(), self.format.w(), &mut width);
- if !add_affixes && require_affixes {
- continue;
- }
-
- // Check whether we should include grouping characters. We need
- // room for a complete set or we don't insert any at all. We don't
- // include grouping characters if decimal places were requested but
- // they were all dropped.
- let grouping = style.grouping.filter(|_| {
- integer_digits > 3
- && (self.format.d == 0 || decimals > 0)
- && allocate_space((integer_digits - 1) / 3, self.format.w(), &mut width)
- });
-
- // Assemble number.
- let magnitude = rounder.format(decimals as usize);
- let mut output = SmallString::<[u8; 40]>::new();
- if !self.trim_spaces {
- for _ in width..self.format.w() {
- output.push(' ');
- }
- }
- if negative {
- output.push_str(&style.neg_prefix.s);
- }
- if add_affixes {
- output.push_str(&style.prefix.s);
- }
- if let Some(grouping) = grouping {
- for (i, digit) in magnitude[..integer_digits].bytes().enumerate() {
- if i > 0 && (integer_digits - i) % 3 == 0 {
- output.push(grouping.into());
- }
- output.push(digit as char);
- }
- } else {
- output.push_str(&magnitude[..integer_digits]);
- }
- if decimals > 0 {
- output.push(style.decimal.into());
- let s = &magnitude[integer_digits + 1..];
- output.push_str(&s[..decimals as usize]);
- }
- if add_affixes {
- output.push_str(&style.suffix.s);
- }
- if negative {
- output.push_str(&style.neg_suffix.s);
- } else {
- for _ in 0..style.neg_suffix.width {
- output.push(' ');
- }
- }
-
- debug_assert!(self.trim_spaces || output.len() >= self.format.w());
- debug_assert!(output.len() <= self.format.w() + style.extra_bytes);
- f.write_str(&output)?;
- return Ok(true);
- }
- Ok(false)
- }
-
- fn scientific(
- &self,
- f: &mut Formatter<'_>,
- number: f64,
- style: &NumberStyle,
- require_affixes: bool,
- ) -> Result<bool, FmtError> {
- // Allocate minimum required space.
- let mut width = 6 + style.neg_suffix.width;
- if number < 0.0 {
- width += style.neg_prefix.width;
- }
- if width > self.format.w() {
- return Ok(false);
- }
-
- // Check for room for prefix and suffix.
- let add_affixes = allocate_space(style.affix_width(), self.format.w(), &mut width);
- if require_affixes && !add_affixes {
- return Ok(false);
- }
-
- // Figure out number of characters we can use for the fraction, if any.
- // (If that turns out to be `1`, then we'll output a decimal point
- // without any digits following.)
- let mut fraction_width = min(self.format.d as usize + 1, self.format.w() - width).min(16);
- if self.format.type_ != Type::E && fraction_width == 1 {
- fraction_width = 0;
- }
- width += fraction_width;
-
- let mut output = SmallString::<[u8; 40]>::new();
- if !self.trim_spaces {
- for _ in width..self.format.w() {
- output.push(' ');
- }
- }
- if number < 0.0 {
- output.push_str(&style.neg_prefix.s);
- }
- if add_affixes {
- output.push_str(&style.prefix.s);
- }
- write!(
- &mut output,
- "{:.*E}",
- fraction_width.saturating_sub(1),
- number.abs()
- )
- .unwrap();
- if fraction_width == 1 {
- // Insert `.` before the `E`, to get a value like "1.E+000".
- output.insert(output.find('E').unwrap(), '.');
- }
-
- // Rust always uses `.` as the decimal point. Translate to `,` if
- // necessary.
- if style.decimal == Decimal::Comma {
- fix_decimal_point(&mut output);
- }
-
- // Make exponent have exactly three digits, plus sign.
- let e = output.as_bytes().iter().position(|c| *c == b'E').unwrap();
- let exponent: isize = output[e + 1..].parse().unwrap();
- if exponent.abs() > 999 {
- return Ok(false);
- }
- output.truncate(e + 1);
- write!(&mut output, "{exponent:+04}").unwrap();
-
- // Add suffixes.
- if add_affixes {
- output.push_str(&style.suffix.s);
- }
- if number.is_sign_negative() {
- output.push_str(&style.neg_suffix.s);
- } else {
- for _ in 0..style.neg_suffix.width {
- output.push(' ');
- }
- }
-
- println!(
- "{} for {number} width={width} fraction_width={fraction_width}: {output:?}",
- self.format
- );
- debug_assert!(self.trim_spaces || output.len() >= self.format.w());
- debug_assert!(output.len() <= self.format.w() + style.extra_bytes);
- f.write_str(&output)?;
- Ok(true)
- }
-
- fn n(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
- if number < 0.0 {
- return self.missing(f);
- }
-
- let legacy = LegacyFormat::new(number, self.format.d());
- let w = self.format.w();
- let len = legacy.len();
- if len > w {
- self.overflow(f)
- } else {
- write!(f, "{}{legacy}", Zeros(w.saturating_sub(len)))
- }
- }
-
- fn z(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
- let legacy = LegacyFormat::new(number, self.format.d());
- let w = self.format.w();
- let len = legacy.len();
- if len > w {
- self.overflow(f)
- } else {
- let mut s = legacy.to_small_string::<40>();
- if number < 0.0 {
- if let Some(last) = s.pop() {
- let last = last.to_digit(10).unwrap();
- s.push(b"}JKLMNOPQR"[last as usize] as char);
- }
- }
- write!(f, "{}{s}", Zeros(w.saturating_sub(len)))
- }
- }
-
- fn pibhex(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
- if number < 0.0 {
- self.overflow(f)
- } else {
- let number = number.round();
- if number >= power256(self.format.w / 2) {
- self.overflow(f)
- } else {
- let binary = integer_to_binary(number as u64, self.format.w / 2);
- output_hex(f, &binary)
- }
- }
- }
-
- fn rbhex(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
- let rb = self.rb(Some(number), self.format.w() / 2);
- output_hex(f, &rb)
- }
-
- fn date(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
- const MINUTE: f64 = 60.0;
- const HOUR: f64 = 60.0 * 60.0;
- const DAY: f64 = 60.0 * 60.0 * 24.0;
-
- let (date, mut time) = match self.format.type_.category() {
- Category::Date => {
- if number < 0.0 {
- return self.missing(f);
- }
- let Some(date) = calendar_offset_to_gregorian(number / DAY) else {
- return self.missing(f);
- };
- (date, number % DAY)
- }
- Category::Time => (NaiveDate::MIN, number),
- _ => unreachable!(),
- };
-
- let mut output = SmallString::<[u8; 40]>::new();
- for TemplateItem { c, n } in DateTemplate::for_format(self.format).unwrap() {
- match c {
- 'd' if n < 3 => write!(&mut output, "{:02}", date.day()).unwrap(),
- 'd' => write!(&mut output, "{:03}", day_of_year(date).unwrap_or(1)).unwrap(),
- 'm' if n < 3 => write!(&mut output, "{:02}", date.month()).unwrap(),
- 'm' => write!(&mut output, "{}", short_month_name(date.month()).unwrap()).unwrap(),
- 'y' if n >= 4 => {
- let year = date.year();
- if year <= 9999 {
- write!(&mut output, "{year:04}").unwrap();
- } else if self.format.type_ == Type::DateTime
- || self.format.type_ == Type::YmdHms
- {
- write!(&mut output, "****").unwrap();
- } else {
- return self.overflow(f);
- }
- }
- 'y' => {
- let epoch = self.settings.epoch.0;
- let offset = date.year() - epoch;
- if !(0..=99).contains(&offset) {
- return self.overflow(f);
- }
- write!(&mut output, "{:02}", date.year().abs() % 100).unwrap();
- }
- 'q' => write!(&mut output, "{}", date.month0() / 3 + 1).unwrap(),
- 'w' => write!(
- &mut output,
- "{:2}",
- (day_of_year(date).unwrap_or(1) - 1) / 7 + 1
- )
- .unwrap(),
- 'D' => {
- if time < 0.0 {
- output.push('-');
- }
- time = time.abs();
- write!(&mut output, "{:1$.0}", (time / DAY).floor(), n).unwrap();
- time %= DAY;
- }
- 'H' => {
- if time < 0.0 {
- output.push('-');
- }
- time = time.abs();
- write!(&mut output, "{:01$.0}", (time / HOUR).floor(), n).unwrap();
- time %= HOUR;
- }
- 'M' => {
- if time < 0.0 {
- output.push('-');
- }
- time = time.abs();
- write!(&mut output, "{:02.0}", (time / MINUTE).floor()).unwrap();
- time %= MINUTE;
-
- let excess_width = self.format.w() as isize - output.len() as isize;
- if excess_width < 0 || (self.format.type_ == Type::MTime && excess_width < 3) {
- return self.overflow(f);
- }
- if excess_width == 3
- || excess_width == 4
- || (excess_width >= 5 && self.format.d == 0)
- {
- write!(&mut output, ":{:02.0}", time.floor()).unwrap();
- } else if excess_width >= 5 {
- let d = min(self.format.d(), excess_width as usize - 4);
- let w = d + 3;
- write!(&mut output, ":{time:0w$.d$}").unwrap();
- if self.settings.decimal == Decimal::Comma {
- fix_decimal_point(&mut output);
- }
- }
- break;
- }
- c if n == 1 => output.push(c),
- _ => unreachable!(),
- }
- }
- if !self.trim_spaces {
- write!(f, "{:>1$}", &output, self.format.w())
- } else {
- f.write_str(&output)
- }
- }
-
- fn month(&self, f: &mut Formatter<'_>, number: f64) -> FmtResult {
- if let Some(month) = month_name(number as u32) {
- if !self.trim_spaces {
- write!(f, "{month:.*}", self.format.w())
- } else {
- f.write_str(month)
- }
- } else {
- self.missing(f)
- }
- }
-
- /// Writes this object to `w`. Writes binary formats ([Type::P],
- /// [Type::PIB], and so on) as binary values, and writes other output
- /// formats in the given `encoding`.
- ///
- /// If `dv` is a [DisplayDatum], the difference between `write!(f, "{}",
- /// dv)` and `dv.write(f, encoding)` is:
- ///
- /// * `write!` always outputs UTF-8. Binary formats are encoded as the
- /// Unicode characters corresponding to their bytes.
- ///
- /// * `dv.write` outputs the desired `encoding`. Binary formats are not
- /// encoded in `encoding` (and thus they might be invalid for the
- /// encoding).
- pub fn write<W>(&self, mut w: W, encoding: &'static Encoding) -> Result<(), IoError>
- where
- W: IoWrite,
- {
- match self.to_binary() {
- Some(binary) => w.write_all(&binary),
- None if encoding == UTF_8 => {
- write!(&mut w, "{self}")
- }
- None => w.write_all(&encoding.encode(&self.to_small_string::<64>()).0),
- }
- }
-
- fn to_binary(&self) -> Option<SmallVec<[u8; 16]>> {
- let number = self.datum.as_number()?;
- match self.format.type_() {
- Type::P => Some(self.p(number)),
- Type::PK => Some(self.pk(number)),
- Type::IB => Some(self.ib(number)),
- Type::PIB => Some(self.pib(number)),
- Type::RB => Some(self.rb(number, self.format.w())),
- _ => None,
- }
- }
-
- fn bcd(&self, number: Option<f64>, digits: usize) -> (bool, SmallVec<[u8; 16]>) {
- let legacy = LegacyFormat::new(number.unwrap_or_default(), self.format.d());
- let len = legacy.len();
-
- let mut output = SmallVec::new();
- if len > digits {
- output.resize(digits.div_ceil(2), 0);
- (false, output)
- } else {
- let mut decimal = SmallString::<[u8; 16]>::new();
- write!(
- &mut decimal,
- "{}{legacy}",
- Zeros(digits.saturating_sub(len))
- )
- .unwrap();
-
- let mut src = decimal.bytes();
- for _ in 0..digits / 2 {
- let d0 = src.next().unwrap() - b'0';
- let d1 = src.next().unwrap() - b'0';
- output.push((d0 << 4) + d1);
- }
- if digits % 2 != 0 {
- let d = src.next().unwrap() - b'0';
- output.push(d << 4);
- }
- (true, output)
- }
- }
-
- fn p(&self, number: Option<f64>) -> SmallVec<[u8; 16]> {
- let (valid, mut output) = self.bcd(number, self.format.w() * 2 - 1);
- if valid && number.is_some_and(|number| number < 0.0) {
- *output.last_mut().unwrap() |= 0xd;
- } else {
- *output.last_mut().unwrap() |= 0xf;
- }
- output
- }
-
- fn pk(&self, number: Option<f64>) -> SmallVec<[u8; 16]> {
- let number = match number {
- Some(number) if number < 0.0 => None,
- other => other,
- };
- let (_valid, output) = self.bcd(number, self.format.w() * 2);
- output
- }
-
- fn ib(&self, number: Option<f64>) -> SmallVec<[u8; 16]> {
- let number = number.map_or(0.0, |number| (number * power10(self.format.d())).round());
- let number = if number >= power256(self.format.w) / 2.0 - 1.0
- || number < -power256(self.format.w) / 2.0
- {
- 0.0
- } else {
- number
- };
- let integer = number.abs() as u64;
- let integer = if number < 0.0 {
- (-(integer as i64)) as u64
- } else {
- integer
- };
- endian_to_smallvec(self.endian.output, integer, self.format.w())
- }
-
- fn pib(&self, number: Option<f64>) -> SmallVec<[u8; 16]> {
- let number = number.map_or(0.0, |number| (number * power10(self.format.d())).round());
- let number = if number >= power256(self.format.w) || number < 0.0 {
- 0.0
- } else {
- number
- };
- let integer = number.abs() as u64;
- endian_to_smallvec(self.endian.output, integer, self.format.w())
- }
-
- fn rb(&self, number: Option<f64>, w: usize) -> SmallVec<[u8; 16]> {
- let number = number.unwrap_or(-f64::MAX);
- let bytes: [u8; 8] = self.endian.output.to_bytes(number);
- let mut vec = SmallVec::new();
- vec.extend_from_slice(&bytes);
- vec.resize(w, 0);
- vec
- }
-}
-
-struct LegacyFormat {
- s: SmallVec<[u8; 40]>,
- trailing_zeros: usize,
-}
-
-impl LegacyFormat {
- fn new(number: f64, d: usize) -> Self {
- let mut s = SmallVec::<[u8; 40]>::new();
- write!(&mut s, "{:E}", number.abs()).unwrap();
- debug_assert!(s.is_ascii());
-
- // Parse exponent.
- //
- // Add 1 because of the transformation we will do just below, and `d` so
- // that we just need to round to the nearest integer.
- let e_index = s.iter().position(|c| *c == b'E').unwrap();
- let mut exponent = unsafe { from_utf8_unchecked(&s[e_index + 1..]) }
- .parse::<i32>()
- .unwrap()
- + 1
- + d as i32;
-
- // Transform `1.234E56` into `1234`.
- if e_index == 1 {
- // No decimals, e.g. `1E4` or `0E0`.
- s.truncate(1)
- } else {
- s.remove(1);
- s.truncate(e_index - 1);
- };
- debug_assert!(s.iter().all(|c| c.is_ascii_digit()));
-
- if exponent >= 0 && exponent < s.len() as i32 {
- // The first `exponent` digits are before the decimal point. We
- // need to round off there.
- let exp = exponent as usize;
-
- fn round_up(digits: &mut [u8], position: usize) -> bool {
- for index in (0..position).rev() {
- match digits[index] {
- b'0'..=b'8' => {
- digits[index] += 1;
- return true;
- }
- b'9' => {
- digits[index] = b'0';
- }
- _ => unreachable!(),
- }
- }
- false
- }
-
- if s[exp] >= b'5' && !round_up(&mut s, exp) {
- s.clear();
- s.push(b'1');
- exponent += 1;
- }
- }
-
- let exponent = exponent.max(0) as usize;
- s.truncate(exponent);
- s.resize(exponent, b'0');
- let trailing_zeros = exponent.saturating_sub(s.len());
- Self { s, trailing_zeros }
- }
- fn s(&self) -> &str {
- unsafe { from_utf8_unchecked(&self.s) }
- }
- fn len(&self) -> usize {
- self.s.len() + self.trailing_zeros
- }
-}
-
-impl Display for LegacyFormat {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{}{}", self.s(), Zeros(self.trailing_zeros))
- }
-}
-
-struct Zeros(usize);
-
-impl Display for Zeros {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- let mut n = self.0;
- while n > 0 {
- static ZEROS: &str = "0000000000000000000000000000000000000000";
- let chunk = n.min(ZEROS.len());
- f.write_str(&ZEROS[..chunk])?;
- n -= chunk;
- }
- Ok(())
- }
-}
-
-fn integer_to_binary(number: u64, width: u16) -> SmallVec<[u8; 8]> {
- let bytes = (number << ((8 - width) * 8)).to_be_bytes();
- SmallVec::from_slice(&bytes[..width as usize])
-}
-
-fn output_hex(f: &mut Formatter<'_>, bytes: &[u8]) -> FmtResult {
- for byte in bytes {
- write!(f, "{byte:02X}")?;
- }
- Ok(())
-}
-
-fn allocate_space(want: usize, capacity: usize, used: &mut usize) -> bool {
- if *used + want <= capacity {
- *used += want;
- true
- } else {
- false
- }
-}
-
-/// A representation of a number that can be quickly rounded to any desired
-/// number of decimal places (up to a specified maximum).
-#[derive(Debug)]
-struct Rounder {
- /// Magnitude of number with excess precision.
- string: SmallString<[u8; 40]>,
-
- /// Number of digits before decimal point.
- integer_digits: usize,
-
- /// Number of `9`s or `.`s at start of string.
- leading_nines: usize,
-
- /// Number of `0`s or `.`s at start of string.
- leading_zeros: usize,
-
- /// Is the number negative?
- negative: bool,
-}
-
-impl Rounder {
- fn new(style: &NumberStyle, number: f64, max_decimals: u8) -> Self {
- debug_assert!(number.abs() < 1e41);
- debug_assert!((0..=16).contains(&max_decimals));
-
- let mut string = SmallString::new();
- if max_decimals == 0 {
- // Fast path. No rounding needed.
- //
- // We append `.00` to the integer representation because
- // [Self::round_up] assumes that fractional digits are present.
- write!(&mut string, "{:.0}.00", number.round().abs()).unwrap()
- } else {
- // Slow path.
- //
- // This is more difficult than it really should be because we have
- // to make sure that numbers that are exactly halfway between two
- // representations are always rounded away from zero. This is not
- // what format! normally does (usually it rounds to even), so we
- // have to fake it as best we can, by formatting with extra
- // precision and then doing the rounding ourselves.
- //
- // We take up to two rounds to format numbers. In the first round,
- // we obtain 2 digits of precision beyond those requested by the
- // user. If those digits are exactly "50", then in a second round
- // we format with as many digits as are significant in a "double".
- //
- // It might be better to directly implement our own floating-point
- // formatting routine instead of relying on the system's sprintf
- // implementation. But the classic Steele and White paper on
- // printing floating-point numbers does not hint how to do what we
- // want, and it's not obvious how to change their algorithms to do
- // so. It would also be a lot of work.
- write!(
- &mut string,
- "{:.*}",
- max_decimals as usize + 2,
- number.abs()
- )
- .unwrap();
- if string.ends_with("50") {
- let (_sig, binary_exponent) = frexp(number);
- let decimal_exponent = binary_exponent * 3 / 10;
- let format_decimals = (f64::DIGITS as i32 + 1) - decimal_exponent;
- if format_decimals > max_decimals as i32 + 2 {
- string.clear();
- write!(&mut string, "{:.*}", format_decimals as usize, number.abs()).unwrap();
- }
- }
- };
-
- if !style.leading_zero && string.starts_with("0") {
- string.remove(0);
- }
- let leading_zeros = string
- .bytes()
- .take_while(|c| *c == b'0' || *c == b'.')
- .count();
- let leading_nines = string
- .bytes()
- .take_while(|c| *c == b'9' || *c == b'.')
- .count();
- let integer_digits = string.bytes().take_while(u8::is_ascii_digit).count();
- let negative = number.is_sign_negative();
- Self {
- string,
- integer_digits,
- leading_nines,
- leading_zeros,
- negative,
- }
- }
-
- /// Returns a [RounderWdith] for formatting the magnitude to `decimals`
- /// decimal places. `decimals` must be in `0..=16`.
- fn width(&self, decimals: usize) -> RounderWidth {
- // Calculate base measures.
- let mut width = self.integer_digits;
- if decimals > 0 {
- width += decimals + 1;
- }
- let mut integer_digits = self.integer_digits;
- let mut negative = self.negative;
-
- // Rounding can cause adjustments.
- if self.should_round_up(decimals) {
- // Rounding up leading `9s` adds a new digit (a `1`).
- if self.leading_nines >= width {
- width += 1;
- integer_digits += 1;
- }
- } else {
- // Rounding down.
- if self.leading_zeros >= width {
- // All digits that remain after rounding are zeros. Therefore
- // we drop the negative sign.
- negative = false;
- if self.integer_digits == 0 && decimals == 0 {
- // No digits at all are left. We need to display
- // at least a single digit (a zero).
- debug_assert_eq!(width, 0);
- width += 1;
- integer_digits = 1;
- }
- }
- }
- RounderWidth {
- width,
- integer_digits,
- negative,
- }
- }
-
- /// Returns true if the number should be rounded up when chopped off at
- /// `decimals` decimal places, false if it should be rounded down.
- fn should_round_up(&self, decimals: usize) -> bool {
- let digit = self.string.as_bytes()[self.integer_digits + decimals + 1];
- debug_assert!(digit.is_ascii_digit());
- digit >= b'5'
- }
-
- /// Formats the number, rounding to `decimals` decimal places. Exactly as
- /// many characters as indicated by [Self::width(decimals)] are written.
- fn format(&self, decimals: usize) -> SmallString<[u8; 40]> {
- let mut output = SmallString::new();
- let mut base_width = self.integer_digits;
- if decimals > 0 {
- base_width += decimals + 1;
- }
-
- if self.should_round_up(decimals) {
- if self.leading_nines < base_width {
- // Rounding up. This is the common case where rounding up
- // doesn't add an extra digit.
- output.push_str(&self.string[..base_width]);
-
- // SAFETY: This loop only changes ASCII characters to other
- // ASCII characters.
- unsafe {
- for c in output.as_bytes_mut().iter_mut().rev() {
- match *c {
- b'9' => *c = b'0',
- b'0'..=b'8' => {
- *c += 1;
- break;
- }
- b'.' => (),
- _ => unreachable!(),
- }
- }
- }
- } else {
- // Rounding up leading 9s causes the result to be a 1 followed
- // by a number of 0s, plus a decimal point.
- output.push('1');
- for _ in 0..self.integer_digits {
- output.push('0');
- }
- if decimals > 0 {
- output.push('.');
- for _ in 0..decimals {
- output.push('0');
- }
- }
- debug_assert_eq!(output.len(), base_width + 1);
- }
- } else {
- // Rounding down.
- if self.integer_digits != 0 || decimals != 0 {
- // Common case: just copy the digits.
- output.push_str(&self.string);
- } else {
- // No digits remain. The output is just a zero.
- output.push('0');
- }
- }
- output
- }
-}
-
-struct RounderWidth {
- /// Number of characters required to format the number to a specified number
- /// of decimal places. This includes integer digits and a decimal point and
- /// fractional digits, if any, but it does not include any negative prefix
- /// or suffix or other affixes.
- width: usize,
-
- /// Number of digits before the decimal point, between 0 and 40.
- integer_digits: usize,
-
- /// True if the number is negative and its rounded representation would
- /// include at least one nonzero digit.
- negative: bool,
-}
-
-/// Returns `10^x`.
-fn power10(x: usize) -> f64 {
- const POWERS: [f64; 41] = [
- 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16,
- 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31,
- 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40,
- ];
- POWERS
- .get(x)
- .copied()
- .unwrap_or_else(|| 10.0_f64.powi(x as i32))
-}
-
-/// Returns `256^x`.
-fn power256(x: u16) -> f64 {
- const POWERS: [f64; 9] = [
- 1.0,
- 256.0,
- 65536.0,
- 16777216.0,
- 4294967296.0,
- 1099511627776.0,
- 281474976710656.0,
- 72057594037927936.0,
- 18446744073709551616.0,
- ];
- POWERS
- .get(x as usize)
- .copied()
- .unwrap_or_else(|| 256.0_f64.powi(x as i32))
-}
-
-fn fix_decimal_point<A>(s: &mut SmallString<A>)
-where
- A: Array<Item = u8>,
-{
- // SAFETY: This only changes only one ASCII character (`.`) to
- // another ASCII character (`,`).
- unsafe {
- if let Some(dot) = s.as_bytes_mut().iter_mut().find(|c| **c == b'.') {
- *dot = b',';
- }
- }
-}
-
-pub fn endian_to_smallvec<const N: usize>(
- endian: Endian,
- mut value: u64,
- n: usize,
-) -> SmallVec<[u8; N]> {
- debug_assert!(n <= 8);
- let mut vec = SmallVec::new();
- value <<= 8 * (8 - n);
- for _ in 0..n {
- vec.push((value >> 56) as u8);
- value <<= 8;
- }
- if endian == Endian::Little {
- vec.reverse();
- }
- vec
-}
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-use std::{
- fmt::{Debug, Display, Formatter, Result as FmtResult, Write},
- ops::{Not, RangeInclusive},
- str::{Chars, FromStr},
- sync::LazyLock,
-};
-
-use chrono::{Datelike, Local};
-use enum_iterator::{all, Sequence};
-use enum_map::{Enum, EnumMap};
-use serde::{Deserialize, Serialize};
-use thiserror::Error as ThisError;
-use unicode_width::UnicodeWidthStr;
-
-use crate::{
- data::{ByteString, Datum},
- sys::raw,
- util::ToSmallString,
- variable::{VarType, VarWidth},
-};
-
-mod display;
-mod parse;
-pub use display::{DisplayDatum, DisplayPlain, DisplayPlainF64};
-
-#[derive(Clone, ThisError, Debug, PartialEq, Eq)]
-pub enum Error {
- #[error("Unknown format type {value}.")]
- UnknownFormat { value: u16 },
-
- #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)]
- OddWidthNotAllowed(UncheckedFormat),
-
- #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())]
- BadWidth(UncheckedFormat),
-
- #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)]
- DecimalsNotAllowedForFormat(UncheckedFormat),
-
- #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)]
- DecimalsNotAllowedForWidth(UncheckedFormat),
-
- #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)]
- TooManyDecimalsForWidth {
- spec: UncheckedFormat,
- max_d: Decimals,
- },
-
- #[error("String variable is not compatible with numeric format {0}.")]
- UnnamedVariableNotCompatibleWithNumericFormat(Type),
-
- #[error("Numeric variable is not compatible with string format {0}.")]
- UnnamedVariableNotCompatibleWithStringFormat(Type),
-
- #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")]
- NamedStringVariableBadSpecWidth {
- variable: String,
- width: Width,
- bad_spec: Format,
- good_spec: Format,
- },
-
- #[error("String variable with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")]
- UnnamedStringVariableBadSpecWidth {
- width: Width,
- bad_spec: Format,
- good_spec: Format,
- },
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub enum Category {
- // Numeric formats.
- Basic,
- Custom,
- Legacy,
- Binary,
- Hex,
- Date,
- Time,
- DateComponent,
-
- // String formats.
- String,
-}
-
-impl From<Type> for Category {
- fn from(source: Type) -> Self {
- match source {
- Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic,
- Type::CC(_) => Self::Custom,
- Type::N | Type::Z => Self::Legacy,
- Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary,
- Type::PIBHex | Type::RBHex => Self::Hex,
- Type::Date
- | Type::ADate
- | Type::EDate
- | Type::JDate
- | Type::SDate
- | Type::QYr
- | Type::MoYr
- | Type::WkYr
- | Type::DateTime
- | Type::YmdHms => Self::Date,
- Type::MTime | Type::Time | Type::DTime => Self::Time,
- Type::WkDay | Type::Month => Self::DateComponent,
- Type::A | Type::AHex => Self::String,
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash, Sequence, Serialize)]
-pub enum CC {
- A,
- B,
- C,
- D,
- E,
-}
-
-impl CC {
- pub fn as_string(&self) -> &'static str {
- match self {
- CC::A => "A",
- CC::B => "B",
- CC::C => "C",
- CC::D => "D",
- CC::E => "E",
- }
- }
-}
-
-impl Display for CC {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{}", self.as_string())
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Sequence, Serialize)]
-pub enum Type {
- // Basic numeric formats.
- F,
- Comma,
- Dot,
- Dollar,
- Pct,
- E,
-
- // Custom currency formats.
- CC(CC),
-
- // Legacy numeric formats.
- N,
- Z,
-
- // Binary and hexadecimal formats.
- P,
- PK,
- IB,
- PIB,
- PIBHex,
- RB,
- RBHex,
-
- // Time and date formats.
- Date,
- ADate,
- EDate,
- JDate,
- SDate,
- QYr,
- MoYr,
- WkYr,
- DateTime,
- YmdHms,
- MTime,
- Time,
- DTime,
-
- // Date component formats.
- WkDay,
- Month,
-
- // String formats.
- A,
- AHex,
-}
-
-pub type Width = u16;
-pub type SignedWidth = i16;
-
-pub type Decimals = u8;
-
-impl Type {
- pub fn max_width(self) -> Width {
- match self {
- Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16,
- Self::IB | Self::PIB | Self::RB => 8,
- Self::A => 32767,
- Self::AHex => 32767 * 2,
- _ => 40,
- }
- }
-
- pub fn min_width(self) -> Width {
- match self {
- // Basic numeric formats.
- Self::F => 1,
- Self::Comma => 1,
- Self::Dot => 1,
- Self::Dollar => 2,
- Self::Pct => 2,
- Self::E => 6,
-
- // Custom currency formats.
- Self::CC(_) => 2,
-
- // Legacy numeric formats.
- Self::N => 1,
- Self::Z => 1,
-
- // Binary and hexadecimal formats.
- Self::P => 1,
- Self::PK => 1,
- Self::IB => 1,
- Self::PIB => 1,
- Self::PIBHex => 2,
- Self::RB => 2,
- Self::RBHex => 4,
-
- // Time and date formats.
- Self::Date => 9,
- Self::ADate => 8,
- Self::EDate => 8,
- Self::JDate => 5,
- Self::SDate => 8,
- Self::QYr => 6,
- Self::MoYr => 6,
- Self::WkYr => 8,
- Self::DateTime => 17,
- Self::YmdHms => 16,
- Self::MTime => 5,
- Self::Time => 5,
- Self::DTime => 8,
-
- // Date component formats.
- Self::WkDay => 2,
- Self::Month => 3,
-
- // String formats.
- Self::A => 1,
- Self::AHex => 2,
- }
- }
-
- pub fn width_range(self) -> RangeInclusive<Width> {
- self.min_width()..=self.max_width()
- }
-
- pub fn max_decimals(self, width: Width) -> Decimals {
- let width = width.clamp(1, 40) as SignedWidth;
- let max = match self {
- Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1,
- Self::Dollar | Self::Pct => width - 2,
- Self::E => width - 7,
- Self::N | Self::Z => width,
- Self::P => width * 2 - 1,
- Self::PK => width * 2,
- Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth,
- Self::PIBHex => 0,
- Self::RB | Self::RBHex => 16,
- Self::Date
- | Self::ADate
- | Self::EDate
- | Self::JDate
- | Self::SDate
- | Self::QYr
- | Self::MoYr
- | Self::WkYr => 0,
- Self::DateTime => width - 21,
- Self::YmdHms => width - 20,
- Self::MTime => width - 6,
- Self::Time => width - 9,
- Self::DTime => width - 12,
- Self::WkDay | Self::Month | Self::A | Self::AHex => 0,
- };
- max.clamp(0, 16) as Decimals
- }
-
- pub fn takes_decimals(self) -> bool {
- self.max_decimals(Width::MAX) > 0
- }
-
- pub fn category(self) -> Category {
- self.into()
- }
-
- pub fn width_step(self) -> Width {
- if self.category() == Category::Hex || self == Self::AHex {
- 2
- } else {
- 1
- }
- }
-
- pub fn clamp_width(self, width: Width) -> Width {
- let (min, max) = self.width_range().into_inner();
- let width = width.clamp(min, max);
- if self.width_step() == 2 {
- width / 2 * 2
- } else {
- width
- }
- }
-
- pub fn var_type(self) -> VarType {
- match self {
- Self::A | Self::AHex => VarType::String,
- _ => VarType::Numeric,
- }
- }
-
- /// Checks whether this format is valid for a variable with the given
- /// `var_type`.
- pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> {
- let my_type = self.var_type();
- match (my_type, var_type) {
- (VarType::Numeric, VarType::String) => {
- Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self))
- }
- (VarType::String, VarType::Numeric) => {
- Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self))
- }
- _ => Ok(()),
- }
- }
-
- pub fn as_str(&self) -> &'static str {
- match self {
- Self::F => "F",
- Self::Comma => "COMMA",
- Self::Dot => "DOT",
- Self::Dollar => "DOLLAR",
- Self::Pct => "PCT",
- Self::E => "E",
- Self::CC(CC::A) => "CCA",
- Self::CC(CC::B) => "CCB",
- Self::CC(CC::C) => "CCC",
- Self::CC(CC::D) => "CCD",
- Self::CC(CC::E) => "CCE",
- Self::N => "N",
- Self::Z => "Z",
- Self::P => "P",
- Self::PK => "PK",
- Self::IB => "IB",
- Self::PIB => "PIB",
- Self::PIBHex => "PIBHEX",
- Self::RB => "RB",
- Self::RBHex => "RBHEX",
- Self::Date => "DATE",
- Self::ADate => "ADATE",
- Self::EDate => "EDATE",
- Self::JDate => "JDATE",
- Self::SDate => "SDATE",
- Self::QYr => "QYR",
- Self::MoYr => "MOYR",
- Self::WkYr => "WKYR",
- Self::DateTime => "DATETIME",
- Self::YmdHms => "YMDHMS",
- Self::MTime => "MTIME",
- Self::Time => "TIME",
- Self::DTime => "DTIME",
- Self::WkDay => "WKDAY",
- Self::Month => "MONTH",
- Self::A => "A",
- Self::AHex => "AHEX",
- }
- }
-
- pub fn default_value(&self) -> Datum<ByteString> {
- match self.var_type() {
- VarType::Numeric => Datum::sysmis(),
- VarType::String => Datum::String(ByteString::default()),
- }
- }
-}
-
-impl Display for Type {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{}", self.as_str())
- }
-}
-
-impl FromStr for Type {
- type Err = ();
-
- fn from_str(s: &str) -> Result<Self, Self::Err> {
- for type_ in all::<Type>() {
- if type_.as_str().eq_ignore_ascii_case(s) {
- return Ok(type_);
- }
- }
- Err(())
- }
-}
-
-fn max_digits_for_bytes(bytes: usize) -> usize {
- *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20)
-}
-
-#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct AbstractFormat {
- pub name: String,
- w: Width,
- d: Decimals,
-}
-
-fn split<F>(s: &str, predicate: F) -> (&str, &str)
-where
- F: Fn(&char) -> bool,
-{
- let rest = s.trim_start_matches(|c| predicate(&c));
- let start = &s[..s.len() - rest.len()];
- (start, rest)
-}
-
-impl FromStr for AbstractFormat {
- type Err = ();
-
- fn from_str(s: &str) -> Result<Self, Self::Err> {
- let (name, s) = split(s, char::is_ascii_alphabetic);
- if name.is_empty() {
- return Err(());
- }
-
- let (w, s) = split(s, char::is_ascii_digit);
- let Ok(w) = w.parse() else {
- return Err(());
- };
-
- let (d, rest) = if let Some(s) = s.strip_prefix('.') {
- let (d, rest) = split(s, char::is_ascii_digit);
- let Ok(d) = d.parse() else {
- return Err(());
- };
- (d, rest)
- } else {
- (0, s)
- };
-
- if !rest.is_empty() {
- return Err(());
- }
- Ok(Self {
- name: name.into(),
- w,
- d,
- })
- }
-}
-
-impl TryFrom<AbstractFormat> for UncheckedFormat {
- type Error = ();
-
- fn try_from(value: AbstractFormat) -> Result<Self, Self::Error> {
- Ok(UncheckedFormat::new(value.name.parse()?, value.w, value.d))
- }
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub struct Format {
- type_: Type,
- w: Width,
- d: Decimals,
-}
-
-impl Serialize for Format {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- self.to_small_string::<16>().serialize(serializer)
- }
-}
-
-impl Format {
- pub const F40: Format = Format {
- type_: Type::F,
- w: 40,
- d: 0,
- };
-
- pub const F40_1: Format = Format {
- type_: Type::F,
- w: 40,
- d: 1,
- };
-
- pub const F40_2: Format = Format {
- type_: Type::F,
- w: 40,
- d: 2,
- };
-
- pub const F40_3: Format = Format {
- type_: Type::F,
- w: 40,
- d: 3,
- };
-
- pub const PCT40_1: Format = Format {
- type_: Type::Pct,
- w: 40,
- d: 1,
- };
-
- pub const F8_2: Format = Format {
- type_: Type::F,
- w: 8,
- d: 2,
- };
-
- pub const DATETIME40_0: Format = Format {
- type_: Type::DateTime,
- w: 40,
- d: 0,
- };
-
- pub fn type_(self) -> Type {
- self.type_
- }
- pub fn w(self) -> usize {
- self.w as usize
- }
- pub fn d(self) -> usize {
- self.d as usize
- }
-
- pub fn new(type_: Type, w: Width, d: Decimals) -> Option<Self> {
- UncheckedFormat { type_, w, d }.try_into().ok()
- }
-
- pub fn default_for_width(var_width: VarWidth) -> Self {
- match var_width {
- VarWidth::Numeric => Format {
- type_: Type::F,
- w: 8,
- d: 2,
- },
- VarWidth::String(w) => Format {
- type_: Type::A,
- w,
- d: 0,
- },
- }
- }
-
- pub fn fixed_from(source: &UncheckedFormat) -> Self {
- let UncheckedFormat {
- type_: format,
- w,
- d,
- } = *source;
- let (min, max) = format.width_range().into_inner();
- let mut w = w.clamp(min, max);
- if d <= format.max_decimals(Width::MAX) {
- while d > format.max_decimals(w) {
- w += 1;
- assert!(w <= 40);
- }
- }
- let d = d.clamp(0, format.max_decimals(w));
- Self {
- type_: format,
- w,
- d,
- }
- }
-
- pub fn var_width(self) -> VarWidth {
- match self.type_ {
- Type::A => VarWidth::String(self.w),
- Type::AHex => VarWidth::String(self.w / 2),
- _ => VarWidth::Numeric,
- }
- }
-
- pub fn var_type(self) -> VarType {
- self.type_.var_type()
- }
-
- /// Checks whether this format specification is valid for a variable with
- /// width `var_width`.
- pub fn check_width_compatibility(self, var_width: VarWidth) -> Result<Self, Error> {
- // Verify that the format is right for the variable's type.
- self.type_.check_type_compatibility(var_width.into())?;
-
- if let VarWidth::String(w) = var_width {
- if var_width != self.var_width() {
- let bad_spec = self;
- let good_spec = if self.type_ == Type::A {
- Format { w, ..self }
- } else {
- Format { w: w * 2, ..self }
- };
- return Err(Error::UnnamedStringVariableBadSpecWidth {
- width: w,
- bad_spec,
- good_spec,
- });
- }
- }
-
- Ok(self)
- }
-
- pub fn default_value(&self) -> Datum<ByteString> {
- match self.var_width() {
- VarWidth::Numeric => Datum::sysmis(),
- VarWidth::String(width) => Datum::String(ByteString::spaces(width as usize)),
- }
- }
-
- pub fn resize(&mut self, width: VarWidth) {
- match (self.var_width(), width) {
- (VarWidth::Numeric, VarWidth::Numeric) => {}
- (VarWidth::String(_), VarWidth::String(new_width)) => {
- self.w = if self.type_ == Type::AHex {
- new_width * 2
- } else {
- new_width
- };
- }
- _ => *self = Self::default_for_width(width),
- }
- }
-
- pub fn codepage_to_unicode(&mut self) {
- let mut width = self.var_width();
- width.codepage_to_unicode();
- if let Some(width) = width.as_string_width() {
- if self.type_ == Type::AHex {
- self.w = width as u16 * 2;
- } else {
- self.w = width as u16;
- }
- }
- }
-}
-
-impl Debug for Format {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{self}")
- }
-}
-
-impl Display for Format {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{}{}", self.type_, self.w)?;
- if self.type_.takes_decimals() || self.d > 0 {
- write!(f, ".{}", self.d)?;
- }
- Ok(())
- }
-}
-
-impl TryFrom<UncheckedFormat> for Format {
- type Error = Error;
-
- fn try_from(source: UncheckedFormat) -> Result<Self, Self::Error> {
- let UncheckedFormat {
- type_: format,
- w,
- d,
- } = source;
- let max_d = format.max_decimals(w);
- if w % format.width_step() != 0 {
- Err(Error::OddWidthNotAllowed(source))
- } else if !format.width_range().contains(&w) {
- Err(Error::BadWidth(source))
- } else if d > max_d {
- if format.takes_decimals() {
- Err(Error::DecimalsNotAllowedForFormat(source))
- } else if max_d > 0 {
- Err(Error::TooManyDecimalsForWidth {
- spec: source,
- max_d,
- })
- } else {
- Err(Error::DecimalsNotAllowedForWidth(source))
- }
- } else {
- Ok(Format {
- type_: format,
- w,
- d,
- })
- }
- }
-}
-
-impl From<Type> for u16 {
- fn from(source: Type) -> Self {
- match source {
- Type::A => 1,
- Type::AHex => 2,
- Type::Comma => 3,
- Type::Dollar => 4,
- Type::F => 5,
- Type::IB => 6,
- Type::PIBHex => 7,
- Type::P => 8,
- Type::PIB => 9,
- Type::PK => 10,
- Type::RB => 11,
- Type::RBHex => 12,
- Type::Z => 15,
- Type::N => 16,
- Type::E => 17,
- Type::Date => 20,
- Type::Time => 21,
- Type::DateTime => 22,
- Type::ADate => 23,
- Type::JDate => 24,
- Type::DTime => 25,
- Type::WkDay => 26,
- Type::Month => 27,
- Type::MoYr => 28,
- Type::QYr => 29,
- Type::WkYr => 30,
- Type::Pct => 31,
- Type::Dot => 32,
- Type::CC(CC::A) => 33,
- Type::CC(CC::B) => 34,
- Type::CC(CC::C) => 35,
- Type::CC(CC::D) => 36,
- Type::CC(CC::E) => 37,
- Type::EDate => 38,
- Type::SDate => 39,
- Type::MTime => 40,
- Type::YmdHms => 41,
- }
- }
-}
-
-impl TryFrom<u16> for Type {
- type Error = Error;
-
- fn try_from(source: u16) -> Result<Self, Self::Error> {
- match source {
- 1 => Ok(Self::A),
- 2 => Ok(Self::AHex),
- 3 => Ok(Self::Comma),
- 4 => Ok(Self::Dollar),
- 5 => Ok(Self::F),
- 6 => Ok(Self::IB),
- 7 => Ok(Self::PIBHex),
- 8 => Ok(Self::P),
- 9 => Ok(Self::PIB),
- 10 => Ok(Self::PK),
- 11 => Ok(Self::RB),
- 12 => Ok(Self::RBHex),
- 15 => Ok(Self::Z),
- 16 => Ok(Self::N),
- 17 => Ok(Self::E),
- 20 => Ok(Self::Date),
- 21 => Ok(Self::Time),
- 22 => Ok(Self::DateTime),
- 23 => Ok(Self::ADate),
- 24 => Ok(Self::JDate),
- 25 => Ok(Self::DTime),
- 26 => Ok(Self::WkDay),
- 27 => Ok(Self::Month),
- 28 => Ok(Self::MoYr),
- 29 => Ok(Self::QYr),
- 30 => Ok(Self::WkYr),
- 31 => Ok(Self::Pct),
- 32 => Ok(Self::Dot),
- 33 => Ok(Self::CC(CC::A)),
- 34 => Ok(Self::CC(CC::B)),
- 35 => Ok(Self::CC(CC::C)),
- 36 => Ok(Self::CC(CC::D)),
- 37 => Ok(Self::CC(CC::E)),
- 38 => Ok(Self::EDate),
- 39 => Ok(Self::SDate),
- 40 => Ok(Self::MTime),
- 41 => Ok(Self::YmdHms),
- _ => Err(Error::UnknownFormat { value: source }),
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct UncheckedFormat {
- pub type_: Type,
-
- pub w: Width,
-
- pub d: Decimals,
-}
-
-impl UncheckedFormat {
- pub fn new(type_: Type, w: Width, d: Decimals) -> Self {
- Self { type_, w, d }
- }
- pub fn fix(&self) -> Format {
- Format::fixed_from(self)
- }
-}
-
-impl TryFrom<raw::records::RawFormat> for UncheckedFormat {
- type Error = Error;
-
- fn try_from(raw: raw::records::RawFormat) -> Result<Self, Self::Error> {
- let raw = raw.0;
- let raw_format = (raw >> 16) as u16;
- let format = raw_format.try_into()?;
- let w = ((raw >> 8) & 0xff) as Width;
- let d = (raw & 0xff) as Decimals;
- Ok(Self {
- type_: format,
- w,
- d,
- })
- }
-}
-
-impl Display for UncheckedFormat {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{}{}", self.type_, self.w)?;
- if self.type_.takes_decimals() || self.d > 0 {
- write!(f, ".{}", self.d)?;
- }
- Ok(())
- }
-}
-
-#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Enum, Serialize, Deserialize)]
-#[serde(rename_all = "snake_case")]
-pub enum Decimal {
- #[default]
- Dot,
- Comma,
-}
-
-impl Decimal {
- pub fn as_str(&self) -> &'static str {
- match self {
- Decimal::Dot => ".",
- Decimal::Comma => ",",
- }
- }
-}
-
-impl From<Decimal> for char {
- fn from(value: Decimal) -> Self {
- u8::from(value).into()
- }
-}
-
-impl From<Decimal> for u8 {
- fn from(value: Decimal) -> Self {
- match value {
- Decimal::Dot => b'.',
- Decimal::Comma => b',',
- }
- }
-}
-
-impl TryFrom<char> for Decimal {
- type Error = ();
-
- fn try_from(c: char) -> Result<Self, Self::Error> {
- match c {
- '.' => Ok(Self::Dot),
- ',' => Ok(Self::Comma),
- _ => Err(()),
- }
- }
-}
-
-impl Not for Decimal {
- type Output = Self;
-
- fn not(self) -> Self::Output {
- match self {
- Self::Dot => Self::Comma,
- Self::Comma => Self::Dot,
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)]
-pub struct Epoch(pub i32);
-
-impl Epoch {
- /// Applies the epoch to `year`:
- ///
- /// - If `year` is 2 digits (between 0 and 99, inclusive), returns it
- /// converted it to the correct year considering the epoch.
- ///
- /// - Otherwise, returns `year` unchanged.
- pub fn apply(&self, year: i32) -> i32 {
- match year {
- 0..=99 => {
- let century = self.0 / 100 * 100;
- let offset = self.0 - century;
- if year >= offset {
- year + century
- } else {
- year + century + 100
- }
- }
- other => other,
- }
- }
-}
-
-impl Default for Epoch {
- fn default() -> Self {
- static DEFAULT: LazyLock<Epoch> = LazyLock::new(|| Epoch(Local::now().year() - 69));
- *DEFAULT
- }
-}
-
-impl Display for Epoch {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{}", self.0)
- }
-}
-
-#[derive(Clone, Debug, Default, Serialize)]
-pub struct Settings {
- pub epoch: Epoch,
-
- /// Either `'.'` or `','`.
- pub decimal: Decimal,
-
- /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5`
- /// instead of `.5`)?
- pub leading_zero: bool,
-
- /// Custom currency styles.
- pub ccs: EnumMap<CC, Option<Box<NumberStyle>>>,
-}
-
-#[derive(Copy, Clone, Enum)]
-struct StyleParams {
- decimal: Decimal,
- leading_zero: bool,
-}
-impl From<&Settings> for StyleParams {
- fn from(value: &Settings) -> Self {
- Self {
- decimal: value.decimal,
- leading_zero: value.leading_zero,
- }
- }
-}
-
-struct StyleSet(EnumMap<StyleParams, NumberStyle>);
-
-impl StyleSet {
- fn new(f: impl Fn(StyleParams) -> NumberStyle) -> Self {
- Self(EnumMap::from_fn(f))
- }
- fn get(&self, settings: &Settings) -> &NumberStyle {
- &self.0[settings.into()]
- }
-}
-
-impl Settings {
- pub fn with_cc(mut self, cc: CC, style: NumberStyle) -> Self {
- self.ccs[cc] = Some(Box::new(style));
- self
- }
- pub fn with_leading_zero(self, leading_zero: bool) -> Self {
- Self {
- leading_zero,
- ..self
- }
- }
- pub fn with_epoch(self, epoch: Epoch) -> Self {
- Self { epoch, ..self }
- }
- pub fn number_style(&self, type_: Type) -> &NumberStyle {
- static DEFAULT: LazyLock<NumberStyle> =
- LazyLock::new(|| NumberStyle::new("", "", Decimal::Dot, None, false));
-
- match type_ {
- Type::F | Type::E => {
- static F: LazyLock<StyleSet> = LazyLock::new(|| {
- StyleSet::new(|p| NumberStyle::new("", "", p.decimal, None, p.leading_zero))
- });
- F.get(self)
- }
- Type::Comma => {
- static COMMA: LazyLock<StyleSet> = LazyLock::new(|| {
- StyleSet::new(|p| {
- NumberStyle::new("", "", p.decimal, Some(!p.decimal), p.leading_zero)
- })
- });
- COMMA.get(self)
- }
- Type::Dot => {
- static DOT: LazyLock<StyleSet> = LazyLock::new(|| {
- StyleSet::new(|p| {
- NumberStyle::new("", "", !p.decimal, Some(p.decimal), p.leading_zero)
- })
- });
- DOT.get(self)
- }
- Type::Dollar => {
- static DOLLAR: LazyLock<StyleSet> = LazyLock::new(|| {
- StyleSet::new(|p| NumberStyle::new("$", "", p.decimal, Some(!p.decimal), false))
- });
- DOLLAR.get(self)
- }
- Type::Pct => {
- static PCT: LazyLock<StyleSet> = LazyLock::new(|| {
- StyleSet::new(|p| NumberStyle::new("", "%", p.decimal, None, false))
- });
- PCT.get(self)
- }
- Type::CC(cc) => self.ccs[cc].as_deref().unwrap_or(&DEFAULT),
- Type::N
- | Type::Z
- | Type::P
- | Type::PK
- | Type::IB
- | Type::PIB
- | Type::PIBHex
- | Type::RB
- | Type::RBHex
- | Type::Date
- | Type::ADate
- | Type::EDate
- | Type::JDate
- | Type::SDate
- | Type::QYr
- | Type::MoYr
- | Type::WkYr
- | Type::DateTime
- | Type::YmdHms
- | Type::MTime
- | Type::Time
- | Type::DTime
- | Type::WkDay
- | Type::Month
- | Type::A
- | Type::AHex => &DEFAULT,
- }
- }
-}
-
-/// A numeric output style. This can express numeric formats in
-/// [Category::Basic] and [Category::Custom].
-#[derive(Clone, Debug, Serialize)]
-pub struct NumberStyle {
- pub neg_prefix: Affix,
- pub prefix: Affix,
- pub suffix: Affix,
- pub neg_suffix: Affix,
-
- /// Decimal point.
- pub decimal: Decimal,
-
- /// Grouping character.
- pub grouping: Option<Decimal>,
-
- /// Format as `.5` or `0.5`?
- pub leading_zero: bool,
-
- /// An `Affix` may require more bytes than its display width; for example,
- /// U+00A5 (¥) is 2 bytes in UTF-8 but occupies only one display column.
- /// This member is the sum of the number of bytes required by all of the
- /// `Affix` members in this struct, minus their display widths. Thus, it
- /// can be used to size memory allocations: for example, the formatted
- /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in
- /// UTF-8.
- #[serde(skip)]
- pub extra_bytes: usize,
-}
-
-impl Display for NumberStyle {
- /// Display this number style in the format used for custom currency.
- ///
- /// This format can only accurately represent number styles that include a
- /// grouping character. If this number style doesn't, it will pretend that
- /// the grouping character is the opposite of the decimal point character.
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- let grouping = char::from(!self.decimal);
- write!(
- f,
- "{}{}{}{}{}{}{}",
- self.neg_prefix.display(grouping),
- grouping,
- self.prefix.display(grouping),
- grouping,
- self.suffix.display(grouping),
- grouping,
- self.neg_suffix.display(grouping),
- )
- }
-}
-
-impl NumberStyle {
- fn new(
- prefix: &str,
- suffix: &str,
- decimal: Decimal,
- grouping: Option<Decimal>,
- leading_zero: bool,
- ) -> Self {
- // These assertions ensure that zero is correct for `extra_bytes`.
- debug_assert!(prefix.is_ascii());
- debug_assert!(suffix.is_ascii());
-
- Self {
- neg_prefix: Affix::new("-"),
- prefix: Affix::new(prefix),
- suffix: Affix::new(suffix),
- neg_suffix: Affix::new(""),
- decimal,
- grouping,
- leading_zero,
- extra_bytes: 0,
- }
- }
-
- fn affix_width(&self) -> usize {
- self.prefix.width + self.suffix.width
- }
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct Affix {
- /// String contents of affix.
- pub s: String,
-
- #[serde(skip)]
- /// Display width in columns (see [unicode_width])
- pub width: usize,
-}
-
-impl Affix {
- fn new(s: impl Into<String>) -> Self {
- let s = s.into();
- Self {
- width: s.width(),
- s,
- }
- }
-
- fn extra_bytes(&self) -> usize {
- self.s.len().checked_sub(self.width).unwrap()
- }
-
- fn display(&self, escape: char) -> DisplayAffix<'_> {
- DisplayAffix {
- affix: self.s.as_str(),
- escape,
- }
- }
-}
-
-pub struct DisplayAffix<'a> {
- affix: &'a str,
- escape: char,
-}
-
-impl Display for DisplayAffix<'_> {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- for c in self.affix.chars() {
- if c == self.escape {
- f.write_char('\'')?;
- }
- f.write_char(c)?;
- }
- Ok(())
- }
-}
-
-impl FromStr for NumberStyle {
- type Err = ();
-
- fn from_str(s: &str) -> Result<Self, Self::Err> {
- fn find_separator(s: &str) -> Option<char> {
- // Count commas and periods. There must be exactly three of one or
- // the other, except that an apostrophe escapes a following comma or
- // period.
- let mut n_commas = 0;
- let mut n_periods = 0;
- let s = s.as_bytes();
- for i in 0..s.len() {
- if i > 0 && s[i - 1] == b'\'' {
- } else if s[i] == b',' {
- n_commas += 1;
- } else if s[i] == b'.' {
- n_periods += 1;
- }
- }
-
- if n_commas == 3 && n_periods != 3 {
- Some(',')
- } else if n_periods == 3 && n_commas != 3 {
- Some('.')
- } else {
- None
- }
- }
-
- fn take_cc_token(iter: &mut Chars<'_>, grouping: char) -> Affix {
- let mut s = String::new();
- let mut quote = false;
- for c in iter {
- if c == '\'' && !quote {
- quote = true;
- } else if c == grouping && !quote {
- break;
- } else {
- s.push(c);
- quote = false;
- }
- }
- Affix::new(s)
- }
-
- let Some(grouping) = find_separator(s) else {
- return Err(());
- };
- let mut iter = s.chars();
- let neg_prefix = take_cc_token(&mut iter, grouping);
- let prefix = take_cc_token(&mut iter, grouping);
- let suffix = take_cc_token(&mut iter, grouping);
- let neg_suffix = take_cc_token(&mut iter, grouping);
- let grouping: Decimal = grouping.try_into().unwrap();
- let decimal = !grouping;
- let extra_bytes = neg_prefix.extra_bytes()
- + prefix.extra_bytes()
- + suffix.extra_bytes()
- + neg_suffix.extra_bytes();
- Ok(Self {
- neg_prefix,
- prefix,
- suffix,
- neg_suffix,
- decimal,
- grouping: Some(grouping),
- leading_zero: false,
- extra_bytes,
- })
- }
-}
-
-/// An item within a [DateTemplate].
-pub struct TemplateItem {
- /// Character in the template.
- pub c: char,
-
- /// Number of repetitions of the character.
- pub n: usize,
-}
-
-/// A template for date and time formats.
-#[derive(Clone)]
-pub struct DateTemplate(&'static str);
-
-impl DateTemplate {
- /// Returns a [DateTemplate] used for date and time input and output in a
- /// field of the given `type_` and `width`.
- ///
- /// `width` only affects whether a 2-digit year or a 4-digit year is used,
- /// that is, whether the returned string contains `yy` or `yyyy`, and
- /// whether seconds are included, that is, whether the returned string
- /// contains `:SS`. A caller that doesn't care whether the returned string
- /// contains `yy` or `yyyy` or `:SS` can just specify 0 to omit them.
- pub fn new(type_: Type, width: usize) -> Option<Self> {
- let (short, long) = match type_ {
- Type::F
- | Type::Comma
- | Type::Dot
- | Type::Dollar
- | Type::Pct
- | Type::E
- | Type::CC(_)
- | Type::N
- | Type::Z
- | Type::P
- | Type::PK
- | Type::IB
- | Type::PIB
- | Type::PIBHex
- | Type::RB
- | Type::RBHex
- | Type::WkDay
- | Type::Month
- | Type::A
- | Type::AHex => return None,
- Type::Date => ("dd-mmm-yy", "dd-mmm-yyyy"),
- Type::ADate => ("mm/dd/yy", "mm/dd/yyyy"),
- Type::EDate => ("dd.mm.yy", "dd.mm.yyyy"),
- Type::JDate => ("yyddd", "yyyyddd"),
- Type::SDate => ("yy/mm/dd", "yyyy/mm/dd"),
- Type::QYr => ("q Q yy", "q Q yyyy"),
- Type::MoYr => ("mmm yy", "mmm yyyy"),
- Type::WkYr => ("ww WK yy", "ww WK yyyy"),
- Type::DateTime => ("dd-mmm-yyyy HH:MM", "dd-mmm-yyyy HH:MM:SS"),
- Type::YmdHms => ("yyyy-mm-dd HH:MM", "yyyy-mm-dd HH:MM:SS"),
- Type::MTime => ("MM", "MM:SS"),
- Type::Time => ("HH:MM", "HH:MM:SS"),
- Type::DTime => ("D HH:MM", "D HH:MM:SS"),
- };
- if width >= long.len() {
- Some(DateTemplate(long))
- } else {
- Some(DateTemplate(short))
- }
- }
-
- pub fn for_format(format: Format) -> Option<Self> {
- Self::new(format.type_(), format.w())
- }
-
- #[allow(clippy::len_without_is_empty)]
- pub fn len(&self) -> usize {
- self.0.len()
- }
-}
-
-impl Iterator for DateTemplate {
- type Item = TemplateItem;
-
- fn next(&mut self) -> Option<Self::Item> {
- let mut iter = self.0.chars();
- let c = iter.next()?;
- self.0 = iter.as_str();
- let mut n = 1;
- while iter.next() == Some(c) {
- self.0 = iter.as_str();
- n += 1;
- }
- Some(TemplateItem { c, n })
- }
-}
-
-#[cfg(test)]
-mod tests {
- use crate::format::{Format, Type, Width};
-
- #[test]
- fn codepage_to_unicode() {
- fn check_format(input: Format, expected_width: Width) {
- let mut output = input;
- output.codepage_to_unicode();
- let expected = Format::new(input.type_, expected_width, input.d).unwrap();
- assert_eq!(output, expected);
- }
- check_format(Format::new(Type::A, 1, 0).unwrap(), 3);
- check_format(Format::new(Type::A, 2, 0).unwrap(), 6);
- check_format(Format::new(Type::A, 3, 0).unwrap(), 9);
- check_format(Format::new(Type::A, 1000, 0).unwrap(), 3000);
- check_format(Format::new(Type::A, 20000, 0).unwrap(), 32767);
-
- check_format(Format::new(Type::AHex, 2, 0).unwrap(), 6);
- check_format(Format::new(Type::AHex, 4, 0).unwrap(), 12);
- check_format(Format::new(Type::AHex, 6, 0).unwrap(), 18);
- check_format(Format::new(Type::AHex, 2000, 0).unwrap(), 6000);
- check_format(Format::new(Type::AHex, 20000, 0).unwrap(), 60000);
- check_format(Format::new(Type::AHex, 30000, 0).unwrap(), 65534);
-
- check_format(Format::new(Type::F, 40, 0).unwrap(), 40);
- }
-}
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Lexical analysis for PSPP syntax.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into three
+//! phases:
+//!
+//! 1. A low level called "segmentation", implemented in the [segment] module.
+//! This labels syntax strings with [Segment](segment::Segment)s.
+//!
+//! 2. A middle level called "scanning", implemented in the [scan] module.
+//! This transforms and merges segments to form [Token]s.
+//!
+//! 3. A high level called "lexing", implemented in the [lexer] module. Lexing
+//! brings together multiple source files and invokes macro expansion on the
+//! tokens output by the scanner.
+
+// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
+#![cfg_attr(not(test), warn(missing_docs))]
+
+pub mod command_name;
+pub mod lexer;
+pub mod scan;
+pub mod segment;
+mod token;
+pub use token::{Punct, Token};
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-//! Lexical analysis for PSPP syntax.
-//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into three
-//! phases:
-//!
-//! 1. A low level called "segmentation", implemented in the [segment] module.
-//! This labels syntax strings with [Segment](segment::Segment)s.
-//!
-//! 2. A middle level called "scanning", implemented in the [scan] module.
-//! This transforms and merges segments to form [Token]s.
-//!
-//! 3. A high level called "lexing", implemented in the [lexer] module. Lexing
-//! brings together multiple source files and invokes macro expansion on the
-//! tokens output by the scanner.
-
-// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
-#![cfg_attr(not(test), warn(missing_docs))]
-
-pub mod command_name;
-pub mod lexer;
-pub mod scan;
-pub mod segment;
-mod token;
-pub use token::{Punct, Token};
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Mid-level lexical analysis.
+//!
+//! This module implements mid-level lexical analysis using the segments
+//! output by the lower-level [segmentation phase](super::segment).
+//!
+//! Scanning accepts as input a stream of segments, which are UTF-8 strings
+//! labeled with a [segment type](super::segment::Segment). It outputs a stream
+//! of [Token]s used by the PSPP parser or an error.
+
+use crate::identifier::{Identifier, ReservedWord};
+
+use super::{
+ segment::{Segment, Segmenter, Syntax},
+ token::{Punct, Token},
+};
+use std::collections::VecDeque;
+use thiserror::Error as ThisError;
+
+/// Error returned by [merge_tokens].
+#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
+pub enum ScanError {
+ /// Unterminated string constant.
+ #[error("Unterminated string constant.")]
+ ExpectedQuote,
+
+ /// Missing exponent.
+ #[error("Missing exponent following `{0}`")]
+ ExpectedExponent(String),
+
+ /// Odd length hex string.
+ #[error("String of hex digits has {0} characters, which is not a multiple of 2.")]
+ OddLengthHexString(usize),
+
+ /// Invalid hex digit.
+ #[error("Invalid hex digit {0:?}.")]
+ BadHexDigit(char),
+
+ /// Incomplete UTF-8 sequence.
+ #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
+ IncompleteUtf8 {
+ /// Incomplete sequence.
+ substring: String,
+ /// Offset of start of sequence.
+ offset: usize,
+ },
+
+ /// Bad UTF-8 sequence.
+ #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
+ BadUtf8 {
+ /// Invalid sequence.
+ substring: String,
+ /// Offset of start of sequence.
+ offset: usize,
+ },
+
+ /// Invalid length Unicode string.
+ #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
+ BadLengthUnicodeString(usize),
+
+ /// Invalid code point.
+ #[error("U+{0:04X} is not a valid Unicode code point.")]
+ BadCodePoint(u32),
+
+ /// Expected hexadecimal Unicode code point
+ #[error("Expected hexadecimal Unicode code point.")]
+ ExpectedCodePoint,
+
+ /// `DO REPEAT` nested too deeply.
+ #[error("`DO REPEAT` nested too deeply.")]
+ DoRepeatOverflow,
+
+ /// Unexpected character.
+ #[error("Unexpected character {0:?} in input.")]
+ UnexpectedChar(char),
+}
+
+/// The action returned by [merge_tokens].
+#[derive(Clone, Debug)]
+pub enum MergeAction {
+ /// Copy one token literally from input to output.
+ Copy,
+
+ /// Expand `n` tokens from the input into `token` in the output.
+ Expand {
+ /// Number of tokens to expand.
+ n: usize,
+
+ /// Replacement token.
+ token: Token,
+ },
+}
+
+/// Used by [merge_tokens] to indicate that more input is needed.
+#[derive(Copy, Clone, Debug)]
+pub struct Incomplete;
+
+impl Segment {
+ /// Tries to transform this segment, which was obtained for `s`, into a
+ /// token. Returns one of:
+ ///
+ /// - `None`: This segment doesn't correspond to any token (because it is a
+ /// comment, white space, etc.) and can be dropped in tokenization.
+ ///
+ /// - `Some(Ok(token))`: This segment corresponds to the given token.
+ ///
+ /// - `Some(Err(error))`: The segment contains an error, which the caller
+ /// should report.
+ ///
+ /// The raw token (or error) that this function returns should ordinarily be
+ /// merged with adjacent tokens with [merge_tokens] or some higher-level
+ /// construct.
+ pub fn to_token(self, s: &str) -> Option<Result<Token, ScanError>> {
+ match self {
+ Segment::Number => Some(Ok(Token::Number(s.parse().unwrap()))),
+ Segment::QuotedString => {
+ // Trim quote mark from front and back.
+ let mut chars = s.chars();
+ let quote = chars.next().unwrap();
+ let s = chars.as_str().strip_suffix(quote).unwrap();
+
+ // Replace doubled quotes by single ones.
+ let (single_quote, double_quote) = match quote {
+ '\'' => ("'", "''"),
+ '"' => ("\"", "\"\""),
+ _ => unreachable!(),
+ };
+ Some(Ok(Token::String(s.replace(double_quote, single_quote))))
+ }
+ Segment::HexString => {
+ // Strip `X"` prefix and `"` suffix (or variations).
+ let s = &s[2..s.len() - 1];
+ for c in s.chars() {
+ if !c.is_ascii_hexdigit() {
+ return Some(Err(ScanError::BadHexDigit(c)));
+ }
+ }
+ if s.len() % 2 != 0 {
+ return Some(Err(ScanError::OddLengthHexString(s.len())));
+ }
+ let bytes = s
+ .as_bytes()
+ .chunks_exact(2)
+ .map(|pair| {
+ let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
+ let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
+ hi * 16 + lo
+ })
+ .collect::<Vec<_>>();
+ match String::from_utf8(bytes) {
+ Ok(string) => Some(Ok(Token::String(string))),
+ Err(error) => {
+ let details = error.utf8_error();
+ let offset = details.valid_up_to() * 2;
+ let end = details
+ .error_len()
+ .map(|len| offset + len * 2)
+ .unwrap_or(s.len());
+ let substring = String::from(&s[offset..end]);
+ Some(Err(if details.error_len().is_some() {
+ ScanError::BadUtf8 { substring, offset }
+ } else {
+ ScanError::IncompleteUtf8 { substring, offset }
+ }))
+ }
+ }
+ }
+ Segment::UnicodeString => {
+ // Strip `U"` prefix and `"` suffix (or variations).
+ let s = &s[2..s.len() - 1];
+ if !(1..=8).contains(&s.len()) {
+ return Some(Err(ScanError::BadLengthUnicodeString(s.len())));
+ }
+ let Ok(code_point) = u32::from_str_radix(s, 16) else {
+ return Some(Err(ScanError::ExpectedCodePoint));
+ };
+ let Some(c) = char::from_u32(code_point) else {
+ return Some(Err(ScanError::BadCodePoint(code_point)));
+ };
+ Some(Ok(Token::String(String::from(c))))
+ }
+
+ Segment::UnquotedString
+ | Segment::DoRepeatCommand
+ | Segment::InlineData
+ | Segment::Document
+ | Segment::MacroBody
+ | Segment::MacroName => Some(Ok(Token::String(String::from(s)))),
+
+ Segment::Identifier => {
+ if let Ok(reserved_word) = ReservedWord::try_from(s) {
+ match reserved_word {
+ ReservedWord::And => Some(Ok(Token::Punct(Punct::And))),
+ ReservedWord::Or => Some(Ok(Token::Punct(Punct::Or))),
+ ReservedWord::Not => Some(Ok(Token::Punct(Punct::Not))),
+ ReservedWord::Eq => Some(Ok(Token::Punct(Punct::Eq))),
+ ReservedWord::Ge => Some(Ok(Token::Punct(Punct::Ge))),
+ ReservedWord::Gt => Some(Ok(Token::Punct(Punct::Gt))),
+ ReservedWord::Le => Some(Ok(Token::Punct(Punct::Le))),
+ ReservedWord::Lt => Some(Ok(Token::Punct(Punct::Lt))),
+ ReservedWord::Ne => Some(Ok(Token::Punct(Punct::Ne))),
+ ReservedWord::All => Some(Ok(Token::Punct(Punct::All))),
+ ReservedWord::By => Some(Ok(Token::Punct(Punct::By))),
+ ReservedWord::To => Some(Ok(Token::Punct(Punct::To))),
+ ReservedWord::With => Some(Ok(Token::Punct(Punct::With))),
+ }
+ } else {
+ Some(Ok(Token::Id(Identifier::new(s).unwrap())))
+ }
+ }
+ Segment::Punct => match s {
+ "(" => Some(Ok(Token::Punct(Punct::LParen))),
+ ")" => Some(Ok(Token::Punct(Punct::RParen))),
+ "[" => Some(Ok(Token::Punct(Punct::LSquare))),
+ "]" => Some(Ok(Token::Punct(Punct::RSquare))),
+ "{" => Some(Ok(Token::Punct(Punct::LCurly))),
+ "}" => Some(Ok(Token::Punct(Punct::RCurly))),
+ "," => Some(Ok(Token::Punct(Punct::Comma))),
+ "=" => Some(Ok(Token::Punct(Punct::Equals))),
+ "-" => Some(Ok(Token::Punct(Punct::Dash))),
+ "&" => Some(Ok(Token::Punct(Punct::And))),
+ "|" => Some(Ok(Token::Punct(Punct::Or))),
+ "+" => Some(Ok(Token::Punct(Punct::Plus))),
+ "/" => Some(Ok(Token::Punct(Punct::Slash))),
+ "*" => Some(Ok(Token::Punct(Punct::Asterisk))),
+ "<" => Some(Ok(Token::Punct(Punct::Lt))),
+ ">" => Some(Ok(Token::Punct(Punct::Gt))),
+ "~" => Some(Ok(Token::Punct(Punct::Not))),
+ ":" => Some(Ok(Token::Punct(Punct::Colon))),
+ ";" => Some(Ok(Token::Punct(Punct::Semicolon))),
+ "**" => Some(Ok(Token::Punct(Punct::Exp))),
+ "<=" => Some(Ok(Token::Punct(Punct::Le))),
+ "<>" => Some(Ok(Token::Punct(Punct::Ne))),
+ "~=" => Some(Ok(Token::Punct(Punct::Ne))),
+ ">=" => Some(Ok(Token::Punct(Punct::Ge))),
+ "!" => Some(Ok(Token::Punct(Punct::Bang))),
+ "%" => Some(Ok(Token::Punct(Punct::Percent))),
+ "?" => Some(Ok(Token::Punct(Punct::Question))),
+ "`" => Some(Ok(Token::Punct(Punct::Backtick))),
+ "_" => Some(Ok(Token::Punct(Punct::Underscore))),
+ "." => Some(Ok(Token::Punct(Punct::Dot))),
+ "!*" => Some(Ok(Token::Punct(Punct::BangAsterisk))),
+ _ => unreachable!("bad punctuator {s:?}"),
+ },
+ Segment::Shbang
+ | Segment::Spaces
+ | Segment::Comment
+ | Segment::Newline
+ | Segment::CommentCommand => None,
+ Segment::DoRepeatOverflow => Some(Err(ScanError::DoRepeatOverflow)),
+ Segment::StartDocument => Some(Ok(Token::Id(Identifier::new("DOCUMENT").unwrap()))),
+ Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
+ Some(Ok(Token::End))
+ }
+ Segment::ExpectedQuote => Some(Err(ScanError::ExpectedQuote)),
+ Segment::ExpectedExponent => Some(Err(ScanError::ExpectedExponent(String::from(s)))),
+ Segment::UnexpectedChar => {
+ Some(Err(ScanError::UnexpectedChar(s.chars().next().unwrap())))
+ }
+ }
+ }
+}
+
+/// Attempts to merge a sequence of tokens together into a single token.
+///
+/// The tokens are taken from the beginning of `input`, which given
+/// 0-based token index returns:
+///
+/// * `Ok(Some(token))`: The token with the given index.
+///
+/// * `Ok(None)`: End of input.
+///
+/// * `Err(Incomplete)`: The given token isn't available yet (it may or may not
+/// exist).
+///
+/// This function returns one of:
+///
+/// * `Ok(Some(MergeAction))`: How to transform one or more input tokens into an
+/// output token.
+///
+/// * `Ok(None)`: End of input. (Only returned if `input(0)` is `Ok(None)`.)
+///
+/// * `Err(Incomplete)`: More input tokens are needed. Call again with longer
+/// `input`. ([Token::End] or [Token::Punct(Punct::EndCmd)] is
+/// always sufficient as extra input.)
+///
+/// This performs two different kinds of token merging:
+///
+/// - String concatenation, where syntax like `"a" + "b"` is converted into a
+/// single string token. This is definitely needed because the parser relies
+/// on it.
+///
+/// - Negative number merging, where syntax like `-5` is converted from a pair
+/// of tokens (a dash and a positive number) into a single token (a negative
+/// number). This might not be needed anymore because the segmenter
+/// directly treats a dash followed by a number, with optional intervening
+/// white space, as a negative number. It's only needed if we want
+/// intervening comments to be allowed or for part of the negative number
+/// token to be produced by macro expansion.
+pub fn merge_tokens<'a, F>(input: F) -> Result<Option<MergeAction>, Incomplete>
+where
+ F: Fn(usize) -> Result<Option<&'a Token>, Incomplete>,
+{
+ let Some(token) = input(0)? else {
+ return Ok(None);
+ };
+ match token {
+ Token::Punct(Punct::Dash) => match input(1)? {
+ Some(Token::Number(number)) if number.is_sign_positive() => {
+ let number = *number;
+ Ok(Some(MergeAction::Expand {
+ n: 2,
+ token: Token::Number(-number),
+ }))
+ }
+ _ => Ok(Some(MergeAction::Copy)),
+ },
+ Token::String(_) => {
+ let mut i = 0;
+ while matches!(input(i * 2 + 1)?, Some(Token::Punct(Punct::Plus)))
+ && matches!(input(i * 2 + 2)?, Some(Token::String(_)))
+ {
+ i += 1;
+ }
+ if i == 0 {
+ Ok(Some(MergeAction::Copy))
+ } else {
+ let mut output = String::new();
+ for i in 0..=i {
+ let Token::String(s) = input(i * 2).unwrap().unwrap() else {
+ unreachable!()
+ };
+ output.push_str(s);
+ }
+ Ok(Some(MergeAction::Expand {
+ n: i * 2 + 1,
+ token: Token::String(output),
+ }))
+ }
+ }
+ _ => Ok(Some(MergeAction::Copy)),
+ }
+}
+
+/// Too-simple lexical analyzer for strings.
+///
+/// Given a string, [StringSegmenter] provides iteration over raw tokens.
+/// Unlike [StringScanner], [StringSegmenter] does not merge tokens using
+/// [merge_tokens]. Usually merging is desirable, so [StringScanner] should be
+/// preferred.
+///
+/// This is used as part of macro expansion.
+pub struct StringSegmenter<'a> {
+ input: &'a str,
+ segmenter: Segmenter,
+}
+
+impl<'a> StringSegmenter<'a> {
+ /// Creates a new [StringSegmenter] for `input` using syntax variant `mode`.
+ /// See [Segmenter::new] for an explanation of `is_snippet`.
+ pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self {
+ Self {
+ input,
+ segmenter: Segmenter::new(mode, is_snippet),
+ }
+ }
+}
+
+impl<'a> Iterator for StringSegmenter<'a> {
+ type Item = (&'a str, Result<Token, ScanError>);
+
+ fn next(&mut self) -> Option<Self::Item> {
+ loop {
+ let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap()?;
+ let (s, rest) = self.input.split_at(seg_len);
+ self.input = rest;
+
+ if let Some(token) = seg_type.to_token(s) {
+ return Some((s, token));
+ }
+ }
+ }
+}
+
+/// Simple lexical analyzer for strings.
+///
+/// Given a string, [StringScanner] provides iteration over tokens.
+pub struct StringScanner<'a> {
+ input: &'a str,
+ eof: bool,
+ segmenter: Segmenter,
+ tokens: VecDeque<Token>,
+}
+
+impl<'a> StringScanner<'a> {
+ /// Creates a new [StringScanner] for `input` using syntax variant `mode`.
+ /// See [Segmenter::new] for an explanation of `is_snippet`.
+ pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self {
+ Self {
+ input,
+ eof: false,
+ segmenter: Segmenter::new(mode, is_snippet),
+ tokens: VecDeque::with_capacity(1),
+ }
+ }
+
+ fn merge(&mut self, eof: bool) -> Result<Option<Result<Token, ScanError>>, Incomplete> {
+ match merge_tokens(|index| {
+ if let Some(token) = self.tokens.get(index) {
+ Ok(Some(token))
+ } else if eof {
+ Ok(None)
+ } else {
+ Err(Incomplete)
+ }
+ })? {
+ Some(MergeAction::Copy) => Ok(Some(Ok(self.tokens.pop_front().unwrap()))),
+ Some(MergeAction::Expand { n, token }) => {
+ self.tokens.drain(..n);
+ Ok(Some(Ok(token)))
+ }
+ None => Ok(None),
+ }
+ }
+
+ /// Transforms this [StringScanner] into an iterator that includes only the
+ /// [Token]s, omitting [ScanError]s.
+ pub fn unwrapped(self) -> impl Iterator<Item = Token> + use<'a> {
+ self.map(|scan_token| scan_token.ok().unwrap())
+ }
+}
+
+impl Iterator for StringScanner<'_> {
+ type Item = Result<Token, ScanError>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ loop {
+ if let Ok(Some(token)) = self.merge(self.eof) {
+ return Some(token);
+ }
+
+ let Some((seg_len, seg_type)) = self.segmenter.push(self.input, true).unwrap() else {
+ self.eof = true;
+ return self.merge(true).unwrap();
+ };
+ let (s, rest) = self.input.split_at(seg_len);
+
+ match seg_type.to_token(s) {
+ Some(Err(error)) => {
+ if let Ok(Some(token)) = self.merge(true) {
+ return Some(token);
+ }
+ self.input = rest;
+ return Some(Err(error));
+ }
+ Some(Ok(token)) => {
+ self.tokens.push_back(token);
+ }
+ None => (),
+ }
+ self.input = rest;
+ }
+ }
+}
+
+#[cfg(test)]
+mod test;
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-//! Mid-level lexical analysis.
-//!
-//! This module implements mid-level lexical analysis using the segments
-//! output by the lower-level [segmentation phase](super::segment).
-//!
-//! Scanning accepts as input a stream of segments, which are UTF-8 strings
-//! labeled with a [segment type](super::segment::Segment). It outputs a stream
-//! of [Token]s used by the PSPP parser or an error.
-
-use crate::identifier::{Identifier, ReservedWord};
-
-use super::{
- segment::{Segment, Segmenter, Syntax},
- token::{Punct, Token},
-};
-use std::collections::VecDeque;
-use thiserror::Error as ThisError;
-
-/// Error returned by [merge_tokens].
-#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
-pub enum ScanError {
- /// Unterminated string constant.
- #[error("Unterminated string constant.")]
- ExpectedQuote,
-
- /// Missing exponent.
- #[error("Missing exponent following `{0}`")]
- ExpectedExponent(String),
-
- /// Odd length hex string.
- #[error("String of hex digits has {0} characters, which is not a multiple of 2.")]
- OddLengthHexString(usize),
-
- /// Invalid hex digit.
- #[error("Invalid hex digit {0:?}.")]
- BadHexDigit(char),
-
- /// Incomplete UTF-8 sequence.
- #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
- IncompleteUtf8 {
- /// Incomplete sequence.
- substring: String,
- /// Offset of start of sequence.
- offset: usize,
- },
-
- /// Bad UTF-8 sequence.
- #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
- BadUtf8 {
- /// Invalid sequence.
- substring: String,
- /// Offset of start of sequence.
- offset: usize,
- },
-
- /// Invalid length Unicode string.
- #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
- BadLengthUnicodeString(usize),
-
- /// Invalid code point.
- #[error("U+{0:04X} is not a valid Unicode code point.")]
- BadCodePoint(u32),
-
- /// Expected hexadecimal Unicode code point
- #[error("Expected hexadecimal Unicode code point.")]
- ExpectedCodePoint,
-
- /// `DO REPEAT` nested too deeply.
- #[error("`DO REPEAT` nested too deeply.")]
- DoRepeatOverflow,
-
- /// Unexpected character.
- #[error("Unexpected character {0:?} in input.")]
- UnexpectedChar(char),
-}
-
-/// The action returned by [merge_tokens].
-#[derive(Clone, Debug)]
-pub enum MergeAction {
- /// Copy one token literally from input to output.
- Copy,
-
- /// Expand `n` tokens from the input into `token` in the output.
- Expand {
- /// Number of tokens to expand.
- n: usize,
-
- /// Replacement token.
- token: Token,
- },
-}
-
-/// Used by [merge_tokens] to indicate that more input is needed.
-#[derive(Copy, Clone, Debug)]
-pub struct Incomplete;
-
-impl Segment {
- /// Tries to transform this segment, which was obtained for `s`, into a
- /// token. Returns one of:
- ///
- /// - `None`: This segment doesn't correspond to any token (because it is a
- /// comment, white space, etc.) and can be dropped in tokenization.
- ///
- /// - `Some(Ok(token))`: This segment corresponds to the given token.
- ///
- /// - `Some(Err(error))`: The segment contains an error, which the caller
- /// should report.
- ///
- /// The raw token (or error) that this function returns should ordinarily be
- /// merged with adjacent tokens with [merge_tokens] or some higher-level
- /// construct.
- pub fn to_token(self, s: &str) -> Option<Result<Token, ScanError>> {
- match self {
- Segment::Number => Some(Ok(Token::Number(s.parse().unwrap()))),
- Segment::QuotedString => {
- // Trim quote mark from front and back.
- let mut chars = s.chars();
- let quote = chars.next().unwrap();
- let s = chars.as_str().strip_suffix(quote).unwrap();
-
- // Replace doubled quotes by single ones.
- let (single_quote, double_quote) = match quote {
- '\'' => ("'", "''"),
- '"' => ("\"", "\"\""),
- _ => unreachable!(),
- };
- Some(Ok(Token::String(s.replace(double_quote, single_quote))))
- }
- Segment::HexString => {
- // Strip `X"` prefix and `"` suffix (or variations).
- let s = &s[2..s.len() - 1];
- for c in s.chars() {
- if !c.is_ascii_hexdigit() {
- return Some(Err(ScanError::BadHexDigit(c)));
- }
- }
- if s.len() % 2 != 0 {
- return Some(Err(ScanError::OddLengthHexString(s.len())));
- }
- let bytes = s
- .as_bytes()
- .chunks_exact(2)
- .map(|pair| {
- let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
- let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
- hi * 16 + lo
- })
- .collect::<Vec<_>>();
- match String::from_utf8(bytes) {
- Ok(string) => Some(Ok(Token::String(string))),
- Err(error) => {
- let details = error.utf8_error();
- let offset = details.valid_up_to() * 2;
- let end = details
- .error_len()
- .map(|len| offset + len * 2)
- .unwrap_or(s.len());
- let substring = String::from(&s[offset..end]);
- Some(Err(if details.error_len().is_some() {
- ScanError::BadUtf8 { substring, offset }
- } else {
- ScanError::IncompleteUtf8 { substring, offset }
- }))
- }
- }
- }
- Segment::UnicodeString => {
- // Strip `U"` prefix and `"` suffix (or variations).
- let s = &s[2..s.len() - 1];
- if !(1..=8).contains(&s.len()) {
- return Some(Err(ScanError::BadLengthUnicodeString(s.len())));
- }
- let Ok(code_point) = u32::from_str_radix(s, 16) else {
- return Some(Err(ScanError::ExpectedCodePoint));
- };
- let Some(c) = char::from_u32(code_point) else {
- return Some(Err(ScanError::BadCodePoint(code_point)));
- };
- Some(Ok(Token::String(String::from(c))))
- }
-
- Segment::UnquotedString
- | Segment::DoRepeatCommand
- | Segment::InlineData
- | Segment::Document
- | Segment::MacroBody
- | Segment::MacroName => Some(Ok(Token::String(String::from(s)))),
-
- Segment::Identifier => {
- if let Ok(reserved_word) = ReservedWord::try_from(s) {
- match reserved_word {
- ReservedWord::And => Some(Ok(Token::Punct(Punct::And))),
- ReservedWord::Or => Some(Ok(Token::Punct(Punct::Or))),
- ReservedWord::Not => Some(Ok(Token::Punct(Punct::Not))),
- ReservedWord::Eq => Some(Ok(Token::Punct(Punct::Eq))),
- ReservedWord::Ge => Some(Ok(Token::Punct(Punct::Ge))),
- ReservedWord::Gt => Some(Ok(Token::Punct(Punct::Gt))),
- ReservedWord::Le => Some(Ok(Token::Punct(Punct::Le))),
- ReservedWord::Lt => Some(Ok(Token::Punct(Punct::Lt))),
- ReservedWord::Ne => Some(Ok(Token::Punct(Punct::Ne))),
- ReservedWord::All => Some(Ok(Token::Punct(Punct::All))),
- ReservedWord::By => Some(Ok(Token::Punct(Punct::By))),
- ReservedWord::To => Some(Ok(Token::Punct(Punct::To))),
- ReservedWord::With => Some(Ok(Token::Punct(Punct::With))),
- }
- } else {
- Some(Ok(Token::Id(Identifier::new(s).unwrap())))
- }
- }
- Segment::Punct => match s {
- "(" => Some(Ok(Token::Punct(Punct::LParen))),
- ")" => Some(Ok(Token::Punct(Punct::RParen))),
- "[" => Some(Ok(Token::Punct(Punct::LSquare))),
- "]" => Some(Ok(Token::Punct(Punct::RSquare))),
- "{" => Some(Ok(Token::Punct(Punct::LCurly))),
- "}" => Some(Ok(Token::Punct(Punct::RCurly))),
- "," => Some(Ok(Token::Punct(Punct::Comma))),
- "=" => Some(Ok(Token::Punct(Punct::Equals))),
- "-" => Some(Ok(Token::Punct(Punct::Dash))),
- "&" => Some(Ok(Token::Punct(Punct::And))),
- "|" => Some(Ok(Token::Punct(Punct::Or))),
- "+" => Some(Ok(Token::Punct(Punct::Plus))),
- "/" => Some(Ok(Token::Punct(Punct::Slash))),
- "*" => Some(Ok(Token::Punct(Punct::Asterisk))),
- "<" => Some(Ok(Token::Punct(Punct::Lt))),
- ">" => Some(Ok(Token::Punct(Punct::Gt))),
- "~" => Some(Ok(Token::Punct(Punct::Not))),
- ":" => Some(Ok(Token::Punct(Punct::Colon))),
- ";" => Some(Ok(Token::Punct(Punct::Semicolon))),
- "**" => Some(Ok(Token::Punct(Punct::Exp))),
- "<=" => Some(Ok(Token::Punct(Punct::Le))),
- "<>" => Some(Ok(Token::Punct(Punct::Ne))),
- "~=" => Some(Ok(Token::Punct(Punct::Ne))),
- ">=" => Some(Ok(Token::Punct(Punct::Ge))),
- "!" => Some(Ok(Token::Punct(Punct::Bang))),
- "%" => Some(Ok(Token::Punct(Punct::Percent))),
- "?" => Some(Ok(Token::Punct(Punct::Question))),
- "`" => Some(Ok(Token::Punct(Punct::Backtick))),
- "_" => Some(Ok(Token::Punct(Punct::Underscore))),
- "." => Some(Ok(Token::Punct(Punct::Dot))),
- "!*" => Some(Ok(Token::Punct(Punct::BangAsterisk))),
- _ => unreachable!("bad punctuator {s:?}"),
- },
- Segment::Shbang
- | Segment::Spaces
- | Segment::Comment
- | Segment::Newline
- | Segment::CommentCommand => None,
- Segment::DoRepeatOverflow => Some(Err(ScanError::DoRepeatOverflow)),
- Segment::StartDocument => Some(Ok(Token::Id(Identifier::new("DOCUMENT").unwrap()))),
- Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
- Some(Ok(Token::End))
- }
- Segment::ExpectedQuote => Some(Err(ScanError::ExpectedQuote)),
- Segment::ExpectedExponent => Some(Err(ScanError::ExpectedExponent(String::from(s)))),
- Segment::UnexpectedChar => {
- Some(Err(ScanError::UnexpectedChar(s.chars().next().unwrap())))
- }
- }
- }
-}
-
-/// Attempts to merge a sequence of tokens together into a single token.
-///
-/// The tokens are taken from the beginning of `input`, which given
-/// 0-based token index returns:
-///
-/// * `Ok(Some(token))`: The token with the given index.
-///
-/// * `Ok(None)`: End of input.
-///
-/// * `Err(Incomplete)`: The given token isn't available yet (it may or may not
-/// exist).
-///
-/// This function returns one of:
-///
-/// * `Ok(Some(MergeAction))`: How to transform one or more input tokens into an
-/// output token.
-///
-/// * `Ok(None)`: End of input. (Only returned if `input(0)` is `Ok(None)`.)
-///
-/// * `Err(Incomplete)`: More input tokens are needed. Call again with longer
-/// `input`. ([Token::End] or [Token::Punct(Punct::EndCmd)] is
-/// always sufficient as extra input.)
-///
-/// This performs two different kinds of token merging:
-///
-/// - String concatenation, where syntax like `"a" + "b"` is converted into a
-/// single string token. This is definitely needed because the parser relies
-/// on it.
-///
-/// - Negative number merging, where syntax like `-5` is converted from a pair
-/// of tokens (a dash and a positive number) into a single token (a negative
-/// number). This might not be needed anymore because the segmenter
-/// directly treats a dash followed by a number, with optional intervening
-/// white space, as a negative number. It's only needed if we want
-/// intervening comments to be allowed or for part of the negative number
-/// token to be produced by macro expansion.
-pub fn merge_tokens<'a, F>(input: F) -> Result<Option<MergeAction>, Incomplete>
-where
- F: Fn(usize) -> Result<Option<&'a Token>, Incomplete>,
-{
- let Some(token) = input(0)? else {
- return Ok(None);
- };
- match token {
- Token::Punct(Punct::Dash) => match input(1)? {
- Some(Token::Number(number)) if number.is_sign_positive() => {
- let number = *number;
- Ok(Some(MergeAction::Expand {
- n: 2,
- token: Token::Number(-number),
- }))
- }
- _ => Ok(Some(MergeAction::Copy)),
- },
- Token::String(_) => {
- let mut i = 0;
- while matches!(input(i * 2 + 1)?, Some(Token::Punct(Punct::Plus)))
- && matches!(input(i * 2 + 2)?, Some(Token::String(_)))
- {
- i += 1;
- }
- if i == 0 {
- Ok(Some(MergeAction::Copy))
- } else {
- let mut output = String::new();
- for i in 0..=i {
- let Token::String(s) = input(i * 2).unwrap().unwrap() else {
- unreachable!()
- };
- output.push_str(s);
- }
- Ok(Some(MergeAction::Expand {
- n: i * 2 + 1,
- token: Token::String(output),
- }))
- }
- }
- _ => Ok(Some(MergeAction::Copy)),
- }
-}
-
-/// Too-simple lexical analyzer for strings.
-///
-/// Given a string, [StringSegmenter] provides iteration over raw tokens.
-/// Unlike [StringScanner], [StringSegmenter] does not merge tokens using
-/// [merge_tokens]. Usually merging is desirable, so [StringScanner] should be
-/// preferred.
-///
-/// This is used as part of macro expansion.
-pub struct StringSegmenter<'a> {
- input: &'a str,
- segmenter: Segmenter,
-}
-
-impl<'a> StringSegmenter<'a> {
- /// Creates a new [StringSegmenter] for `input` using syntax variant `mode`.
- /// See [Segmenter::new] for an explanation of `is_snippet`.
- pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self {
- Self {
- input,
- segmenter: Segmenter::new(mode, is_snippet),
- }
- }
-}
-
-impl<'a> Iterator for StringSegmenter<'a> {
- type Item = (&'a str, Result<Token, ScanError>);
-
- fn next(&mut self) -> Option<Self::Item> {
- loop {
- let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap()?;
- let (s, rest) = self.input.split_at(seg_len);
- self.input = rest;
-
- if let Some(token) = seg_type.to_token(s) {
- return Some((s, token));
- }
- }
- }
-}
-
-/// Simple lexical analyzer for strings.
-///
-/// Given a string, [StringScanner] provides iteration over tokens.
-pub struct StringScanner<'a> {
- input: &'a str,
- eof: bool,
- segmenter: Segmenter,
- tokens: VecDeque<Token>,
-}
-
-impl<'a> StringScanner<'a> {
- /// Creates a new [StringScanner] for `input` using syntax variant `mode`.
- /// See [Segmenter::new] for an explanation of `is_snippet`.
- pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self {
- Self {
- input,
- eof: false,
- segmenter: Segmenter::new(mode, is_snippet),
- tokens: VecDeque::with_capacity(1),
- }
- }
-
- fn merge(&mut self, eof: bool) -> Result<Option<Result<Token, ScanError>>, Incomplete> {
- match merge_tokens(|index| {
- if let Some(token) = self.tokens.get(index) {
- Ok(Some(token))
- } else if eof {
- Ok(None)
- } else {
- Err(Incomplete)
- }
- })? {
- Some(MergeAction::Copy) => Ok(Some(Ok(self.tokens.pop_front().unwrap()))),
- Some(MergeAction::Expand { n, token }) => {
- self.tokens.drain(..n);
- Ok(Some(Ok(token)))
- }
- None => Ok(None),
- }
- }
-
- /// Transforms this [StringScanner] into an iterator that includes only the
- /// [Token]s, omitting [ScanError]s.
- pub fn unwrapped(self) -> impl Iterator<Item = Token> + use<'a> {
- self.map(|scan_token| scan_token.ok().unwrap())
- }
-}
-
-impl Iterator for StringScanner<'_> {
- type Item = Result<Token, ScanError>;
-
- fn next(&mut self) -> Option<Self::Item> {
- loop {
- if let Ok(Some(token)) = self.merge(self.eof) {
- return Some(token);
- }
-
- let Some((seg_len, seg_type)) = self.segmenter.push(self.input, true).unwrap() else {
- self.eof = true;
- return self.merge(true).unwrap();
- };
- let (s, rest) = self.input.split_at(seg_len);
-
- match seg_type.to_token(s) {
- Some(Err(error)) => {
- if let Ok(Some(token)) = self.merge(true) {
- return Some(token);
- }
- self.input = rest;
- return Some(Err(error));
- }
- Some(Ok(token)) => {
- self.tokens.push_back(token);
- }
- None => (),
- }
- self.input = rest;
- }
- }
-}
-
-#[cfg(test)]
-mod test;
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Low-level lexical analysis.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into [three
+//! phases](super). This module implements the low-level segmentation phase.
+//!
+//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label
+//! (a segment type) for each byte or contiguous sequence of bytes in the input.
+//! It also, in a few corner cases, outputs zero-width segments that label the
+//! boundary between a pair of bytes in the input.
+//!
+//! Some segment types correspond directly to tokens; for example,
+//! [Segment::Identifier] becomes [Token::Id] later in lexical analysis. Other
+//! segments contribute to tokens but do not correspond directly; for example,
+//! multiple quoted string [Segment::QuotedString] separated by
+//! [Segment::Spaces] and "+" punctuators [Segment::Punct] may be combined to
+//! form a single string token [Token::String]. Still other segments are
+//! ignored (e.g. [Segment::Spaces]) or trigger special behavior such as error
+//! messages later in tokenization (e.g. [Segment::ExpectedQuote]).
+//!
+//! [Token::Id]: crate::lex::token::Token::Id
+//! [Token::String]: crate::lex::token::Token::String
+
+use std::cmp::Ordering;
+
+use crate::{
+ identifier::{id_match, id_match_n, IdentifierChar},
+ prompt::PromptStyle,
+};
+use bitflags::bitflags;
+
+use super::command_name::{command_match, COMMAND_NAMES};
+
+/// Syntax variant.
+///
+/// PSPP syntax is written in one of two syntax variant which are broadly
+/// defined as follows:
+///
+/// - In interactive syntax, commands end with a period at the end of the line
+/// or with a blank line.
+///
+/// - In batch syntax, the second and subsequent lines of a command are indented
+/// from the left margin.
+///
+/// The segmenter can also try to automatically detect the kind of syntax in
+/// use, using a heuristic that is usually correct.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
+pub enum Syntax {
+ /// Try to interpret input correctly regardless of whether it is written
+ /// for interactive or batch syntax.
+ ///
+ /// This is `Syntax::default()`.
+ #[default]
+ Auto,
+
+ /// Interactive syntax.
+ Interactive,
+
+ /// Batch syntax.
+ Batch,
+}
+
+/// The type of a segment.
+///
+/// A [Segment] is a label for a string slice and is normally paired with one.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Segment {
+ /// A number.
+ Number,
+
+ /// A quoted string (`'...'` or `"..."`)..
+ QuotedString,
+
+ /// A hexadecimal string (`X'...'` or `X"..."`).
+ HexString,
+
+ /// A Unicode string (`U'...'` or `U"..."`).
+ UnicodeString,
+
+ /// An unquoted string.
+ ///
+ /// Unquoted strings appear only in a few special-case constructs, such as
+ /// the `FILE LABEL` command.
+ UnquotedString,
+
+ /// An identifier.
+ Identifier,
+
+ /// A punctuator or operator.
+ Punct,
+
+ /// `#!` at the beginning of a syntax file only.
+ Shbang,
+
+ /// Spaces.
+ Spaces,
+
+ /// A comment (`/* ... */`).
+ Comment,
+
+ /// New-line.
+ Newline,
+
+ /// A comment command (`* ...` or `COMMENT ...`).
+ CommentCommand,
+
+ /// In a `DO REPEAT` command, one of the lines to be repeated.
+ DoRepeatCommand,
+
+ /// Indicates `DO REPEAT` nested more deeply than supported.
+ DoRepeatOverflow,
+
+ /// A line of inline data inside `BEGIN DATA`...`END DATA`.
+ InlineData,
+
+ /// In `!DEFINE`, an identifier for the macro being defined.
+ ///
+ /// Distinguished from [Identifier](Self::Identifier) because a `MacroName`
+ /// must never be macro-expanded.
+ MacroName,
+
+ /// Contents of `!DEFINE`...`!ENDDEFINE`.
+ MacroBody,
+
+ /// Represents the `DOCUMENT` beginning a `DOCUMENT` command.
+ ///
+ /// This token is not associated with any text: the actual `DOCUMENT`
+ /// keyword is part of the following [Document](Self::Document) segment.
+ /// This is because documents include the `DOCUMENT` keyword.
+ StartDocument,
+
+ /// One of the lines of documents in a `DOCUMENT` command.
+ ///
+ /// The first line of a document includes the `DOCUMENT` keyword itself.
+ Document,
+
+ /// A command separator.
+ ///
+ /// This segment is usually for `+`, `-`, or `.` at the beginning of a line.
+ StartCommand,
+
+ /// A command separator.
+ ///
+ /// This segment is usually for a blank line. It also appears at the end of
+ /// a file.
+ SeparateCommands,
+
+ /// A command separator.
+ ///
+ /// This segment is for `.` at the end of a line.
+ EndCommand,
+
+ /// Missing quote at the end of a line.
+ ///
+ /// This segment contains a partial quoted string. It starts with a quote
+ /// mark (`"` or `'`, possibly preceded by `X` or `U`) but goes to the end
+ /// of the line without the matching end quote mark.
+ ExpectedQuote,
+
+ /// Missing exponent in number.
+ ///
+ /// This segment contains a number that ends with `E` or `E+` or `E-`
+ /// without a following exponent.
+ ExpectedExponent,
+
+ /// Unexpected character.
+ ///
+ /// The segment is a single character that isn't valid in syntax.
+ UnexpectedChar,
+}
+
+bitflags! {
+ #[derive(Copy, Clone, Debug)]
+ struct Substate: u8 {
+ const START_OF_LINE = 1;
+ const START_OF_COMMAND = 2;
+ }
+}
+
+/// Used by [Segmenter] to indicate that more input is needed.
+#[derive(Copy, Clone, Debug)]
+pub struct Incomplete;
+
+/// Labels syntax input with [Segment]s.
+#[derive(Copy, Clone)]
+pub struct Segmenter {
+ state: (State, Substate),
+ nest: u8,
+ syntax: Syntax,
+}
+
+impl Segmenter {
+ /// Returns a segmenter with the given `syntax`.
+ ///
+ /// If `is_snippet` is false, then the segmenter will parse as if it's being
+ /// given a whole file. This means, for example, that it will interpret `-`
+ /// or `+` at the beginning of the syntax as a separator between commands
+ /// (since `-` or `+` at the beginning of a line has this meaning).
+ ///
+ /// If `is_snippet` is true, then the segmenter will parse as if it's being
+ /// given an isolated piece of syntax. This means that, for example, that
+ /// it will interpret `-` or `+` at the beginning of the syntax as an
+ /// operator token or (if followed by a digit) as part of a number.
+ pub fn new(syntax: Syntax, is_snippet: bool) -> Self {
+ Self {
+ state: if is_snippet {
+ (State::General, Substate::empty())
+ } else {
+ (State::Shbang, Substate::empty())
+ },
+ syntax,
+ nest: 0,
+ }
+ }
+
+ /// Returns the [Syntax] variant passed in to [new](Self::new).
+ pub fn syntax(&self) -> Syntax {
+ self.syntax
+ }
+
+ fn start_of_line(&self) -> bool {
+ self.state.1.contains(Substate::START_OF_LINE)
+ }
+
+ fn start_of_command(&self) -> bool {
+ self.state.1.contains(Substate::START_OF_COMMAND)
+ }
+
+ /// Returns the style of command prompt to display to an interactive user
+ /// for input in the current state.. The return value is most accurate in
+ /// with [Syntax::Interactive] syntax and at the beginning of a line (that
+ /// is, if [Segmenter::push] consumed as much as possible of the input up to
+ /// a new-line).
+ pub fn prompt(&self) -> PromptStyle {
+ match self.state.0 {
+ State::Shbang => PromptStyle::First,
+ State::General => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::Comment1 | State::Comment2 => PromptStyle::Comment,
+ State::Document1 | State::Document2 => PromptStyle::Document,
+ State::Document3 => PromptStyle::First,
+ State::FileLabel1 => PromptStyle::Later,
+ State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
+ State::DoRepeat1 | State::DoRepeat2 => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::DoRepeat3 => PromptStyle::DoRepeat,
+ State::DoRepeat4 => PromptStyle::DoRepeat,
+ State::Define1 | State::Define2 | State::Define3 => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
+ State::BeginData1 => PromptStyle::First,
+ State::BeginData2 => PromptStyle::Later,
+ State::BeginData3 | State::BeginData4 => PromptStyle::Data,
+ }
+ }
+
+ fn push_rest<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ if input.is_empty() {
+ if eof {
+ return Ok(None);
+ } else {
+ return Err(Incomplete);
+ };
+ }
+
+ match self.state.0 {
+ State::Shbang => self.parse_shbang(input, eof),
+ State::General => {
+ if self.start_of_line() {
+ self.parse_start_of_line(input, eof)
+ } else {
+ self.parse_mid_line(input, eof)
+ }
+ }
+ State::Comment1 => self.parse_comment_1(input, eof),
+ State::Comment2 => self.parse_comment_2(input, eof),
+ State::Document1 => self.parse_document_1(input, eof),
+ State::Document2 => self.parse_document_2(input, eof),
+ State::Document3 => self.parse_document_3(input, eof),
+ State::FileLabel1 => self.parse_file_label_1(input, eof),
+ State::FileLabel2 => self.parse_file_label_2(input, eof),
+ State::FileLabel3 => self.parse_file_label_3(input, eof),
+ State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
+ State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
+ State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
+ State::DoRepeat4 => self.parse_do_repeat_4(input),
+ State::Define1 => self.parse_define_1_2(input, eof),
+ State::Define2 => self.parse_define_1_2(input, eof),
+ State::Define3 => self.parse_define_3(input, eof),
+ State::Define4 => self.parse_define_4_5(input, eof),
+ State::Define5 => self.parse_define_4_5(input, eof),
+ State::Define6 => self.parse_define_6(input, eof),
+ State::BeginData1 => self.parse_begin_data_1(input, eof),
+ State::BeginData2 => self.parse_begin_data_2(input, eof),
+ State::BeginData3 => self.parse_begin_data_3(input, eof),
+ State::BeginData4 => self.parse_begin_data_4(input, eof),
+ }
+ }
+
+ /// Attempts to label a prefix of the remaining input with a segment type.
+ /// The caller supplies a prefix of the remaining input as `input`. If
+ /// `eof` is true, then `input` is the entire (remainder) of the input; if
+ /// `eof` is false, then further input is potentially available.
+ ///
+ /// The input may contain `\n` or `\r\n` line ends in any combination.
+ ///
+ /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
+ /// in the segment at the beginning of `input` (a number in
+ /// `0..=input.len()`) and the type of that segment. The next call should
+ /// not include those bytes in `input`, because the segmenter has
+ /// (figuratively) consumed them.
+ ///
+ /// Segments can have zero length, including segment types
+ /// [Segment::SeparateCommands], [Segment::StartDocument],
+ /// [Segment::InlineData], and [Segment::Spaces].
+ ///
+ /// Failure occurs only if the segment type of the bytes in `input` cannot
+ /// yet be determined. In this case, this function returns
+ /// `Err(Incomplete)`. If more input is available, the caller should obtain
+ /// some more, then call again with a longer `input`. If this is still not
+ /// enough, the process might need to repeat again and again. If input is
+ /// exhausted, then the caller may call again setting `eof` to true. This
+ /// function will never return `Err(Incomplete)` when `eof` is true.
+ ///
+ /// The caller must not, in a sequence of calls, supply contradictory input.
+ /// That is, bytes provided as part of `input` in one call, but not
+ /// consumed, must not be provided with *different* values on subsequent
+ /// calls. This is because the function must often make decisions based on
+ /// looking ahead beyond the bytes that it consumes.
+ pub fn push(&mut self, input: &str, eof: bool) -> Result<Option<(usize, Segment)>, Incomplete> {
+ Ok(self
+ .push_rest(input, eof)?
+ .map(|(rest, seg_type)| (input.len() - rest.len(), seg_type)))
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum State {
+ Shbang,
+ General,
+ Comment1,
+ Comment2,
+ Document1,
+ Document2,
+ Document3,
+ FileLabel1,
+ FileLabel2,
+ FileLabel3,
+ DoRepeat1,
+ DoRepeat2,
+ DoRepeat3,
+ DoRepeat4,
+ Define1,
+ Define2,
+ Define3,
+ Define4,
+ Define5,
+ Define6,
+ BeginData1,
+ BeginData2,
+ BeginData3,
+ BeginData4,
+}
+
+fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
+ let mut iter = input.chars();
+ match iter.next() {
+ None if !eof => Err(Incomplete),
+ c => Ok((c, iter.as_str())),
+ }
+}
+
+fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
+ '*' => {
+ if let (Some('/'), rest) = take(rest, eof)? {
+ return Ok(rest);
+ }
+ }
+ _ => (),
+ };
+ input = rest;
+ }
+}
+
+fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
+where
+ F: Fn(char) -> bool,
+{
+ let input = input.trim_start_matches(f);
+ if input.is_empty() && !eof {
+ Err(Incomplete)
+ } else {
+ Ok(input)
+ }
+}
+
+fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
+where
+ F: Fn(char) -> bool,
+{
+ if let (Some(c), rest) = take(input, eof)? {
+ if f(c) {
+ return Ok(Some(rest));
+ }
+ }
+ Ok(None)
+}
+
+fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+ c if c.is_whitespace() => (),
+ _ => return Ok(input),
+ }
+ input = rest;
+ }
+}
+
+fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
+ skip_matching(|c| c.is_ascii_digit(), input, eof)
+}
+
+fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '/' => {
+ let (c, rest2) = take(rest, eof)?;
+ match c {
+ Some('*') => input = skip_comment(rest2, eof)?,
+ Some(_) | None => return Ok(rest),
+ }
+ }
+ '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+ c if c.is_whitespace() => input = rest,
+ _ => return Ok(input),
+ };
+ }
+}
+
+fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(false);
+ };
+ match c {
+ 'x' | 'X' | 'u' | 'U' => {
+ let (c, _rest) = take(rest, eof)?;
+ Ok(c == Some('\'') || c == Some('"'))
+ }
+ '\'' | '"' => Ok(true),
+ '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true),
+ _ => Ok(false),
+ }
+}
+
+fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(true);
+ };
+ Ok(match c {
+ '\n' => true,
+ '\r' => take(rest, eof)?.0 == Some('\n'),
+ _ => false,
+ })
+}
+
+fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ is_end_of_line(skip_spaces_and_comments(input, eof)?, eof)
+}
+
+fn first(s: &str) -> char {
+ s.chars().next().unwrap()
+}
+fn get_command_name_candidates(target: &str) -> &[&'static str] {
+ if target.is_empty() {
+ return &[];
+ }
+ let target_first = first(target).to_ascii_uppercase();
+ let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
+ let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
+ &COMMAND_NAMES[low..high]
+}
+
+fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let command_name = input
+ .split(|c: char| {
+ !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-')
+ })
+ .next()
+ .unwrap();
+ if !eof && command_name.len() == input.len() {
+ return Err(Incomplete);
+ }
+ let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.');
+ for command in get_command_name_candidates(command_name) {
+ if let Some(m) = command_match(command, command_name) {
+ if m.missing_words <= 0 {
+ return Ok(true);
+ }
+ }
+ }
+ Ok(false)
+}
+
+impl Segmenter {
+ fn parse_shbang<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ if let (Some('#'), rest) = take(input, eof)? {
+ if let (Some('!'), rest) = take(rest, eof)? {
+ let rest = self.parse_full_line(rest, eof)?;
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok(Some((rest, Segment::Shbang)));
+ }
+ }
+
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ self.push_rest(input, eof)
+ }
+ fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
+ match self.syntax {
+ Syntax::Auto => detect_command_name(input, eof),
+ Syntax::Interactive => Ok(false),
+ Syntax::Batch => Ok(true),
+ }
+ }
+ fn parse_start_of_line<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ debug_assert_eq!(self.state.0, State::General);
+ debug_assert!(self.start_of_line());
+ debug_assert!(!input.is_empty());
+
+ let (Some(c), rest) = take(input, eof).unwrap() else {
+ unreachable!()
+ };
+ match c {
+ '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => {
+ // This `+` is punctuation that may separate pieces of a string.
+ self.state = (State::General, Substate::empty());
+ return Ok(Some((rest, Segment::Punct)));
+ }
+ '+' | '-' | '.' => {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok(Some((rest, Segment::StartCommand)));
+ }
+ _ if c.is_whitespace() => {
+ if at_end_of_line(input, eof)? {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok(Some((input, Segment::SeparateCommands)));
+ }
+ }
+ _ => {
+ if self.at_command_start(input, eof)?
+ && !self.state.1.contains(Substate::START_OF_COMMAND)
+ {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok(Some((input, Segment::StartCommand)));
+ }
+ }
+ }
+ self.state.1 = Substate::START_OF_COMMAND;
+ self.parse_mid_line(input, eof)
+ }
+ fn parse_mid_line<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ debug_assert!(self.state.0 == State::General);
+ debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
+ let (Some(c), rest) = take(input, eof)? else {
+ unreachable!()
+ };
+ match c {
+ '\r' | '\n' if is_end_of_line(input, eof)? => {
+ self.state.1 |= Substate::START_OF_LINE;
+ Ok(Some((
+ self.parse_newline(input, eof).unwrap().unwrap(),
+ Segment::Newline,
+ )))
+ }
+ '/' => {
+ if let (Some('*'), rest) = take(rest, eof)? {
+ let rest = skip_comment(rest, eof)?;
+ Ok(Some((rest, Segment::Comment)))
+ } else {
+ self.state.1 = Substate::empty();
+ Ok(Some((rest, Segment::Punct)))
+ }
+ }
+ '-' => {
+ let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
+ match c {
+ Some(c) if c.is_ascii_digit() => {
+ return self.parse_number(rest, eof);
+ }
+ Some('.') => {
+ if let (Some(c), _rest) = take(rest2, eof)? {
+ if c.is_ascii_digit() {
+ return self.parse_number(rest, eof);
+ }
+ }
+ }
+ None | Some(_) => (),
+ }
+ self.state.1 = Substate::empty();
+ Ok(Some((rest, Segment::Punct)))
+ }
+ '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
+ self.state.1 = Substate::empty();
+ Ok(Some((rest, Segment::Punct)))
+ }
+ '*' => {
+ if self.state.1.contains(Substate::START_OF_COMMAND) {
+ self.state = (State::Comment1, Substate::empty());
+ self.parse_comment_1(input, eof)
+ } else {
+ self.parse_digraph(&['*'], rest, eof)
+ }
+ }
+ '<' => self.parse_digraph(&['=', '>'], rest, eof),
+ '>' => self.parse_digraph(&['='], rest, eof),
+ '~' => self.parse_digraph(&['='], rest, eof),
+ '.' if at_end_of_line(rest, eof)? => {
+ self.state.1 = Substate::START_OF_COMMAND;
+ Ok(Some((rest, Segment::EndCommand)))
+ }
+ '.' => match take(rest, eof)? {
+ (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
+ _ => Ok(Some((rest, Segment::Punct))),
+ },
+ '0'..='9' => self.parse_number(input, eof),
+ 'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof),
+ 'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof),
+ '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof),
+ '!' => {
+ let (c, rest2) = take(rest, eof)?;
+ match c {
+ Some('*') => Ok(Some((rest2, Segment::Punct))),
+ Some(_) => self.parse_id(input, eof),
+ None => Ok(Some((rest, Segment::Punct))),
+ }
+ }
+ c if c.is_whitespace() => Ok(Some((skip_spaces(rest, eof)?, Segment::Spaces))),
+ c if c.may_start_id() => self.parse_id(input, eof),
+ '#'..='~' if c != '\\' && c != '^' => {
+ self.state.1 = Substate::empty();
+ Ok(Some((rest, Segment::Punct)))
+ }
+ _ => {
+ self.state.1 = Substate::empty();
+ Ok(Some((rest, Segment::UnexpectedChar)))
+ }
+ }
+ }
+ fn parse_string<'a>(
+ &mut self,
+ segment: Segment,
+ quote: char,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ while let (Some(c), rest) = take(input, eof)? {
+ match c {
+ _ if c == quote => {
+ let (c, rest2) = take(rest, eof)?;
+ if c != Some(quote) {
+ self.state.1 = Substate::empty();
+ return Ok(Some((rest, segment)));
+ }
+ input = rest2;
+ }
+ '\r' | '\n' if is_end_of_line(input, eof)? => break,
+ _ => input = rest,
+ }
+ }
+ self.state.1 = Substate::empty();
+ Ok(Some((input, Segment::ExpectedQuote)))
+ }
+ fn maybe_parse_string<'a>(
+ &mut self,
+ segment: Segment,
+ input: (&'a str, &'a str),
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ match take(input.1, eof)? {
+ (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof),
+ _ => self.parse_id(input.0, eof),
+ }
+ }
+ fn next_id_in_command<'a>(
+ &self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, &'a str), Incomplete> {
+ let mut sub = Segmenter::new(self.syntax, true);
+ loop {
+ let Some((seg_len, seg_type)) = sub.push(input, eof)? else {
+ return Ok((input, input));
+ };
+ let (segment, rest) = input.split_at(seg_len);
+ match seg_type {
+ Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (),
+
+ Segment::Identifier => return Ok((segment, rest)),
+
+ Segment::Number
+ | Segment::QuotedString
+ | Segment::HexString
+ | Segment::UnicodeString
+ | Segment::UnquotedString
+ | Segment::Punct
+ | Segment::CommentCommand
+ | Segment::DoRepeatCommand
+ | Segment::DoRepeatOverflow
+ | Segment::InlineData
+ | Segment::MacroName
+ | Segment::MacroBody
+ | Segment::StartDocument
+ | Segment::Document
+ | Segment::StartCommand
+ | Segment::SeparateCommands
+ | Segment::EndCommand
+ | Segment::ExpectedQuote
+ | Segment::ExpectedExponent
+ | Segment::UnexpectedChar => return Ok(("", rest)),
+ }
+ input = rest;
+ }
+ }
+ fn parse_id<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let (Some(_), mut end) = take(input, eof).unwrap() else {
+ unreachable!()
+ };
+ while let (Some(c), rest) = take(end, eof)? {
+ if !c.may_continue_id() {
+ break;
+ };
+ end = rest;
+ }
+ let identifier = &input[..input.len() - end.len()];
+ let identifier = match identifier.strip_suffix('.') {
+ Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
+ _ => identifier,
+ };
+ let rest = &input[identifier.len()..];
+
+ if self.state.1.contains(Substate::START_OF_COMMAND) {
+ if id_match_n("COMMENT", identifier, 4) {
+ self.state = (State::Comment1, Substate::empty());
+ return self.parse_comment_1(input, eof);
+ } else if id_match("DOCUMENT", identifier) {
+ self.state = (State::Document1, Substate::empty());
+ return Ok(Some((input, Segment::StartDocument)));
+ } else if id_match_n("DEFINE", identifier, 6) {
+ self.state = (State::Define1, Substate::empty());
+ } else if id_match("FILE", identifier) {
+ if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
+ self.state = (State::FileLabel1, Substate::empty());
+ return Ok(Some((rest, Segment::Identifier)));
+ }
+ } else if id_match("DO", identifier) {
+ if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
+ self.state = (State::DoRepeat1, Substate::empty());
+ return Ok(Some((rest, Segment::Identifier)));
+ }
+ } else if id_match("BEGIN", identifier) {
+ let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
+ if id_match("DATA", next_id) {
+ let rest2 = skip_spaces_and_comments(rest2, eof)?;
+ let rest2 = if let Some(s) = rest2.strip_prefix('.') {
+ skip_spaces_and_comments(s, eof)?
+ } else {
+ rest2
+ };
+ if is_end_of_line(rest2, eof)? {
+ let s = &input[..input.len() - rest2.len()];
+ self.state = (
+ if s.contains('\n') {
+ State::BeginData1
+ } else {
+ State::BeginData2
+ },
+ Substate::empty(),
+ );
+ return Ok(Some((rest, Segment::Identifier)));
+ }
+ }
+ }
+ }
+
+ self.state.1 = Substate::empty();
+ Ok(Some((
+ rest,
+ if identifier != "!" {
+ Segment::Identifier
+ } else {
+ Segment::Punct
+ },
+ )))
+ }
+ fn parse_digraph<'a>(
+ &mut self,
+ seconds: &[char],
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let (c, rest) = take(input, eof)?;
+ self.state.1 = Substate::empty();
+ Ok(Some((
+ match c {
+ Some(c) if seconds.contains(&c) => rest,
+ _ => input,
+ },
+ Segment::Punct,
+ )))
+ }
+ fn parse_number<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let mut input = skip_digits(input, eof)?;
+ if let Some(rest) = match_char(|c| c == '.', input, eof)? {
+ let rest2 = skip_digits(rest, eof)?;
+ if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
+ input = rest2;
+ }
+ };
+ if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
+ let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
+ let rest2 = skip_digits(rest, eof)?;
+ if rest2.len() == rest.len() {
+ self.state.1 = Substate::empty();
+ return Ok(Some((rest, Segment::ExpectedExponent)));
+ }
+ input = rest2;
+ }
+ self.state.1 = Substate::empty();
+ Ok(Some((input, Segment::Number)))
+ }
+ fn parse_comment_1<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ enum CommentState<'a> {
+ Blank,
+ NotBlank,
+ Period(&'a str),
+ }
+ let mut state = CommentState::Blank;
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ // End of file.
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok(Some((input, Segment::SeparateCommands)));
+ };
+ match c {
+ '.' => state = CommentState::Period(input),
+ '\n' | '\r' if is_end_of_line(input, eof)? => {
+ match state {
+ CommentState::Blank => {
+ // Blank line ends comment command.
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok(Some((input, Segment::SeparateCommands)));
+ }
+ CommentState::Period(period) => {
+ // '.' at end of line ends comment command.
+ self.state = (State::General, Substate::empty());
+ return Ok(Some((period, Segment::CommentCommand)));
+ }
+ CommentState::NotBlank => {
+ // Comment continues onto next line.
+ self.state = (State::Comment2, Substate::empty());
+ return Ok(Some((input, Segment::CommentCommand)));
+ }
+ }
+ }
+ c if c.is_whitespace() => (),
+ _ => state = CommentState::NotBlank,
+ }
+ input = rest;
+ }
+ }
+ fn parse_comment_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+
+ let new_command = match take(rest, eof)?.0 {
+ Some('+') | Some('-') | Some('.') => true,
+ Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
+ None | Some(_) => false,
+ };
+ if new_command {
+ self.state = (
+ State::General,
+ Substate::START_OF_LINE | Substate::START_OF_COMMAND,
+ );
+ } else {
+ self.state = (State::Comment1, Substate::empty());
+ }
+ Ok(Some((rest, Segment::Newline)))
+ }
+ fn parse_document_1<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let mut end_cmd = false;
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ self.state = (State::Document3, Substate::empty());
+ return Ok(Some((input, Segment::Document)));
+ };
+ match c {
+ '.' => end_cmd = true,
+ '\n' | '\r' if is_end_of_line(input, eof)? => {
+ self.state.0 = if end_cmd {
+ State::Document3
+ } else {
+ State::Document2
+ };
+ return Ok(Some((input, Segment::Document)));
+ }
+ c if !c.is_whitespace() => end_cmd = false,
+ _ => (),
+ }
+ input = rest;
+ }
+ }
+ fn parse_document_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state = (State::Document1, Substate::empty());
+ Ok(Some((rest, Segment::Newline)))
+ }
+ fn parse_document_3<'a>(
+ &mut self,
+ input: &'a str,
+ _eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ Ok(Some((input, Segment::EndCommand)))
+ }
+ fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let input = skip_spaces_and_comments(input, eof)?;
+ match take(input, eof)?.0 {
+ Some('\'') | Some('"') | Some('\n') => Ok(true),
+ _ => Ok(false),
+ }
+ }
+ fn parse_file_label_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let mut sub = Segmenter {
+ state: (State::General, self.state.1),
+ ..*self
+ };
+ let (rest, segment) = sub.push_rest(input, eof)?.unwrap();
+ if segment == Segment::Identifier {
+ let id = &input[..input.len() - rest.len()];
+ debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
+ if Self::quoted_file_label(rest, eof)? {
+ *self = sub;
+ } else {
+ self.state.0 = State::FileLabel2;
+ }
+ } else {
+ self.state.1 = sub.state.1;
+ }
+ Ok(Some((rest, segment)))
+ }
+ fn parse_file_label_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let input = skip_spaces(input, eof)?;
+ self.state = (State::FileLabel3, Substate::empty());
+ Ok(Some((input, Segment::Spaces)))
+ }
+ fn parse_file_label_3<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let mut end_cmd = None;
+ loop {
+ let (c, rest) = take(input, eof)?;
+ match c {
+ None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
+ self.state = (State::General, Substate::empty());
+ return Ok(Some((end_cmd.unwrap_or(input), Segment::UnquotedString)));
+ }
+ None => unreachable!(),
+ Some('.') => end_cmd = Some(input),
+ Some(c) if !c.is_whitespace() => end_cmd = None,
+ Some(_) => (),
+ }
+ input = rest;
+ }
+ }
+ fn subparse<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let mut sub = Segmenter {
+ syntax: self.syntax,
+ state: (State::General, self.state.1),
+ nest: 0,
+ };
+ let result = sub.push_rest(input, eof)?;
+ self.state.1 = sub.state.1;
+ Ok(result)
+ }
+ /// We are segmenting a `DO REPEAT` command, currently reading the syntax
+ /// that defines the stand-in variables (the head) before the lines of
+ /// syntax to be repeated (the body).
+ fn parse_do_repeat_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?.unwrap();
+ if segment == Segment::SeparateCommands {
+ // We reached a blank line that separates the head from the body.
+ self.state.0 = State::DoRepeat2;
+ } else if segment == Segment::EndCommand || segment == Segment::StartCommand {
+ // We reached the body.
+ self.state.0 = State::DoRepeat3;
+ self.nest = 1;
+ }
+ Ok(Some((rest, segment)))
+ }
+ /// We are segmenting a `DO REPEAT` command, currently reading a blank line
+ /// that separates the head from the body.
+ fn parse_do_repeat_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?.unwrap();
+ if segment == Segment::Newline {
+ // We reached the body.
+ self.state.0 = State::DoRepeat3;
+ self.nest = 1;
+ }
+ Ok(Some((rest, segment)))
+ }
+ fn parse_newline<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<&'a str>, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(None);
+ };
+ match c {
+ '\n' => Ok(Some(rest)),
+ '\r' => {
+ if let (Some('\n'), rest) = take(rest, eof)? {
+ Ok(Some(rest))
+ } else {
+ Ok(None)
+ }
+ }
+ _ => Ok(None),
+ }
+ }
+
+ fn parse_full_line<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<&'a str, Incomplete> {
+ loop {
+ if is_end_of_line(input, eof)? {
+ return Ok(input);
+ }
+ input = take(input, eof).unwrap().1;
+ }
+ }
+ fn check_repeat_command(&mut self, input: &str, eof: bool) -> Result<isize, Incomplete> {
+ let input = input.strip_prefix(['-', '+']).unwrap_or(input);
+ let (id1, input) = self.next_id_in_command(input, eof)?;
+ if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) {
+ Ok(1)
+ } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0)
+ {
+ Ok(-1)
+ } else {
+ Ok(0)
+ }
+ }
+ /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
+ /// are to be repeated. Report each line of syntax as a single
+ /// [`Type::DoRepeatCommand`].
+ ///
+ /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
+ /// blocks inside the lines we're segmenting. `self.nest` counts the
+ /// nesting level, starting at 1.
+ fn parse_do_repeat_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ if let Some(rest) = self.parse_newline(input, eof)? {
+ return Ok(Some((rest, Segment::Newline)));
+ }
+ let rest = self.parse_full_line(input, eof)?;
+ match self.check_repeat_command(input, eof)?.cmp(&0) {
+ Ordering::Greater => {
+ if let Some(nest) = self.nest.checked_add(1) {
+ self.nest = nest;
+ } else {
+ self.state.0 = State::DoRepeat4;
+ }
+ }
+ Ordering::Less => {
+ self.nest -= 1;
+ if self.nest == 0 {
+ // Nesting level dropped to 0, so we've finished reading the `DO
+ // REPEAT` body.
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ return self.push_rest(input, eof);
+ }
+ }
+ Ordering::Equal => (),
+ }
+ Ok(Some((rest, Segment::DoRepeatCommand)))
+ }
+ fn parse_do_repeat_4<'a>(
+ &mut self,
+ input: &'a str,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ self.state.0 = State::DoRepeat3;
+ Ok(Some((input, Segment::DoRepeatOverflow)))
+ }
+ /// We are segmenting a `DEFINE` command, which consists of:
+ ///
+ /// - The `DEFINE` keyword.
+ ///
+ /// - An identifier. We transform this into `Type::MacroName` instead of
+ /// `Type::Identifier` because this identifier must never be macro-expanded.
+ ///
+ /// - Anything but `(`.
+ ///
+ /// - `(` followed by a sequence of tokens possibly including balanced
+ /// parentheses up to a final `)`.
+ ///
+ /// - A sequence of any number of lines, one string per line, ending with
+ /// `!ENDDEFINE`. The first line is usually blank (that is, a newline
+ /// follows the `(`). The last line usually just has `!ENDDEFINE.` on
+ /// it, but it can start with other tokens. The whole
+ /// DEFINE...!ENDDEFINE can be on a single line, even.
+ fn parse_define_1_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?.unwrap();
+ match segment {
+ Segment::Identifier if self.state.0 == State::Define1 => {
+ self.state.0 = State::Define2;
+ return Ok(Some((rest, Segment::MacroName)));
+ }
+ Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
+ // The DEFINE command is malformed because we reached its end
+ // without ever hitting a `(` token. Transition back to general
+ // parsing.
+ self.state.0 = State::General;
+ }
+ Segment::Punct if input.starts_with('(') => {
+ self.state.0 = State::Define3;
+ self.nest = 1;
+ }
+ _ => (),
+ }
+ Ok(Some((rest, segment)))
+ }
+ fn parse_define_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?.unwrap();
+ match segment {
+ Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
+ // The DEFINE command is malformed because we reached its end
+ // without ever hitting a `(` token. Transition back to general
+ // parsing.
+ self.state.0 = State::General;
+ }
+ Segment::Punct if input.starts_with('(') => {
+ self.nest += 1;
+ }
+ Segment::Punct if input.starts_with(')') => {
+ self.nest -= 1;
+ if self.nest == 0 {
+ self.state = (State::Define4, Substate::empty());
+ }
+ }
+ _ => (),
+ }
+ Ok(Some((rest, segment)))
+ }
+ fn find_enddefine(mut input: &str) -> Option<&str> {
+ loop {
+ input = skip_spaces_and_comments(input, true).unwrap();
+ let (Some(c), rest) = take(input, true).unwrap() else {
+ return None;
+ };
+ match c {
+ '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
+ return Some(input)
+ }
+ '\'' | '"' => {
+ let index = rest.find(c)?;
+ input = &rest[index + 1..];
+ }
+ _ => input = rest,
+ }
+ }
+ }
+
+ /// We are in the body of a macro definition, looking for additional lines
+ /// of the body or `!ENDDEFINE`.
+ ///
+ /// In `State::Define4`, we're parsing the first line of the macro body (the
+ /// same line as the closing parenthesis in the argument definition). In
+ /// `State::Define5`, we're on a later line.
+ fn parse_define_4_5<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let rest = self.parse_full_line(input, eof)?;
+ let line = &input[..input.len() - rest.len()];
+ if let Some(end) = Self::find_enddefine(line) {
+ // Macro ends at the !ENDDEFINE on this line.
+ self.state = (State::General, Substate::empty());
+ let (prefix, rest) = input.split_at(line.len() - end.len());
+ if prefix.is_empty() {
+ // Line starts with `!ENDDEFINE`.
+ self.push_rest(input, eof)
+ } else if prefix.trim_start().is_empty() {
+ // Line starts with spaces followed by `!ENDDEFINE`.
+ Ok(Some((rest, Segment::Spaces)))
+ } else {
+ // Line starts with some content followed by `!ENDDEFINE`.
+ Ok(Some((rest, Segment::MacroBody)))
+ }
+ } else {
+ // No `!ENDDEFINE`. We have a full line of macro body.
+ //
+ // If the first line of the macro body is blank, we just report it
+ // as spaces, or not at all if there are no spaces, because it's not
+ // significant.
+ //
+ // However, if it's a later line, we need to report it because blank
+ // lines can have significance.
+ let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() {
+ if line.is_empty() {
+ return self.parse_define_6(input, eof);
+ }
+ Segment::Spaces
+ } else {
+ Segment::MacroBody
+ };
+ self.state.0 = State::Define6;
+ Ok(Some((rest, segment)))
+ }
+ }
+ fn parse_define_6<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::Define5;
+ Ok(Some((rest, Segment::Newline)))
+ }
+ fn parse_begin_data_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?.unwrap();
+ if segment == Segment::Newline {
+ self.state.0 = State::BeginData2;
+ }
+ Ok(Some((rest, segment)))
+ }
+ fn parse_begin_data_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?.unwrap();
+ if segment == Segment::Newline {
+ self.state.0 = State::BeginData3;
+ }
+ Ok(Some((rest, segment)))
+ }
+ fn is_end_data(line: &str) -> bool {
+ let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
+ return false;
+ };
+ let (Some(c), rest) = take(rest, true).unwrap() else {
+ return false;
+ };
+ if !c.is_whitespace() {
+ return false;
+ };
+ let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
+ return false;
+ };
+
+ let mut endcmd = false;
+ for c in rest.chars() {
+ match c {
+ '.' if endcmd => return false,
+ '.' => endcmd = true,
+ c if c.is_whitespace() => (),
+ _ => return false,
+ }
+ }
+ true
+ }
+ fn parse_begin_data_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let rest = self.parse_full_line(input, eof)?;
+ let line = &input[..input.len() - rest.len()];
+ if Self::is_end_data(line) {
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ self.push_rest(input, eof)
+ } else {
+ self.state.0 = State::BeginData4;
+ Ok(Some((rest, Segment::InlineData)))
+ }
+ }
+ fn parse_begin_data_4<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::BeginData3;
+ Ok(Some((rest, Segment::Newline)))
+ }
+}
+
+fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
+ line.get(..pattern.len()).and_then(|prefix| {
+ prefix
+ .eq_ignore_ascii_case(pattern)
+ .then(|| &line[pattern.len()..])
+ })
+}
+
+#[cfg(test)]
+mod test;
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-//! Low-level lexical analysis.
-//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into [three
-//! phases](super). This module implements the low-level segmentation phase.
-//!
-//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label
-//! (a segment type) for each byte or contiguous sequence of bytes in the input.
-//! It also, in a few corner cases, outputs zero-width segments that label the
-//! boundary between a pair of bytes in the input.
-//!
-//! Some segment types correspond directly to tokens; for example,
-//! [Segment::Identifier] becomes [Token::Id] later in lexical analysis. Other
-//! segments contribute to tokens but do not correspond directly; for example,
-//! multiple quoted string [Segment::QuotedString] separated by
-//! [Segment::Spaces] and "+" punctuators [Segment::Punct] may be combined to
-//! form a single string token [Token::String]. Still other segments are
-//! ignored (e.g. [Segment::Spaces]) or trigger special behavior such as error
-//! messages later in tokenization (e.g. [Segment::ExpectedQuote]).
-//!
-//! [Token::Id]: crate::lex::token::Token::Id
-//! [Token::String]: crate::lex::token::Token::String
-
-use std::cmp::Ordering;
-
-use crate::{
- identifier::{id_match, id_match_n, IdentifierChar},
- prompt::PromptStyle,
-};
-use bitflags::bitflags;
-
-use super::command_name::{command_match, COMMAND_NAMES};
-
-/// Syntax variant.
-///
-/// PSPP syntax is written in one of two syntax variant which are broadly
-/// defined as follows:
-///
-/// - In interactive syntax, commands end with a period at the end of the line
-/// or with a blank line.
-///
-/// - In batch syntax, the second and subsequent lines of a command are indented
-/// from the left margin.
-///
-/// The segmenter can also try to automatically detect the kind of syntax in
-/// use, using a heuristic that is usually correct.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
-pub enum Syntax {
- /// Try to interpret input correctly regardless of whether it is written
- /// for interactive or batch syntax.
- ///
- /// This is `Syntax::default()`.
- #[default]
- Auto,
-
- /// Interactive syntax.
- Interactive,
-
- /// Batch syntax.
- Batch,
-}
-
-/// The type of a segment.
-///
-/// A [Segment] is a label for a string slice and is normally paired with one.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Segment {
- /// A number.
- Number,
-
- /// A quoted string (`'...'` or `"..."`)..
- QuotedString,
-
- /// A hexadecimal string (`X'...'` or `X"..."`).
- HexString,
-
- /// A Unicode string (`U'...'` or `U"..."`).
- UnicodeString,
-
- /// An unquoted string.
- ///
- /// Unquoted strings appear only in a few special-case constructs, such as
- /// the `FILE LABEL` command.
- UnquotedString,
-
- /// An identifier.
- Identifier,
-
- /// A punctuator or operator.
- Punct,
-
- /// `#!` at the beginning of a syntax file only.
- Shbang,
-
- /// Spaces.
- Spaces,
-
- /// A comment (`/* ... */`).
- Comment,
-
- /// New-line.
- Newline,
-
- /// A comment command (`* ...` or `COMMENT ...`).
- CommentCommand,
-
- /// In a `DO REPEAT` command, one of the lines to be repeated.
- DoRepeatCommand,
-
- /// Indicates `DO REPEAT` nested more deeply than supported.
- DoRepeatOverflow,
-
- /// A line of inline data inside `BEGIN DATA`...`END DATA`.
- InlineData,
-
- /// In `!DEFINE`, an identifier for the macro being defined.
- ///
- /// Distinguished from [Identifier](Self::Identifier) because a `MacroName`
- /// must never be macro-expanded.
- MacroName,
-
- /// Contents of `!DEFINE`...`!ENDDEFINE`.
- MacroBody,
-
- /// Represents the `DOCUMENT` beginning a `DOCUMENT` command.
- ///
- /// This token is not associated with any text: the actual `DOCUMENT`
- /// keyword is part of the following [Document](Self::Document) segment.
- /// This is because documents include the `DOCUMENT` keyword.
- StartDocument,
-
- /// One of the lines of documents in a `DOCUMENT` command.
- ///
- /// The first line of a document includes the `DOCUMENT` keyword itself.
- Document,
-
- /// A command separator.
- ///
- /// This segment is usually for `+`, `-`, or `.` at the beginning of a line.
- StartCommand,
-
- /// A command separator.
- ///
- /// This segment is usually for a blank line. It also appears at the end of
- /// a file.
- SeparateCommands,
-
- /// A command separator.
- ///
- /// This segment is for `.` at the end of a line.
- EndCommand,
-
- /// Missing quote at the end of a line.
- ///
- /// This segment contains a partial quoted string. It starts with a quote
- /// mark (`"` or `'`, possibly preceded by `X` or `U`) but goes to the end
- /// of the line without the matching end quote mark.
- ExpectedQuote,
-
- /// Missing exponent in number.
- ///
- /// This segment contains a number that ends with `E` or `E+` or `E-`
- /// without a following exponent.
- ExpectedExponent,
-
- /// Unexpected character.
- ///
- /// The segment is a single character that isn't valid in syntax.
- UnexpectedChar,
-}
-
-bitflags! {
- #[derive(Copy, Clone, Debug)]
- struct Substate: u8 {
- const START_OF_LINE = 1;
- const START_OF_COMMAND = 2;
- }
-}
-
-/// Used by [Segmenter] to indicate that more input is needed.
-#[derive(Copy, Clone, Debug)]
-pub struct Incomplete;
-
-/// Labels syntax input with [Segment]s.
-#[derive(Copy, Clone)]
-pub struct Segmenter {
- state: (State, Substate),
- nest: u8,
- syntax: Syntax,
-}
-
-impl Segmenter {
- /// Returns a segmenter with the given `syntax`.
- ///
- /// If `is_snippet` is false, then the segmenter will parse as if it's being
- /// given a whole file. This means, for example, that it will interpret `-`
- /// or `+` at the beginning of the syntax as a separator between commands
- /// (since `-` or `+` at the beginning of a line has this meaning).
- ///
- /// If `is_snippet` is true, then the segmenter will parse as if it's being
- /// given an isolated piece of syntax. This means that, for example, that
- /// it will interpret `-` or `+` at the beginning of the syntax as an
- /// operator token or (if followed by a digit) as part of a number.
- pub fn new(syntax: Syntax, is_snippet: bool) -> Self {
- Self {
- state: if is_snippet {
- (State::General, Substate::empty())
- } else {
- (State::Shbang, Substate::empty())
- },
- syntax,
- nest: 0,
- }
- }
-
- /// Returns the [Syntax] variant passed in to [new](Self::new).
- pub fn syntax(&self) -> Syntax {
- self.syntax
- }
-
- fn start_of_line(&self) -> bool {
- self.state.1.contains(Substate::START_OF_LINE)
- }
-
- fn start_of_command(&self) -> bool {
- self.state.1.contains(Substate::START_OF_COMMAND)
- }
-
- /// Returns the style of command prompt to display to an interactive user
- /// for input in the current state.. The return value is most accurate in
- /// with [Syntax::Interactive] syntax and at the beginning of a line (that
- /// is, if [Segmenter::push] consumed as much as possible of the input up to
- /// a new-line).
- pub fn prompt(&self) -> PromptStyle {
- match self.state.0 {
- State::Shbang => PromptStyle::First,
- State::General => {
- if self.start_of_command() {
- PromptStyle::First
- } else {
- PromptStyle::Later
- }
- }
- State::Comment1 | State::Comment2 => PromptStyle::Comment,
- State::Document1 | State::Document2 => PromptStyle::Document,
- State::Document3 => PromptStyle::First,
- State::FileLabel1 => PromptStyle::Later,
- State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
- State::DoRepeat1 | State::DoRepeat2 => {
- if self.start_of_command() {
- PromptStyle::First
- } else {
- PromptStyle::Later
- }
- }
- State::DoRepeat3 => PromptStyle::DoRepeat,
- State::DoRepeat4 => PromptStyle::DoRepeat,
- State::Define1 | State::Define2 | State::Define3 => {
- if self.start_of_command() {
- PromptStyle::First
- } else {
- PromptStyle::Later
- }
- }
- State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
- State::BeginData1 => PromptStyle::First,
- State::BeginData2 => PromptStyle::Later,
- State::BeginData3 | State::BeginData4 => PromptStyle::Data,
- }
- }
-
- fn push_rest<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- if input.is_empty() {
- if eof {
- return Ok(None);
- } else {
- return Err(Incomplete);
- };
- }
-
- match self.state.0 {
- State::Shbang => self.parse_shbang(input, eof),
- State::General => {
- if self.start_of_line() {
- self.parse_start_of_line(input, eof)
- } else {
- self.parse_mid_line(input, eof)
- }
- }
- State::Comment1 => self.parse_comment_1(input, eof),
- State::Comment2 => self.parse_comment_2(input, eof),
- State::Document1 => self.parse_document_1(input, eof),
- State::Document2 => self.parse_document_2(input, eof),
- State::Document3 => self.parse_document_3(input, eof),
- State::FileLabel1 => self.parse_file_label_1(input, eof),
- State::FileLabel2 => self.parse_file_label_2(input, eof),
- State::FileLabel3 => self.parse_file_label_3(input, eof),
- State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
- State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
- State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
- State::DoRepeat4 => self.parse_do_repeat_4(input),
- State::Define1 => self.parse_define_1_2(input, eof),
- State::Define2 => self.parse_define_1_2(input, eof),
- State::Define3 => self.parse_define_3(input, eof),
- State::Define4 => self.parse_define_4_5(input, eof),
- State::Define5 => self.parse_define_4_5(input, eof),
- State::Define6 => self.parse_define_6(input, eof),
- State::BeginData1 => self.parse_begin_data_1(input, eof),
- State::BeginData2 => self.parse_begin_data_2(input, eof),
- State::BeginData3 => self.parse_begin_data_3(input, eof),
- State::BeginData4 => self.parse_begin_data_4(input, eof),
- }
- }
-
- /// Attempts to label a prefix of the remaining input with a segment type.
- /// The caller supplies a prefix of the remaining input as `input`. If
- /// `eof` is true, then `input` is the entire (remainder) of the input; if
- /// `eof` is false, then further input is potentially available.
- ///
- /// The input may contain `\n` or `\r\n` line ends in any combination.
- ///
- /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
- /// in the segment at the beginning of `input` (a number in
- /// `0..=input.len()`) and the type of that segment. The next call should
- /// not include those bytes in `input`, because the segmenter has
- /// (figuratively) consumed them.
- ///
- /// Segments can have zero length, including segment types
- /// [Segment::SeparateCommands], [Segment::StartDocument],
- /// [Segment::InlineData], and [Segment::Spaces].
- ///
- /// Failure occurs only if the segment type of the bytes in `input` cannot
- /// yet be determined. In this case, this function returns
- /// `Err(Incomplete)`. If more input is available, the caller should obtain
- /// some more, then call again with a longer `input`. If this is still not
- /// enough, the process might need to repeat again and again. If input is
- /// exhausted, then the caller may call again setting `eof` to true. This
- /// function will never return `Err(Incomplete)` when `eof` is true.
- ///
- /// The caller must not, in a sequence of calls, supply contradictory input.
- /// That is, bytes provided as part of `input` in one call, but not
- /// consumed, must not be provided with *different* values on subsequent
- /// calls. This is because the function must often make decisions based on
- /// looking ahead beyond the bytes that it consumes.
- pub fn push(&mut self, input: &str, eof: bool) -> Result<Option<(usize, Segment)>, Incomplete> {
- Ok(self
- .push_rest(input, eof)?
- .map(|(rest, seg_type)| (input.len() - rest.len(), seg_type)))
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-enum State {
- Shbang,
- General,
- Comment1,
- Comment2,
- Document1,
- Document2,
- Document3,
- FileLabel1,
- FileLabel2,
- FileLabel3,
- DoRepeat1,
- DoRepeat2,
- DoRepeat3,
- DoRepeat4,
- Define1,
- Define2,
- Define3,
- Define4,
- Define5,
- Define6,
- BeginData1,
- BeginData2,
- BeginData3,
- BeginData4,
-}
-
-fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
- let mut iter = input.chars();
- match iter.next() {
- None if !eof => Err(Incomplete),
- c => Ok((c, iter.as_str())),
- }
-}
-
-fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(input);
- };
- match c {
- '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
- '*' => {
- if let (Some('/'), rest) = take(rest, eof)? {
- return Ok(rest);
- }
- }
- _ => (),
- };
- input = rest;
- }
-}
-
-fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
-where
- F: Fn(char) -> bool,
-{
- let input = input.trim_start_matches(f);
- if input.is_empty() && !eof {
- Err(Incomplete)
- } else {
- Ok(input)
- }
-}
-
-fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
-where
- F: Fn(char) -> bool,
-{
- if let (Some(c), rest) = take(input, eof)? {
- if f(c) {
- return Ok(Some(rest));
- }
- }
- Ok(None)
-}
-
-fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(input);
- };
- match c {
- '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
- c if c.is_whitespace() => (),
- _ => return Ok(input),
- }
- input = rest;
- }
-}
-
-fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
- skip_matching(|c| c.is_ascii_digit(), input, eof)
-}
-
-fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(input);
- };
- match c {
- '/' => {
- let (c, rest2) = take(rest, eof)?;
- match c {
- Some('*') => input = skip_comment(rest2, eof)?,
- Some(_) | None => return Ok(rest),
- }
- }
- '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
- c if c.is_whitespace() => input = rest,
- _ => return Ok(input),
- };
- }
-}
-
-fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(false);
- };
- match c {
- 'x' | 'X' | 'u' | 'U' => {
- let (c, _rest) = take(rest, eof)?;
- Ok(c == Some('\'') || c == Some('"'))
- }
- '\'' | '"' => Ok(true),
- '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true),
- _ => Ok(false),
- }
-}
-
-fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(true);
- };
- Ok(match c {
- '\n' => true,
- '\r' => take(rest, eof)?.0 == Some('\n'),
- _ => false,
- })
-}
-
-fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
- is_end_of_line(skip_spaces_and_comments(input, eof)?, eof)
-}
-
-fn first(s: &str) -> char {
- s.chars().next().unwrap()
-}
-fn get_command_name_candidates(target: &str) -> &[&'static str] {
- if target.is_empty() {
- return &[];
- }
- let target_first = first(target).to_ascii_uppercase();
- let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
- let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
- &COMMAND_NAMES[low..high]
-}
-
-fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let command_name = input
- .split(|c: char| {
- !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-')
- })
- .next()
- .unwrap();
- if !eof && command_name.len() == input.len() {
- return Err(Incomplete);
- }
- let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.');
- for command in get_command_name_candidates(command_name) {
- if let Some(m) = command_match(command, command_name) {
- if m.missing_words <= 0 {
- return Ok(true);
- }
- }
- }
- Ok(false)
-}
-
-impl Segmenter {
- fn parse_shbang<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- if let (Some('#'), rest) = take(input, eof)? {
- if let (Some('!'), rest) = take(rest, eof)? {
- let rest = self.parse_full_line(rest, eof)?;
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok(Some((rest, Segment::Shbang)));
- }
- }
-
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- self.push_rest(input, eof)
- }
- fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
- match self.syntax {
- Syntax::Auto => detect_command_name(input, eof),
- Syntax::Interactive => Ok(false),
- Syntax::Batch => Ok(true),
- }
- }
- fn parse_start_of_line<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- debug_assert_eq!(self.state.0, State::General);
- debug_assert!(self.start_of_line());
- debug_assert!(!input.is_empty());
-
- let (Some(c), rest) = take(input, eof).unwrap() else {
- unreachable!()
- };
- match c {
- '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => {
- // This `+` is punctuation that may separate pieces of a string.
- self.state = (State::General, Substate::empty());
- return Ok(Some((rest, Segment::Punct)));
- }
- '+' | '-' | '.' => {
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok(Some((rest, Segment::StartCommand)));
- }
- _ if c.is_whitespace() => {
- if at_end_of_line(input, eof)? {
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok(Some((input, Segment::SeparateCommands)));
- }
- }
- _ => {
- if self.at_command_start(input, eof)?
- && !self.state.1.contains(Substate::START_OF_COMMAND)
- {
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok(Some((input, Segment::StartCommand)));
- }
- }
- }
- self.state.1 = Substate::START_OF_COMMAND;
- self.parse_mid_line(input, eof)
- }
- fn parse_mid_line<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- debug_assert!(self.state.0 == State::General);
- debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
- let (Some(c), rest) = take(input, eof)? else {
- unreachable!()
- };
- match c {
- '\r' | '\n' if is_end_of_line(input, eof)? => {
- self.state.1 |= Substate::START_OF_LINE;
- Ok(Some((
- self.parse_newline(input, eof).unwrap().unwrap(),
- Segment::Newline,
- )))
- }
- '/' => {
- if let (Some('*'), rest) = take(rest, eof)? {
- let rest = skip_comment(rest, eof)?;
- Ok(Some((rest, Segment::Comment)))
- } else {
- self.state.1 = Substate::empty();
- Ok(Some((rest, Segment::Punct)))
- }
- }
- '-' => {
- let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
- match c {
- Some(c) if c.is_ascii_digit() => {
- return self.parse_number(rest, eof);
- }
- Some('.') => {
- if let (Some(c), _rest) = take(rest2, eof)? {
- if c.is_ascii_digit() {
- return self.parse_number(rest, eof);
- }
- }
- }
- None | Some(_) => (),
- }
- self.state.1 = Substate::empty();
- Ok(Some((rest, Segment::Punct)))
- }
- '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
- self.state.1 = Substate::empty();
- Ok(Some((rest, Segment::Punct)))
- }
- '*' => {
- if self.state.1.contains(Substate::START_OF_COMMAND) {
- self.state = (State::Comment1, Substate::empty());
- self.parse_comment_1(input, eof)
- } else {
- self.parse_digraph(&['*'], rest, eof)
- }
- }
- '<' => self.parse_digraph(&['=', '>'], rest, eof),
- '>' => self.parse_digraph(&['='], rest, eof),
- '~' => self.parse_digraph(&['='], rest, eof),
- '.' if at_end_of_line(rest, eof)? => {
- self.state.1 = Substate::START_OF_COMMAND;
- Ok(Some((rest, Segment::EndCommand)))
- }
- '.' => match take(rest, eof)? {
- (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
- _ => Ok(Some((rest, Segment::Punct))),
- },
- '0'..='9' => self.parse_number(input, eof),
- 'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof),
- 'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof),
- '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof),
- '!' => {
- let (c, rest2) = take(rest, eof)?;
- match c {
- Some('*') => Ok(Some((rest2, Segment::Punct))),
- Some(_) => self.parse_id(input, eof),
- None => Ok(Some((rest, Segment::Punct))),
- }
- }
- c if c.is_whitespace() => Ok(Some((skip_spaces(rest, eof)?, Segment::Spaces))),
- c if c.may_start_id() => self.parse_id(input, eof),
- '#'..='~' if c != '\\' && c != '^' => {
- self.state.1 = Substate::empty();
- Ok(Some((rest, Segment::Punct)))
- }
- _ => {
- self.state.1 = Substate::empty();
- Ok(Some((rest, Segment::UnexpectedChar)))
- }
- }
- }
- fn parse_string<'a>(
- &mut self,
- segment: Segment,
- quote: char,
- mut input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- while let (Some(c), rest) = take(input, eof)? {
- match c {
- _ if c == quote => {
- let (c, rest2) = take(rest, eof)?;
- if c != Some(quote) {
- self.state.1 = Substate::empty();
- return Ok(Some((rest, segment)));
- }
- input = rest2;
- }
- '\r' | '\n' if is_end_of_line(input, eof)? => break,
- _ => input = rest,
- }
- }
- self.state.1 = Substate::empty();
- Ok(Some((input, Segment::ExpectedQuote)))
- }
- fn maybe_parse_string<'a>(
- &mut self,
- segment: Segment,
- input: (&'a str, &'a str),
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- match take(input.1, eof)? {
- (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof),
- _ => self.parse_id(input.0, eof),
- }
- }
- fn next_id_in_command<'a>(
- &self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, &'a str), Incomplete> {
- let mut sub = Segmenter::new(self.syntax, true);
- loop {
- let Some((seg_len, seg_type)) = sub.push(input, eof)? else {
- return Ok((input, input));
- };
- let (segment, rest) = input.split_at(seg_len);
- match seg_type {
- Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (),
-
- Segment::Identifier => return Ok((segment, rest)),
-
- Segment::Number
- | Segment::QuotedString
- | Segment::HexString
- | Segment::UnicodeString
- | Segment::UnquotedString
- | Segment::Punct
- | Segment::CommentCommand
- | Segment::DoRepeatCommand
- | Segment::DoRepeatOverflow
- | Segment::InlineData
- | Segment::MacroName
- | Segment::MacroBody
- | Segment::StartDocument
- | Segment::Document
- | Segment::StartCommand
- | Segment::SeparateCommands
- | Segment::EndCommand
- | Segment::ExpectedQuote
- | Segment::ExpectedExponent
- | Segment::UnexpectedChar => return Ok(("", rest)),
- }
- input = rest;
- }
- }
- fn parse_id<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let (Some(_), mut end) = take(input, eof).unwrap() else {
- unreachable!()
- };
- while let (Some(c), rest) = take(end, eof)? {
- if !c.may_continue_id() {
- break;
- };
- end = rest;
- }
- let identifier = &input[..input.len() - end.len()];
- let identifier = match identifier.strip_suffix('.') {
- Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
- _ => identifier,
- };
- let rest = &input[identifier.len()..];
-
- if self.state.1.contains(Substate::START_OF_COMMAND) {
- if id_match_n("COMMENT", identifier, 4) {
- self.state = (State::Comment1, Substate::empty());
- return self.parse_comment_1(input, eof);
- } else if id_match("DOCUMENT", identifier) {
- self.state = (State::Document1, Substate::empty());
- return Ok(Some((input, Segment::StartDocument)));
- } else if id_match_n("DEFINE", identifier, 6) {
- self.state = (State::Define1, Substate::empty());
- } else if id_match("FILE", identifier) {
- if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
- self.state = (State::FileLabel1, Substate::empty());
- return Ok(Some((rest, Segment::Identifier)));
- }
- } else if id_match("DO", identifier) {
- if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
- self.state = (State::DoRepeat1, Substate::empty());
- return Ok(Some((rest, Segment::Identifier)));
- }
- } else if id_match("BEGIN", identifier) {
- let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
- if id_match("DATA", next_id) {
- let rest2 = skip_spaces_and_comments(rest2, eof)?;
- let rest2 = if let Some(s) = rest2.strip_prefix('.') {
- skip_spaces_and_comments(s, eof)?
- } else {
- rest2
- };
- if is_end_of_line(rest2, eof)? {
- let s = &input[..input.len() - rest2.len()];
- self.state = (
- if s.contains('\n') {
- State::BeginData1
- } else {
- State::BeginData2
- },
- Substate::empty(),
- );
- return Ok(Some((rest, Segment::Identifier)));
- }
- }
- }
- }
-
- self.state.1 = Substate::empty();
- Ok(Some((
- rest,
- if identifier != "!" {
- Segment::Identifier
- } else {
- Segment::Punct
- },
- )))
- }
- fn parse_digraph<'a>(
- &mut self,
- seconds: &[char],
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let (c, rest) = take(input, eof)?;
- self.state.1 = Substate::empty();
- Ok(Some((
- match c {
- Some(c) if seconds.contains(&c) => rest,
- _ => input,
- },
- Segment::Punct,
- )))
- }
- fn parse_number<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let mut input = skip_digits(input, eof)?;
- if let Some(rest) = match_char(|c| c == '.', input, eof)? {
- let rest2 = skip_digits(rest, eof)?;
- if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
- input = rest2;
- }
- };
- if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
- let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
- let rest2 = skip_digits(rest, eof)?;
- if rest2.len() == rest.len() {
- self.state.1 = Substate::empty();
- return Ok(Some((rest, Segment::ExpectedExponent)));
- }
- input = rest2;
- }
- self.state.1 = Substate::empty();
- Ok(Some((input, Segment::Number)))
- }
- fn parse_comment_1<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- enum CommentState<'a> {
- Blank,
- NotBlank,
- Period(&'a str),
- }
- let mut state = CommentState::Blank;
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- // End of file.
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok(Some((input, Segment::SeparateCommands)));
- };
- match c {
- '.' => state = CommentState::Period(input),
- '\n' | '\r' if is_end_of_line(input, eof)? => {
- match state {
- CommentState::Blank => {
- // Blank line ends comment command.
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok(Some((input, Segment::SeparateCommands)));
- }
- CommentState::Period(period) => {
- // '.' at end of line ends comment command.
- self.state = (State::General, Substate::empty());
- return Ok(Some((period, Segment::CommentCommand)));
- }
- CommentState::NotBlank => {
- // Comment continues onto next line.
- self.state = (State::Comment2, Substate::empty());
- return Ok(Some((input, Segment::CommentCommand)));
- }
- }
- }
- c if c.is_whitespace() => (),
- _ => state = CommentState::NotBlank,
- }
- input = rest;
- }
- }
- fn parse_comment_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
-
- let new_command = match take(rest, eof)?.0 {
- Some('+') | Some('-') | Some('.') => true,
- Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
- None | Some(_) => false,
- };
- if new_command {
- self.state = (
- State::General,
- Substate::START_OF_LINE | Substate::START_OF_COMMAND,
- );
- } else {
- self.state = (State::Comment1, Substate::empty());
- }
- Ok(Some((rest, Segment::Newline)))
- }
- fn parse_document_1<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let mut end_cmd = false;
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- self.state = (State::Document3, Substate::empty());
- return Ok(Some((input, Segment::Document)));
- };
- match c {
- '.' => end_cmd = true,
- '\n' | '\r' if is_end_of_line(input, eof)? => {
- self.state.0 = if end_cmd {
- State::Document3
- } else {
- State::Document2
- };
- return Ok(Some((input, Segment::Document)));
- }
- c if !c.is_whitespace() => end_cmd = false,
- _ => (),
- }
- input = rest;
- }
- }
- fn parse_document_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
- self.state = (State::Document1, Substate::empty());
- Ok(Some((rest, Segment::Newline)))
- }
- fn parse_document_3<'a>(
- &mut self,
- input: &'a str,
- _eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- Ok(Some((input, Segment::EndCommand)))
- }
- fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let input = skip_spaces_and_comments(input, eof)?;
- match take(input, eof)?.0 {
- Some('\'') | Some('"') | Some('\n') => Ok(true),
- _ => Ok(false),
- }
- }
- fn parse_file_label_1<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let mut sub = Segmenter {
- state: (State::General, self.state.1),
- ..*self
- };
- let (rest, segment) = sub.push_rest(input, eof)?.unwrap();
- if segment == Segment::Identifier {
- let id = &input[..input.len() - rest.len()];
- debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
- if Self::quoted_file_label(rest, eof)? {
- *self = sub;
- } else {
- self.state.0 = State::FileLabel2;
- }
- } else {
- self.state.1 = sub.state.1;
- }
- Ok(Some((rest, segment)))
- }
- fn parse_file_label_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let input = skip_spaces(input, eof)?;
- self.state = (State::FileLabel3, Substate::empty());
- Ok(Some((input, Segment::Spaces)))
- }
- fn parse_file_label_3<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let mut end_cmd = None;
- loop {
- let (c, rest) = take(input, eof)?;
- match c {
- None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
- self.state = (State::General, Substate::empty());
- return Ok(Some((end_cmd.unwrap_or(input), Segment::UnquotedString)));
- }
- None => unreachable!(),
- Some('.') => end_cmd = Some(input),
- Some(c) if !c.is_whitespace() => end_cmd = None,
- Some(_) => (),
- }
- input = rest;
- }
- }
- fn subparse<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let mut sub = Segmenter {
- syntax: self.syntax,
- state: (State::General, self.state.1),
- nest: 0,
- };
- let result = sub.push_rest(input, eof)?;
- self.state.1 = sub.state.1;
- Ok(result)
- }
- /// We are segmenting a `DO REPEAT` command, currently reading the syntax
- /// that defines the stand-in variables (the head) before the lines of
- /// syntax to be repeated (the body).
- fn parse_do_repeat_1<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?.unwrap();
- if segment == Segment::SeparateCommands {
- // We reached a blank line that separates the head from the body.
- self.state.0 = State::DoRepeat2;
- } else if segment == Segment::EndCommand || segment == Segment::StartCommand {
- // We reached the body.
- self.state.0 = State::DoRepeat3;
- self.nest = 1;
- }
- Ok(Some((rest, segment)))
- }
- /// We are segmenting a `DO REPEAT` command, currently reading a blank line
- /// that separates the head from the body.
- fn parse_do_repeat_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?.unwrap();
- if segment == Segment::Newline {
- // We reached the body.
- self.state.0 = State::DoRepeat3;
- self.nest = 1;
- }
- Ok(Some((rest, segment)))
- }
- fn parse_newline<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<&'a str>, Incomplete> {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(None);
- };
- match c {
- '\n' => Ok(Some(rest)),
- '\r' => {
- if let (Some('\n'), rest) = take(rest, eof)? {
- Ok(Some(rest))
- } else {
- Ok(None)
- }
- }
- _ => Ok(None),
- }
- }
-
- fn parse_full_line<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<&'a str, Incomplete> {
- loop {
- if is_end_of_line(input, eof)? {
- return Ok(input);
- }
- input = take(input, eof).unwrap().1;
- }
- }
- fn check_repeat_command(&mut self, input: &str, eof: bool) -> Result<isize, Incomplete> {
- let input = input.strip_prefix(['-', '+']).unwrap_or(input);
- let (id1, input) = self.next_id_in_command(input, eof)?;
- if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) {
- Ok(1)
- } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0)
- {
- Ok(-1)
- } else {
- Ok(0)
- }
- }
- /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
- /// are to be repeated. Report each line of syntax as a single
- /// [`Type::DoRepeatCommand`].
- ///
- /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
- /// blocks inside the lines we're segmenting. `self.nest` counts the
- /// nesting level, starting at 1.
- fn parse_do_repeat_3<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- if let Some(rest) = self.parse_newline(input, eof)? {
- return Ok(Some((rest, Segment::Newline)));
- }
- let rest = self.parse_full_line(input, eof)?;
- match self.check_repeat_command(input, eof)?.cmp(&0) {
- Ordering::Greater => {
- if let Some(nest) = self.nest.checked_add(1) {
- self.nest = nest;
- } else {
- self.state.0 = State::DoRepeat4;
- }
- }
- Ordering::Less => {
- self.nest -= 1;
- if self.nest == 0 {
- // Nesting level dropped to 0, so we've finished reading the `DO
- // REPEAT` body.
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- return self.push_rest(input, eof);
- }
- }
- Ordering::Equal => (),
- }
- Ok(Some((rest, Segment::DoRepeatCommand)))
- }
- fn parse_do_repeat_4<'a>(
- &mut self,
- input: &'a str,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- self.state.0 = State::DoRepeat3;
- Ok(Some((input, Segment::DoRepeatOverflow)))
- }
- /// We are segmenting a `DEFINE` command, which consists of:
- ///
- /// - The `DEFINE` keyword.
- ///
- /// - An identifier. We transform this into `Type::MacroName` instead of
- /// `Type::Identifier` because this identifier must never be macro-expanded.
- ///
- /// - Anything but `(`.
- ///
- /// - `(` followed by a sequence of tokens possibly including balanced
- /// parentheses up to a final `)`.
- ///
- /// - A sequence of any number of lines, one string per line, ending with
- /// `!ENDDEFINE`. The first line is usually blank (that is, a newline
- /// follows the `(`). The last line usually just has `!ENDDEFINE.` on
- /// it, but it can start with other tokens. The whole
- /// DEFINE...!ENDDEFINE can be on a single line, even.
- fn parse_define_1_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?.unwrap();
- match segment {
- Segment::Identifier if self.state.0 == State::Define1 => {
- self.state.0 = State::Define2;
- return Ok(Some((rest, Segment::MacroName)));
- }
- Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
- // The DEFINE command is malformed because we reached its end
- // without ever hitting a `(` token. Transition back to general
- // parsing.
- self.state.0 = State::General;
- }
- Segment::Punct if input.starts_with('(') => {
- self.state.0 = State::Define3;
- self.nest = 1;
- }
- _ => (),
- }
- Ok(Some((rest, segment)))
- }
- fn parse_define_3<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?.unwrap();
- match segment {
- Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
- // The DEFINE command is malformed because we reached its end
- // without ever hitting a `(` token. Transition back to general
- // parsing.
- self.state.0 = State::General;
- }
- Segment::Punct if input.starts_with('(') => {
- self.nest += 1;
- }
- Segment::Punct if input.starts_with(')') => {
- self.nest -= 1;
- if self.nest == 0 {
- self.state = (State::Define4, Substate::empty());
- }
- }
- _ => (),
- }
- Ok(Some((rest, segment)))
- }
- fn find_enddefine(mut input: &str) -> Option<&str> {
- loop {
- input = skip_spaces_and_comments(input, true).unwrap();
- let (Some(c), rest) = take(input, true).unwrap() else {
- return None;
- };
- match c {
- '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
- return Some(input)
- }
- '\'' | '"' => {
- let index = rest.find(c)?;
- input = &rest[index + 1..];
- }
- _ => input = rest,
- }
- }
- }
-
- /// We are in the body of a macro definition, looking for additional lines
- /// of the body or `!ENDDEFINE`.
- ///
- /// In `State::Define4`, we're parsing the first line of the macro body (the
- /// same line as the closing parenthesis in the argument definition). In
- /// `State::Define5`, we're on a later line.
- fn parse_define_4_5<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let rest = self.parse_full_line(input, eof)?;
- let line = &input[..input.len() - rest.len()];
- if let Some(end) = Self::find_enddefine(line) {
- // Macro ends at the !ENDDEFINE on this line.
- self.state = (State::General, Substate::empty());
- let (prefix, rest) = input.split_at(line.len() - end.len());
- if prefix.is_empty() {
- // Line starts with `!ENDDEFINE`.
- self.push_rest(input, eof)
- } else if prefix.trim_start().is_empty() {
- // Line starts with spaces followed by `!ENDDEFINE`.
- Ok(Some((rest, Segment::Spaces)))
- } else {
- // Line starts with some content followed by `!ENDDEFINE`.
- Ok(Some((rest, Segment::MacroBody)))
- }
- } else {
- // No `!ENDDEFINE`. We have a full line of macro body.
- //
- // If the first line of the macro body is blank, we just report it
- // as spaces, or not at all if there are no spaces, because it's not
- // significant.
- //
- // However, if it's a later line, we need to report it because blank
- // lines can have significance.
- let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() {
- if line.is_empty() {
- return self.parse_define_6(input, eof);
- }
- Segment::Spaces
- } else {
- Segment::MacroBody
- };
- self.state.0 = State::Define6;
- Ok(Some((rest, segment)))
- }
- }
- fn parse_define_6<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
- self.state.0 = State::Define5;
- Ok(Some((rest, Segment::Newline)))
- }
- fn parse_begin_data_1<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?.unwrap();
- if segment == Segment::Newline {
- self.state.0 = State::BeginData2;
- }
- Ok(Some((rest, segment)))
- }
- fn parse_begin_data_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?.unwrap();
- if segment == Segment::Newline {
- self.state.0 = State::BeginData3;
- }
- Ok(Some((rest, segment)))
- }
- fn is_end_data(line: &str) -> bool {
- let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
- return false;
- };
- let (Some(c), rest) = take(rest, true).unwrap() else {
- return false;
- };
- if !c.is_whitespace() {
- return false;
- };
- let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
- return false;
- };
-
- let mut endcmd = false;
- for c in rest.chars() {
- match c {
- '.' if endcmd => return false,
- '.' => endcmd = true,
- c if c.is_whitespace() => (),
- _ => return false,
- }
- }
- true
- }
- fn parse_begin_data_3<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let rest = self.parse_full_line(input, eof)?;
- let line = &input[..input.len() - rest.len()];
- if Self::is_end_data(line) {
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- self.push_rest(input, eof)
- } else {
- self.state.0 = State::BeginData4;
- Ok(Some((rest, Segment::InlineData)))
- }
- }
- fn parse_begin_data_4<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<(&'a str, Segment)>, Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
- self.state.0 = State::BeginData3;
- Ok(Some((rest, Segment::Newline)))
- }
-}
-
-fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
- line.get(..pattern.len()).and_then(|prefix| {
- prefix
- .eq_ignore_ascii_case(pattern)
- .then(|| &line[pattern.len()..])
- })
-}
-
-#[cfg(test)]
-mod test;
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+#![allow(dead_code)]
+use std::{
+ borrow::Cow,
+ sync::{Arc, OnceLock},
+};
+
+use enum_map::EnumMap;
+use pivot::PivotTable;
+use serde::Serialize;
+
+use crate::{
+ message::Diagnostic,
+ output::pivot::{Axis3, BorderStyle, Dimension, Group, Look},
+};
+
+use self::pivot::Value;
+
+pub mod cairo;
+pub mod csv;
+pub mod driver;
+pub mod html;
+pub mod json;
+pub mod page;
+pub mod pivot;
+pub mod render;
+pub mod spv;
+pub mod table;
+pub mod text;
+pub mod text_line;
+
+/// A single output item.
+#[derive(Serialize)]
+pub struct Item {
+ /// The localized label for the item that appears in the outline pane in the
+ /// output viewer and in PDF outlines. This is `None` if no label has been
+ /// explicitly set.
+ label: Option<String>,
+
+ /// A locale-invariant identifier for the command that produced the output,
+ /// which may be `None` if unknown or if a command did not produce this
+ /// output.
+ command_name: Option<String>,
+
+ /// For a group item, this is true if the group's subtree should
+ /// be expanded in an outline view, false otherwise.
+ ///
+ /// For other kinds of output items, this is true to show the item's
+ /// content, false to hide it. The item's label is always shown in an
+ /// outline view.
+ show: bool,
+
+ /// Item details.
+ details: Details,
+}
+
+impl Item {
+ pub fn new(details: impl Into<Details>) -> Self {
+ let details = details.into();
+ Self {
+ label: None,
+ command_name: details.command_name().cloned(),
+ show: true,
+ details,
+ }
+ }
+
+ pub fn label(&self) -> Cow<'static, str> {
+ match &self.label {
+ Some(label) => Cow::from(label.clone()),
+ None => self.details.label(),
+ }
+ }
+}
+
+impl<T> From<T> for Item
+where
+ T: Into<Details>,
+{
+ fn from(value: T) -> Self {
+ Self::new(value)
+ }
+}
+
+#[derive(Serialize)]
+pub enum Details {
+ Chart,
+ Image,
+ Group(Vec<Arc<Item>>),
+ Message(Box<Diagnostic>),
+ PageBreak,
+ Table(Box<PivotTable>),
+ Text(Box<Text>),
+}
+
+impl Details {
+ pub fn as_group(&self) -> Option<&[Arc<Item>]> {
+ match self {
+ Self::Group(children) => Some(children.as_slice()),
+ _ => None,
+ }
+ }
+
+ pub fn command_name(&self) -> Option<&String> {
+ match self {
+ Details::Chart
+ | Details::Image
+ | Details::Group(_)
+ | Details::Message(_)
+ | Details::PageBreak
+ | Details::Text(_) => None,
+ Details::Table(pivot_table) => pivot_table.command_c.as_ref(),
+ }
+ }
+
+ pub fn label(&self) -> Cow<'static, str> {
+ match self {
+ Details::Chart => todo!(),
+ Details::Image => todo!(),
+ Details::Group(_) => Cow::from("Group"),
+ Details::Message(diagnostic) => Cow::from(diagnostic.severity.as_title_str()),
+ Details::PageBreak => Cow::from("Page Break"),
+ Details::Table(pivot_table) => Cow::from(pivot_table.label()),
+ Details::Text(text) => Cow::from(text.type_.as_str()),
+ }
+ }
+
+ pub fn is_page_break(&self) -> bool {
+ matches!(self, Self::PageBreak)
+ }
+}
+
+impl<A> FromIterator<A> for Details
+where
+ A: Into<Arc<Item>>,
+{
+ fn from_iter<T>(iter: T) -> Self
+ where
+ T: IntoIterator<Item = A>,
+ {
+ Self::Group(iter.into_iter().map(|value| value.into()).collect())
+ }
+}
+
+impl From<Diagnostic> for Details {
+ fn from(value: Diagnostic) -> Self {
+ Self::Message(Box::new(value))
+ }
+}
+
+impl From<Box<Diagnostic>> for Details {
+ fn from(value: Box<Diagnostic>) -> Self {
+ Self::Message(value)
+ }
+}
+
+impl From<PivotTable> for Details {
+ fn from(value: PivotTable) -> Self {
+ Self::Table(Box::new(value))
+ }
+}
+
+impl From<Box<PivotTable>> for Details {
+ fn from(value: Box<PivotTable>) -> Self {
+ Self::Table(value)
+ }
+}
+
+impl From<Text> for Details {
+ fn from(value: Text) -> Self {
+ Self::Text(Box::new(value))
+ }
+}
+
+impl From<Box<Text>> for Details {
+ fn from(value: Box<Text>) -> Self {
+ Self::Text(value)
+ }
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct Text {
+ type_: TextType,
+
+ content: Value,
+}
+
+impl Text {
+ pub fn new_log(value: impl Into<Value>) -> Self {
+ Self {
+ type_: TextType::Log,
+ content: value.into(),
+ }
+ }
+}
+
+fn text_item_table_look() -> Arc<Look> {
+ static LOOK: OnceLock<Arc<Look>> = OnceLock::new();
+ LOOK.get_or_init(|| {
+ Arc::new({
+ let mut look = Look::default().with_borders(EnumMap::from_fn(|_| BorderStyle::none()));
+ for style in look.areas.values_mut() {
+ style.cell_style.margins = EnumMap::from_fn(|_| [0, 0]);
+ }
+ look
+ })
+ })
+ .clone()
+}
+
+impl From<Text> for PivotTable {
+ fn from(value: Text) -> Self {
+ let dimension =
+ Dimension::new(Group::new(Value::new_text("Text")).with(Value::new_user_text("null")))
+ .with_all_labels_hidden();
+ PivotTable::new([(Axis3::Y, dimension)])
+ .with_look(text_item_table_look())
+ .with_data([(&[0], value.content)])
+ .with_subtype(Value::new_user_text("Text"))
+ }
+}
+
+impl From<&Diagnostic> for Text {
+ fn from(value: &Diagnostic) -> Self {
+ Self::new_log(value.to_string())
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum TextType {
+ /// `TITLE` and `SUBTITLE` commands.
+ PageTitle,
+
+ /// Title,
+ Title,
+
+ /// Syntax printback logging.
+ Syntax,
+
+ /// Other logging.
+ Log,
+}
+
+impl TextType {
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ TextType::PageTitle => "Page Title",
+ TextType::Title => "Title",
+ TextType::Syntax => "Log",
+ TextType::Log => "Log",
+ }
+ }
+
+ pub fn as_xml_str(&self) -> &'static str {
+ match self {
+ TextType::PageTitle => "page-title",
+ TextType::Title => "title",
+ TextType::Syntax | TextType::Log => "log",
+ }
+ }
+}
+
+pub struct ItemCursor {
+ cur: Option<Arc<Item>>,
+ stack: Vec<(Arc<Item>, usize)>,
+}
+
+impl ItemCursor {
+ pub fn new(start: Arc<Item>) -> Self {
+ Self {
+ cur: Some(start),
+ stack: Vec::new(),
+ }
+ }
+
+ pub fn cur(&self) -> Option<&Arc<Item>> {
+ self.cur.as_ref()
+ }
+
+ pub fn next(&mut self) {
+ let Some(cur) = self.cur.take() else {
+ return;
+ };
+ match cur.details {
+ Details::Group(ref children) if !children.is_empty() => {
+ self.cur = Some(children[0].clone());
+ self.stack.push((cur, 1));
+ }
+ _ => {
+ while let Some((item, index)) = self.stack.pop() {
+ let children = item.details.as_group().unwrap();
+ if index < children.len() {
+ self.cur = Some(children[index].clone());
+ self.stack.push((item, index + 1));
+ return;
+ }
+ }
+ }
+ }
+ }
+}
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+use pango::SCALE;
+
+use crate::output::pivot::HorzAlign;
+
+mod driver;
+pub mod fsm;
+pub mod pager;
+
+pub use driver::{CairoConfig, CairoDriver};
+
+/// Conversion from 1/96" units ("pixels") to Cairo/Pango units.
+fn px_to_xr(x: usize) -> usize {
+ x * 3 * (SCALE as usize * 72 / 96) / 3
+}
+
+fn xr_to_pt(x: usize) -> f64 {
+ x as f64 / SCALE as f64
+}
+
+fn horz_align_to_pango(horz_align: HorzAlign) -> pango::Alignment {
+ match horz_align {
+ HorzAlign::Right | HorzAlign::Decimal { .. } => pango::Alignment::Right,
+ HorzAlign::Left => pango::Alignment::Left,
+ HorzAlign::Center => pango::Alignment::Center,
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use crate::output::cairo::{CairoConfig, CairoDriver};
+
+ #[test]
+ fn create() {
+ CairoDriver::new(&CairoConfig::new("test.pdf")).unwrap();
+ }
+}
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-use pango::SCALE;
-
-use crate::output::pivot::HorzAlign;
-
-mod driver;
-pub mod fsm;
-pub mod pager;
-
-pub use driver::{CairoConfig, CairoDriver};
-
-/// Conversion from 1/96" units ("pixels") to Cairo/Pango units.
-fn px_to_xr(x: usize) -> usize {
- x * 3 * (SCALE as usize * 72 / 96) / 3
-}
-
-fn xr_to_pt(x: usize) -> f64 {
- x as f64 / SCALE as f64
-}
-
-fn horz_align_to_pango(horz_align: HorzAlign) -> pango::Alignment {
- match horz_align {
- HorzAlign::Right | HorzAlign::Decimal { .. } => pango::Alignment::Right,
- HorzAlign::Left => pango::Alignment::Left,
- HorzAlign::Center => pango::Alignment::Center,
- }
-}
-
-#[cfg(test)]
-mod test {
- use crate::output::cairo::{CairoConfig, CairoDriver};
-
- #[test]
- fn create() {
- CairoDriver::new(&CairoConfig::new("test.pdf")).unwrap();
- }
-}
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-#![allow(dead_code)]
-use std::{
- borrow::Cow,
- sync::{Arc, OnceLock},
-};
-
-use enum_map::EnumMap;
-use pivot::PivotTable;
-use serde::Serialize;
-
-use crate::{
- message::Diagnostic,
- output::pivot::{Axis3, BorderStyle, Dimension, Group, Look},
-};
-
-use self::pivot::Value;
-
-pub mod cairo;
-pub mod csv;
-pub mod driver;
-pub mod html;
-pub mod json;
-pub mod page;
-pub mod pivot;
-pub mod render;
-pub mod spv;
-pub mod table;
-pub mod text;
-pub mod text_line;
-
-/// A single output item.
-#[derive(Serialize)]
-pub struct Item {
- /// The localized label for the item that appears in the outline pane in the
- /// output viewer and in PDF outlines. This is `None` if no label has been
- /// explicitly set.
- label: Option<String>,
-
- /// A locale-invariant identifier for the command that produced the output,
- /// which may be `None` if unknown or if a command did not produce this
- /// output.
- command_name: Option<String>,
-
- /// For a group item, this is true if the group's subtree should
- /// be expanded in an outline view, false otherwise.
- ///
- /// For other kinds of output items, this is true to show the item's
- /// content, false to hide it. The item's label is always shown in an
- /// outline view.
- show: bool,
-
- /// Item details.
- details: Details,
-}
-
-impl Item {
- pub fn new(details: impl Into<Details>) -> Self {
- let details = details.into();
- Self {
- label: None,
- command_name: details.command_name().cloned(),
- show: true,
- details,
- }
- }
-
- pub fn label(&self) -> Cow<'static, str> {
- match &self.label {
- Some(label) => Cow::from(label.clone()),
- None => self.details.label(),
- }
- }
-}
-
-impl<T> From<T> for Item
-where
- T: Into<Details>,
-{
- fn from(value: T) -> Self {
- Self::new(value)
- }
-}
-
-#[derive(Serialize)]
-pub enum Details {
- Chart,
- Image,
- Group(Vec<Arc<Item>>),
- Message(Box<Diagnostic>),
- PageBreak,
- Table(Box<PivotTable>),
- Text(Box<Text>),
-}
-
-impl Details {
- pub fn as_group(&self) -> Option<&[Arc<Item>]> {
- match self {
- Self::Group(children) => Some(children.as_slice()),
- _ => None,
- }
- }
-
- pub fn command_name(&self) -> Option<&String> {
- match self {
- Details::Chart
- | Details::Image
- | Details::Group(_)
- | Details::Message(_)
- | Details::PageBreak
- | Details::Text(_) => None,
- Details::Table(pivot_table) => pivot_table.command_c.as_ref(),
- }
- }
-
- pub fn label(&self) -> Cow<'static, str> {
- match self {
- Details::Chart => todo!(),
- Details::Image => todo!(),
- Details::Group(_) => Cow::from("Group"),
- Details::Message(diagnostic) => Cow::from(diagnostic.severity.as_title_str()),
- Details::PageBreak => Cow::from("Page Break"),
- Details::Table(pivot_table) => Cow::from(pivot_table.label()),
- Details::Text(text) => Cow::from(text.type_.as_str()),
- }
- }
-
- pub fn is_page_break(&self) -> bool {
- matches!(self, Self::PageBreak)
- }
-}
-
-impl<A> FromIterator<A> for Details
-where
- A: Into<Arc<Item>>,
-{
- fn from_iter<T>(iter: T) -> Self
- where
- T: IntoIterator<Item = A>,
- {
- Self::Group(iter.into_iter().map(|value| value.into()).collect())
- }
-}
-
-impl From<Diagnostic> for Details {
- fn from(value: Diagnostic) -> Self {
- Self::Message(Box::new(value))
- }
-}
-
-impl From<Box<Diagnostic>> for Details {
- fn from(value: Box<Diagnostic>) -> Self {
- Self::Message(value)
- }
-}
-
-impl From<PivotTable> for Details {
- fn from(value: PivotTable) -> Self {
- Self::Table(Box::new(value))
- }
-}
-
-impl From<Box<PivotTable>> for Details {
- fn from(value: Box<PivotTable>) -> Self {
- Self::Table(value)
- }
-}
-
-impl From<Text> for Details {
- fn from(value: Text) -> Self {
- Self::Text(Box::new(value))
- }
-}
-
-impl From<Box<Text>> for Details {
- fn from(value: Box<Text>) -> Self {
- Self::Text(value)
- }
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct Text {
- type_: TextType,
-
- content: Value,
-}
-
-impl Text {
- pub fn new_log(value: impl Into<Value>) -> Self {
- Self {
- type_: TextType::Log,
- content: value.into(),
- }
- }
-}
-
-fn text_item_table_look() -> Arc<Look> {
- static LOOK: OnceLock<Arc<Look>> = OnceLock::new();
- LOOK.get_or_init(|| {
- Arc::new({
- let mut look = Look::default().with_borders(EnumMap::from_fn(|_| BorderStyle::none()));
- for style in look.areas.values_mut() {
- style.cell_style.margins = EnumMap::from_fn(|_| [0, 0]);
- }
- look
- })
- })
- .clone()
-}
-
-impl From<Text> for PivotTable {
- fn from(value: Text) -> Self {
- let dimension =
- Dimension::new(Group::new(Value::new_text("Text")).with(Value::new_user_text("null")))
- .with_all_labels_hidden();
- PivotTable::new([(Axis3::Y, dimension)])
- .with_look(text_item_table_look())
- .with_data([(&[0], value.content)])
- .with_subtype(Value::new_user_text("Text"))
- }
-}
-
-impl From<&Diagnostic> for Text {
- fn from(value: &Diagnostic) -> Self {
- Self::new_log(value.to_string())
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub enum TextType {
- /// `TITLE` and `SUBTITLE` commands.
- PageTitle,
-
- /// Title,
- Title,
-
- /// Syntax printback logging.
- Syntax,
-
- /// Other logging.
- Log,
-}
-
-impl TextType {
- pub fn as_str(&self) -> &'static str {
- match self {
- TextType::PageTitle => "Page Title",
- TextType::Title => "Title",
- TextType::Syntax => "Log",
- TextType::Log => "Log",
- }
- }
-
- pub fn as_xml_str(&self) -> &'static str {
- match self {
- TextType::PageTitle => "page-title",
- TextType::Title => "title",
- TextType::Syntax | TextType::Log => "log",
- }
- }
-}
-
-pub struct ItemCursor {
- cur: Option<Arc<Item>>,
- stack: Vec<(Arc<Item>, usize)>,
-}
-
-impl ItemCursor {
- pub fn new(start: Arc<Item>) -> Self {
- Self {
- cur: Some(start),
- stack: Vec::new(),
- }
- }
-
- pub fn cur(&self) -> Option<&Arc<Item>> {
- self.cur.as_ref()
- }
-
- pub fn next(&mut self) {
- let Some(cur) = self.cur.take() else {
- return;
- };
- match cur.details {
- Details::Group(ref children) if !children.is_empty() => {
- self.cur = Some(children[0].clone());
- self.stack.push((cur, 1));
- }
- _ => {
- while let Some((item, index)) = self.stack.pop() {
- let children = item.details.as_group().unwrap();
- if index < children.len() {
- self.cur = Some(children[index].clone());
- self.stack.push((item, index + 1));
- return;
- }
- }
- }
- }
- }
-}
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Pivot tables.
+//!
+//! Pivot tables are PSPP's primary form of output. They are analogous to the
+//! pivot tables you might be familiar with from spreadsheets and databases.
+//! See <https://en.wikipedia.org/wiki/Pivot_table> for a brief introduction to
+//! the overall concept of a pivot table.
+//!
+//! In PSPP, the most important internal pieces of a pivot table are:
+//!
+//! - Title. Every pivot table has a title that is displayed above it. It also
+//! has an optional caption (displayed below it) and corner text (displayed in
+//! the upper left corner).
+//!
+//! - Dimensions. A dimension consists of zero or more categories. A category
+//! has a label, such as "df" or "Asymp. Sig." or 123 or a variable name. The
+//! categories are the leaves of a tree whose non-leaf nodes form groups of
+//! categories. The tree always has a root group whose label is the name of
+//! the dimension.
+//!
+//! - Axes. A table has three axes: column, row, and layer. Each dimension is
+//! assigned to an axis, and each axis has zero or more dimensions. When an
+//! axis has more than one dimension, they are ordered from innermost to
+//! outermost.
+//!
+//! - Data. A table's data consists of zero or more cells. Each cell maps from
+//! a category for each dimension to a value, which is commonly a number but
+//! could also be a variable name or an arbitrary text string.
+
+use std::{
+ collections::HashMap,
+ fmt::{Debug, Display, Write},
+ io::Read,
+ iter::{once, repeat, repeat_n, FusedIterator},
+ ops::{Index, IndexMut, Not, Range, RangeInclusive},
+ str::{from_utf8, FromStr, Utf8Error},
+ sync::{Arc, OnceLock},
+};
+
+use binrw::Error as BinError;
+use chrono::NaiveDateTime;
+pub use color::ParseError as ParseColorError;
+use color::{palette::css::TRANSPARENT, AlphaColor, Rgba8, Srgb};
+use enum_iterator::Sequence;
+use enum_map::{enum_map, Enum, EnumMap};
+use look_xml::TableProperties;
+use quick_xml::{de::from_str, DeError};
+use serde::{
+ de::Visitor,
+ ser::{SerializeMap, SerializeStruct},
+ Deserialize, Serialize, Serializer,
+};
+use smallstr::SmallString;
+use smallvec::SmallVec;
+use thiserror::Error as ThisError;
+use tlo::parse_tlo;
+
+use crate::{
+ data::{ByteString, Datum, EncodedString, RawString},
+ format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat},
+ settings::{Settings, Show},
+ util::ToSmallString,
+ variable::{VarType, Variable},
+};
+
+pub mod output;
+
+mod look_xml;
+#[cfg(test)]
+pub mod test;
+mod tlo;
+
+/// Areas of a pivot table for styling purposes.
+#[derive(Copy, Clone, Debug, Default, Enum, PartialEq, Eq)]
+pub enum Area {
+ Title,
+ Caption,
+
+ /// Footnotes,
+ Footer,
+
+ // Top-left corner.
+ Corner,
+
+ /// Labels for columns ([Axis2::X]) and rows ([Axis2::Y]).
+ Labels(Axis2),
+
+ #[default]
+ Data,
+
+ /// Layer indication.
+ Layers,
+}
+
+impl Display for Area {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Area::Title => write!(f, "title"),
+ Area::Caption => write!(f, "caption"),
+ Area::Footer => write!(f, "footer"),
+ Area::Corner => write!(f, "corner"),
+ Area::Labels(axis2) => write!(f, "labels({axis2})"),
+ Area::Data => write!(f, "data"),
+ Area::Layers => write!(f, "layers"),
+ }
+ }
+}
+
+impl Serialize for Area {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ serializer.serialize_str(&self.to_small_string::<16>())
+ }
+}
+
+impl Area {
+ fn default_cell_style(self) -> CellStyle {
+ use HorzAlign::*;
+ use VertAlign::*;
+ let (horz_align, vert_align, hmargins, vmargins) = match self {
+ Area::Title => (Some(Center), Middle, [8, 11], [1, 8]),
+ Area::Caption => (Some(Left), Top, [8, 11], [1, 1]),
+ Area::Footer => (Some(Left), Top, [11, 8], [2, 3]),
+ Area::Corner => (Some(Left), Bottom, [8, 11], [1, 1]),
+ Area::Labels(Axis2::X) => (Some(Center), Top, [8, 11], [1, 3]),
+ Area::Labels(Axis2::Y) => (Some(Left), Top, [8, 11], [1, 3]),
+ Area::Data => (None, Top, [8, 11], [1, 1]),
+ Area::Layers => (Some(Left), Bottom, [8, 11], [1, 3]),
+ };
+ CellStyle {
+ horz_align,
+ vert_align,
+ margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins },
+ }
+ }
+
+ fn default_font_style(self) -> FontStyle {
+ FontStyle {
+ bold: self == Area::Title,
+ italic: false,
+ underline: false,
+ markup: false,
+ font: String::from("Sans Serif"),
+ fg: [Color::BLACK; 2],
+ bg: [Color::WHITE; 2],
+ size: 9,
+ }
+ }
+
+ fn default_area_style(self) -> AreaStyle {
+ AreaStyle {
+ cell_style: self.default_cell_style(),
+ font_style: self.default_font_style(),
+ }
+ }
+}
+
+/// Table borders for styling purposes.
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)]
+pub enum Border {
+ Title,
+ OuterFrame(BoxBorder),
+ InnerFrame(BoxBorder),
+ Dimension(RowColBorder),
+ Category(RowColBorder),
+ DataLeft,
+ DataTop,
+}
+
+impl Border {
+ pub fn default_stroke(self) -> Stroke {
+ match self {
+ Self::InnerFrame(_) | Self::DataLeft | Self::DataTop => Stroke::Thick,
+ Self::Dimension(
+ RowColBorder(HeadingRegion::Columns, _) | RowColBorder(_, Axis2::X),
+ )
+ | Self::Category(RowColBorder(HeadingRegion::Columns, _)) => Stroke::Solid,
+ _ => Stroke::None,
+ }
+ }
+ pub fn default_border_style(self) -> BorderStyle {
+ BorderStyle {
+ stroke: self.default_stroke(),
+ color: Color::BLACK,
+ }
+ }
+
+ fn fallback(self) -> Self {
+ match self {
+ Self::Title
+ | Self::OuterFrame(_)
+ | Self::InnerFrame(_)
+ | Self::DataLeft
+ | Self::DataTop
+ | Self::Category(_) => self,
+ Self::Dimension(row_col_border) => Self::Category(row_col_border),
+ }
+ }
+}
+
+impl Display for Border {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Border::Title => write!(f, "title"),
+ Border::OuterFrame(box_border) => write!(f, "outer_frame({box_border})"),
+ Border::InnerFrame(box_border) => write!(f, "inner_frame({box_border})"),
+ Border::Dimension(row_col_border) => write!(f, "dimension({row_col_border})"),
+ Border::Category(row_col_border) => write!(f, "category({row_col_border})"),
+ Border::DataLeft => write!(f, "data(left)"),
+ Border::DataTop => write!(f, "data(top)"),
+ }
+ }
+}
+
+impl Serialize for Border {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ serializer.serialize_str(&self.to_small_string::<32>())
+ }
+}
+
+/// The borders on a box.
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum BoxBorder {
+ Left,
+ Top,
+ Right,
+ Bottom,
+}
+
+impl BoxBorder {
+ fn as_str(&self) -> &'static str {
+ match self {
+ BoxBorder::Left => "left",
+ BoxBorder::Top => "top",
+ BoxBorder::Right => "right",
+ BoxBorder::Bottom => "bottom",
+ }
+ }
+}
+
+impl Display for BoxBorder {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.write_str(self.as_str())
+ }
+}
+
+/// Borders between rows and columns.
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub struct RowColBorder(
+ /// Row or column headings.
+ pub HeadingRegion,
+ /// Horizontal ([Axis2::X]) or vertical ([Axis2::Y]) borders.
+ pub Axis2,
+);
+
+impl Display for RowColBorder {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}:{}", self.0, self.1)
+ }
+}
+
+/// Sizing for rows or columns of a rendered table.
+///
+/// The comments below talk about columns and their widths but they apply
+/// equally to rows and their heights.
+#[derive(Default, Clone, Debug, Serialize)]
+pub struct Sizing {
+ /// Specific column widths, in 1/96" units.
+ widths: Vec<i32>,
+
+ /// Specific page breaks: 0-based columns after which a page break must
+ /// occur, e.g. a value of 1 requests a break after the second column.
+ breaks: Vec<usize>,
+
+ /// Keeps: columns to keep together on a page if possible.
+ keeps: Vec<Range<usize>>,
+}
+
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Sequence, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Axis3 {
+ X,
+ Y,
+ Z,
+}
+
+impl Axis3 {
+ fn transpose(&self) -> Option<Self> {
+ match self {
+ Axis3::X => Some(Axis3::Y),
+ Axis3::Y => Some(Axis3::X),
+ Axis3::Z => None,
+ }
+ }
+}
+
+impl From<Axis2> for Axis3 {
+ fn from(axis2: Axis2) -> Self {
+ match axis2 {
+ Axis2::X => Self::X,
+ Axis2::Y => Self::Y,
+ }
+ }
+}
+
+/// An axis within a pivot table.
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct Axis {
+ /// `dimensions[0]` is the innermost dimension.
+ pub dimensions: Vec<usize>,
+}
+
+pub struct AxisIterator {
+ indexes: SmallVec<[usize; 4]>,
+ lengths: SmallVec<[usize; 4]>,
+ done: bool,
+}
+
+impl FusedIterator for AxisIterator {}
+impl Iterator for AxisIterator {
+ type Item = SmallVec<[usize; 4]>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.done {
+ None
+ } else {
+ let retval = self.indexes.clone();
+ for (index, len) in self.indexes.iter_mut().zip(self.lengths.iter().copied()) {
+ *index += 1;
+ if *index < len {
+ return Some(retval);
+ };
+ *index = 0;
+ }
+ self.done = true;
+ Some(retval)
+ }
+ }
+}
+
+impl PivotTable {
+ pub fn with_look(mut self, look: Arc<Look>) -> Self {
+ self.look = look;
+ self
+ }
+ pub fn insert_number(&mut self, data_indexes: &[usize], number: Option<f64>, class: Class) {
+ let format = match class {
+ Class::Other => Settings::global().default_format,
+ Class::Integer => Format::F40,
+ Class::Correlations => Format::F40_3,
+ Class::Significance => Format::F40_3,
+ Class::Percent => Format::PCT40_1,
+ Class::Residual => Format::F40_2,
+ Class::Count => Format::F40, // XXX
+ };
+ let value = Value::new(ValueInner::Number(NumberValue {
+ show: None,
+ format,
+ honor_small: class == Class::Other,
+ value: number,
+ variable: None,
+ value_label: None,
+ }));
+ self.insert(data_indexes, value);
+ }
+
+ pub fn with_footnotes(mut self, footnotes: Footnotes) -> Self {
+ debug_assert!(self.footnotes.is_empty());
+ self.footnotes = footnotes;
+ self
+ }
+ fn axis_values(&self, axis: Axis3) -> AxisIterator {
+ AxisIterator {
+ indexes: repeat_n(0, self.axes[axis].dimensions.len()).collect(),
+ lengths: self.axis_dimensions(axis).map(|d| d.len()).collect(),
+ done: self.axis_extent(axis) == 0,
+ }
+ }
+
+ fn axis_extent(&self, axis: Axis3) -> usize {
+ self.axis_dimensions(axis).map(|d| d.len()).product()
+ }
+}
+
+/// Dimensions.
+///
+/// A [Dimension] identifies the categories associated with a single dimension
+/// within a multidimensional pivot table.
+///
+/// A dimension contains a collection of categories, which are the leaves in a
+/// tree of groups.
+///
+/// (A dimension or a group can contain zero categories, but this is unusual.
+/// If a dimension contains no categories, then its table cannot contain any
+/// data.)
+#[derive(Clone, Debug, Serialize)]
+pub struct Dimension {
+ /// Hierarchy of categories within the dimension. The groups and categories
+ /// are sorted in the order that should be used for display. This might be
+ /// different from the original order produced for output if the user
+ /// adjusted it.
+ ///
+ /// The root must always be a group, although it is allowed to have no
+ /// subcategories.
+ pub root: Group,
+
+ /// Ordering of leaves for presentation.
+ ///
+ /// This is a permutation of `0..n` where `n` is the number of leaves. It
+ /// maps from an index in presentation order to an index in data order.
+ pub presentation_order: Vec<usize>,
+
+ /// Display.
+ pub hide_all_labels: bool,
+}
+
+pub type GroupVec<'a> = SmallVec<[&'a Group; 4]>;
+pub struct Path<'a> {
+ groups: GroupVec<'a>,
+ leaf: &'a Leaf,
+}
+
+impl Dimension {
+ pub fn new(root: Group) -> Self {
+ Dimension {
+ presentation_order: (0..root.len()).collect(),
+ root,
+ hide_all_labels: false,
+ }
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns the number of (leaf) categories in this dimension.
+ pub fn len(&self) -> usize {
+ self.root.len()
+ }
+
+ pub fn nth_leaf(&self, index: usize) -> Option<&Leaf> {
+ self.root.nth_leaf(index)
+ }
+
+ pub fn leaf_path(&self, index: usize) -> Option<Path<'_>> {
+ self.root.leaf_path(index, SmallVec::new())
+ }
+
+ pub fn with_all_labels_hidden(self) -> Self {
+ Self {
+ hide_all_labels: true,
+ ..self
+ }
+ }
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct Group {
+ #[serde(skip)]
+ len: usize,
+ pub name: Box<Value>,
+
+ /// The child categories.
+ ///
+ /// A group usually has multiple children, but it is allowed to have
+ /// only one or even (pathologically) none.
+ pub children: Vec<Category>,
+
+ /// Whether to show the group's label.
+ pub show_label: bool,
+}
+
+impl Group {
+ pub fn new(name: impl Into<Value>) -> Self {
+ Self::with_capacity(name, 0)
+ }
+
+ pub fn with_capacity(name: impl Into<Value>, capacity: usize) -> Self {
+ Self {
+ len: 0,
+ name: Box::new(name.into()),
+ children: Vec::with_capacity(capacity),
+ show_label: false,
+ }
+ }
+
+ pub fn push(&mut self, child: impl Into<Category>) {
+ let mut child = child.into();
+ if let Category::Group(group) = &mut child {
+ group.show_label = true;
+ }
+ self.len += child.len();
+ self.children.push(child);
+ }
+
+ pub fn with(mut self, child: impl Into<Category>) -> Self {
+ self.push(child);
+ self
+ }
+
+ pub fn with_multiple<C>(mut self, children: impl IntoIterator<Item = C>) -> Self
+ where
+ C: Into<Category>,
+ {
+ self.extend(children);
+ self
+ }
+
+ pub fn with_label_shown(self) -> Self {
+ self.with_show_label(true)
+ }
+
+ pub fn with_show_label(mut self, show_label: bool) -> Self {
+ self.show_label = show_label;
+ self
+ }
+
+ pub fn nth_leaf(&self, mut index: usize) -> Option<&Leaf> {
+ for child in &self.children {
+ let len = child.len();
+ if index < len {
+ return child.nth_leaf(index);
+ }
+ index -= len;
+ }
+ None
+ }
+
+ pub fn leaf_path<'a>(&'a self, mut index: usize, mut groups: GroupVec<'a>) -> Option<Path<'a>> {
+ for child in &self.children {
+ let len = child.len();
+ if index < len {
+ groups.push(self);
+ return child.leaf_path(index, groups);
+ }
+ index -= len;
+ }
+ None
+ }
+
+ pub fn len(&self) -> usize {
+ self.len
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ pub fn name(&self) -> &Value {
+ &self.name
+ }
+}
+
+impl<C> Extend<C> for Group
+where
+ C: Into<Category>,
+{
+ fn extend<T: IntoIterator<Item = C>>(&mut self, children: T) {
+ let children = children.into_iter();
+ self.children.reserve(children.size_hint().0);
+ for child in children {
+ self.push(child);
+ }
+ }
+}
+
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct Footnotes(pub Vec<Arc<Footnote>>);
+
+impl Footnotes {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ pub fn push(&mut self, footnote: Footnote) -> Arc<Footnote> {
+ let footnote = Arc::new(footnote.with_index(self.0.len()));
+ self.0.push(footnote.clone());
+ footnote
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.0.is_empty()
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Leaf {
+ name: Box<Value>,
+}
+
+impl Leaf {
+ pub fn new(name: Value) -> Self {
+ Self {
+ name: Box::new(name),
+ }
+ }
+ pub fn name(&self) -> &Value {
+ &self.name
+ }
+}
+
+impl Serialize for Leaf {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ self.name.serialize(serializer)
+ }
+}
+
+/// Pivot result classes.
+///
+/// These are used to mark [Leaf] categories as having particular types of data,
+/// to set their numeric formats.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum Class {
+ Other,
+ Integer,
+ Correlations,
+ Significance,
+ Percent,
+ Residual,
+ Count,
+}
+
+/// A pivot_category is a leaf (a category) or a group.
+#[derive(Clone, Debug, Serialize)]
+pub enum Category {
+ Group(Group),
+ Leaf(Leaf),
+}
+
+impl Category {
+ pub fn name(&self) -> &Value {
+ match self {
+ Category::Group(group) => &group.name,
+ Category::Leaf(leaf) => &leaf.name,
+ }
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ pub fn len(&self) -> usize {
+ match self {
+ Category::Group(group) => group.len,
+ Category::Leaf(_) => 1,
+ }
+ }
+
+ pub fn nth_leaf(&self, index: usize) -> Option<&Leaf> {
+ match self {
+ Category::Group(group) => group.nth_leaf(index),
+ Category::Leaf(leaf) => {
+ if index == 0 {
+ Some(leaf)
+ } else {
+ None
+ }
+ }
+ }
+ }
+
+ pub fn leaf_path<'a>(&'a self, index: usize, groups: GroupVec<'a>) -> Option<Path<'a>> {
+ match self {
+ Category::Group(group) => group.leaf_path(index, groups),
+ Category::Leaf(leaf) => {
+ if index == 0 {
+ Some(Path { groups, leaf })
+ } else {
+ None
+ }
+ }
+ }
+ }
+
+ pub fn show_label(&self) -> bool {
+ match self {
+ Category::Group(group) => group.show_label,
+ Category::Leaf(_) => true,
+ }
+ }
+}
+
+impl From<Group> for Category {
+ fn from(group: Group) -> Self {
+ Self::Group(group)
+ }
+}
+
+impl From<Leaf> for Category {
+ fn from(group: Leaf) -> Self {
+ Self::Leaf(group)
+ }
+}
+
+impl From<Value> for Category {
+ fn from(name: Value) -> Self {
+ Leaf::new(name).into()
+ }
+}
+
+impl From<&Variable> for Category {
+ fn from(variable: &Variable) -> Self {
+ Value::new_variable(variable).into()
+ }
+}
+
+impl From<&str> for Category {
+ fn from(name: &str) -> Self {
+ Self::Leaf(Leaf::new(Value::new_text(name)))
+ }
+}
+
+impl From<String> for Category {
+ fn from(name: String) -> Self {
+ Self::Leaf(Leaf::new(Value::new_text(name)))
+ }
+}
+
+impl From<&String> for Category {
+ fn from(name: &String) -> Self {
+ Self::Leaf(Leaf::new(Value::new_text(name)))
+ }
+}
+
+/// Styling for a pivot table.
+///
+/// The division between this and the style information in [PivotTable] seems
+/// fairly arbitrary. The ultimate reason for the division is simply because
+/// that's how SPSS documentation and file formats do it.
+#[derive(Clone, Debug, Serialize)]
+pub struct Look {
+ pub name: Option<String>,
+
+ /// Whether to hide rows or columns whose cells are all empty.
+ pub hide_empty: bool,
+
+ pub row_label_position: LabelPosition,
+
+ /// Ranges of column widths in the two heading regions, in 1/96" units.
+ pub heading_widths: EnumMap<HeadingRegion, RangeInclusive<usize>>,
+
+ /// Kind of markers to use for footnotes.
+ pub footnote_marker_type: FootnoteMarkerType,
+
+ /// Where to put the footnote markers.
+ pub footnote_marker_position: FootnoteMarkerPosition,
+
+ /// Styles for areas of the pivot table.
+ pub areas: EnumMap<Area, AreaStyle>,
+
+ /// Styles for borders in the pivot table.
+ pub borders: EnumMap<Border, BorderStyle>,
+
+ pub print_all_layers: bool,
+
+ pub paginate_layers: bool,
+
+ pub shrink_to_fit: EnumMap<Axis2, bool>,
+
+ pub top_continuation: bool,
+
+ pub bottom_continuation: bool,
+
+ pub continuation: Option<String>,
+
+ pub n_orphan_lines: usize,
+}
+
+impl Look {
+ pub fn with_omit_empty(mut self, omit_empty: bool) -> Self {
+ self.hide_empty = omit_empty;
+ self
+ }
+ pub fn with_row_label_position(mut self, row_label_position: LabelPosition) -> Self {
+ self.row_label_position = row_label_position;
+ self
+ }
+ pub fn with_borders(mut self, borders: EnumMap<Border, BorderStyle>) -> Self {
+ self.borders = borders;
+ self
+ }
+}
+
+impl Default for Look {
+ fn default() -> Self {
+ Self {
+ name: None,
+ hide_empty: true,
+ row_label_position: LabelPosition::default(),
+ heading_widths: EnumMap::from_fn(|region| match region {
+ HeadingRegion::Rows => 36..=72,
+ HeadingRegion::Columns => 36..=120,
+ }),
+ footnote_marker_type: FootnoteMarkerType::default(),
+ footnote_marker_position: FootnoteMarkerPosition::default(),
+ areas: EnumMap::from_fn(Area::default_area_style),
+ borders: EnumMap::from_fn(Border::default_border_style),
+ print_all_layers: false,
+ paginate_layers: false,
+ shrink_to_fit: EnumMap::from_fn(|_| false),
+ top_continuation: false,
+ bottom_continuation: false,
+ continuation: None,
+ n_orphan_lines: 0,
+ }
+ }
+}
+
+#[derive(ThisError, Debug)]
+pub enum ParseLookError {
+ #[error(transparent)]
+ XmlError(#[from] DeError),
+
+ #[error(transparent)]
+ Utf8Error(#[from] Utf8Error),
+
+ #[error(transparent)]
+ BinError(#[from] BinError),
+
+ #[error(transparent)]
+ IoError(#[from] std::io::Error),
+}
+
+impl Look {
+ pub fn shared_default() -> Arc<Look> {
+ static LOOK: OnceLock<Arc<Look>> = OnceLock::new();
+ LOOK.get_or_init(|| Arc::new(Look::default())).clone()
+ }
+
+ pub fn from_xml(xml: &str) -> Result<Self, ParseLookError> {
+ Ok(from_str::<TableProperties>(xml)
+ .map_err(ParseLookError::from)?
+ .into())
+ }
+
+ pub fn from_binary(tlo: &[u8]) -> Result<Self, ParseLookError> {
+ parse_tlo(tlo).map_err(ParseLookError::from)
+ }
+
+ pub fn from_data(data: &[u8]) -> Result<Self, ParseLookError> {
+ if data.starts_with(b"\xff\xff\0\0") {
+ Self::from_binary(data)
+ } else {
+ Self::from_xml(from_utf8(data).map_err(ParseLookError::from)?)
+ }
+ }
+
+ pub fn from_reader<R>(mut reader: R) -> Result<Self, ParseLookError>
+ where
+ R: Read,
+ {
+ let mut buffer = Vec::new();
+ reader
+ .read_to_end(&mut buffer)
+ .map_err(ParseLookError::from)?;
+ Self::from_data(&buffer)
+ }
+}
+
+/// Position for group labels.
+#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
+pub enum LabelPosition {
+ /// Hierarachically enclosing the categories.
+ ///
+ /// For column labels, group labels appear above the categories. For row
+ /// labels, group labels appear to the left of the categories.
+ ///
+ /// ```text
+ /// ┌────┬──────────────┐ ┌─────────┬──────────┐
+ /// │ │ nested │ │ │ columns │
+ /// │ ├────┬────┬────┤ ├──────┬──┼──────────┤
+ /// │ │ a1 │ a2 │ a3 │ │ │a1│...data...│
+ /// ├────┼────┼────┼────┤ │nested│a2│...data...│
+ /// │ │data│data│data│ │ │a3│...data...│
+ /// │ │ . │ . │ . │ └──────┴──┴──────────┘
+ /// │rows│ . │ . │ . │
+ /// │ │ . │ . │ . │
+ /// └────┴────┴────┴────┘
+ /// ```
+ #[serde(rename = "nested")]
+ Nested,
+
+ /// In the corner (row labels only).
+ ///
+ /// ```text
+ /// ┌──────┬──────────┐
+ /// │corner│ columns │
+ /// ├──────┼──────────┤
+ /// │ a1│...data...│
+ /// │ a2│...data...│
+ /// │ a3│...data...│
+ /// └──────┴──────────┘
+ /// ```
+ #[default]
+ #[serde(rename = "inCorner")]
+ Corner,
+}
+
+/// The heading region of a rendered pivot table:
+///
+/// ```text
+/// ┌──────────────────┬─────────────────────────────────────────────────┐
+/// │ │ column headings │
+/// │ ├─────────────────────────────────────────────────┤
+/// │ corner │ │
+/// │ and │ │
+/// │ row headings │ data │
+/// │ │ │
+/// │ │ │
+/// └──────────────────┴─────────────────────────────────────────────────┘
+/// ```
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum HeadingRegion {
+ Rows,
+ Columns,
+}
+
+impl HeadingRegion {
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ HeadingRegion::Rows => "rows",
+ HeadingRegion::Columns => "columns",
+ }
+ }
+}
+
+impl Display for HeadingRegion {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+impl From<Axis2> for HeadingRegion {
+ fn from(axis: Axis2) -> Self {
+ match axis {
+ Axis2::X => HeadingRegion::Columns,
+ Axis2::Y => HeadingRegion::Rows,
+ }
+ }
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct AreaStyle {
+ pub cell_style: CellStyle,
+ pub font_style: FontStyle,
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct CellStyle {
+ /// `None` means "mixed" alignment: align strings to the left, numbers to
+ /// the right.
+ pub horz_align: Option<HorzAlign>,
+ pub vert_align: VertAlign,
+
+ /// Margins in 1/96" units.
+ ///
+ /// `margins[Axis2::X][0]` is the left margin.
+ /// `margins[Axis2::X][1]` is the right margin.
+ /// `margins[Axis2::Y][0]` is the top margin.
+ /// `margins[Axis2::Y][1]` is the bottom margin.
+ pub margins: EnumMap<Axis2, [i32; 2]>,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum HorzAlign {
+ /// Right aligned.
+ Right,
+
+ /// Left aligned.
+ Left,
+
+ /// Centered.
+ Center,
+
+ /// Align the decimal point at the specified position.
+ Decimal {
+ /// Decimal offset from the right side of the cell, in 1/96" units.
+ offset: f64,
+
+ /// Decimal character.
+ decimal: Decimal,
+ },
+}
+
+impl HorzAlign {
+ pub fn for_mixed(var_type: VarType) -> Self {
+ match var_type {
+ VarType::Numeric => Self::Right,
+ VarType::String => Self::Left,
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum VertAlign {
+ /// Top alignment.
+ Top,
+
+ /// Centered,
+ Middle,
+
+ /// Bottom alignment.
+ Bottom,
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct FontStyle {
+ pub bold: bool,
+ pub italic: bool,
+ pub underline: bool,
+ pub markup: bool,
+ pub font: String,
+
+ /// `fg[0]` is the usual foreground color.
+ ///
+ /// `fg[1]` is used only in [Area::Data] for odd-numbered rows.
+ pub fg: [Color; 2],
+
+ /// `bg[0]` is the usual background color.
+ ///
+ /// `bg[1]` is used only in [Area::Data] for odd-numbered rows.
+ pub bg: [Color; 2],
+
+ /// In 1/72" units.
+ pub size: i32,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub struct Color {
+ pub alpha: u8,
+ pub r: u8,
+ pub g: u8,
+ pub b: u8,
+}
+
+impl Color {
+ pub const BLACK: Color = Color::new(0, 0, 0);
+ pub const WHITE: Color = Color::new(255, 255, 255);
+ pub const RED: Color = Color::new(255, 0, 0);
+ pub const BLUE: Color = Color::new(0, 0, 255);
+ pub const TRANSPARENT: Color = Color::new(0, 0, 0).with_alpha(0);
+
+ pub const fn new(r: u8, g: u8, b: u8) -> Self {
+ Self {
+ alpha: 255,
+ r,
+ g,
+ b,
+ }
+ }
+
+ pub const fn with_alpha(self, alpha: u8) -> Self {
+ Self { alpha, ..self }
+ }
+
+ pub const fn without_alpha(self) -> Self {
+ self.with_alpha(255)
+ }
+
+ pub fn display_css(&self) -> DisplayCss {
+ DisplayCss(*self)
+ }
+}
+
+impl Debug for Color {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", self.display_css())
+ }
+}
+
+impl From<Rgba8> for Color {
+ fn from(Rgba8 { r, g, b, a }: Rgba8) -> Self {
+ Self::new(r, g, b).with_alpha(a)
+ }
+}
+
+impl FromStr for Color {
+ type Err = ParseColorError;
+
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
+ fn is_bare_hex(s: &str) -> bool {
+ let s = s.trim();
+ s.chars().count() == 6 && s.chars().all(|c| c.is_ascii_hexdigit())
+ }
+ let color: AlphaColor<Srgb> = match s.parse() {
+ Err(ParseColorError::UnknownColorSyntax) if is_bare_hex(s) => {
+ ("#".to_owned() + s).parse()
+ }
+ Err(ParseColorError::UnknownColorSyntax)
+ if s.trim().eq_ignore_ascii_case("transparent") =>
+ {
+ Ok(TRANSPARENT)
+ }
+ other => other,
+ }?;
+ Ok(color.to_rgba8().into())
+ }
+}
+
+impl Serialize for Color {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ serializer.serialize_str(&self.display_css().to_small_string::<32>())
+ }
+}
+
+impl<'de> Deserialize<'de> for Color {
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+ where
+ D: serde::Deserializer<'de>,
+ {
+ struct ColorVisitor;
+
+ impl<'de> Visitor<'de> for ColorVisitor {
+ type Value = Color;
+
+ fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+ formatter.write_str("\"#rrggbb\" or \"rrggbb\" or web color name")
+ }
+
+ fn visit_borrowed_str<E>(self, v: &'de str) -> Result<Self::Value, E>
+ where
+ E: serde::de::Error,
+ {
+ v.parse().map_err(E::custom)
+ }
+ }
+
+ deserializer.deserialize_str(ColorVisitor)
+ }
+}
+
+pub struct DisplayCss(Color);
+
+impl Display for DisplayCss {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ let Color { alpha, r, g, b } = self.0;
+ match alpha {
+ 255 => write!(f, "#{r:02x}{g:02x}{b:02x}"),
+ _ => write!(f, "rgb({r}, {g}, {b}, {:.2})", alpha as f64 / 255.0),
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, Deserialize)]
+pub struct BorderStyle {
+ #[serde(rename = "@borderStyleType")]
+ pub stroke: Stroke,
+
+ #[serde(rename = "@color")]
+ pub color: Color,
+}
+
+impl Serialize for BorderStyle {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ let mut s = serializer.serialize_struct("BorderStyle", 2)?;
+ s.serialize_field("stroke", &self.stroke)?;
+ s.serialize_field("color", &self.color)?;
+ s.end()
+ }
+}
+
+impl BorderStyle {
+ pub const fn none() -> Self {
+ Self {
+ stroke: Stroke::None,
+ color: Color::BLACK,
+ }
+ }
+
+ pub fn is_none(&self) -> bool {
+ self.stroke.is_none()
+ }
+
+ /// Returns a border style that "combines" the two arguments, that is, that
+ /// gives a reasonable choice for a rule for different reasons should have
+ /// both styles.
+ pub fn combine(self, other: BorderStyle) -> Self {
+ Self {
+ stroke: self.stroke.combine(other.stroke),
+ color: self.color,
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Enum, Deserialize, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Stroke {
+ None,
+ Solid,
+ Dashed,
+ Thick,
+ Thin,
+ Double,
+}
+
+impl Stroke {
+ pub fn is_none(&self) -> bool {
+ self == &Self::None
+ }
+
+ /// Returns a stroke that "combines" the two arguments, that is, that gives
+ /// a reasonable stroke choice for a rule for different reasons should have
+ /// both styles.
+ pub fn combine(self, other: Stroke) -> Self {
+ self.max(other)
+ }
+}
+
+/// An axis of a 2-dimensional table.
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Axis2 {
+ X,
+ Y,
+}
+
+impl Axis2 {
+ pub fn new_enum<T>(x: T, y: T) -> EnumMap<Axis2, T> {
+ EnumMap::from_array([x, y])
+ }
+
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ Axis2::X => "x",
+ Axis2::Y => "y",
+ }
+ }
+}
+
+impl Display for Axis2 {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+impl Not for Axis2 {
+ type Output = Self;
+
+ fn not(self) -> Self::Output {
+ match self {
+ Self::X => Self::Y,
+ Self::Y => Self::X,
+ }
+ }
+}
+
+/// A 2-dimensional `(x,y)` pair.
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)]
+pub struct Coord2(pub EnumMap<Axis2, usize>);
+
+impl Coord2 {
+ pub fn new(x: usize, y: usize) -> Self {
+ use Axis2::*;
+ Self(enum_map! {
+ X => x,
+ Y => y
+ })
+ }
+
+ pub fn for_axis((a, az): (Axis2, usize), bz: usize) -> Self {
+ let mut coord = Self::default();
+ coord[a] = az;
+ coord[!a] = bz;
+ coord
+ }
+
+ pub fn from_fn<F>(f: F) -> Self
+ where
+ F: FnMut(Axis2) -> usize,
+ {
+ Self(EnumMap::from_fn(f))
+ }
+
+ pub fn x(&self) -> usize {
+ self.0[Axis2::X]
+ }
+
+ pub fn y(&self) -> usize {
+ self.0[Axis2::Y]
+ }
+
+ pub fn get(&self, axis: Axis2) -> usize {
+ self.0[axis]
+ }
+}
+
+impl From<EnumMap<Axis2, usize>> for Coord2 {
+ fn from(value: EnumMap<Axis2, usize>) -> Self {
+ Self(value)
+ }
+}
+
+impl Index<Axis2> for Coord2 {
+ type Output = usize;
+
+ fn index(&self, index: Axis2) -> &Self::Output {
+ &self.0[index]
+ }
+}
+
+impl IndexMut<Axis2> for Coord2 {
+ fn index_mut(&mut self, index: Axis2) -> &mut Self::Output {
+ &mut self.0[index]
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct Rect2(pub EnumMap<Axis2, Range<usize>>);
+
+impl Rect2 {
+ pub fn new(x_range: Range<usize>, y_range: Range<usize>) -> Self {
+ Self(enum_map! {
+ Axis2::X => x_range.clone(),
+ Axis2::Y => y_range.clone(),
+ })
+ }
+ pub fn for_cell(cell: Coord2) -> Self {
+ Self::new(cell.x()..cell.x() + 1, cell.y()..cell.y() + 1)
+ }
+ pub fn for_ranges((a, a_range): (Axis2, Range<usize>), b_range: Range<usize>) -> Self {
+ let b = !a;
+ let mut ranges = EnumMap::default();
+ ranges[a] = a_range;
+ ranges[b] = b_range;
+ Self(ranges)
+ }
+ pub fn top_left(&self) -> Coord2 {
+ use Axis2::*;
+ Coord2::new(self[X].start, self[Y].start)
+ }
+ pub fn from_fn<F>(f: F) -> Self
+ where
+ F: FnMut(Axis2) -> Range<usize>,
+ {
+ Self(EnumMap::from_fn(f))
+ }
+ pub fn translate(self, offset: Coord2) -> Rect2 {
+ Self::from_fn(|axis| self[axis].start + offset[axis]..self[axis].end + offset[axis])
+ }
+ pub fn is_empty(&self) -> bool {
+ self[Axis2::X].is_empty() || self[Axis2::Y].is_empty()
+ }
+}
+
+impl From<EnumMap<Axis2, Range<usize>>> for Rect2 {
+ fn from(value: EnumMap<Axis2, Range<usize>>) -> Self {
+ Self(value)
+ }
+}
+
+impl Index<Axis2> for Rect2 {
+ type Output = Range<usize>;
+
+ fn index(&self, index: Axis2) -> &Self::Output {
+ &self.0[index]
+ }
+}
+
+impl IndexMut<Axis2> for Rect2 {
+ fn index_mut(&mut self, index: Axis2) -> &mut Self::Output {
+ &mut self.0[index]
+ }
+}
+
+#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
+#[serde(rename_all = "camelCase")]
+pub enum FootnoteMarkerType {
+ /// a, b, c, ...
+ #[default]
+ Alphabetic,
+
+ /// 1, 2, 3, ...
+ Numeric,
+}
+
+#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
+#[serde(rename_all = "camelCase")]
+pub enum FootnoteMarkerPosition {
+ /// Subscripts.
+ #[default]
+ Subscript,
+
+ /// Superscripts.
+ Superscript,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct ValueOptions {
+ pub show_values: Option<Show>,
+
+ pub show_variables: Option<Show>,
+
+ pub small: f64,
+
+ /// Where to put the footnote markers.
+ pub footnote_marker_type: FootnoteMarkerType,
+}
+
+impl Default for ValueOptions {
+ fn default() -> Self {
+ Self {
+ show_values: None,
+ show_variables: None,
+ small: 0.0001,
+ footnote_marker_type: FootnoteMarkerType::default(),
+ }
+ }
+}
+
+pub trait IntoValueOptions {
+ fn into_value_options(self) -> ValueOptions;
+}
+
+impl IntoValueOptions for () {
+ fn into_value_options(self) -> ValueOptions {
+ ValueOptions::default()
+ }
+}
+
+impl IntoValueOptions for &PivotTable {
+ fn into_value_options(self) -> ValueOptions {
+ self.value_options()
+ }
+}
+
+impl IntoValueOptions for &ValueOptions {
+ fn into_value_options(self) -> ValueOptions {
+ *self
+ }
+}
+
+impl IntoValueOptions for ValueOptions {
+ fn into_value_options(self) -> ValueOptions {
+ self
+ }
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct PivotTable {
+ pub look: Arc<Look>,
+
+ pub rotate_inner_column_labels: bool,
+
+ pub rotate_outer_row_labels: bool,
+
+ pub show_grid_lines: bool,
+
+ pub show_title: bool,
+
+ pub show_caption: bool,
+
+ pub show_values: Option<Show>,
+
+ pub show_variables: Option<Show>,
+
+ pub weight_format: Format,
+
+ /// Current layer indexes, with `axes[Axis3::Z].dimensions.len()` elements.
+ /// `current_layer[i]` is an offset into
+ /// `axes[Axis3::Z].dimensions[i].data_leaves[]`, except that a dimension
+ /// can have zero leaves, in which case `current_layer[i]` is zero and
+ /// there's no corresponding leaf.
+ pub current_layer: Vec<usize>,
+
+ /// Column and row sizing and page breaks.
+ pub sizing: EnumMap<Axis2, Option<Box<Sizing>>>,
+
+ /// Format settings.
+ pub settings: FormatSettings,
+
+ /// Numeric grouping character (usually `.` or `,`).
+ pub grouping: Option<char>,
+
+ pub small: f64,
+
+ pub command_local: Option<String>,
+ pub command_c: Option<String>,
+ pub language: Option<String>,
+ pub locale: Option<String>,
+ pub dataset: Option<String>,
+ pub datafile: Option<String>,
+ pub date: Option<NaiveDateTime>,
+ pub footnotes: Footnotes,
+ pub title: Option<Box<Value>>,
+ pub subtype: Option<Box<Value>>,
+ pub corner_text: Option<Box<Value>>,
+ pub caption: Option<Box<Value>>,
+ pub notes: Option<String>,
+ pub dimensions: Vec<Dimension>,
+ pub axes: EnumMap<Axis3, Axis>,
+ pub cells: HashMap<usize, Value>,
+}
+
+impl PivotTable {
+ pub fn with_title(mut self, title: impl Into<Value>) -> Self {
+ self.title = Some(Box::new(title.into()));
+ self.show_title = true;
+ self
+ }
+
+ pub fn with_caption(mut self, caption: impl Into<Value>) -> Self {
+ self.caption = Some(Box::new(caption.into()));
+ self.show_caption = true;
+ self
+ }
+
+ pub fn with_corner_text(mut self, corner_text: impl Into<Value>) -> Self {
+ self.corner_text = Some(Box::new(corner_text.into()));
+ self
+ }
+
+ pub fn with_subtype(self, subtype: impl Into<Value>) -> Self {
+ Self {
+ subtype: Some(Box::new(subtype.into())),
+ ..self
+ }
+ }
+
+ pub fn with_show_title(mut self, show_title: bool) -> Self {
+ self.show_title = show_title;
+ self
+ }
+
+ pub fn with_show_caption(mut self, show_caption: bool) -> Self {
+ self.show_caption = show_caption;
+ self
+ }
+
+ pub fn with_layer(mut self, layer: &[usize]) -> Self {
+ debug_assert_eq!(layer.len(), self.current_layer.len());
+ if self.look.print_all_layers {
+ self.look_mut().print_all_layers = false;
+ }
+ self.current_layer.clear();
+ self.current_layer.extend_from_slice(layer);
+ self
+ }
+
+ pub fn with_all_layers(mut self) -> Self {
+ if !self.look.print_all_layers {
+ self.look_mut().print_all_layers = true;
+ }
+ self
+ }
+
+ pub fn look_mut(&mut self) -> &mut Look {
+ Arc::make_mut(&mut self.look)
+ }
+
+ pub fn with_show_empty(mut self) -> Self {
+ if self.look.hide_empty {
+ self.look_mut().hide_empty = false;
+ }
+ self
+ }
+
+ pub fn with_hide_empty(mut self) -> Self {
+ if !self.look.hide_empty {
+ self.look_mut().hide_empty = true;
+ }
+ self
+ }
+
+ pub fn label(&self) -> String {
+ match &self.title {
+ Some(title) => title.display(self).to_string(),
+ None => String::from("Table"),
+ }
+ }
+
+ pub fn title(&self) -> &Value {
+ match &self.title {
+ Some(title) => title,
+ None => {
+ static EMPTY: Value = Value::empty();
+ &EMPTY
+ }
+ }
+ }
+
+ pub fn subtype(&self) -> &Value {
+ match &self.subtype {
+ Some(subtype) => subtype,
+ None => {
+ static EMPTY: Value = Value::empty();
+ &EMPTY
+ }
+ }
+ }
+}
+
+impl Default for PivotTable {
+ fn default() -> Self {
+ Self {
+ look: Look::shared_default(),
+ rotate_inner_column_labels: false,
+ rotate_outer_row_labels: false,
+ show_grid_lines: false,
+ show_title: true,
+ show_caption: true,
+ show_values: None,
+ show_variables: None,
+ weight_format: Format::F40,
+ current_layer: Vec::new(),
+ sizing: EnumMap::default(),
+ settings: FormatSettings::default(), // XXX from settings
+ grouping: None,
+ small: 0.0001, // XXX from settings.
+ command_local: None,
+ command_c: None, // XXX from current command name.
+ language: None,
+ locale: None,
+ dataset: None,
+ datafile: None,
+ date: None,
+ footnotes: Footnotes::new(),
+ subtype: None,
+ title: None,
+ corner_text: None,
+ caption: None,
+ notes: None,
+ dimensions: Vec::new(),
+ axes: EnumMap::default(),
+ cells: HashMap::new(),
+ }
+ }
+}
+
+fn cell_index<I>(data_indexes: &[usize], dimensions: I) -> usize
+where
+ I: ExactSizeIterator<Item = usize>,
+{
+ debug_assert_eq!(data_indexes.len(), dimensions.len());
+ let mut index = 0;
+ for (dimension, data_index) in dimensions.zip(data_indexes.iter()) {
+ debug_assert!(*data_index < dimension);
+ index = dimension * index + data_index;
+ }
+ index
+}
+
+impl PivotTable {
+ pub fn new(axes_and_dimensions: impl IntoIterator<Item = (Axis3, Dimension)>) -> Self {
+ let mut dimensions = Vec::new();
+ let mut axes = EnumMap::<Axis3, Axis>::default();
+ for (axis, dimension) in axes_and_dimensions {
+ axes[axis].dimensions.push(dimensions.len());
+ dimensions.push(dimension);
+ }
+ Self {
+ look: Settings::global().look.clone(),
+ current_layer: repeat_n(0, axes[Axis3::Z].dimensions.len()).collect(),
+ axes,
+ dimensions,
+ ..Self::default()
+ }
+ }
+ fn cell_index(&self, data_indexes: &[usize]) -> usize {
+ cell_index(data_indexes, self.dimensions.iter().map(|d| d.len()))
+ }
+
+ pub fn insert(&mut self, data_indexes: &[usize], value: impl Into<Value>) {
+ self.cells
+ .insert(self.cell_index(data_indexes), value.into());
+ }
+
+ pub fn get(&self, data_indexes: &[usize]) -> Option<&Value> {
+ self.cells.get(&self.cell_index(data_indexes))
+ }
+
+ pub fn with_data<I>(mut self, iter: impl IntoIterator<Item = (I, Value)>) -> Self
+ where
+ I: AsRef<[usize]>,
+ {
+ self.extend(iter);
+ self
+ }
+
+ /// Converts per-axis presentation-order indexes in `presentation_indexes`,
+ /// into data indexes for each dimension.
+ fn convert_indexes_ptod(
+ &self,
+ presentation_indexes: EnumMap<Axis3, &[usize]>,
+ ) -> SmallVec<[usize; 4]> {
+ let mut data_indexes = SmallVec::from_elem(0, self.dimensions.len());
+ for (axis, presentation_indexes) in presentation_indexes {
+ for (&dim_index, &pindex) in self.axes[axis]
+ .dimensions
+ .iter()
+ .zip(presentation_indexes.iter())
+ {
+ data_indexes[dim_index] = self.dimensions[dim_index].presentation_order[pindex];
+ }
+ }
+ data_indexes
+ }
+
+ /// Returns an iterator for the layer axis:
+ ///
+ /// - If `print` is true and `self.look.print_all_layers`, then the iterator
+ /// will visit all values of the layer axis.
+ ///
+ /// - Otherwise, the iterator will just visit `self.current_layer`.
+ pub fn layers(&self, print: bool) -> Box<dyn Iterator<Item = SmallVec<[usize; 4]>>> {
+ if print && self.look.print_all_layers {
+ Box::new(self.axis_values(Axis3::Z))
+ } else {
+ Box::new(once(SmallVec::from_slice(&self.current_layer)))
+ }
+ }
+
+ pub fn value_options(&self) -> ValueOptions {
+ ValueOptions {
+ show_values: self.show_values,
+ show_variables: self.show_variables,
+ small: self.small,
+ footnote_marker_type: self.look.footnote_marker_type,
+ }
+ }
+
+ pub fn transpose(&mut self) {
+ self.axes.swap(Axis3::X, Axis3::Y);
+ }
+
+ pub fn axis_dimensions(
+ &self,
+ axis: Axis3,
+ ) -> impl DoubleEndedIterator<Item = &Dimension> + ExactSizeIterator {
+ self.axes[axis]
+ .dimensions
+ .iter()
+ .copied()
+ .map(|index| &self.dimensions[index])
+ }
+
+ fn find_dimension(&self, dim_index: usize) -> Option<(Axis3, usize)> {
+ debug_assert!(dim_index < self.dimensions.len());
+ for axis in enum_iterator::all::<Axis3>() {
+ for (position, dimension) in self.axes[axis].dimensions.iter().copied().enumerate() {
+ if dimension == dim_index {
+ return Some((axis, position));
+ }
+ }
+ }
+ None
+ }
+ pub fn move_dimension(&mut self, dim_index: usize, new_axis: Axis3, new_position: usize) {
+ let (old_axis, old_position) = self.find_dimension(dim_index).unwrap();
+ if old_axis == new_axis && old_position == new_position {
+ return;
+ }
+
+ // Update the current layer, if necessary. If we're moving within the
+ // layer axis, preserve the current layer.
+ match (old_axis, new_axis) {
+ (Axis3::Z, Axis3::Z) => {
+ // Rearrange the layer axis.
+ if old_position < new_position {
+ self.current_layer[old_position..=new_position].rotate_left(1);
+ } else {
+ self.current_layer[new_position..=old_position].rotate_right(1);
+ }
+ }
+ (Axis3::Z, _) => {
+ // A layer is becoming a row or column.
+ self.current_layer.remove(old_position);
+ }
+ (_, Axis3::Z) => {
+ // A row or column is becoming a layer.
+ self.current_layer.insert(new_position, 0);
+ }
+ _ => (),
+ }
+
+ self.axes[old_axis].dimensions.remove(old_position);
+ self.axes[new_axis]
+ .dimensions
+ .insert(new_position, dim_index);
+ }
+}
+
+impl<I> Extend<(I, Value)> for PivotTable
+where
+ I: AsRef<[usize]>,
+{
+ fn extend<T: IntoIterator<Item = (I, Value)>>(&mut self, iter: T) {
+ for (data_indexes, value) in iter {
+ self.insert(data_indexes.as_ref(), value);
+ }
+ }
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct Footnote {
+ #[serde(skip)]
+ index: usize,
+ pub content: Box<Value>,
+ pub marker: Option<Box<Value>>,
+ pub show: bool,
+}
+
+impl Footnote {
+ pub fn new(content: impl Into<Value>) -> Self {
+ Self {
+ index: 0,
+ content: Box::new(content.into()),
+ marker: None,
+ show: true,
+ }
+ }
+ pub fn with_marker(mut self, marker: impl Into<Value>) -> Self {
+ self.marker = Some(Box::new(marker.into()));
+ self
+ }
+
+ pub fn with_show(mut self, show: bool) -> Self {
+ self.show = show;
+ self
+ }
+
+ pub fn with_index(mut self, index: usize) -> Self {
+ self.index = index;
+ self
+ }
+
+ pub fn display_marker(&self, options: impl IntoValueOptions) -> DisplayMarker<'_> {
+ DisplayMarker {
+ footnote: self,
+ options: options.into_value_options(),
+ }
+ }
+
+ pub fn display_content(&self, options: impl IntoValueOptions) -> DisplayValue<'_> {
+ self.content.display(options)
+ }
+
+ pub fn index(&self) -> usize {
+ self.index
+ }
+}
+
+pub struct DisplayMarker<'a> {
+ footnote: &'a Footnote,
+ options: ValueOptions,
+}
+
+impl Display for DisplayMarker<'_> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ if let Some(marker) = &self.footnote.marker {
+ write!(f, "{}", marker.display(self.options).without_suffixes())
+ } else {
+ let i = self.footnote.index + 1;
+ match self.options.footnote_marker_type {
+ FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic::new_lowercase(i)),
+ FootnoteMarkerType::Numeric => write!(f, "{i}"),
+ }
+ }
+ }
+}
+
+/// Displays a number in 26adic notation.
+///
+/// Zero is displayed as the empty string, 1 through 26 as `a` through `z`, 27
+/// through 52 as `aa` through `az`, and so on.
+pub struct Display26Adic {
+ value: usize,
+ base: u8,
+}
+
+impl Display26Adic {
+ /// Constructs a `Display26Adic` for `value`, with letters in lowercase.
+ pub fn new_lowercase(value: usize) -> Self {
+ Self { value, base: b'a' }
+ }
+
+ /// Constructs a `Display26Adic` for `value`, with letters in uppercase.
+ pub fn new_uppercase(value: usize) -> Self {
+ Self { value, base: b'A' }
+ }
+}
+
+impl Display for Display26Adic {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ let mut output = SmallVec::<[u8; 16]>::new();
+ let mut number = self.value;
+ while number > 0 {
+ number -= 1;
+ let digit = (number % 26) as u8;
+ output.push(digit + self.base);
+ number /= 26;
+ }
+ output.reverse();
+ write!(f, "{}", from_utf8(&output).unwrap())
+ }
+}
+
+/// The content of a single pivot table cell.
+///
+/// A [Value] is also a pivot table's title, caption, footnote marker and
+/// contents, and so on.
+///
+/// A given [Value] is one of:
+///
+/// 1. A number resulting from a calculation.
+///
+/// A number has an associated display format (usually [F] or [Pct]). This
+/// format can be set directly, but that is not usually the easiest way.
+/// Instead, it is usually true that all of the values in a single category
+/// should have the same format (e.g. all "Significance" values might use
+/// format `F40.3`), so PSPP makes it easy to set the default format for a
+/// category while creating the category. See pivot_dimension_create() for
+/// more details.
+///
+/// [F]: crate::format::Type::F
+/// [Pct]: crate::format::Type::Pct
+///
+/// 2. A numeric or string value obtained from data ([ValueInner::Number] or
+/// [ValueInner::String]). If such a value corresponds to a variable, then the
+/// variable's name can be attached to the pivot_value. If the value has a
+/// value label, then that can also be attached. When a label is present,
+/// the user can control whether to show the value or the label or both.
+///
+/// 3. A variable name ([ValueInner::Variable]). The variable label, if any, can
+/// be attached too, and again the user can control whether to show the value
+/// or the label or both.
+///
+/// 4. A text string ([ValueInner::Text). The value stores the string in English
+/// and translated into the output language (localized). Use
+/// pivot_value_new_text() or pivot_value_new_text_format() for those cases.
+/// In some cases, only an English or a localized version is available for
+/// one reason or another, although this is regrettable; in those cases, use
+/// pivot_value_new_user_text() or pivot_value_new_user_text_nocopy().
+///
+/// 5. A template. PSPP doesn't create these itself yet, but it can read and
+/// interpret those created by SPSS.
+#[derive(Clone, Default)]
+pub struct Value {
+ pub inner: ValueInner,
+ pub styling: Option<Box<ValueStyle>>,
+}
+
+impl Serialize for Value {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ self.inner.serialize(serializer)
+ }
+}
+
+/// Wrapper for [Value] that uses [Value::serialize_bare] for serialization.
+#[derive(Serialize)]
+struct BareValue<'a>(#[serde(serialize_with = "Value::serialize_bare")] pub &'a Value);
+
+impl Value {
+ pub fn serialize_bare<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: Serializer,
+ {
+ match &self.inner {
+ ValueInner::Number(number_value) => number_value.serialize_bare(serializer),
+ ValueInner::String(string_value) => string_value.s.serialize(serializer),
+ ValueInner::Variable(variable_value) => variable_value.var_name.serialize(serializer),
+ ValueInner::Text(text_value) => text_value.localized.serialize(serializer),
+ ValueInner::Template(template_value) => template_value.localized.serialize(serializer),
+ ValueInner::Empty => serializer.serialize_none(),
+ }
+ }
+
+ fn new(inner: ValueInner) -> Self {
+ Self {
+ inner,
+ styling: None,
+ }
+ }
+ pub fn new_number_with_format(x: Option<f64>, format: Format) -> Self {
+ Self::new(ValueInner::Number(NumberValue {
+ show: None,
+ format,
+ honor_small: false,
+ value: x,
+ variable: None,
+ value_label: None,
+ }))
+ }
+ pub fn new_variable(variable: &Variable) -> Self {
+ Self::new(ValueInner::Variable(VariableValue {
+ show: None,
+ var_name: String::from(variable.name.as_str()),
+ variable_label: variable.label.clone(),
+ }))
+ }
+ pub fn new_datum<B>(value: &Datum<B>) -> Self
+ where
+ B: EncodedString,
+ {
+ match value {
+ Datum::Number(number) => Self::new_number(*number),
+ Datum::String(string) => Self::new_user_text(string.as_str()),
+ }
+ }
+ pub fn new_variable_value(variable: &Variable, value: &Datum<ByteString>) -> Self {
+ let var_name = Some(variable.name.as_str().into());
+ let value_label = variable.value_labels.get(value).map(String::from);
+ match value {
+ Datum::Number(number) => Self::new(ValueInner::Number(NumberValue {
+ show: None,
+ format: match variable.print_format.var_type() {
+ VarType::Numeric => variable.print_format,
+ VarType::String => {
+ #[cfg(debug_assertions)]
+ panic!("cannot create numeric pivot value with string format");
+
+ #[cfg(not(debug_assertions))]
+ Format::F8_2
+ }
+ },
+ honor_small: false,
+ value: *number,
+ variable: var_name,
+ value_label,
+ })),
+ Datum::String(string) => Self::new(ValueInner::String(StringValue {
+ show: None,
+ hex: variable.print_format.type_() == Type::AHex,
+ s: string
+ .as_ref()
+ .with_encoding(variable.encoding())
+ .into_string(),
+ var_name,
+ value_label,
+ })),
+ }
+ }
+ pub fn new_number(x: Option<f64>) -> Self {
+ Self::new_number_with_format(x, Format::F8_2)
+ }
+ pub fn new_integer(x: Option<f64>) -> Self {
+ Self::new_number_with_format(x, Format::F40)
+ }
+ pub fn new_text(s: impl Into<String>) -> Self {
+ Self::new_user_text(s)
+ }
+ pub fn new_user_text(s: impl Into<String>) -> Self {
+ let s: String = s.into();
+ if s.is_empty() {
+ Self::default()
+ } else {
+ Self::new(ValueInner::Text(TextValue {
+ user_provided: true,
+ localized: s.clone(),
+ c: None,
+ id: None,
+ }))
+ }
+ }
+ pub fn with_footnote(mut self, footnote: &Arc<Footnote>) -> Self {
+ self.add_footnote(footnote);
+ self
+ }
+ pub fn add_footnote(&mut self, footnote: &Arc<Footnote>) {
+ let footnotes = &mut self.styling.get_or_insert_default().footnotes;
+ footnotes.push(footnote.clone());
+ footnotes.sort_by_key(|f| f.index);
+ }
+ pub fn with_show_value_label(mut self, show: Option<Show>) -> Self {
+ let new_show = show;
+ match &mut self.inner {
+ ValueInner::Number(NumberValue { show, .. })
+ | ValueInner::String(StringValue { show, .. }) => {
+ *show = new_show;
+ }
+ _ => (),
+ }
+ self
+ }
+ pub fn with_show_variable_label(mut self, show: Option<Show>) -> Self {
+ if let ValueInner::Variable(variable_value) = &mut self.inner {
+ variable_value.show = show;
+ }
+ self
+ }
+ pub fn with_value_label(mut self, label: Option<String>) -> Self {
+ match &mut self.inner {
+ ValueInner::Number(NumberValue { value_label, .. })
+ | ValueInner::String(StringValue { value_label, .. }) => *value_label = label.clone(),
+ _ => (),
+ }
+ self
+ }
+ pub const fn empty() -> Self {
+ Value {
+ inner: ValueInner::Empty,
+ styling: None,
+ }
+ }
+ pub const fn is_empty(&self) -> bool {
+ self.inner.is_empty() && self.styling.is_none()
+ }
+}
+
+impl From<&str> for Value {
+ fn from(value: &str) -> Self {
+ Self::new_text(value)
+ }
+}
+
+impl From<String> for Value {
+ fn from(value: String) -> Self {
+ Self::new_text(value)
+ }
+}
+
+impl From<&Variable> for Value {
+ fn from(variable: &Variable) -> Self {
+ Self::new_variable(variable)
+ }
+}
+
+pub struct DisplayValue<'a> {
+ inner: &'a ValueInner,
+ markup: bool,
+ subscripts: &'a [String],
+ footnotes: &'a [Arc<Footnote>],
+ options: ValueOptions,
+ show_value: bool,
+ show_label: Option<&'a str>,
+}
+
+impl<'a> DisplayValue<'a> {
+ pub fn subscripts(&self) -> impl Iterator<Item = &str> {
+ self.subscripts.iter().map(String::as_str)
+ }
+
+ pub fn has_subscripts(&self) -> bool {
+ !self.subscripts.is_empty()
+ }
+
+ pub fn footnotes(&self) -> impl Iterator<Item = DisplayMarker<'_>> {
+ self.footnotes
+ .iter()
+ .filter(|f| f.show)
+ .map(|f| f.display_marker(self.options))
+ }
+
+ pub fn has_footnotes(&self) -> bool {
+ self.footnotes().next().is_some()
+ }
+
+ pub fn without_suffixes(self) -> Self {
+ Self {
+ subscripts: &[],
+ footnotes: &[],
+ ..self
+ }
+ }
+
+ /// Returns this display split into `(body, suffixes)` where `suffixes` is
+ /// subscripts and footnotes and `body` is everything else.
+ pub fn split_suffixes(self) -> (Self, Self) {
+ let suffixes = Self {
+ inner: &ValueInner::Empty,
+ ..self
+ };
+ (self.without_suffixes(), suffixes)
+ }
+
+ pub fn with_styling(mut self, styling: &'a ValueStyle) -> Self {
+ if let Some(area_style) = &styling.style {
+ self.markup = area_style.font_style.markup;
+ }
+ self.subscripts = styling.subscripts.as_slice();
+ self.footnotes = styling.footnotes.as_slice();
+ self
+ }
+
+ pub fn with_font_style(self, font_style: &FontStyle) -> Self {
+ Self {
+ markup: font_style.markup,
+ ..self
+ }
+ }
+
+ pub fn with_subscripts(self, subscripts: &'a [String]) -> Self {
+ Self { subscripts, ..self }
+ }
+
+ pub fn with_footnotes(self, footnotes: &'a [Arc<Footnote>]) -> Self {
+ Self { footnotes, ..self }
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.inner.is_empty() && self.subscripts.is_empty() && self.footnotes.is_empty()
+ }
+
+ fn small(&self) -> f64 {
+ self.options.small
+ }
+
+ pub fn var_type(&self) -> VarType {
+ match self.inner {
+ ValueInner::Number(NumberValue { .. }) if self.show_label.is_none() => VarType::Numeric,
+ _ => VarType::String,
+ }
+ }
+
+ fn template(
+ &self,
+ f: &mut std::fmt::Formatter<'_>,
+ template: &str,
+ args: &[Vec<Value>],
+ ) -> std::fmt::Result {
+ let mut iter = template.as_bytes().iter();
+ while let Some(c) = iter.next() {
+ match c {
+ b'\\' => {
+ let c = *iter.next().unwrap_or(&b'\\') as char;
+ let c = if c == 'n' { '\n' } else { c };
+ write!(f, "{c}")?;
+ }
+ b'^' => {
+ let (index, rest) = consume_int(iter.as_slice());
+ iter = rest.iter();
+ let Some(arg) = args.get(index.wrapping_sub(1)) else {
+ continue;
+ };
+ if let Some(arg) = arg.first() {
+ write!(f, "{}", arg.display(self.options))?;
+ }
+ }
+ b'[' => {
+ let (a, rest) = extract_inner_template(iter.as_slice());
+ let (b, rest) = extract_inner_template(rest);
+ let rest = rest.strip_prefix(b"]").unwrap_or(rest);
+ let (index, rest) = consume_int(rest);
+ iter = rest.iter();
+
+ let Some(mut args) = args.get(index.wrapping_sub(1)).map(|vec| vec.as_slice())
+ else {
+ continue;
+ };
+ let (mut template, mut escape) =
+ if !a.is_empty() { (a, b'%') } else { (b, b'^') };
+ while !args.is_empty() {
+ let n_consumed = self.inner_template(f, template, escape, args)?;
+ if n_consumed == 0 {
+ break;
+ }
+ args = &args[n_consumed..];
+
+ template = b;
+ escape = b'^';
+ }
+ }
+ c => write!(f, "{c}")?,
+ }
+ }
+ Ok(())
+ }
+
+ fn inner_template(
+ &self,
+ f: &mut std::fmt::Formatter<'_>,
+ template: &[u8],
+ escape: u8,
+ args: &[Value],
+ ) -> Result<usize, std::fmt::Error> {
+ let mut iter = template.iter();
+ let mut args_consumed = 0;
+ while let Some(c) = iter.next() {
+ match c {
+ b'\\' => {
+ let c = *iter.next().unwrap_or(&b'\\') as char;
+ let c = if c == 'n' { '\n' } else { c };
+ write!(f, "{c}")?;
+ }
+ c if *c == escape => {
+ let (index, rest) = consume_int(iter.as_slice());
+ iter = rest.iter();
+ let Some(arg) = args.get(index.wrapping_sub(1)) else {
+ continue;
+ };
+ args_consumed = args_consumed.max(index);
+ write!(f, "{}", arg.display(self.options))?;
+ }
+ c => write!(f, "{c}")?,
+ }
+ }
+ Ok(args_consumed)
+ }
+}
+
+fn consume_int(input: &[u8]) -> (usize, &[u8]) {
+ let mut n = 0;
+ for (index, c) in input.iter().enumerate() {
+ if !c.is_ascii_digit() {
+ return (n, &input[index..]);
+ }
+ n = n * 10 + (c - b'0') as usize;
+ }
+ (n, &[])
+}
+
+fn extract_inner_template(input: &[u8]) -> (&[u8], &[u8]) {
+ for (index, c) in input.iter().copied().enumerate() {
+ if c == b':' && (index == 0 || input[index - 1] != b'\\') {
+ return input.split_at(index);
+ }
+ }
+ (input, &[])
+}
+
+fn interpret_show(
+ global_show: impl Fn() -> Show,
+ table_show: Option<Show>,
+ value_show: Option<Show>,
+ label: &str,
+) -> (bool, Option<&str>) {
+ match value_show.or(table_show).unwrap_or_else(global_show) {
+ Show::Value => (true, None),
+ Show::Label => (false, Some(label)),
+ Show::Both => (true, Some(label)),
+ }
+}
+
+impl Display for DisplayValue<'_> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self.inner {
+ ValueInner::Number(NumberValue {
+ format,
+ honor_small,
+ value,
+ ..
+ }) => {
+ if self.show_value {
+ let format = if format.type_() == Type::F
+ && *honor_small
+ && value.is_some_and(|value| value != 0.0 && value.abs() < self.small())
+ {
+ UncheckedFormat::new(Type::E, 40, format.d() as u8).fix()
+ } else {
+ *format
+ };
+ let mut buf = SmallString::<[u8; 40]>::new();
+ write!(
+ &mut buf,
+ "{}",
+ Datum::<&str>::Number(*value).display(format)
+ )
+ .unwrap();
+ write!(f, "{}", buf.trim_start_matches(' '))?;
+ }
+ if let Some(label) = self.show_label {
+ if self.show_value {
+ write!(f, " ")?;
+ }
+ f.write_str(label)?;
+ }
+ Ok(())
+ }
+
+ ValueInner::String(StringValue { s, .. })
+ | ValueInner::Variable(VariableValue { var_name: s, .. }) => {
+ match (self.show_value, self.show_label) {
+ (true, None) => write!(f, "{s}"),
+ (false, Some(label)) => write!(f, "{label}"),
+ (true, Some(label)) => write!(f, "{s} {label}"),
+ (false, None) => unreachable!(),
+ }
+ }
+
+ ValueInner::Text(TextValue {
+ localized: local, ..
+ }) => {
+ /*
+ if self
+ .inner
+ .styling
+ .as_ref()
+ .is_some_and(|styling| styling.style.font_style.markup)
+ {
+ todo!();
+ }*/
+ f.write_str(local)
+ }
+
+ ValueInner::Template(TemplateValue {
+ args,
+ localized: local,
+ ..
+ }) => self.template(f, local, args),
+
+ ValueInner::Empty => Ok(()),
+ }?;
+
+ for (subscript, delimiter) in self.subscripts.iter().zip(once('_').chain(repeat(','))) {
+ write!(f, "{delimiter}{subscript}")?;
+ }
+
+ for footnote in self.footnotes {
+ write!(f, "[{}]", footnote.display_marker(self.options))?;
+ }
+
+ Ok(())
+ }
+}
+
+impl Value {
+ // Returns an object that will format this value, including subscripts and
+ // superscripts and footnotes. `options` controls whether variable and
+ // value labels are included.
+ pub fn display(&self, options: impl IntoValueOptions) -> DisplayValue<'_> {
+ let display = self.inner.display(options.into_value_options());
+ match &self.styling {
+ Some(styling) => display.with_styling(styling),
+ None => display,
+ }
+ }
+}
+
+impl Debug for Value {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{:?}", self.display(()).to_string())
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct NumberValue {
+ /// The numerical value, or `None` if it is a missing value.
+ pub value: Option<f64>,
+ pub format: Format,
+ pub show: Option<Show>,
+ pub honor_small: bool,
+ pub variable: Option<String>,
+ pub value_label: Option<String>,
+}
+
+impl Serialize for NumberValue {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ if self.format.type_() == Type::F && self.variable.is_none() && self.value_label.is_none() {
+ self.value.serialize(serializer)
+ } else {
+ let mut s = serializer.serialize_map(None)?;
+ s.serialize_entry("value", &self.value)?;
+ s.serialize_entry("format", &self.format)?;
+ if let Some(show) = self.show {
+ s.serialize_entry("show", &show)?;
+ }
+ if self.honor_small {
+ s.serialize_entry("honor_small", &self.honor_small)?;
+ }
+ if let Some(variable) = &self.variable {
+ s.serialize_entry("variable", variable)?;
+ }
+ if let Some(value_label) = &self.value_label {
+ s.serialize_entry("value_label", value_label)?;
+ }
+ s.end()
+ }
+ }
+}
+
+impl NumberValue {
+ pub fn serialize_bare<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: Serializer,
+ {
+ if let Some(number) = self.value
+ && number.trunc() == number
+ && number >= -(1i64 << 53) as f64
+ && number <= (1i64 << 53) as f64
+ {
+ (number as u64).serialize(serializer)
+ } else {
+ self.value.serialize(serializer)
+ }
+ }
+}
+
+#[derive(Serialize)]
+pub struct BareNumberValue<'a>(
+ #[serde(serialize_with = "NumberValue::serialize_bare")] pub &'a NumberValue,
+);
+
+#[derive(Clone, Debug, Serialize)]
+pub struct StringValue {
+ /// The string value.
+ ///
+ /// If `hex` is true, this should contain hex digits, not raw binary data
+ /// (otherwise it would be impossible to encode non-UTF-8 data).
+ pub s: String,
+
+ /// True if `s` is hex digits.
+ pub hex: bool,
+
+ pub show: Option<Show>,
+
+ pub var_name: Option<String>,
+ pub value_label: Option<String>,
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct VariableValue {
+ pub show: Option<Show>,
+ pub var_name: String,
+ pub variable_label: Option<String>,
+}
+
+#[derive(Clone, Debug)]
+pub struct TextValue {
+ pub user_provided: bool,
+ /// Localized.
+ pub localized: String,
+ /// English.
+ pub c: Option<String>,
+ /// Identifier.
+ pub id: Option<String>,
+}
+
+impl Serialize for TextValue {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ if self.user_provided && self.c.is_none() && self.id.is_none() {
+ serializer.serialize_str(&self.localized)
+ } else {
+ let mut s = serializer.serialize_struct(
+ "TextValue",
+ 2 + self.c.is_some() as usize + self.id.is_some() as usize,
+ )?;
+ s.serialize_field("user_provided", &self.user_provided)?;
+ s.serialize_field("localized", &self.localized)?;
+ if let Some(c) = &self.c {
+ s.serialize_field("c", &c)?;
+ }
+ if let Some(id) = &self.id {
+ s.serialize_field("id", &id)?;
+ }
+ s.end()
+ }
+ }
+}
+
+impl TextValue {
+ pub fn localized(&self) -> &str {
+ self.localized.as_str()
+ }
+ pub fn c(&self) -> &str {
+ self.c.as_ref().unwrap_or(&self.localized).as_str()
+ }
+ pub fn id(&self) -> &str {
+ self.id.as_ref().unwrap_or(&self.localized).as_str()
+ }
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct TemplateValue {
+ pub args: Vec<Vec<Value>>,
+ pub localized: String,
+ pub id: String,
+}
+
+#[derive(Clone, Debug, Default, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ValueInner {
+ Number(NumberValue),
+ String(StringValue),
+ Variable(VariableValue),
+ Text(TextValue),
+ Template(TemplateValue),
+
+ #[default]
+ Empty,
+}
+
+impl ValueInner {
+ pub const fn is_empty(&self) -> bool {
+ matches!(self, Self::Empty)
+ }
+ fn show(&self) -> Option<Show> {
+ match self {
+ ValueInner::Number(NumberValue { show, .. })
+ | ValueInner::String(StringValue { show, .. })
+ | ValueInner::Variable(VariableValue { show, .. }) => *show,
+ _ => None,
+ }
+ }
+
+ fn label(&self) -> Option<&str> {
+ self.value_label().or_else(|| self.variable_label())
+ }
+
+ fn value_label(&self) -> Option<&str> {
+ match self {
+ ValueInner::Number(NumberValue { value_label, .. })
+ | ValueInner::String(StringValue { value_label, .. }) => {
+ value_label.as_ref().map(String::as_str)
+ }
+ _ => None,
+ }
+ }
+
+ fn variable_label(&self) -> Option<&str> {
+ match self {
+ ValueInner::Variable(VariableValue { variable_label, .. }) => {
+ variable_label.as_ref().map(String::as_str)
+ }
+ _ => None,
+ }
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct ValueStyle {
+ pub style: Option<AreaStyle>,
+ pub subscripts: Vec<String>,
+ pub footnotes: Vec<Arc<Footnote>>,
+}
+
+impl ValueStyle {
+ pub fn is_empty(&self) -> bool {
+ self.style.is_none() && self.subscripts.is_empty() && self.footnotes.is_empty()
+ }
+}
+
+impl ValueInner {
+ // Returns an object that will format this value. Settings on `options`
+ // control whether variable and value labels are included.
+ pub fn display(&self, options: impl IntoValueOptions) -> DisplayValue<'_> {
+ let options = options.into_value_options();
+ let (show_value, show_label) = if let Some(value_label) = self.value_label() {
+ interpret_show(
+ || Settings::global().show_values,
+ options.show_values,
+ self.show(),
+ value_label,
+ )
+ } else if let Some(variable_label) = self.variable_label() {
+ interpret_show(
+ || Settings::global().show_variables,
+ options.show_variables,
+ self.show(),
+ variable_label,
+ )
+ } else {
+ (true, None)
+ };
+ DisplayValue {
+ inner: self,
+ markup: false,
+ subscripts: &[],
+ footnotes: &[],
+ options,
+ show_value,
+ show_label,
+ }
+ }
+}
+
+pub struct MetadataEntry {
+ pub name: Value,
+ pub value: MetadataValue,
+}
+
+pub enum MetadataValue {
+ Leaf(Value),
+ Group(Vec<MetadataEntry>),
+}
+
+impl MetadataEntry {
+ pub fn into_pivot_table(self) -> PivotTable {
+ let mut data = Vec::new();
+ let group = match self.visit(&mut data) {
+ Category::Group(group) => group,
+ Category::Leaf(leaf) => Group::new("Metadata").with(leaf).with_label_shown(),
+ };
+ PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data(
+ data.into_iter()
+ .enumerate()
+ .filter(|(_row, value)| !value.is_empty())
+ .map(|(row, value)| ([row], value)),
+ )
+ }
+ fn visit(self, data: &mut Vec<Value>) -> Category {
+ match self.value {
+ MetadataValue::Leaf(value) => {
+ data.push(value);
+ Leaf::new(self.name).into()
+ }
+ MetadataValue::Group(items) => Group::with_capacity(self.name, items.len())
+ .with_multiple(items.into_iter().map(|item| item.visit(data)))
+ .into(),
+ }
+ }
+}
+
+impl Serialize for MetadataValue {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ match self {
+ MetadataValue::Leaf(value) => value.serialize_bare(serializer),
+ MetadataValue::Group(items) => {
+ let mut map = serializer.serialize_map(Some(items.len()))?;
+ for item in items {
+ let name = item.name.display(()).to_string();
+ map.serialize_entry(&name, &item.value)?;
+ }
+ map.end()
+ }
+ }
+ }
+}
+impl Serialize for MetadataEntry {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ match &self.value {
+ MetadataValue::Leaf(value) => {
+ let mut map = serializer.serialize_map(Some(1))?;
+ let name = self.name.display(()).to_string();
+ map.serialize_entry(&name, &BareValue(value))?;
+ map.end()
+ }
+ MetadataValue::Group(items) => {
+ let mut map = serializer.serialize_map(Some(items.len()))?;
+ for item in items {
+ let name = item.name.display(()).to_string();
+ map.serialize_entry(&name, &item.value)?;
+ }
+ map.end()
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::output::pivot::{Display26Adic, MetadataEntry, MetadataValue, Value};
+
+ #[test]
+ fn display_26adic() {
+ for (number, lowercase, uppercase) in [
+ (0, "", ""),
+ (1, "a", "A"),
+ (2, "b", "B"),
+ (26, "z", "Z"),
+ (27, "aa", "AA"),
+ (28, "ab", "AB"),
+ (29, "ac", "AC"),
+ (18278, "zzz", "ZZZ"),
+ (18279, "aaaa", "AAAA"),
+ (19010, "abcd", "ABCD"),
+ ] {
+ assert_eq!(Display26Adic::new_lowercase(number).to_string(), lowercase);
+ assert_eq!(Display26Adic::new_uppercase(number).to_string(), uppercase);
+ }
+ }
+
+ #[test]
+ fn metadata_entry() {
+ let tree = MetadataEntry {
+ name: Value::from("Group"),
+ value: MetadataValue::Group(vec![
+ MetadataEntry {
+ name: Value::from("Name 1"),
+ value: MetadataValue::Leaf(Value::from("Value 1")),
+ },
+ MetadataEntry {
+ name: Value::from("Subgroup 1"),
+ value: MetadataValue::Group(vec![
+ MetadataEntry {
+ name: Value::from("Subname 1"),
+ value: MetadataValue::Leaf(Value::from("Subvalue 1")),
+ },
+ MetadataEntry {
+ name: Value::from("Subname 2"),
+ value: MetadataValue::Leaf(Value::from("Subvalue 2")),
+ },
+ MetadataEntry {
+ name: Value::from("Subname 3"),
+ value: MetadataValue::Leaf(Value::new_integer(Some(3.0))),
+ },
+ ]),
+ },
+ MetadataEntry {
+ name: Value::from("Name 2"),
+ value: MetadataValue::Leaf(Value::from("Value 2")),
+ },
+ ]),
+ };
+ assert_eq!(
+ serde_json::to_string_pretty(&tree).unwrap(),
+ r#"{
+ "Name 1": "Value 1",
+ "Subgroup 1": {
+ "Subname 1": "Subvalue 1",
+ "Subname 2": "Subvalue 2",
+ "Subname 3": 3
+ },
+ "Name 2": "Value 2"
+}"#
+ );
+
+ assert_eq!(
+ tree.into_pivot_table().to_string(),
+ r#"╭────────────────────┬──────────╮
+│ Name 1 │Value 1 │
+├────────────────────┼──────────┤
+│Subgroup 1 Subname 1│Subvalue 1│
+│ Subname 2│Subvalue 2│
+│ Subname 3│ 3│
+├────────────────────┼──────────┤
+│ Name 2 │Value 2 │
+╰────────────────────┴──────────╯
+"#
+ );
+ }
+}
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-//! Pivot tables.
-//!
-//! Pivot tables are PSPP's primary form of output. They are analogous to the
-//! pivot tables you might be familiar with from spreadsheets and databases.
-//! See <https://en.wikipedia.org/wiki/Pivot_table> for a brief introduction to
-//! the overall concept of a pivot table.
-//!
-//! In PSPP, the most important internal pieces of a pivot table are:
-//!
-//! - Title. Every pivot table has a title that is displayed above it. It also
-//! has an optional caption (displayed below it) and corner text (displayed in
-//! the upper left corner).
-//!
-//! - Dimensions. A dimension consists of zero or more categories. A category
-//! has a label, such as "df" or "Asymp. Sig." or 123 or a variable name. The
-//! categories are the leaves of a tree whose non-leaf nodes form groups of
-//! categories. The tree always has a root group whose label is the name of
-//! the dimension.
-//!
-//! - Axes. A table has three axes: column, row, and layer. Each dimension is
-//! assigned to an axis, and each axis has zero or more dimensions. When an
-//! axis has more than one dimension, they are ordered from innermost to
-//! outermost.
-//!
-//! - Data. A table's data consists of zero or more cells. Each cell maps from
-//! a category for each dimension to a value, which is commonly a number but
-//! could also be a variable name or an arbitrary text string.
-
-use std::{
- collections::HashMap,
- fmt::{Debug, Display, Write},
- io::Read,
- iter::{once, repeat, repeat_n, FusedIterator},
- ops::{Index, IndexMut, Not, Range, RangeInclusive},
- str::{from_utf8, FromStr, Utf8Error},
- sync::{Arc, OnceLock},
-};
-
-use binrw::Error as BinError;
-use chrono::NaiveDateTime;
-pub use color::ParseError as ParseColorError;
-use color::{palette::css::TRANSPARENT, AlphaColor, Rgba8, Srgb};
-use enum_iterator::Sequence;
-use enum_map::{enum_map, Enum, EnumMap};
-use look_xml::TableProperties;
-use quick_xml::{de::from_str, DeError};
-use serde::{
- de::Visitor,
- ser::{SerializeMap, SerializeStruct},
- Deserialize, Serialize, Serializer,
-};
-use smallstr::SmallString;
-use smallvec::SmallVec;
-use thiserror::Error as ThisError;
-use tlo::parse_tlo;
-
-use crate::{
- data::{ByteString, Datum, EncodedString, RawString},
- format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat},
- settings::{Settings, Show},
- util::ToSmallString,
- variable::{VarType, Variable},
-};
-
-pub mod output;
-
-mod look_xml;
-#[cfg(test)]
-pub mod test;
-mod tlo;
-
-/// Areas of a pivot table for styling purposes.
-#[derive(Copy, Clone, Debug, Default, Enum, PartialEq, Eq)]
-pub enum Area {
- Title,
- Caption,
-
- /// Footnotes,
- Footer,
-
- // Top-left corner.
- Corner,
-
- /// Labels for columns ([Axis2::X]) and rows ([Axis2::Y]).
- Labels(Axis2),
-
- #[default]
- Data,
-
- /// Layer indication.
- Layers,
-}
-
-impl Display for Area {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- Area::Title => write!(f, "title"),
- Area::Caption => write!(f, "caption"),
- Area::Footer => write!(f, "footer"),
- Area::Corner => write!(f, "corner"),
- Area::Labels(axis2) => write!(f, "labels({axis2})"),
- Area::Data => write!(f, "data"),
- Area::Layers => write!(f, "layers"),
- }
- }
-}
-
-impl Serialize for Area {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- serializer.serialize_str(&self.to_small_string::<16>())
- }
-}
-
-impl Area {
- fn default_cell_style(self) -> CellStyle {
- use HorzAlign::*;
- use VertAlign::*;
- let (horz_align, vert_align, hmargins, vmargins) = match self {
- Area::Title => (Some(Center), Middle, [8, 11], [1, 8]),
- Area::Caption => (Some(Left), Top, [8, 11], [1, 1]),
- Area::Footer => (Some(Left), Top, [11, 8], [2, 3]),
- Area::Corner => (Some(Left), Bottom, [8, 11], [1, 1]),
- Area::Labels(Axis2::X) => (Some(Center), Top, [8, 11], [1, 3]),
- Area::Labels(Axis2::Y) => (Some(Left), Top, [8, 11], [1, 3]),
- Area::Data => (None, Top, [8, 11], [1, 1]),
- Area::Layers => (Some(Left), Bottom, [8, 11], [1, 3]),
- };
- CellStyle {
- horz_align,
- vert_align,
- margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins },
- }
- }
-
- fn default_font_style(self) -> FontStyle {
- FontStyle {
- bold: self == Area::Title,
- italic: false,
- underline: false,
- markup: false,
- font: String::from("Sans Serif"),
- fg: [Color::BLACK; 2],
- bg: [Color::WHITE; 2],
- size: 9,
- }
- }
-
- fn default_area_style(self) -> AreaStyle {
- AreaStyle {
- cell_style: self.default_cell_style(),
- font_style: self.default_font_style(),
- }
- }
-}
-
-/// Table borders for styling purposes.
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)]
-pub enum Border {
- Title,
- OuterFrame(BoxBorder),
- InnerFrame(BoxBorder),
- Dimension(RowColBorder),
- Category(RowColBorder),
- DataLeft,
- DataTop,
-}
-
-impl Border {
- pub fn default_stroke(self) -> Stroke {
- match self {
- Self::InnerFrame(_) | Self::DataLeft | Self::DataTop => Stroke::Thick,
- Self::Dimension(
- RowColBorder(HeadingRegion::Columns, _) | RowColBorder(_, Axis2::X),
- )
- | Self::Category(RowColBorder(HeadingRegion::Columns, _)) => Stroke::Solid,
- _ => Stroke::None,
- }
- }
- pub fn default_border_style(self) -> BorderStyle {
- BorderStyle {
- stroke: self.default_stroke(),
- color: Color::BLACK,
- }
- }
-
- fn fallback(self) -> Self {
- match self {
- Self::Title
- | Self::OuterFrame(_)
- | Self::InnerFrame(_)
- | Self::DataLeft
- | Self::DataTop
- | Self::Category(_) => self,
- Self::Dimension(row_col_border) => Self::Category(row_col_border),
- }
- }
-}
-
-impl Display for Border {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- Border::Title => write!(f, "title"),
- Border::OuterFrame(box_border) => write!(f, "outer_frame({box_border})"),
- Border::InnerFrame(box_border) => write!(f, "inner_frame({box_border})"),
- Border::Dimension(row_col_border) => write!(f, "dimension({row_col_border})"),
- Border::Category(row_col_border) => write!(f, "category({row_col_border})"),
- Border::DataLeft => write!(f, "data(left)"),
- Border::DataTop => write!(f, "data(top)"),
- }
- }
-}
-
-impl Serialize for Border {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- serializer.serialize_str(&self.to_small_string::<32>())
- }
-}
-
-/// The borders on a box.
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub enum BoxBorder {
- Left,
- Top,
- Right,
- Bottom,
-}
-
-impl BoxBorder {
- fn as_str(&self) -> &'static str {
- match self {
- BoxBorder::Left => "left",
- BoxBorder::Top => "top",
- BoxBorder::Right => "right",
- BoxBorder::Bottom => "bottom",
- }
- }
-}
-
-impl Display for BoxBorder {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- f.write_str(self.as_str())
- }
-}
-
-/// Borders between rows and columns.
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub struct RowColBorder(
- /// Row or column headings.
- pub HeadingRegion,
- /// Horizontal ([Axis2::X]) or vertical ([Axis2::Y]) borders.
- pub Axis2,
-);
-
-impl Display for RowColBorder {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{}:{}", self.0, self.1)
- }
-}
-
-/// Sizing for rows or columns of a rendered table.
-///
-/// The comments below talk about columns and their widths but they apply
-/// equally to rows and their heights.
-#[derive(Default, Clone, Debug, Serialize)]
-pub struct Sizing {
- /// Specific column widths, in 1/96" units.
- widths: Vec<i32>,
-
- /// Specific page breaks: 0-based columns after which a page break must
- /// occur, e.g. a value of 1 requests a break after the second column.
- breaks: Vec<usize>,
-
- /// Keeps: columns to keep together on a page if possible.
- keeps: Vec<Range<usize>>,
-}
-
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Sequence, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub enum Axis3 {
- X,
- Y,
- Z,
-}
-
-impl Axis3 {
- fn transpose(&self) -> Option<Self> {
- match self {
- Axis3::X => Some(Axis3::Y),
- Axis3::Y => Some(Axis3::X),
- Axis3::Z => None,
- }
- }
-}
-
-impl From<Axis2> for Axis3 {
- fn from(axis2: Axis2) -> Self {
- match axis2 {
- Axis2::X => Self::X,
- Axis2::Y => Self::Y,
- }
- }
-}
-
-/// An axis within a pivot table.
-#[derive(Clone, Debug, Default, Serialize)]
-pub struct Axis {
- /// `dimensions[0]` is the innermost dimension.
- pub dimensions: Vec<usize>,
-}
-
-pub struct AxisIterator {
- indexes: SmallVec<[usize; 4]>,
- lengths: SmallVec<[usize; 4]>,
- done: bool,
-}
-
-impl FusedIterator for AxisIterator {}
-impl Iterator for AxisIterator {
- type Item = SmallVec<[usize; 4]>;
-
- fn next(&mut self) -> Option<Self::Item> {
- if self.done {
- None
- } else {
- let retval = self.indexes.clone();
- for (index, len) in self.indexes.iter_mut().zip(self.lengths.iter().copied()) {
- *index += 1;
- if *index < len {
- return Some(retval);
- };
- *index = 0;
- }
- self.done = true;
- Some(retval)
- }
- }
-}
-
-impl PivotTable {
- pub fn with_look(mut self, look: Arc<Look>) -> Self {
- self.look = look;
- self
- }
- pub fn insert_number(&mut self, data_indexes: &[usize], number: Option<f64>, class: Class) {
- let format = match class {
- Class::Other => Settings::global().default_format,
- Class::Integer => Format::F40,
- Class::Correlations => Format::F40_3,
- Class::Significance => Format::F40_3,
- Class::Percent => Format::PCT40_1,
- Class::Residual => Format::F40_2,
- Class::Count => Format::F40, // XXX
- };
- let value = Value::new(ValueInner::Number(NumberValue {
- show: None,
- format,
- honor_small: class == Class::Other,
- value: number,
- variable: None,
- value_label: None,
- }));
- self.insert(data_indexes, value);
- }
-
- pub fn with_footnotes(mut self, footnotes: Footnotes) -> Self {
- debug_assert!(self.footnotes.is_empty());
- self.footnotes = footnotes;
- self
- }
- fn axis_values(&self, axis: Axis3) -> AxisIterator {
- AxisIterator {
- indexes: repeat_n(0, self.axes[axis].dimensions.len()).collect(),
- lengths: self.axis_dimensions(axis).map(|d| d.len()).collect(),
- done: self.axis_extent(axis) == 0,
- }
- }
-
- fn axis_extent(&self, axis: Axis3) -> usize {
- self.axis_dimensions(axis).map(|d| d.len()).product()
- }
-}
-
-/// Dimensions.
-///
-/// A [Dimension] identifies the categories associated with a single dimension
-/// within a multidimensional pivot table.
-///
-/// A dimension contains a collection of categories, which are the leaves in a
-/// tree of groups.
-///
-/// (A dimension or a group can contain zero categories, but this is unusual.
-/// If a dimension contains no categories, then its table cannot contain any
-/// data.)
-#[derive(Clone, Debug, Serialize)]
-pub struct Dimension {
- /// Hierarchy of categories within the dimension. The groups and categories
- /// are sorted in the order that should be used for display. This might be
- /// different from the original order produced for output if the user
- /// adjusted it.
- ///
- /// The root must always be a group, although it is allowed to have no
- /// subcategories.
- pub root: Group,
-
- /// Ordering of leaves for presentation.
- ///
- /// This is a permutation of `0..n` where `n` is the number of leaves. It
- /// maps from an index in presentation order to an index in data order.
- pub presentation_order: Vec<usize>,
-
- /// Display.
- pub hide_all_labels: bool,
-}
-
-pub type GroupVec<'a> = SmallVec<[&'a Group; 4]>;
-pub struct Path<'a> {
- groups: GroupVec<'a>,
- leaf: &'a Leaf,
-}
-
-impl Dimension {
- pub fn new(root: Group) -> Self {
- Dimension {
- presentation_order: (0..root.len()).collect(),
- root,
- hide_all_labels: false,
- }
- }
-
- pub fn is_empty(&self) -> bool {
- self.len() == 0
- }
-
- /// Returns the number of (leaf) categories in this dimension.
- pub fn len(&self) -> usize {
- self.root.len()
- }
-
- pub fn nth_leaf(&self, index: usize) -> Option<&Leaf> {
- self.root.nth_leaf(index)
- }
-
- pub fn leaf_path(&self, index: usize) -> Option<Path<'_>> {
- self.root.leaf_path(index, SmallVec::new())
- }
-
- pub fn with_all_labels_hidden(self) -> Self {
- Self {
- hide_all_labels: true,
- ..self
- }
- }
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct Group {
- #[serde(skip)]
- len: usize,
- pub name: Box<Value>,
-
- /// The child categories.
- ///
- /// A group usually has multiple children, but it is allowed to have
- /// only one or even (pathologically) none.
- pub children: Vec<Category>,
-
- /// Whether to show the group's label.
- pub show_label: bool,
-}
-
-impl Group {
- pub fn new(name: impl Into<Value>) -> Self {
- Self::with_capacity(name, 0)
- }
-
- pub fn with_capacity(name: impl Into<Value>, capacity: usize) -> Self {
- Self {
- len: 0,
- name: Box::new(name.into()),
- children: Vec::with_capacity(capacity),
- show_label: false,
- }
- }
-
- pub fn push(&mut self, child: impl Into<Category>) {
- let mut child = child.into();
- if let Category::Group(group) = &mut child {
- group.show_label = true;
- }
- self.len += child.len();
- self.children.push(child);
- }
-
- pub fn with(mut self, child: impl Into<Category>) -> Self {
- self.push(child);
- self
- }
-
- pub fn with_multiple<C>(mut self, children: impl IntoIterator<Item = C>) -> Self
- where
- C: Into<Category>,
- {
- self.extend(children);
- self
- }
-
- pub fn with_label_shown(self) -> Self {
- self.with_show_label(true)
- }
-
- pub fn with_show_label(mut self, show_label: bool) -> Self {
- self.show_label = show_label;
- self
- }
-
- pub fn nth_leaf(&self, mut index: usize) -> Option<&Leaf> {
- for child in &self.children {
- let len = child.len();
- if index < len {
- return child.nth_leaf(index);
- }
- index -= len;
- }
- None
- }
-
- pub fn leaf_path<'a>(&'a self, mut index: usize, mut groups: GroupVec<'a>) -> Option<Path<'a>> {
- for child in &self.children {
- let len = child.len();
- if index < len {
- groups.push(self);
- return child.leaf_path(index, groups);
- }
- index -= len;
- }
- None
- }
-
- pub fn len(&self) -> usize {
- self.len
- }
-
- pub fn is_empty(&self) -> bool {
- self.len() == 0
- }
-
- pub fn name(&self) -> &Value {
- &self.name
- }
-}
-
-impl<C> Extend<C> for Group
-where
- C: Into<Category>,
-{
- fn extend<T: IntoIterator<Item = C>>(&mut self, children: T) {
- let children = children.into_iter();
- self.children.reserve(children.size_hint().0);
- for child in children {
- self.push(child);
- }
- }
-}
-
-#[derive(Clone, Debug, Default, Serialize)]
-pub struct Footnotes(pub Vec<Arc<Footnote>>);
-
-impl Footnotes {
- pub fn new() -> Self {
- Self::default()
- }
-
- pub fn push(&mut self, footnote: Footnote) -> Arc<Footnote> {
- let footnote = Arc::new(footnote.with_index(self.0.len()));
- self.0.push(footnote.clone());
- footnote
- }
-
- pub fn is_empty(&self) -> bool {
- self.0.is_empty()
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Leaf {
- name: Box<Value>,
-}
-
-impl Leaf {
- pub fn new(name: Value) -> Self {
- Self {
- name: Box::new(name),
- }
- }
- pub fn name(&self) -> &Value {
- &self.name
- }
-}
-
-impl Serialize for Leaf {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- self.name.serialize(serializer)
- }
-}
-
-/// Pivot result classes.
-///
-/// These are used to mark [Leaf] categories as having particular types of data,
-/// to set their numeric formats.
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub enum Class {
- Other,
- Integer,
- Correlations,
- Significance,
- Percent,
- Residual,
- Count,
-}
-
-/// A pivot_category is a leaf (a category) or a group.
-#[derive(Clone, Debug, Serialize)]
-pub enum Category {
- Group(Group),
- Leaf(Leaf),
-}
-
-impl Category {
- pub fn name(&self) -> &Value {
- match self {
- Category::Group(group) => &group.name,
- Category::Leaf(leaf) => &leaf.name,
- }
- }
-
- pub fn is_empty(&self) -> bool {
- self.len() == 0
- }
-
- pub fn len(&self) -> usize {
- match self {
- Category::Group(group) => group.len,
- Category::Leaf(_) => 1,
- }
- }
-
- pub fn nth_leaf(&self, index: usize) -> Option<&Leaf> {
- match self {
- Category::Group(group) => group.nth_leaf(index),
- Category::Leaf(leaf) => {
- if index == 0 {
- Some(leaf)
- } else {
- None
- }
- }
- }
- }
-
- pub fn leaf_path<'a>(&'a self, index: usize, groups: GroupVec<'a>) -> Option<Path<'a>> {
- match self {
- Category::Group(group) => group.leaf_path(index, groups),
- Category::Leaf(leaf) => {
- if index == 0 {
- Some(Path { groups, leaf })
- } else {
- None
- }
- }
- }
- }
-
- pub fn show_label(&self) -> bool {
- match self {
- Category::Group(group) => group.show_label,
- Category::Leaf(_) => true,
- }
- }
-}
-
-impl From<Group> for Category {
- fn from(group: Group) -> Self {
- Self::Group(group)
- }
-}
-
-impl From<Leaf> for Category {
- fn from(group: Leaf) -> Self {
- Self::Leaf(group)
- }
-}
-
-impl From<Value> for Category {
- fn from(name: Value) -> Self {
- Leaf::new(name).into()
- }
-}
-
-impl From<&Variable> for Category {
- fn from(variable: &Variable) -> Self {
- Value::new_variable(variable).into()
- }
-}
-
-impl From<&str> for Category {
- fn from(name: &str) -> Self {
- Self::Leaf(Leaf::new(Value::new_text(name)))
- }
-}
-
-impl From<String> for Category {
- fn from(name: String) -> Self {
- Self::Leaf(Leaf::new(Value::new_text(name)))
- }
-}
-
-impl From<&String> for Category {
- fn from(name: &String) -> Self {
- Self::Leaf(Leaf::new(Value::new_text(name)))
- }
-}
-
-/// Styling for a pivot table.
-///
-/// The division between this and the style information in [PivotTable] seems
-/// fairly arbitrary. The ultimate reason for the division is simply because
-/// that's how SPSS documentation and file formats do it.
-#[derive(Clone, Debug, Serialize)]
-pub struct Look {
- pub name: Option<String>,
-
- /// Whether to hide rows or columns whose cells are all empty.
- pub hide_empty: bool,
-
- pub row_label_position: LabelPosition,
-
- /// Ranges of column widths in the two heading regions, in 1/96" units.
- pub heading_widths: EnumMap<HeadingRegion, RangeInclusive<usize>>,
-
- /// Kind of markers to use for footnotes.
- pub footnote_marker_type: FootnoteMarkerType,
-
- /// Where to put the footnote markers.
- pub footnote_marker_position: FootnoteMarkerPosition,
-
- /// Styles for areas of the pivot table.
- pub areas: EnumMap<Area, AreaStyle>,
-
- /// Styles for borders in the pivot table.
- pub borders: EnumMap<Border, BorderStyle>,
-
- pub print_all_layers: bool,
-
- pub paginate_layers: bool,
-
- pub shrink_to_fit: EnumMap<Axis2, bool>,
-
- pub top_continuation: bool,
-
- pub bottom_continuation: bool,
-
- pub continuation: Option<String>,
-
- pub n_orphan_lines: usize,
-}
-
-impl Look {
- pub fn with_omit_empty(mut self, omit_empty: bool) -> Self {
- self.hide_empty = omit_empty;
- self
- }
- pub fn with_row_label_position(mut self, row_label_position: LabelPosition) -> Self {
- self.row_label_position = row_label_position;
- self
- }
- pub fn with_borders(mut self, borders: EnumMap<Border, BorderStyle>) -> Self {
- self.borders = borders;
- self
- }
-}
-
-impl Default for Look {
- fn default() -> Self {
- Self {
- name: None,
- hide_empty: true,
- row_label_position: LabelPosition::default(),
- heading_widths: EnumMap::from_fn(|region| match region {
- HeadingRegion::Rows => 36..=72,
- HeadingRegion::Columns => 36..=120,
- }),
- footnote_marker_type: FootnoteMarkerType::default(),
- footnote_marker_position: FootnoteMarkerPosition::default(),
- areas: EnumMap::from_fn(Area::default_area_style),
- borders: EnumMap::from_fn(Border::default_border_style),
- print_all_layers: false,
- paginate_layers: false,
- shrink_to_fit: EnumMap::from_fn(|_| false),
- top_continuation: false,
- bottom_continuation: false,
- continuation: None,
- n_orphan_lines: 0,
- }
- }
-}
-
-#[derive(ThisError, Debug)]
-pub enum ParseLookError {
- #[error(transparent)]
- XmlError(#[from] DeError),
-
- #[error(transparent)]
- Utf8Error(#[from] Utf8Error),
-
- #[error(transparent)]
- BinError(#[from] BinError),
-
- #[error(transparent)]
- IoError(#[from] std::io::Error),
-}
-
-impl Look {
- pub fn shared_default() -> Arc<Look> {
- static LOOK: OnceLock<Arc<Look>> = OnceLock::new();
- LOOK.get_or_init(|| Arc::new(Look::default())).clone()
- }
-
- pub fn from_xml(xml: &str) -> Result<Self, ParseLookError> {
- Ok(from_str::<TableProperties>(xml)
- .map_err(ParseLookError::from)?
- .into())
- }
-
- pub fn from_binary(tlo: &[u8]) -> Result<Self, ParseLookError> {
- parse_tlo(tlo).map_err(ParseLookError::from)
- }
-
- pub fn from_data(data: &[u8]) -> Result<Self, ParseLookError> {
- if data.starts_with(b"\xff\xff\0\0") {
- Self::from_binary(data)
- } else {
- Self::from_xml(from_utf8(data).map_err(ParseLookError::from)?)
- }
- }
-
- pub fn from_reader<R>(mut reader: R) -> Result<Self, ParseLookError>
- where
- R: Read,
- {
- let mut buffer = Vec::new();
- reader
- .read_to_end(&mut buffer)
- .map_err(ParseLookError::from)?;
- Self::from_data(&buffer)
- }
-}
-
-/// Position for group labels.
-#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
-pub enum LabelPosition {
- /// Hierarachically enclosing the categories.
- ///
- /// For column labels, group labels appear above the categories. For row
- /// labels, group labels appear to the left of the categories.
- ///
- /// ```text
- /// ┌────┬──────────────┐ ┌─────────┬──────────┐
- /// │ │ nested │ │ │ columns │
- /// │ ├────┬────┬────┤ ├──────┬──┼──────────┤
- /// │ │ a1 │ a2 │ a3 │ │ │a1│...data...│
- /// ├────┼────┼────┼────┤ │nested│a2│...data...│
- /// │ │data│data│data│ │ │a3│...data...│
- /// │ │ . │ . │ . │ └──────┴──┴──────────┘
- /// │rows│ . │ . │ . │
- /// │ │ . │ . │ . │
- /// └────┴────┴────┴────┘
- /// ```
- #[serde(rename = "nested")]
- Nested,
-
- /// In the corner (row labels only).
- ///
- /// ```text
- /// ┌──────┬──────────┐
- /// │corner│ columns │
- /// ├──────┼──────────┤
- /// │ a1│...data...│
- /// │ a2│...data...│
- /// │ a3│...data...│
- /// └──────┴──────────┘
- /// ```
- #[default]
- #[serde(rename = "inCorner")]
- Corner,
-}
-
-/// The heading region of a rendered pivot table:
-///
-/// ```text
-/// ┌──────────────────┬─────────────────────────────────────────────────┐
-/// │ │ column headings │
-/// │ ├─────────────────────────────────────────────────┤
-/// │ corner │ │
-/// │ and │ │
-/// │ row headings │ data │
-/// │ │ │
-/// │ │ │
-/// └──────────────────┴─────────────────────────────────────────────────┘
-/// ```
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub enum HeadingRegion {
- Rows,
- Columns,
-}
-
-impl HeadingRegion {
- pub fn as_str(&self) -> &'static str {
- match self {
- HeadingRegion::Rows => "rows",
- HeadingRegion::Columns => "columns",
- }
- }
-}
-
-impl Display for HeadingRegion {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{}", self.as_str())
- }
-}
-
-impl From<Axis2> for HeadingRegion {
- fn from(axis: Axis2) -> Self {
- match axis {
- Axis2::X => HeadingRegion::Columns,
- Axis2::Y => HeadingRegion::Rows,
- }
- }
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct AreaStyle {
- pub cell_style: CellStyle,
- pub font_style: FontStyle,
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct CellStyle {
- /// `None` means "mixed" alignment: align strings to the left, numbers to
- /// the right.
- pub horz_align: Option<HorzAlign>,
- pub vert_align: VertAlign,
-
- /// Margins in 1/96" units.
- ///
- /// `margins[Axis2::X][0]` is the left margin.
- /// `margins[Axis2::X][1]` is the right margin.
- /// `margins[Axis2::Y][0]` is the top margin.
- /// `margins[Axis2::Y][1]` is the bottom margin.
- pub margins: EnumMap<Axis2, [i32; 2]>,
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Deserialize, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub enum HorzAlign {
- /// Right aligned.
- Right,
-
- /// Left aligned.
- Left,
-
- /// Centered.
- Center,
-
- /// Align the decimal point at the specified position.
- Decimal {
- /// Decimal offset from the right side of the cell, in 1/96" units.
- offset: f64,
-
- /// Decimal character.
- decimal: Decimal,
- },
-}
-
-impl HorzAlign {
- pub fn for_mixed(var_type: VarType) -> Self {
- match var_type {
- VarType::Numeric => Self::Right,
- VarType::String => Self::Left,
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub enum VertAlign {
- /// Top alignment.
- Top,
-
- /// Centered,
- Middle,
-
- /// Bottom alignment.
- Bottom,
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct FontStyle {
- pub bold: bool,
- pub italic: bool,
- pub underline: bool,
- pub markup: bool,
- pub font: String,
-
- /// `fg[0]` is the usual foreground color.
- ///
- /// `fg[1]` is used only in [Area::Data] for odd-numbered rows.
- pub fg: [Color; 2],
-
- /// `bg[0]` is the usual background color.
- ///
- /// `bg[1]` is used only in [Area::Data] for odd-numbered rows.
- pub bg: [Color; 2],
-
- /// In 1/72" units.
- pub size: i32,
-}
-
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub struct Color {
- pub alpha: u8,
- pub r: u8,
- pub g: u8,
- pub b: u8,
-}
-
-impl Color {
- pub const BLACK: Color = Color::new(0, 0, 0);
- pub const WHITE: Color = Color::new(255, 255, 255);
- pub const RED: Color = Color::new(255, 0, 0);
- pub const BLUE: Color = Color::new(0, 0, 255);
- pub const TRANSPARENT: Color = Color::new(0, 0, 0).with_alpha(0);
-
- pub const fn new(r: u8, g: u8, b: u8) -> Self {
- Self {
- alpha: 255,
- r,
- g,
- b,
- }
- }
-
- pub const fn with_alpha(self, alpha: u8) -> Self {
- Self { alpha, ..self }
- }
-
- pub const fn without_alpha(self) -> Self {
- self.with_alpha(255)
- }
-
- pub fn display_css(&self) -> DisplayCss {
- DisplayCss(*self)
- }
-}
-
-impl Debug for Color {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{}", self.display_css())
- }
-}
-
-impl From<Rgba8> for Color {
- fn from(Rgba8 { r, g, b, a }: Rgba8) -> Self {
- Self::new(r, g, b).with_alpha(a)
- }
-}
-
-impl FromStr for Color {
- type Err = ParseColorError;
-
- fn from_str(s: &str) -> Result<Self, Self::Err> {
- fn is_bare_hex(s: &str) -> bool {
- let s = s.trim();
- s.chars().count() == 6 && s.chars().all(|c| c.is_ascii_hexdigit())
- }
- let color: AlphaColor<Srgb> = match s.parse() {
- Err(ParseColorError::UnknownColorSyntax) if is_bare_hex(s) => {
- ("#".to_owned() + s).parse()
- }
- Err(ParseColorError::UnknownColorSyntax)
- if s.trim().eq_ignore_ascii_case("transparent") =>
- {
- Ok(TRANSPARENT)
- }
- other => other,
- }?;
- Ok(color.to_rgba8().into())
- }
-}
-
-impl Serialize for Color {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- serializer.serialize_str(&self.display_css().to_small_string::<32>())
- }
-}
-
-impl<'de> Deserialize<'de> for Color {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- struct ColorVisitor;
-
- impl<'de> Visitor<'de> for ColorVisitor {
- type Value = Color;
-
- fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
- formatter.write_str("\"#rrggbb\" or \"rrggbb\" or web color name")
- }
-
- fn visit_borrowed_str<E>(self, v: &'de str) -> Result<Self::Value, E>
- where
- E: serde::de::Error,
- {
- v.parse().map_err(E::custom)
- }
- }
-
- deserializer.deserialize_str(ColorVisitor)
- }
-}
-
-pub struct DisplayCss(Color);
-
-impl Display for DisplayCss {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- let Color { alpha, r, g, b } = self.0;
- match alpha {
- 255 => write!(f, "#{r:02x}{g:02x}{b:02x}"),
- _ => write!(f, "rgb({r}, {g}, {b}, {:.2})", alpha as f64 / 255.0),
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, Deserialize)]
-pub struct BorderStyle {
- #[serde(rename = "@borderStyleType")]
- pub stroke: Stroke,
-
- #[serde(rename = "@color")]
- pub color: Color,
-}
-
-impl Serialize for BorderStyle {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- let mut s = serializer.serialize_struct("BorderStyle", 2)?;
- s.serialize_field("stroke", &self.stroke)?;
- s.serialize_field("color", &self.color)?;
- s.end()
- }
-}
-
-impl BorderStyle {
- pub const fn none() -> Self {
- Self {
- stroke: Stroke::None,
- color: Color::BLACK,
- }
- }
-
- pub fn is_none(&self) -> bool {
- self.stroke.is_none()
- }
-
- /// Returns a border style that "combines" the two arguments, that is, that
- /// gives a reasonable choice for a rule for different reasons should have
- /// both styles.
- pub fn combine(self, other: BorderStyle) -> Self {
- Self {
- stroke: self.stroke.combine(other.stroke),
- color: self.color,
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Enum, Deserialize, Serialize)]
-#[serde(rename_all = "camelCase")]
-pub enum Stroke {
- None,
- Solid,
- Dashed,
- Thick,
- Thin,
- Double,
-}
-
-impl Stroke {
- pub fn is_none(&self) -> bool {
- self == &Self::None
- }
-
- /// Returns a stroke that "combines" the two arguments, that is, that gives
- /// a reasonable stroke choice for a rule for different reasons should have
- /// both styles.
- pub fn combine(self, other: Stroke) -> Self {
- self.max(other)
- }
-}
-
-/// An axis of a 2-dimensional table.
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(rename_all = "snake_case")]
-pub enum Axis2 {
- X,
- Y,
-}
-
-impl Axis2 {
- pub fn new_enum<T>(x: T, y: T) -> EnumMap<Axis2, T> {
- EnumMap::from_array([x, y])
- }
-
- pub fn as_str(&self) -> &'static str {
- match self {
- Axis2::X => "x",
- Axis2::Y => "y",
- }
- }
-}
-
-impl Display for Axis2 {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{}", self.as_str())
- }
-}
-
-impl Not for Axis2 {
- type Output = Self;
-
- fn not(self) -> Self::Output {
- match self {
- Self::X => Self::Y,
- Self::Y => Self::X,
- }
- }
-}
-
-/// A 2-dimensional `(x,y)` pair.
-#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)]
-pub struct Coord2(pub EnumMap<Axis2, usize>);
-
-impl Coord2 {
- pub fn new(x: usize, y: usize) -> Self {
- use Axis2::*;
- Self(enum_map! {
- X => x,
- Y => y
- })
- }
-
- pub fn for_axis((a, az): (Axis2, usize), bz: usize) -> Self {
- let mut coord = Self::default();
- coord[a] = az;
- coord[!a] = bz;
- coord
- }
-
- pub fn from_fn<F>(f: F) -> Self
- where
- F: FnMut(Axis2) -> usize,
- {
- Self(EnumMap::from_fn(f))
- }
-
- pub fn x(&self) -> usize {
- self.0[Axis2::X]
- }
-
- pub fn y(&self) -> usize {
- self.0[Axis2::Y]
- }
-
- pub fn get(&self, axis: Axis2) -> usize {
- self.0[axis]
- }
-}
-
-impl From<EnumMap<Axis2, usize>> for Coord2 {
- fn from(value: EnumMap<Axis2, usize>) -> Self {
- Self(value)
- }
-}
-
-impl Index<Axis2> for Coord2 {
- type Output = usize;
-
- fn index(&self, index: Axis2) -> &Self::Output {
- &self.0[index]
- }
-}
-
-impl IndexMut<Axis2> for Coord2 {
- fn index_mut(&mut self, index: Axis2) -> &mut Self::Output {
- &mut self.0[index]
- }
-}
-
-#[derive(Clone, Debug, Default)]
-pub struct Rect2(pub EnumMap<Axis2, Range<usize>>);
-
-impl Rect2 {
- pub fn new(x_range: Range<usize>, y_range: Range<usize>) -> Self {
- Self(enum_map! {
- Axis2::X => x_range.clone(),
- Axis2::Y => y_range.clone(),
- })
- }
- pub fn for_cell(cell: Coord2) -> Self {
- Self::new(cell.x()..cell.x() + 1, cell.y()..cell.y() + 1)
- }
- pub fn for_ranges((a, a_range): (Axis2, Range<usize>), b_range: Range<usize>) -> Self {
- let b = !a;
- let mut ranges = EnumMap::default();
- ranges[a] = a_range;
- ranges[b] = b_range;
- Self(ranges)
- }
- pub fn top_left(&self) -> Coord2 {
- use Axis2::*;
- Coord2::new(self[X].start, self[Y].start)
- }
- pub fn from_fn<F>(f: F) -> Self
- where
- F: FnMut(Axis2) -> Range<usize>,
- {
- Self(EnumMap::from_fn(f))
- }
- pub fn translate(self, offset: Coord2) -> Rect2 {
- Self::from_fn(|axis| self[axis].start + offset[axis]..self[axis].end + offset[axis])
- }
- pub fn is_empty(&self) -> bool {
- self[Axis2::X].is_empty() || self[Axis2::Y].is_empty()
- }
-}
-
-impl From<EnumMap<Axis2, Range<usize>>> for Rect2 {
- fn from(value: EnumMap<Axis2, Range<usize>>) -> Self {
- Self(value)
- }
-}
-
-impl Index<Axis2> for Rect2 {
- type Output = Range<usize>;
-
- fn index(&self, index: Axis2) -> &Self::Output {
- &self.0[index]
- }
-}
-
-impl IndexMut<Axis2> for Rect2 {
- fn index_mut(&mut self, index: Axis2) -> &mut Self::Output {
- &mut self.0[index]
- }
-}
-
-#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
-#[serde(rename_all = "camelCase")]
-pub enum FootnoteMarkerType {
- /// a, b, c, ...
- #[default]
- Alphabetic,
-
- /// 1, 2, 3, ...
- Numeric,
-}
-
-#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
-#[serde(rename_all = "camelCase")]
-pub enum FootnoteMarkerPosition {
- /// Subscripts.
- #[default]
- Subscript,
-
- /// Superscripts.
- Superscript,
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct ValueOptions {
- pub show_values: Option<Show>,
-
- pub show_variables: Option<Show>,
-
- pub small: f64,
-
- /// Where to put the footnote markers.
- pub footnote_marker_type: FootnoteMarkerType,
-}
-
-impl Default for ValueOptions {
- fn default() -> Self {
- Self {
- show_values: None,
- show_variables: None,
- small: 0.0001,
- footnote_marker_type: FootnoteMarkerType::default(),
- }
- }
-}
-
-pub trait IntoValueOptions {
- fn into_value_options(self) -> ValueOptions;
-}
-
-impl IntoValueOptions for () {
- fn into_value_options(self) -> ValueOptions {
- ValueOptions::default()
- }
-}
-
-impl IntoValueOptions for &PivotTable {
- fn into_value_options(self) -> ValueOptions {
- self.value_options()
- }
-}
-
-impl IntoValueOptions for &ValueOptions {
- fn into_value_options(self) -> ValueOptions {
- *self
- }
-}
-
-impl IntoValueOptions for ValueOptions {
- fn into_value_options(self) -> ValueOptions {
- self
- }
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct PivotTable {
- pub look: Arc<Look>,
-
- pub rotate_inner_column_labels: bool,
-
- pub rotate_outer_row_labels: bool,
-
- pub show_grid_lines: bool,
-
- pub show_title: bool,
-
- pub show_caption: bool,
-
- pub show_values: Option<Show>,
-
- pub show_variables: Option<Show>,
-
- pub weight_format: Format,
-
- /// Current layer indexes, with `axes[Axis3::Z].dimensions.len()` elements.
- /// `current_layer[i]` is an offset into
- /// `axes[Axis3::Z].dimensions[i].data_leaves[]`, except that a dimension
- /// can have zero leaves, in which case `current_layer[i]` is zero and
- /// there's no corresponding leaf.
- pub current_layer: Vec<usize>,
-
- /// Column and row sizing and page breaks.
- pub sizing: EnumMap<Axis2, Option<Box<Sizing>>>,
-
- /// Format settings.
- pub settings: FormatSettings,
-
- /// Numeric grouping character (usually `.` or `,`).
- pub grouping: Option<char>,
-
- pub small: f64,
-
- pub command_local: Option<String>,
- pub command_c: Option<String>,
- pub language: Option<String>,
- pub locale: Option<String>,
- pub dataset: Option<String>,
- pub datafile: Option<String>,
- pub date: Option<NaiveDateTime>,
- pub footnotes: Footnotes,
- pub title: Option<Box<Value>>,
- pub subtype: Option<Box<Value>>,
- pub corner_text: Option<Box<Value>>,
- pub caption: Option<Box<Value>>,
- pub notes: Option<String>,
- pub dimensions: Vec<Dimension>,
- pub axes: EnumMap<Axis3, Axis>,
- pub cells: HashMap<usize, Value>,
-}
-
-impl PivotTable {
- pub fn with_title(mut self, title: impl Into<Value>) -> Self {
- self.title = Some(Box::new(title.into()));
- self.show_title = true;
- self
- }
-
- pub fn with_caption(mut self, caption: impl Into<Value>) -> Self {
- self.caption = Some(Box::new(caption.into()));
- self.show_caption = true;
- self
- }
-
- pub fn with_corner_text(mut self, corner_text: impl Into<Value>) -> Self {
- self.corner_text = Some(Box::new(corner_text.into()));
- self
- }
-
- pub fn with_subtype(self, subtype: impl Into<Value>) -> Self {
- Self {
- subtype: Some(Box::new(subtype.into())),
- ..self
- }
- }
-
- pub fn with_show_title(mut self, show_title: bool) -> Self {
- self.show_title = show_title;
- self
- }
-
- pub fn with_show_caption(mut self, show_caption: bool) -> Self {
- self.show_caption = show_caption;
- self
- }
-
- pub fn with_layer(mut self, layer: &[usize]) -> Self {
- debug_assert_eq!(layer.len(), self.current_layer.len());
- if self.look.print_all_layers {
- self.look_mut().print_all_layers = false;
- }
- self.current_layer.clear();
- self.current_layer.extend_from_slice(layer);
- self
- }
-
- pub fn with_all_layers(mut self) -> Self {
- if !self.look.print_all_layers {
- self.look_mut().print_all_layers = true;
- }
- self
- }
-
- pub fn look_mut(&mut self) -> &mut Look {
- Arc::make_mut(&mut self.look)
- }
-
- pub fn with_show_empty(mut self) -> Self {
- if self.look.hide_empty {
- self.look_mut().hide_empty = false;
- }
- self
- }
-
- pub fn with_hide_empty(mut self) -> Self {
- if !self.look.hide_empty {
- self.look_mut().hide_empty = true;
- }
- self
- }
-
- pub fn label(&self) -> String {
- match &self.title {
- Some(title) => title.display(self).to_string(),
- None => String::from("Table"),
- }
- }
-
- pub fn title(&self) -> &Value {
- match &self.title {
- Some(title) => title,
- None => {
- static EMPTY: Value = Value::empty();
- &EMPTY
- }
- }
- }
-
- pub fn subtype(&self) -> &Value {
- match &self.subtype {
- Some(subtype) => subtype,
- None => {
- static EMPTY: Value = Value::empty();
- &EMPTY
- }
- }
- }
-}
-
-impl Default for PivotTable {
- fn default() -> Self {
- Self {
- look: Look::shared_default(),
- rotate_inner_column_labels: false,
- rotate_outer_row_labels: false,
- show_grid_lines: false,
- show_title: true,
- show_caption: true,
- show_values: None,
- show_variables: None,
- weight_format: Format::F40,
- current_layer: Vec::new(),
- sizing: EnumMap::default(),
- settings: FormatSettings::default(), // XXX from settings
- grouping: None,
- small: 0.0001, // XXX from settings.
- command_local: None,
- command_c: None, // XXX from current command name.
- language: None,
- locale: None,
- dataset: None,
- datafile: None,
- date: None,
- footnotes: Footnotes::new(),
- subtype: None,
- title: None,
- corner_text: None,
- caption: None,
- notes: None,
- dimensions: Vec::new(),
- axes: EnumMap::default(),
- cells: HashMap::new(),
- }
- }
-}
-
-fn cell_index<I>(data_indexes: &[usize], dimensions: I) -> usize
-where
- I: ExactSizeIterator<Item = usize>,
-{
- debug_assert_eq!(data_indexes.len(), dimensions.len());
- let mut index = 0;
- for (dimension, data_index) in dimensions.zip(data_indexes.iter()) {
- debug_assert!(*data_index < dimension);
- index = dimension * index + data_index;
- }
- index
-}
-
-impl PivotTable {
- pub fn new(axes_and_dimensions: impl IntoIterator<Item = (Axis3, Dimension)>) -> Self {
- let mut dimensions = Vec::new();
- let mut axes = EnumMap::<Axis3, Axis>::default();
- for (axis, dimension) in axes_and_dimensions {
- axes[axis].dimensions.push(dimensions.len());
- dimensions.push(dimension);
- }
- Self {
- look: Settings::global().look.clone(),
- current_layer: repeat_n(0, axes[Axis3::Z].dimensions.len()).collect(),
- axes,
- dimensions,
- ..Self::default()
- }
- }
- fn cell_index(&self, data_indexes: &[usize]) -> usize {
- cell_index(data_indexes, self.dimensions.iter().map(|d| d.len()))
- }
-
- pub fn insert(&mut self, data_indexes: &[usize], value: impl Into<Value>) {
- self.cells
- .insert(self.cell_index(data_indexes), value.into());
- }
-
- pub fn get(&self, data_indexes: &[usize]) -> Option<&Value> {
- self.cells.get(&self.cell_index(data_indexes))
- }
-
- pub fn with_data<I>(mut self, iter: impl IntoIterator<Item = (I, Value)>) -> Self
- where
- I: AsRef<[usize]>,
- {
- self.extend(iter);
- self
- }
-
- /// Converts per-axis presentation-order indexes in `presentation_indexes`,
- /// into data indexes for each dimension.
- fn convert_indexes_ptod(
- &self,
- presentation_indexes: EnumMap<Axis3, &[usize]>,
- ) -> SmallVec<[usize; 4]> {
- let mut data_indexes = SmallVec::from_elem(0, self.dimensions.len());
- for (axis, presentation_indexes) in presentation_indexes {
- for (&dim_index, &pindex) in self.axes[axis]
- .dimensions
- .iter()
- .zip(presentation_indexes.iter())
- {
- data_indexes[dim_index] = self.dimensions[dim_index].presentation_order[pindex];
- }
- }
- data_indexes
- }
-
- /// Returns an iterator for the layer axis:
- ///
- /// - If `print` is true and `self.look.print_all_layers`, then the iterator
- /// will visit all values of the layer axis.
- ///
- /// - Otherwise, the iterator will just visit `self.current_layer`.
- pub fn layers(&self, print: bool) -> Box<dyn Iterator<Item = SmallVec<[usize; 4]>>> {
- if print && self.look.print_all_layers {
- Box::new(self.axis_values(Axis3::Z))
- } else {
- Box::new(once(SmallVec::from_slice(&self.current_layer)))
- }
- }
-
- pub fn value_options(&self) -> ValueOptions {
- ValueOptions {
- show_values: self.show_values,
- show_variables: self.show_variables,
- small: self.small,
- footnote_marker_type: self.look.footnote_marker_type,
- }
- }
-
- pub fn transpose(&mut self) {
- self.axes.swap(Axis3::X, Axis3::Y);
- }
-
- pub fn axis_dimensions(
- &self,
- axis: Axis3,
- ) -> impl DoubleEndedIterator<Item = &Dimension> + ExactSizeIterator {
- self.axes[axis]
- .dimensions
- .iter()
- .copied()
- .map(|index| &self.dimensions[index])
- }
-
- fn find_dimension(&self, dim_index: usize) -> Option<(Axis3, usize)> {
- debug_assert!(dim_index < self.dimensions.len());
- for axis in enum_iterator::all::<Axis3>() {
- for (position, dimension) in self.axes[axis].dimensions.iter().copied().enumerate() {
- if dimension == dim_index {
- return Some((axis, position));
- }
- }
- }
- None
- }
- pub fn move_dimension(&mut self, dim_index: usize, new_axis: Axis3, new_position: usize) {
- let (old_axis, old_position) = self.find_dimension(dim_index).unwrap();
- if old_axis == new_axis && old_position == new_position {
- return;
- }
-
- // Update the current layer, if necessary. If we're moving within the
- // layer axis, preserve the current layer.
- match (old_axis, new_axis) {
- (Axis3::Z, Axis3::Z) => {
- // Rearrange the layer axis.
- if old_position < new_position {
- self.current_layer[old_position..=new_position].rotate_left(1);
- } else {
- self.current_layer[new_position..=old_position].rotate_right(1);
- }
- }
- (Axis3::Z, _) => {
- // A layer is becoming a row or column.
- self.current_layer.remove(old_position);
- }
- (_, Axis3::Z) => {
- // A row or column is becoming a layer.
- self.current_layer.insert(new_position, 0);
- }
- _ => (),
- }
-
- self.axes[old_axis].dimensions.remove(old_position);
- self.axes[new_axis]
- .dimensions
- .insert(new_position, dim_index);
- }
-}
-
-impl<I> Extend<(I, Value)> for PivotTable
-where
- I: AsRef<[usize]>,
-{
- fn extend<T: IntoIterator<Item = (I, Value)>>(&mut self, iter: T) {
- for (data_indexes, value) in iter {
- self.insert(data_indexes.as_ref(), value);
- }
- }
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct Footnote {
- #[serde(skip)]
- index: usize,
- pub content: Box<Value>,
- pub marker: Option<Box<Value>>,
- pub show: bool,
-}
-
-impl Footnote {
- pub fn new(content: impl Into<Value>) -> Self {
- Self {
- index: 0,
- content: Box::new(content.into()),
- marker: None,
- show: true,
- }
- }
- pub fn with_marker(mut self, marker: impl Into<Value>) -> Self {
- self.marker = Some(Box::new(marker.into()));
- self
- }
-
- pub fn with_show(mut self, show: bool) -> Self {
- self.show = show;
- self
- }
-
- pub fn with_index(mut self, index: usize) -> Self {
- self.index = index;
- self
- }
-
- pub fn display_marker(&self, options: impl IntoValueOptions) -> DisplayMarker<'_> {
- DisplayMarker {
- footnote: self,
- options: options.into_value_options(),
- }
- }
-
- pub fn display_content(&self, options: impl IntoValueOptions) -> DisplayValue<'_> {
- self.content.display(options)
- }
-
- pub fn index(&self) -> usize {
- self.index
- }
-}
-
-pub struct DisplayMarker<'a> {
- footnote: &'a Footnote,
- options: ValueOptions,
-}
-
-impl Display for DisplayMarker<'_> {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- if let Some(marker) = &self.footnote.marker {
- write!(f, "{}", marker.display(self.options).without_suffixes())
- } else {
- let i = self.footnote.index + 1;
- match self.options.footnote_marker_type {
- FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic::new_lowercase(i)),
- FootnoteMarkerType::Numeric => write!(f, "{i}"),
- }
- }
- }
-}
-
-/// Displays a number in 26adic notation.
-///
-/// Zero is displayed as the empty string, 1 through 26 as `a` through `z`, 27
-/// through 52 as `aa` through `az`, and so on.
-pub struct Display26Adic {
- value: usize,
- base: u8,
-}
-
-impl Display26Adic {
- /// Constructs a `Display26Adic` for `value`, with letters in lowercase.
- pub fn new_lowercase(value: usize) -> Self {
- Self { value, base: b'a' }
- }
-
- /// Constructs a `Display26Adic` for `value`, with letters in uppercase.
- pub fn new_uppercase(value: usize) -> Self {
- Self { value, base: b'A' }
- }
-}
-
-impl Display for Display26Adic {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- let mut output = SmallVec::<[u8; 16]>::new();
- let mut number = self.value;
- while number > 0 {
- number -= 1;
- let digit = (number % 26) as u8;
- output.push(digit + self.base);
- number /= 26;
- }
- output.reverse();
- write!(f, "{}", from_utf8(&output).unwrap())
- }
-}
-
-/// The content of a single pivot table cell.
-///
-/// A [Value] is also a pivot table's title, caption, footnote marker and
-/// contents, and so on.
-///
-/// A given [Value] is one of:
-///
-/// 1. A number resulting from a calculation.
-///
-/// A number has an associated display format (usually [F] or [Pct]). This
-/// format can be set directly, but that is not usually the easiest way.
-/// Instead, it is usually true that all of the values in a single category
-/// should have the same format (e.g. all "Significance" values might use
-/// format `F40.3`), so PSPP makes it easy to set the default format for a
-/// category while creating the category. See pivot_dimension_create() for
-/// more details.
-///
-/// [F]: crate::format::Type::F
-/// [Pct]: crate::format::Type::Pct
-///
-/// 2. A numeric or string value obtained from data ([ValueInner::Number] or
-/// [ValueInner::String]). If such a value corresponds to a variable, then the
-/// variable's name can be attached to the pivot_value. If the value has a
-/// value label, then that can also be attached. When a label is present,
-/// the user can control whether to show the value or the label or both.
-///
-/// 3. A variable name ([ValueInner::Variable]). The variable label, if any, can
-/// be attached too, and again the user can control whether to show the value
-/// or the label or both.
-///
-/// 4. A text string ([ValueInner::Text). The value stores the string in English
-/// and translated into the output language (localized). Use
-/// pivot_value_new_text() or pivot_value_new_text_format() for those cases.
-/// In some cases, only an English or a localized version is available for
-/// one reason or another, although this is regrettable; in those cases, use
-/// pivot_value_new_user_text() or pivot_value_new_user_text_nocopy().
-///
-/// 5. A template. PSPP doesn't create these itself yet, but it can read and
-/// interpret those created by SPSS.
-#[derive(Clone, Default)]
-pub struct Value {
- pub inner: ValueInner,
- pub styling: Option<Box<ValueStyle>>,
-}
-
-impl Serialize for Value {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- self.inner.serialize(serializer)
- }
-}
-
-/// Wrapper for [Value] that uses [Value::serialize_bare] for serialization.
-#[derive(Serialize)]
-struct BareValue<'a>(#[serde(serialize_with = "Value::serialize_bare")] pub &'a Value);
-
-impl Value {
- pub fn serialize_bare<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: Serializer,
- {
- match &self.inner {
- ValueInner::Number(number_value) => number_value.serialize_bare(serializer),
- ValueInner::String(string_value) => string_value.s.serialize(serializer),
- ValueInner::Variable(variable_value) => variable_value.var_name.serialize(serializer),
- ValueInner::Text(text_value) => text_value.localized.serialize(serializer),
- ValueInner::Template(template_value) => template_value.localized.serialize(serializer),
- ValueInner::Empty => serializer.serialize_none(),
- }
- }
-
- fn new(inner: ValueInner) -> Self {
- Self {
- inner,
- styling: None,
- }
- }
- pub fn new_number_with_format(x: Option<f64>, format: Format) -> Self {
- Self::new(ValueInner::Number(NumberValue {
- show: None,
- format,
- honor_small: false,
- value: x,
- variable: None,
- value_label: None,
- }))
- }
- pub fn new_variable(variable: &Variable) -> Self {
- Self::new(ValueInner::Variable(VariableValue {
- show: None,
- var_name: String::from(variable.name.as_str()),
- variable_label: variable.label.clone(),
- }))
- }
- pub fn new_datum<B>(value: &Datum<B>) -> Self
- where
- B: EncodedString,
- {
- match value {
- Datum::Number(number) => Self::new_number(*number),
- Datum::String(string) => Self::new_user_text(string.as_str()),
- }
- }
- pub fn new_variable_value(variable: &Variable, value: &Datum<ByteString>) -> Self {
- let var_name = Some(variable.name.as_str().into());
- let value_label = variable.value_labels.get(value).map(String::from);
- match value {
- Datum::Number(number) => Self::new(ValueInner::Number(NumberValue {
- show: None,
- format: match variable.print_format.var_type() {
- VarType::Numeric => variable.print_format,
- VarType::String => {
- #[cfg(debug_assertions)]
- panic!("cannot create numeric pivot value with string format");
-
- #[cfg(not(debug_assertions))]
- Format::F8_2
- }
- },
- honor_small: false,
- value: *number,
- variable: var_name,
- value_label,
- })),
- Datum::String(string) => Self::new(ValueInner::String(StringValue {
- show: None,
- hex: variable.print_format.type_() == Type::AHex,
- s: string
- .as_ref()
- .with_encoding(variable.encoding())
- .into_string(),
- var_name,
- value_label,
- })),
- }
- }
- pub fn new_number(x: Option<f64>) -> Self {
- Self::new_number_with_format(x, Format::F8_2)
- }
- pub fn new_integer(x: Option<f64>) -> Self {
- Self::new_number_with_format(x, Format::F40)
- }
- pub fn new_text(s: impl Into<String>) -> Self {
- Self::new_user_text(s)
- }
- pub fn new_user_text(s: impl Into<String>) -> Self {
- let s: String = s.into();
- if s.is_empty() {
- Self::default()
- } else {
- Self::new(ValueInner::Text(TextValue {
- user_provided: true,
- localized: s.clone(),
- c: None,
- id: None,
- }))
- }
- }
- pub fn with_footnote(mut self, footnote: &Arc<Footnote>) -> Self {
- self.add_footnote(footnote);
- self
- }
- pub fn add_footnote(&mut self, footnote: &Arc<Footnote>) {
- let footnotes = &mut self.styling.get_or_insert_default().footnotes;
- footnotes.push(footnote.clone());
- footnotes.sort_by_key(|f| f.index);
- }
- pub fn with_show_value_label(mut self, show: Option<Show>) -> Self {
- let new_show = show;
- match &mut self.inner {
- ValueInner::Number(NumberValue { show, .. })
- | ValueInner::String(StringValue { show, .. }) => {
- *show = new_show;
- }
- _ => (),
- }
- self
- }
- pub fn with_show_variable_label(mut self, show: Option<Show>) -> Self {
- if let ValueInner::Variable(variable_value) = &mut self.inner {
- variable_value.show = show;
- }
- self
- }
- pub fn with_value_label(mut self, label: Option<String>) -> Self {
- match &mut self.inner {
- ValueInner::Number(NumberValue { value_label, .. })
- | ValueInner::String(StringValue { value_label, .. }) => *value_label = label.clone(),
- _ => (),
- }
- self
- }
- pub const fn empty() -> Self {
- Value {
- inner: ValueInner::Empty,
- styling: None,
- }
- }
- pub const fn is_empty(&self) -> bool {
- self.inner.is_empty() && self.styling.is_none()
- }
-}
-
-impl From<&str> for Value {
- fn from(value: &str) -> Self {
- Self::new_text(value)
- }
-}
-
-impl From<String> for Value {
- fn from(value: String) -> Self {
- Self::new_text(value)
- }
-}
-
-impl From<&Variable> for Value {
- fn from(variable: &Variable) -> Self {
- Self::new_variable(variable)
- }
-}
-
-pub struct DisplayValue<'a> {
- inner: &'a ValueInner,
- markup: bool,
- subscripts: &'a [String],
- footnotes: &'a [Arc<Footnote>],
- options: ValueOptions,
- show_value: bool,
- show_label: Option<&'a str>,
-}
-
-impl<'a> DisplayValue<'a> {
- pub fn subscripts(&self) -> impl Iterator<Item = &str> {
- self.subscripts.iter().map(String::as_str)
- }
-
- pub fn has_subscripts(&self) -> bool {
- !self.subscripts.is_empty()
- }
-
- pub fn footnotes(&self) -> impl Iterator<Item = DisplayMarker<'_>> {
- self.footnotes
- .iter()
- .filter(|f| f.show)
- .map(|f| f.display_marker(self.options))
- }
-
- pub fn has_footnotes(&self) -> bool {
- self.footnotes().next().is_some()
- }
-
- pub fn without_suffixes(self) -> Self {
- Self {
- subscripts: &[],
- footnotes: &[],
- ..self
- }
- }
-
- /// Returns this display split into `(body, suffixes)` where `suffixes` is
- /// subscripts and footnotes and `body` is everything else.
- pub fn split_suffixes(self) -> (Self, Self) {
- let suffixes = Self {
- inner: &ValueInner::Empty,
- ..self
- };
- (self.without_suffixes(), suffixes)
- }
-
- pub fn with_styling(mut self, styling: &'a ValueStyle) -> Self {
- if let Some(area_style) = &styling.style {
- self.markup = area_style.font_style.markup;
- }
- self.subscripts = styling.subscripts.as_slice();
- self.footnotes = styling.footnotes.as_slice();
- self
- }
-
- pub fn with_font_style(self, font_style: &FontStyle) -> Self {
- Self {
- markup: font_style.markup,
- ..self
- }
- }
-
- pub fn with_subscripts(self, subscripts: &'a [String]) -> Self {
- Self { subscripts, ..self }
- }
-
- pub fn with_footnotes(self, footnotes: &'a [Arc<Footnote>]) -> Self {
- Self { footnotes, ..self }
- }
-
- pub fn is_empty(&self) -> bool {
- self.inner.is_empty() && self.subscripts.is_empty() && self.footnotes.is_empty()
- }
-
- fn small(&self) -> f64 {
- self.options.small
- }
-
- pub fn var_type(&self) -> VarType {
- match self.inner {
- ValueInner::Number(NumberValue { .. }) if self.show_label.is_none() => VarType::Numeric,
- _ => VarType::String,
- }
- }
-
- fn template(
- &self,
- f: &mut std::fmt::Formatter<'_>,
- template: &str,
- args: &[Vec<Value>],
- ) -> std::fmt::Result {
- let mut iter = template.as_bytes().iter();
- while let Some(c) = iter.next() {
- match c {
- b'\\' => {
- let c = *iter.next().unwrap_or(&b'\\') as char;
- let c = if c == 'n' { '\n' } else { c };
- write!(f, "{c}")?;
- }
- b'^' => {
- let (index, rest) = consume_int(iter.as_slice());
- iter = rest.iter();
- let Some(arg) = args.get(index.wrapping_sub(1)) else {
- continue;
- };
- if let Some(arg) = arg.first() {
- write!(f, "{}", arg.display(self.options))?;
- }
- }
- b'[' => {
- let (a, rest) = extract_inner_template(iter.as_slice());
- let (b, rest) = extract_inner_template(rest);
- let rest = rest.strip_prefix(b"]").unwrap_or(rest);
- let (index, rest) = consume_int(rest);
- iter = rest.iter();
-
- let Some(mut args) = args.get(index.wrapping_sub(1)).map(|vec| vec.as_slice())
- else {
- continue;
- };
- let (mut template, mut escape) =
- if !a.is_empty() { (a, b'%') } else { (b, b'^') };
- while !args.is_empty() {
- let n_consumed = self.inner_template(f, template, escape, args)?;
- if n_consumed == 0 {
- break;
- }
- args = &args[n_consumed..];
-
- template = b;
- escape = b'^';
- }
- }
- c => write!(f, "{c}")?,
- }
- }
- Ok(())
- }
-
- fn inner_template(
- &self,
- f: &mut std::fmt::Formatter<'_>,
- template: &[u8],
- escape: u8,
- args: &[Value],
- ) -> Result<usize, std::fmt::Error> {
- let mut iter = template.iter();
- let mut args_consumed = 0;
- while let Some(c) = iter.next() {
- match c {
- b'\\' => {
- let c = *iter.next().unwrap_or(&b'\\') as char;
- let c = if c == 'n' { '\n' } else { c };
- write!(f, "{c}")?;
- }
- c if *c == escape => {
- let (index, rest) = consume_int(iter.as_slice());
- iter = rest.iter();
- let Some(arg) = args.get(index.wrapping_sub(1)) else {
- continue;
- };
- args_consumed = args_consumed.max(index);
- write!(f, "{}", arg.display(self.options))?;
- }
- c => write!(f, "{c}")?,
- }
- }
- Ok(args_consumed)
- }
-}
-
-fn consume_int(input: &[u8]) -> (usize, &[u8]) {
- let mut n = 0;
- for (index, c) in input.iter().enumerate() {
- if !c.is_ascii_digit() {
- return (n, &input[index..]);
- }
- n = n * 10 + (c - b'0') as usize;
- }
- (n, &[])
-}
-
-fn extract_inner_template(input: &[u8]) -> (&[u8], &[u8]) {
- for (index, c) in input.iter().copied().enumerate() {
- if c == b':' && (index == 0 || input[index - 1] != b'\\') {
- return input.split_at(index);
- }
- }
- (input, &[])
-}
-
-fn interpret_show(
- global_show: impl Fn() -> Show,
- table_show: Option<Show>,
- value_show: Option<Show>,
- label: &str,
-) -> (bool, Option<&str>) {
- match value_show.or(table_show).unwrap_or_else(global_show) {
- Show::Value => (true, None),
- Show::Label => (false, Some(label)),
- Show::Both => (true, Some(label)),
- }
-}
-
-impl Display for DisplayValue<'_> {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self.inner {
- ValueInner::Number(NumberValue {
- format,
- honor_small,
- value,
- ..
- }) => {
- if self.show_value {
- let format = if format.type_() == Type::F
- && *honor_small
- && value.is_some_and(|value| value != 0.0 && value.abs() < self.small())
- {
- UncheckedFormat::new(Type::E, 40, format.d() as u8).fix()
- } else {
- *format
- };
- let mut buf = SmallString::<[u8; 40]>::new();
- write!(
- &mut buf,
- "{}",
- Datum::<&str>::Number(*value).display(format)
- )
- .unwrap();
- write!(f, "{}", buf.trim_start_matches(' '))?;
- }
- if let Some(label) = self.show_label {
- if self.show_value {
- write!(f, " ")?;
- }
- f.write_str(label)?;
- }
- Ok(())
- }
-
- ValueInner::String(StringValue { s, .. })
- | ValueInner::Variable(VariableValue { var_name: s, .. }) => {
- match (self.show_value, self.show_label) {
- (true, None) => write!(f, "{s}"),
- (false, Some(label)) => write!(f, "{label}"),
- (true, Some(label)) => write!(f, "{s} {label}"),
- (false, None) => unreachable!(),
- }
- }
-
- ValueInner::Text(TextValue {
- localized: local, ..
- }) => {
- /*
- if self
- .inner
- .styling
- .as_ref()
- .is_some_and(|styling| styling.style.font_style.markup)
- {
- todo!();
- }*/
- f.write_str(local)
- }
-
- ValueInner::Template(TemplateValue {
- args,
- localized: local,
- ..
- }) => self.template(f, local, args),
-
- ValueInner::Empty => Ok(()),
- }?;
-
- for (subscript, delimiter) in self.subscripts.iter().zip(once('_').chain(repeat(','))) {
- write!(f, "{delimiter}{subscript}")?;
- }
-
- for footnote in self.footnotes {
- write!(f, "[{}]", footnote.display_marker(self.options))?;
- }
-
- Ok(())
- }
-}
-
-impl Value {
- // Returns an object that will format this value, including subscripts and
- // superscripts and footnotes. `options` controls whether variable and
- // value labels are included.
- pub fn display(&self, options: impl IntoValueOptions) -> DisplayValue<'_> {
- let display = self.inner.display(options.into_value_options());
- match &self.styling {
- Some(styling) => display.with_styling(styling),
- None => display,
- }
- }
-}
-
-impl Debug for Value {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{:?}", self.display(()).to_string())
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct NumberValue {
- /// The numerical value, or `None` if it is a missing value.
- pub value: Option<f64>,
- pub format: Format,
- pub show: Option<Show>,
- pub honor_small: bool,
- pub variable: Option<String>,
- pub value_label: Option<String>,
-}
-
-impl Serialize for NumberValue {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- if self.format.type_() == Type::F && self.variable.is_none() && self.value_label.is_none() {
- self.value.serialize(serializer)
- } else {
- let mut s = serializer.serialize_map(None)?;
- s.serialize_entry("value", &self.value)?;
- s.serialize_entry("format", &self.format)?;
- if let Some(show) = self.show {
- s.serialize_entry("show", &show)?;
- }
- if self.honor_small {
- s.serialize_entry("honor_small", &self.honor_small)?;
- }
- if let Some(variable) = &self.variable {
- s.serialize_entry("variable", variable)?;
- }
- if let Some(value_label) = &self.value_label {
- s.serialize_entry("value_label", value_label)?;
- }
- s.end()
- }
- }
-}
-
-impl NumberValue {
- pub fn serialize_bare<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: Serializer,
- {
- if let Some(number) = self.value
- && number.trunc() == number
- && number >= -(1i64 << 53) as f64
- && number <= (1i64 << 53) as f64
- {
- (number as u64).serialize(serializer)
- } else {
- self.value.serialize(serializer)
- }
- }
-}
-
-#[derive(Serialize)]
-pub struct BareNumberValue<'a>(
- #[serde(serialize_with = "NumberValue::serialize_bare")] pub &'a NumberValue,
-);
-
-#[derive(Clone, Debug, Serialize)]
-pub struct StringValue {
- /// The string value.
- ///
- /// If `hex` is true, this should contain hex digits, not raw binary data
- /// (otherwise it would be impossible to encode non-UTF-8 data).
- pub s: String,
-
- /// True if `s` is hex digits.
- pub hex: bool,
-
- pub show: Option<Show>,
-
- pub var_name: Option<String>,
- pub value_label: Option<String>,
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct VariableValue {
- pub show: Option<Show>,
- pub var_name: String,
- pub variable_label: Option<String>,
-}
-
-#[derive(Clone, Debug)]
-pub struct TextValue {
- pub user_provided: bool,
- /// Localized.
- pub localized: String,
- /// English.
- pub c: Option<String>,
- /// Identifier.
- pub id: Option<String>,
-}
-
-impl Serialize for TextValue {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- if self.user_provided && self.c.is_none() && self.id.is_none() {
- serializer.serialize_str(&self.localized)
- } else {
- let mut s = serializer.serialize_struct(
- "TextValue",
- 2 + self.c.is_some() as usize + self.id.is_some() as usize,
- )?;
- s.serialize_field("user_provided", &self.user_provided)?;
- s.serialize_field("localized", &self.localized)?;
- if let Some(c) = &self.c {
- s.serialize_field("c", &c)?;
- }
- if let Some(id) = &self.id {
- s.serialize_field("id", &id)?;
- }
- s.end()
- }
- }
-}
-
-impl TextValue {
- pub fn localized(&self) -> &str {
- self.localized.as_str()
- }
- pub fn c(&self) -> &str {
- self.c.as_ref().unwrap_or(&self.localized).as_str()
- }
- pub fn id(&self) -> &str {
- self.id.as_ref().unwrap_or(&self.localized).as_str()
- }
-}
-
-#[derive(Clone, Debug, Serialize)]
-pub struct TemplateValue {
- pub args: Vec<Vec<Value>>,
- pub localized: String,
- pub id: String,
-}
-
-#[derive(Clone, Debug, Default, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub enum ValueInner {
- Number(NumberValue),
- String(StringValue),
- Variable(VariableValue),
- Text(TextValue),
- Template(TemplateValue),
-
- #[default]
- Empty,
-}
-
-impl ValueInner {
- pub const fn is_empty(&self) -> bool {
- matches!(self, Self::Empty)
- }
- fn show(&self) -> Option<Show> {
- match self {
- ValueInner::Number(NumberValue { show, .. })
- | ValueInner::String(StringValue { show, .. })
- | ValueInner::Variable(VariableValue { show, .. }) => *show,
- _ => None,
- }
- }
-
- fn label(&self) -> Option<&str> {
- self.value_label().or_else(|| self.variable_label())
- }
-
- fn value_label(&self) -> Option<&str> {
- match self {
- ValueInner::Number(NumberValue { value_label, .. })
- | ValueInner::String(StringValue { value_label, .. }) => {
- value_label.as_ref().map(String::as_str)
- }
- _ => None,
- }
- }
-
- fn variable_label(&self) -> Option<&str> {
- match self {
- ValueInner::Variable(VariableValue { variable_label, .. }) => {
- variable_label.as_ref().map(String::as_str)
- }
- _ => None,
- }
- }
-}
-
-#[derive(Clone, Debug, Default)]
-pub struct ValueStyle {
- pub style: Option<AreaStyle>,
- pub subscripts: Vec<String>,
- pub footnotes: Vec<Arc<Footnote>>,
-}
-
-impl ValueStyle {
- pub fn is_empty(&self) -> bool {
- self.style.is_none() && self.subscripts.is_empty() && self.footnotes.is_empty()
- }
-}
-
-impl ValueInner {
- // Returns an object that will format this value. Settings on `options`
- // control whether variable and value labels are included.
- pub fn display(&self, options: impl IntoValueOptions) -> DisplayValue<'_> {
- let options = options.into_value_options();
- let (show_value, show_label) = if let Some(value_label) = self.value_label() {
- interpret_show(
- || Settings::global().show_values,
- options.show_values,
- self.show(),
- value_label,
- )
- } else if let Some(variable_label) = self.variable_label() {
- interpret_show(
- || Settings::global().show_variables,
- options.show_variables,
- self.show(),
- variable_label,
- )
- } else {
- (true, None)
- };
- DisplayValue {
- inner: self,
- markup: false,
- subscripts: &[],
- footnotes: &[],
- options,
- show_value,
- show_label,
- }
- }
-}
-
-pub struct MetadataEntry {
- pub name: Value,
- pub value: MetadataValue,
-}
-
-pub enum MetadataValue {
- Leaf(Value),
- Group(Vec<MetadataEntry>),
-}
-
-impl MetadataEntry {
- pub fn into_pivot_table(self) -> PivotTable {
- let mut data = Vec::new();
- let group = match self.visit(&mut data) {
- Category::Group(group) => group,
- Category::Leaf(leaf) => Group::new("Metadata").with(leaf).with_label_shown(),
- };
- PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data(
- data.into_iter()
- .enumerate()
- .filter(|(_row, value)| !value.is_empty())
- .map(|(row, value)| ([row], value)),
- )
- }
- fn visit(self, data: &mut Vec<Value>) -> Category {
- match self.value {
- MetadataValue::Leaf(value) => {
- data.push(value);
- Leaf::new(self.name).into()
- }
- MetadataValue::Group(items) => Group::with_capacity(self.name, items.len())
- .with_multiple(items.into_iter().map(|item| item.visit(data)))
- .into(),
- }
- }
-}
-
-impl Serialize for MetadataValue {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- match self {
- MetadataValue::Leaf(value) => value.serialize_bare(serializer),
- MetadataValue::Group(items) => {
- let mut map = serializer.serialize_map(Some(items.len()))?;
- for item in items {
- let name = item.name.display(()).to_string();
- map.serialize_entry(&name, &item.value)?;
- }
- map.end()
- }
- }
- }
-}
-impl Serialize for MetadataEntry {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::Serializer,
- {
- match &self.value {
- MetadataValue::Leaf(value) => {
- let mut map = serializer.serialize_map(Some(1))?;
- let name = self.name.display(()).to_string();
- map.serialize_entry(&name, &BareValue(value))?;
- map.end()
- }
- MetadataValue::Group(items) => {
- let mut map = serializer.serialize_map(Some(items.len()))?;
- for item in items {
- let name = item.name.display(()).to_string();
- map.serialize_entry(&name, &item.value)?;
- }
- map.end()
- }
- }
- }
-}
-
-#[cfg(test)]
-mod tests {
- use crate::output::pivot::{Display26Adic, MetadataEntry, MetadataValue, Value};
-
- #[test]
- fn display_26adic() {
- for (number, lowercase, uppercase) in [
- (0, "", ""),
- (1, "a", "A"),
- (2, "b", "B"),
- (26, "z", "Z"),
- (27, "aa", "AA"),
- (28, "ab", "AB"),
- (29, "ac", "AC"),
- (18278, "zzz", "ZZZ"),
- (18279, "aaaa", "AAAA"),
- (19010, "abcd", "ABCD"),
- ] {
- assert_eq!(Display26Adic::new_lowercase(number).to_string(), lowercase);
- assert_eq!(Display26Adic::new_uppercase(number).to_string(), uppercase);
- }
- }
-
- #[test]
- fn metadata_entry() {
- let tree = MetadataEntry {
- name: Value::from("Group"),
- value: MetadataValue::Group(vec![
- MetadataEntry {
- name: Value::from("Name 1"),
- value: MetadataValue::Leaf(Value::from("Value 1")),
- },
- MetadataEntry {
- name: Value::from("Subgroup 1"),
- value: MetadataValue::Group(vec![
- MetadataEntry {
- name: Value::from("Subname 1"),
- value: MetadataValue::Leaf(Value::from("Subvalue 1")),
- },
- MetadataEntry {
- name: Value::from("Subname 2"),
- value: MetadataValue::Leaf(Value::from("Subvalue 2")),
- },
- MetadataEntry {
- name: Value::from("Subname 3"),
- value: MetadataValue::Leaf(Value::new_integer(Some(3.0))),
- },
- ]),
- },
- MetadataEntry {
- name: Value::from("Name 2"),
- value: MetadataValue::Leaf(Value::from("Value 2")),
- },
- ]),
- };
- assert_eq!(
- serde_json::to_string_pretty(&tree).unwrap(),
- r#"{
- "Name 1": "Value 1",
- "Subgroup 1": {
- "Subname 1": "Subvalue 1",
- "Subname 2": "Subvalue 2",
- "Subname 3": 3
- },
- "Name 2": "Value 2"
-}"#
- );
-
- assert_eq!(
- tree.into_pivot_table().to_string(),
- r#"╭────────────────────┬──────────╮
-│ Name 1 │Value 1 │
-├────────────────────┼──────────┤
-│Subgroup 1 Subname 1│Subvalue 1│
-│ Subname 2│Subvalue 2│
-│ Subname 3│ 3│
-├────────────────────┼──────────┤
-│ Name 2 │Value 2 │
-╰────────────────────┴──────────╯
-"#
- );
- }
-}
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Reading and writing system files.
+//!
+//! This module enables reading and writing "system files", the binary format
+//! for SPSS data files. The system file format dates back 40+ years and has
+//! evolved greatly over that time to support new features, but in a way to
+//! facilitate interchange between even the oldest and newest versions of
+//! software.
+//!
+//! Use [ReadOptions] to read a system file in the simplest way.
+//! Use [WriteOptions] to write a system file.
+
+// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
+#![cfg_attr(not(test), warn(missing_docs))]
+
+mod cooked;
+use binrw::Endian;
+pub use cooked::*;
+pub mod encoding;
+pub mod raw;
+
+#[cfg(test)]
+pub mod sack;
+
+mod write;
+use serde::Serializer;
+pub use write::{SystemFileVersion, WriteOptions, Writer};
+
+#[cfg(test)]
+mod test;
+
+fn serialize_endian<S>(endian: &Endian, serializer: S) -> Result<S::Ok, S::Error>
+where
+ S: Serializer,
+{
+ match endian {
+ Endian::Big => serializer.serialize_unit_variant("Endian", 0, "Big"),
+ Endian::Little => serializer.serialize_unit_variant("Endian", 1, "Little"),
+ }
+}
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-//! Reading and writing system files.
-//!
-//! This module enables reading and writing "system files", the binary format
-//! for SPSS data files. The system file format dates back 40+ years and has
-//! evolved greatly over that time to support new features, but in a way to
-//! facilitate interchange between even the oldest and newest versions of
-//! software.
-//!
-//! Use [ReadOptions] to read a system file in the simplest way.
-//! Use [WriteOptions] to write a system file.
-
-// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
-#![cfg_attr(not(test), warn(missing_docs))]
-
-mod cooked;
-use binrw::Endian;
-pub use cooked::*;
-pub mod encoding;
-pub mod raw;
-
-#[cfg(test)]
-pub mod sack;
-
-mod write;
-use serde::Serializer;
-pub use write::{SystemFileVersion, WriteOptions, Writer};
-
-#[cfg(test)]
-mod test;
-
-fn serialize_endian<S>(endian: &Endian, serializer: S) -> Result<S::Ok, S::Error>
-where
- S: Serializer,
-{
- match endian {
- Endian::Big => serializer.serialize_unit_variant("Endian", 0, "Big"),
- Endian::Little => serializer.serialize_unit_variant("Endian", 1, "Little"),
- }
-}