From: Ben Pfaff Date: Tue, 16 Jul 2024 19:40:16 +0000 (-0700) Subject: work X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fa6ad1020122a25d1d0602162b6464a4c55bbb66;p=pspp work --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 3e97e404df..96468531c2 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -400,9 +400,9 @@ impl Decoder { fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier { loop { self.n_generated_names += 1; - let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding) + let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding) .unwrap(); - if !dictionary.variables.contains(&name) { + if !dictionary.variables.contains(&name.0) { return name; } assert!(self.n_generated_names < usize::MAX); @@ -447,9 +447,9 @@ pub fn decode( let mut var_index_map = HashMap::new(); while let Some((value_index, input)) = header_vars.next() { let name = trim_end_spaces(input.name.to_string()); - let name = match Identifier::new(&name, encoding) { + let name = match Identifier::from_encoding(&name, encoding) { Ok(name) => { - if !dictionary.variables.contains(&name) { + if !dictionary.variables.contains(&name.0) { name } else { let new_name = decoder.generate_name(&dictionary); diff --git a/rust/src/dictionary.rs b/rust/src/dictionary.rs index 8d28ff329b..2c1707b257 100644 --- a/rust/src/dictionary.rs +++ b/rust/src/dictionary.rs @@ -9,6 +9,7 @@ use encoding_rs::Encoding; use indexmap::IndexSet; use num::integer::div_ceil; use ordered_float::OrderedFloat; +use unicase::UniCase; use crate::{ format::Spec, @@ -379,8 +380,8 @@ impl Variable { } impl HasIdentifier for Variable { - fn identifier(&self) -> &Identifier { - &self.name + fn identifier(&self) -> &UniCase { + &self.name.0 } } @@ -401,8 +402,8 @@ impl Vector { } impl HasIdentifier for Vector { - fn identifier(&self) -> &Identifier { - &self.name + fn identifier(&self) -> &UniCase { + &self.name.0 } } @@ -413,8 +414,8 @@ pub struct Attribute { } impl HasIdentifier for Attribute { - fn identifier(&self) -> &Identifier { - &self.name + fn identifier(&self) -> &UniCase { + &self.name.0 } } @@ -437,8 +438,8 @@ impl MultipleResponseSet { } impl HasIdentifier for MultipleResponseSet { - fn identifier(&self) -> &Identifier { - &self.name + fn identifier(&self) -> &UniCase { + &self.name.0 } } @@ -468,8 +469,8 @@ impl VariableSet { } impl HasIdentifier for VariableSet { - fn identifier(&self) -> &Identifier { - &self.name + fn identifier(&self) -> &UniCase { + &self.name.0 } } @@ -477,6 +478,8 @@ impl HasIdentifier for VariableSet { mod test { use std::collections::HashSet; + use unicase::UniCase; + use crate::identifier::Identifier; use super::{ByIdentifier, HasIdentifier}; @@ -488,15 +491,15 @@ mod test { } impl HasIdentifier for Variable { - fn identifier(&self) -> &Identifier { - &self.name + fn identifier(&self) -> &UniCase { + &self.name.0 } } #[test] fn test() { // Variables should not be the same if their values differ. - let abcd = Identifier::new_utf8("abcd").unwrap(); + let abcd = Identifier::new("abcd").unwrap(); let abcd1 = Variable { name: abcd.clone(), value: 1, @@ -517,7 +520,7 @@ mod test { assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone()))); assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone()))); assert_eq!( - vars.get(&Identifier::new_utf8("abcd").unwrap()) + vars.get(&UniCase::new(String::from("abcd"))) .unwrap() .0 .value, diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index 9977aa969d..3be08083ca 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -109,10 +109,10 @@ impl Identifier { /// encoding used by the dictionary, not in UTF-8. pub const MAX_LEN: usize = 64; - pub fn new_utf8(s: &str) -> Result { - Self::new(s, UTF_8) + pub fn new(s: &str) -> Result { + Self::from_encoding(s, UTF_8) } - pub fn new(s: &str, encoding: &'static Encoding) -> Result { + pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result { Self::is_plausible(s)?; let identifier = Identifier(s.into()); identifier.check_encoding(encoding)?; @@ -221,7 +221,7 @@ impl Display for Identifier { } pub trait HasIdentifier { - fn identifier(&self) -> &Identifier; + fn identifier(&self) -> &UniCase; } pub struct ByIdentifier(pub T) @@ -275,11 +275,11 @@ where } } -impl Borrow for ByIdentifier +impl Borrow> for ByIdentifier where T: HasIdentifier, { - fn borrow(&self) -> &Identifier { + fn borrow(&self) -> &UniCase { self.0.identifier() } } diff --git a/rust/src/lex/scan/mod.rs b/rust/src/lex/scan/mod.rs index 5503a5bcc0..cbeac4095d 100644 --- a/rust/src/lex/scan/mod.rs +++ b/rust/src/lex/scan/mod.rs @@ -35,6 +35,14 @@ pub enum ScanError { #[error("Invalid hex digit {0:?}.")] BadHexDigit(char), + /// Incomplete UTF-8 sequence. + #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] + IncompleteUtf8 { substring: String, offset: usize }, + + /// Bad UTF-8 sequence. + #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] + BadUtf8 { substring: String, offset: usize }, + /// Invalid length Unicode string. #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")] BadLengthUnicodeString(usize), @@ -93,13 +101,32 @@ impl ScanToken { if s.len() % 2 != 0 { return Some(Self::Error(ScanError::OddLengthHexString(s.len()))); } - let mut out = String::with_capacity(s.len()); - for pair in s.as_bytes().chunks_exact(2) { - let hi = char::from(pair[0]).to_digit(16).unwrap() as u8; - let lo = char::from(pair[1]).to_digit(16).unwrap() as u8; - out.push(char::from(hi * 16 + lo)); + let bytes = s + .as_bytes() + .chunks_exact(2) + .map(|pair| { + let hi = char::from(pair[0]).to_digit(16).unwrap() as u8; + let lo = char::from(pair[1]).to_digit(16).unwrap() as u8; + hi * 16 + lo + }) + .collect::>(); + match String::from_utf8(bytes) { + Ok(string) => Some(Self::Token(Token::String(string))), + Err(error) => { + let details = error.utf8_error(); + let offset = details.valid_up_to() * 2; + let end = details + .error_len() + .map(|len| offset + len * 2) + .unwrap_or(s.len()); + let substring = String::from(&s[offset..end]); + Some(Self::Error(if details.error_len().is_some() { + ScanError::BadUtf8 { substring, offset } + } else { + ScanError::IncompleteUtf8 { substring, offset } + })) + } } - Some(Self::Token(Token::String(out))) } Segment::UnicodeString => { // Strip `U"` prefix and `"` suffix (or variations). diff --git a/rust/src/lex/token.rs b/rust/src/lex/token.rs index 016b282838..868a79dac9 100644 --- a/rust/src/lex/token.rs +++ b/rust/src/lex/token.rs @@ -1,3 +1,5 @@ +use std::fmt::{Display, Formatter, Result as FmtResult}; + #[derive(Clone, Debug, PartialEq)] pub enum Token { /// End of input. @@ -25,7 +27,81 @@ pub enum Token { MacroToken(MacroToken), } -#[derive(Clone, Debug, PartialEq, Eq)] +fn is_printable(c: char) -> bool { + !c.is_control() || ['\t', '\r', '\n'].contains(&c) +} + +fn string_representation(s: &str, quote: char, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{quote}")?; + for section in s.split_inclusive(quote) { + if let Some(rest) = section.strip_suffix(quote) { + write!(f, "{rest}{quote}{quote}")?; + } else { + write!(f, "{section}")?; + } + } + write!(f, "{quote}") +} + +impl Display for Token { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + Token::End => Ok(()), + Token::Id(s) => write!(f, "{s}"), + Token::Number(number) => { + if number.is_sign_negative() { + write!(f, "-{}", number.abs()) + } else { + write!(f, "{number}") + } + } + Token::String(s) => { + if s.chars().all(|c| is_printable(c)) { + if s.contains('"') { + string_representation(s, '\'', f) + } else { + string_representation(s, '"', f) + } + } else { + write!(f, "X\"")?; + for byte in s.bytes() { + let c1 = char::from_digit((byte >> 4) as u32, 16) + .unwrap() + .to_ascii_uppercase(); + let c2 = char::from_digit((byte & 0xf) as u32, 16) + .unwrap() + .to_ascii_uppercase() + .to_ascii_lowercase(); + write!(f, "{c1}{c2}")?; + } + write!(f, "\"") + } + } + Token::EndCommand => write!(f, "."), + Token::Punct(punct) => punct.fmt(f), + Token::MacroToken(mt) => mt.fmt(f), + } + } +} + +/// Check that all negative numbers, even -0, get formatted with a leading `-`. +#[cfg(test)] +mod test { + use crate::lex::token::Token; + + #[test] + fn test_string() { + assert_eq!(Token::String(String::from("abc")).to_string(), "\"abc\""); + assert_eq!(Token::String(String::from("\u{0080}")).to_string(), "X\"C280\""); + } + + #[test] + fn test_neg0() { + assert_eq!(Token::Number(-0.0).to_string(), "-0"); + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum Punct { /// `+`. Plus, @@ -112,6 +188,46 @@ pub enum Punct { Exp, } +impl Punct { + pub fn as_str(&self) -> &'static str { + match self { + Self::Plus => "+", + Self::Dash => "-", + Self::Asterisk => "*", + Self::Slash => "/", + Self::Equals => "=", + Self::LParen => "(", + Self::RParen => ")", + Self::LSquare => "[", + Self::RSquare => "]", + Self::LCurly => "{", + Self::RCurly => "}", + Self::Comma => ",", + Self::Semicolon => ";", + Self::Colon => ":", + Self::And => "AND", + Self::Or => "OR", + Self::Not => "NOT", + Self::Eq => "EQ", + Self::Ge => ">=", + Self::Gt => ">", + Self::Le => "<=", + Self::Lt => "<", + Self::Ne => "~=", + Self::All => "ALL", + Self::By => "BY", + Self::To => "TO", + Self::With => "WITH", + Self::Exp => "**", + } + } +} +impl Display for Punct { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{}", self.as_str()) + } +} + /// Tokens that only appear in macros. #[derive(Clone, Debug, PartialEq, Eq)] pub enum MacroToken { @@ -139,3 +255,26 @@ pub enum MacroToken { /// first character, so this represents an underscore found on its own. Underscore, } + +impl MacroToken { + pub fn as_str(&self) -> &str { + match self { + MacroToken::MacroId(id) => &id, + MacroToken::Bang => "!", + MacroToken::Percent => "%", + MacroToken::Question => "?", + MacroToken::Backtick => "`", + MacroToken::Dot => ".", + MacroToken::Underscore => "_", + } + } +} + +impl Display for MacroToken { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + MacroToken::MacroId(id) => write!(f, "{id}"), + _ => write!(f, "{}", self.as_str()), + } + } +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 46fe08622a..9fb03e9cf6 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -12,4 +12,4 @@ pub mod sack; pub mod lex; pub mod prompt; pub mod message; -pub mod macros; +//pub mod macros; diff --git a/rust/src/macros.rs b/rust/src/macros.rs index bf7e830722..2f9188003a 100644 --- a/rust/src/macros.rs +++ b/rust/src/macros.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ @@ -7,6 +8,27 @@ use crate::{ message::Location, }; +#[derive(Clone, Debug, ThisError)] +pub enum MacroError { + /// Expected more tokens. + #[error( + "Reached end of command expecting {n} more tokens in argument {arg} to macro {macro_}." + )] + ExpectedMoreTokens { + n: usize, + arg: String, + macro_: String, + }, + + /// Expected a particular token. + #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")] + ExpectedToken { + token: String, + arg: String, + macro_: String, + }, +} + /// A PSPP macro as defined with `!DEFINE`. pub struct Macro { /// The macro's name. This is an ordinary identifier except that it is @@ -23,6 +45,20 @@ pub struct Macro { body: Vec, } +impl Macro { + fn initial_state(&self) -> MacroCallState { + if self.parameters.is_empty() { + MacroCallState::Finished + } else if self.parameters[0].is_positional() { + MacroCallState::Keyword + } else if let ValueType::Enclose(_, _) = self.parameters[0].arg { + MacroCallState::Enclose + } else { + MacroCallState::NewArg + } + } +} + struct Parameter { /// `!name` or `!1`. name: String, @@ -32,11 +68,11 @@ struct Parameter { /// The tokens don't include white space, etc. between them. default: Vec, - /// Macro-expand the argument? - expand_arg: bool, + /// Macro-expand the value? + expand_value: bool, /// How the argument is specified. - arg: Arg, + arg: ValueType, } impl Parameter { @@ -47,7 +83,7 @@ impl Parameter { } } -enum Arg { +enum ValueType { /// Argument consists of `.0` tokens. NTokens(usize), @@ -74,26 +110,92 @@ struct BodyToken { type MacroSet = HashMap, Macro>; pub enum MacroCallState { - Arg, + /// Starting a new argument. + NewArg, + + /// Accumulating tokens toward the end of any type of argument. + ContinueArg, + + /// Expecting the opening delimiter of an ARG_ENCLOSE argument. Enclose, + + /// Expecting a keyword for a keyword argument. Keyword, + + /// Expecting an equal sign for a keyword argument. Equals, + + /// Macro fully parsed and ready for expansion. Finished, } -pub struct MacroCallBuilder<'a> { +/// Macro call parser FSM. +pub struct MacroCall<'a> { macros: &'a MacroSet, + macro_: &'a Macro, + state: MacroCallState, + args: Vec>, + + /// Length of macro call so far. + n_tokens: usize, } -impl<'a> MacroCallBuilder<'a> { - fn new(macro_set: &'a MacroSet, token: &Token) -> Option { +impl<'a> MacroCall<'a> { + pub fn new(macros: &'a MacroSet, token: &Token) -> Option { + if macros.is_empty() { + return None; + } let macro_name = match token { Token::Id(s) => s, Token::MacroToken(MacroToken::MacroId(s)) => s, _ => return None, - }.clone(); - let Some(macro_) = macro_set.get(&UniCase::new(macro_name)) else { + } + .clone(); + // XXX Unicase::new() is very expensive. We probably need to define our + // own Unicase-alike that has a proper Borrow<> implementation. + let Some(macro_) = macros.get(&UniCase::new(macro_name)) else { return None; }; + Some(Self { + macros, + macro_, + state: macro_.initial_state(), + args: Vec::with_capacity(macro_.parameters.len()), + n_tokens: 1, + }) + } + + fn push_continue_arg(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) { + if let Token::EndCommand | Token::End = token { + let param = &self.macro_.parameters[self.args.len() - 1]; + let arg = self.args.last().unwrap(); + match param.arg { + ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens { + n: n - arg.len(), + arg: param.name.clone(), + macro_: self.macro_.name.clone(), + }), + ValueType::CharEnd(end) | ValueType::Enclose(_, end) => todo!(), + ValueType::CmdEnd => todo!(), + } + } + } + fn push_new_arg(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) { + if let Token::EndCommand | Token::End = token { + return self.mc_finished(); + } + self.args.push(Vec::new()); + self.state = MacroCallState::ContinueArg; + self.push_continue_arg(token, syntax, error); + } + pub fn push(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) -> ! { + match self.state { + MacroCallState::NewArg => self.push_new_arg(token, syntax, error), + MacroCallState::ContinueArg => self.push_continue_arg(token, syntax, error), + MacroCallState::Enclose => todo!(), + MacroCallState::Keyword => todo!(), + MacroCallState::Equals => todo!(), + MacroCallState::Finished => todo!(), + } } } diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 14231b18a6..c9b04773ff 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -593,7 +593,7 @@ impl Decoder { } pub fn new_identifier(&self, name: &str) -> Result { - Identifier::new(name, self.encoding) + Identifier::from_encoding(name, self.encoding) } } @@ -2817,7 +2817,7 @@ impl LongStringValueLabels { decoder: &Decoder, ) -> Result, Warning> { let var_name = decoder.decode(&self.var_name); - let var_name = Identifier::new(var_name.trim_end(), decoder.encoding) + let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding) .map_err(Warning::InvalidLongStringValueLabelName)?; let mut labels = Vec::with_capacity(self.labels.len());