fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
loop {
self.n_generated_names += 1;
- let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
+ let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding)
.unwrap();
- if !dictionary.variables.contains(&name) {
+ if !dictionary.variables.contains(&name.0) {
return name;
}
assert!(self.n_generated_names < usize::MAX);
let mut var_index_map = HashMap::new();
while let Some((value_index, input)) = header_vars.next() {
let name = trim_end_spaces(input.name.to_string());
- let name = match Identifier::new(&name, encoding) {
+ let name = match Identifier::from_encoding(&name, encoding) {
Ok(name) => {
- if !dictionary.variables.contains(&name) {
+ if !dictionary.variables.contains(&name.0) {
name
} else {
let new_name = decoder.generate_name(&dictionary);
use indexmap::IndexSet;
use num::integer::div_ceil;
use ordered_float::OrderedFloat;
+use unicase::UniCase;
use crate::{
format::Spec,
}
impl HasIdentifier for Variable {
- fn identifier(&self) -> &Identifier {
- &self.name
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
}
}
}
impl HasIdentifier for Vector {
- fn identifier(&self) -> &Identifier {
- &self.name
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
}
}
}
impl HasIdentifier for Attribute {
- fn identifier(&self) -> &Identifier {
- &self.name
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
}
}
}
impl HasIdentifier for MultipleResponseSet {
- fn identifier(&self) -> &Identifier {
- &self.name
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
}
}
}
impl HasIdentifier for VariableSet {
- fn identifier(&self) -> &Identifier {
- &self.name
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
}
}
mod test {
use std::collections::HashSet;
+ use unicase::UniCase;
+
use crate::identifier::Identifier;
use super::{ByIdentifier, HasIdentifier};
}
impl HasIdentifier for Variable {
- fn identifier(&self) -> &Identifier {
- &self.name
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
}
}
#[test]
fn test() {
// Variables should not be the same if their values differ.
- let abcd = Identifier::new_utf8("abcd").unwrap();
+ let abcd = Identifier::new("abcd").unwrap();
let abcd1 = Variable {
name: abcd.clone(),
value: 1,
assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
assert_eq!(
- vars.get(&Identifier::new_utf8("abcd").unwrap())
+ vars.get(&UniCase::new(String::from("abcd")))
.unwrap()
.0
.value,
/// encoding used by the dictionary, not in UTF-8.
pub const MAX_LEN: usize = 64;
- pub fn new_utf8(s: &str) -> Result<Identifier, Error> {
- Self::new(s, UTF_8)
+ pub fn new(s: &str) -> Result<Identifier, Error> {
+ Self::from_encoding(s, UTF_8)
}
- pub fn new(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
+ pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
Self::is_plausible(s)?;
let identifier = Identifier(s.into());
identifier.check_encoding(encoding)?;
}
pub trait HasIdentifier {
- fn identifier(&self) -> &Identifier;
+ fn identifier(&self) -> &UniCase<String>;
}
pub struct ByIdentifier<T>(pub T)
}
}
-impl<T> Borrow<Identifier> for ByIdentifier<T>
+impl<T> Borrow<UniCase<String>> for ByIdentifier<T>
where
T: HasIdentifier,
{
- fn borrow(&self) -> &Identifier {
+ fn borrow(&self) -> &UniCase<String> {
self.0.identifier()
}
}
#[error("Invalid hex digit {0:?}.")]
BadHexDigit(char),
+ /// Incomplete UTF-8 sequence.
+ #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
+ IncompleteUtf8 { substring: String, offset: usize },
+
+ /// Bad UTF-8 sequence.
+ #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
+ BadUtf8 { substring: String, offset: usize },
+
/// Invalid length Unicode string.
#[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
BadLengthUnicodeString(usize),
if s.len() % 2 != 0 {
return Some(Self::Error(ScanError::OddLengthHexString(s.len())));
}
- let mut out = String::with_capacity(s.len());
- for pair in s.as_bytes().chunks_exact(2) {
- let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
- let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
- out.push(char::from(hi * 16 + lo));
+ let bytes = s
+ .as_bytes()
+ .chunks_exact(2)
+ .map(|pair| {
+ let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
+ let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
+ hi * 16 + lo
+ })
+ .collect::<Vec<_>>();
+ match String::from_utf8(bytes) {
+ Ok(string) => Some(Self::Token(Token::String(string))),
+ Err(error) => {
+ let details = error.utf8_error();
+ let offset = details.valid_up_to() * 2;
+ let end = details
+ .error_len()
+ .map(|len| offset + len * 2)
+ .unwrap_or(s.len());
+ let substring = String::from(&s[offset..end]);
+ Some(Self::Error(if details.error_len().is_some() {
+ ScanError::BadUtf8 { substring, offset }
+ } else {
+ ScanError::IncompleteUtf8 { substring, offset }
+ }))
+ }
}
- Some(Self::Token(Token::String(out)))
}
Segment::UnicodeString => {
// Strip `U"` prefix and `"` suffix (or variations).
+use std::fmt::{Display, Formatter, Result as FmtResult};
+
#[derive(Clone, Debug, PartialEq)]
pub enum Token {
/// End of input.
MacroToken(MacroToken),
}
-#[derive(Clone, Debug, PartialEq, Eq)]
+fn is_printable(c: char) -> bool {
+ !c.is_control() || ['\t', '\r', '\n'].contains(&c)
+}
+
+fn string_representation(s: &str, quote: char, f: &mut Formatter<'_>) -> FmtResult {
+ write!(f, "{quote}")?;
+ for section in s.split_inclusive(quote) {
+ if let Some(rest) = section.strip_suffix(quote) {
+ write!(f, "{rest}{quote}{quote}")?;
+ } else {
+ write!(f, "{section}")?;
+ }
+ }
+ write!(f, "{quote}")
+}
+
+impl Display for Token {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ match self {
+ Token::End => Ok(()),
+ Token::Id(s) => write!(f, "{s}"),
+ Token::Number(number) => {
+ if number.is_sign_negative() {
+ write!(f, "-{}", number.abs())
+ } else {
+ write!(f, "{number}")
+ }
+ }
+ Token::String(s) => {
+ if s.chars().all(|c| is_printable(c)) {
+ if s.contains('"') {
+ string_representation(s, '\'', f)
+ } else {
+ string_representation(s, '"', f)
+ }
+ } else {
+ write!(f, "X\"")?;
+ for byte in s.bytes() {
+ let c1 = char::from_digit((byte >> 4) as u32, 16)
+ .unwrap()
+ .to_ascii_uppercase();
+ let c2 = char::from_digit((byte & 0xf) as u32, 16)
+ .unwrap()
+ .to_ascii_uppercase()
+ .to_ascii_lowercase();
+ write!(f, "{c1}{c2}")?;
+ }
+ write!(f, "\"")
+ }
+ }
+ Token::EndCommand => write!(f, "."),
+ Token::Punct(punct) => punct.fmt(f),
+ Token::MacroToken(mt) => mt.fmt(f),
+ }
+ }
+}
+
+/// Check that all negative numbers, even -0, get formatted with a leading `-`.
+#[cfg(test)]
+mod test {
+ use crate::lex::token::Token;
+
+ #[test]
+ fn test_string() {
+ assert_eq!(Token::String(String::from("abc")).to_string(), "\"abc\"");
+ assert_eq!(Token::String(String::from("\u{0080}")).to_string(), "X\"C280\"");
+ }
+
+ #[test]
+ fn test_neg0() {
+ assert_eq!(Token::Number(-0.0).to_string(), "-0");
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Punct {
/// `+`.
Plus,
Exp,
}
+impl Punct {
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ Self::Plus => "+",
+ Self::Dash => "-",
+ Self::Asterisk => "*",
+ Self::Slash => "/",
+ Self::Equals => "=",
+ Self::LParen => "(",
+ Self::RParen => ")",
+ Self::LSquare => "[",
+ Self::RSquare => "]",
+ Self::LCurly => "{",
+ Self::RCurly => "}",
+ Self::Comma => ",",
+ Self::Semicolon => ";",
+ Self::Colon => ":",
+ Self::And => "AND",
+ Self::Or => "OR",
+ Self::Not => "NOT",
+ Self::Eq => "EQ",
+ Self::Ge => ">=",
+ Self::Gt => ">",
+ Self::Le => "<=",
+ Self::Lt => "<",
+ Self::Ne => "~=",
+ Self::All => "ALL",
+ Self::By => "BY",
+ Self::To => "TO",
+ Self::With => "WITH",
+ Self::Exp => "**",
+ }
+ }
+}
+impl Display for Punct {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ write!(f, "{}", self.as_str())
+ }
+}
+
/// Tokens that only appear in macros.
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum MacroToken {
/// first character, so this represents an underscore found on its own.
Underscore,
}
+
+impl MacroToken {
+ pub fn as_str(&self) -> &str {
+ match self {
+ MacroToken::MacroId(id) => &id,
+ MacroToken::Bang => "!",
+ MacroToken::Percent => "%",
+ MacroToken::Question => "?",
+ MacroToken::Backtick => "`",
+ MacroToken::Dot => ".",
+ MacroToken::Underscore => "_",
+ }
+ }
+}
+
+impl Display for MacroToken {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ match self {
+ MacroToken::MacroId(id) => write!(f, "{id}"),
+ _ => write!(f, "{}", self.as_str()),
+ }
+ }
+}
pub mod lex;
pub mod prompt;
pub mod message;
-pub mod macros;
+//pub mod macros;
use std::collections::HashMap;
+use thiserror::Error as ThisError;
use unicase::UniCase;
use crate::{
message::Location,
};
+#[derive(Clone, Debug, ThisError)]
+pub enum MacroError {
+ /// Expected more tokens.
+ #[error(
+ "Reached end of command expecting {n} more tokens in argument {arg} to macro {macro_}."
+ )]
+ ExpectedMoreTokens {
+ n: usize,
+ arg: String,
+ macro_: String,
+ },
+
+ /// Expected a particular token.
+ #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")]
+ ExpectedToken {
+ token: String,
+ arg: String,
+ macro_: String,
+ },
+}
+
/// A PSPP macro as defined with `!DEFINE`.
pub struct Macro {
/// The macro's name. This is an ordinary identifier except that it is
body: Vec<BodyToken>,
}
+impl Macro {
+ fn initial_state(&self) -> MacroCallState {
+ if self.parameters.is_empty() {
+ MacroCallState::Finished
+ } else if self.parameters[0].is_positional() {
+ MacroCallState::Keyword
+ } else if let ValueType::Enclose(_, _) = self.parameters[0].arg {
+ MacroCallState::Enclose
+ } else {
+ MacroCallState::NewArg
+ }
+ }
+}
+
struct Parameter {
/// `!name` or `!1`.
name: String,
/// The tokens don't include white space, etc. between them.
default: Vec<BodyToken>,
- /// Macro-expand the argument?
- expand_arg: bool,
+ /// Macro-expand the value?
+ expand_value: bool,
/// How the argument is specified.
- arg: Arg,
+ arg: ValueType,
}
impl Parameter {
}
}
-enum Arg {
+enum ValueType {
/// Argument consists of `.0` tokens.
NTokens(usize),
type MacroSet = HashMap<UniCase<String>, Macro>;
pub enum MacroCallState {
- Arg,
+ /// Starting a new argument.
+ NewArg,
+
+ /// Accumulating tokens toward the end of any type of argument.
+ ContinueArg,
+
+ /// Expecting the opening delimiter of an ARG_ENCLOSE argument.
Enclose,
+
+ /// Expecting a keyword for a keyword argument.
Keyword,
+
+ /// Expecting an equal sign for a keyword argument.
Equals,
+
+ /// Macro fully parsed and ready for expansion.
Finished,
}
-pub struct MacroCallBuilder<'a> {
+/// Macro call parser FSM.
+pub struct MacroCall<'a> {
macros: &'a MacroSet,
+ macro_: &'a Macro,
+ state: MacroCallState,
+ args: Vec<Vec<BodyToken>>,
+
+ /// Length of macro call so far.
+ n_tokens: usize,
}
-impl<'a> MacroCallBuilder<'a> {
- fn new(macro_set: &'a MacroSet, token: &Token) -> Option<Self> {
+impl<'a> MacroCall<'a> {
+ pub fn new(macros: &'a MacroSet, token: &Token) -> Option<Self> {
+ if macros.is_empty() {
+ return None;
+ }
let macro_name = match token {
Token::Id(s) => s,
Token::MacroToken(MacroToken::MacroId(s)) => s,
_ => return None,
- }.clone();
- let Some(macro_) = macro_set.get(&UniCase::new(macro_name)) else {
+ }
+ .clone();
+ // XXX Unicase::new() is very expensive. We probably need to define our
+ // own Unicase-alike that has a proper Borrow<> implementation.
+ let Some(macro_) = macros.get(&UniCase::new(macro_name)) else {
return None;
};
+ Some(Self {
+ macros,
+ macro_,
+ state: macro_.initial_state(),
+ args: Vec::with_capacity(macro_.parameters.len()),
+ n_tokens: 1,
+ })
+ }
+
+ fn push_continue_arg(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) {
+ if let Token::EndCommand | Token::End = token {
+ let param = &self.macro_.parameters[self.args.len() - 1];
+ let arg = self.args.last().unwrap();
+ match param.arg {
+ ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens {
+ n: n - arg.len(),
+ arg: param.name.clone(),
+ macro_: self.macro_.name.clone(),
+ }),
+ ValueType::CharEnd(end) | ValueType::Enclose(_, end) => todo!(),
+ ValueType::CmdEnd => todo!(),
+ }
+ }
+ }
+ fn push_new_arg(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) {
+ if let Token::EndCommand | Token::End = token {
+ return self.mc_finished();
+ }
+ self.args.push(Vec::new());
+ self.state = MacroCallState::ContinueArg;
+ self.push_continue_arg(token, syntax, error);
+ }
+ pub fn push(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) -> ! {
+ match self.state {
+ MacroCallState::NewArg => self.push_new_arg(token, syntax, error),
+ MacroCallState::ContinueArg => self.push_continue_arg(token, syntax, error),
+ MacroCallState::Enclose => todo!(),
+ MacroCallState::Keyword => todo!(),
+ MacroCallState::Equals => todo!(),
+ MacroCallState::Finished => todo!(),
+ }
}
}
}
pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
- Identifier::new(name, self.encoding)
+ Identifier::from_encoding(name, self.encoding)
}
}
decoder: &Decoder,
) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
let var_name = decoder.decode(&self.var_name);
- let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
+ let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
.map_err(Warning::InvalidLongStringValueLabelName)?;
let mut labels = Vec::with_capacity(self.labels.len());