collections::{hash_map::Entry, HashMap},
error::Error as StdError,
fmt::{Display, Formatter, Result as FmtResult},
- iter::{repeat, Peekable},
- str::Chars,
+ iter::repeat,
};
use crate::endian::{Endian, ToBytes};
pub struct Error {
pub file_name: Option<String>,
pub line_number: Option<usize>,
+ pub token: Option<String>,
pub message: String,
}
impl Error {
- fn new(file_name: Option<&str>, line_number: Option<usize>, message: String) -> Error {
+ fn new(
+ file_name: Option<&str>,
+ line_number: Option<usize>,
+ token: Option<&str>,
+ message: String,
+ ) -> Error {
Error {
file_name: file_name.map(String::from),
line_number,
+ token: token.map(String::from),
message,
}
}
impl Display for Error {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
- if let Some(ref file_name) = self.file_name {
- write!(f, "{file_name}:")?;
- if self.line_number.is_none() {
- write!(f, " ")?;
- }
+ match (self.file_name.as_ref(), self.line_number) {
+ (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?,
+ (Some(ref file_name), None) => write!(f, "{file_name}: ")?,
+ (None, Some(line_number)) => write!(f, "line {line_number}: ")?,
+ (None, None) => (),
}
- if let Some(line_number) = self.line_number {
- write!(f, "{line_number}: ")?;
+ if let Some(ref token) = self.token {
+ write!(f, "at '{token}': ")?;
}
write!(f, "{}", self.message)
}
let mut symbol_table = HashMap::new();
let output = _sack(input, input_file_name, endian, &mut symbol_table)?;
let output = if !symbol_table.is_empty() {
+ for (k, v) in symbol_table.iter() {
+ println!("{k} => {v:?}");
+ }
for (k, v) in symbol_table.iter() {
if v.is_none() {
Err(Error::new(
input_file_name,
None,
+ None,
format!("label {k} used but never defined"),
))?
}
let initial_len = output.len();
match lexer.take()? {
- Token::Integer(integer) => output.extend_from_slice(&lexer.endian.to_bytes(integer)),
+ Token::Integer(integer) => {
+ let Ok(integer): Result<i32, _> = integer.try_into() else {
+ Err(lexer.error(format!(
+ "{integer} is not in the valid range [{},{}]",
+ u32::min_value(),
+ u32::max_value()
+ )))?
+ };
+ output.extend_from_slice(&lexer.endian.to_bytes(integer))},
Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)),
Token::PcSysmis => {
output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff])
Token::I64 => put_integers::<i64, 8>(lexer, "i64", output)?,
Token::String(string) => output.extend_from_slice(string.as_bytes()),
Token::S(size) => {
- let Some(Token::String(ref string)) = lexer.token else {
+ let Some((Token::String(ref string), _)) = lexer.token else {
Err(lexer.error(format!("string expected after 's{size}'")))?
};
let len = string.len();
lexer.get()?;
}
Token::LParen => {
- while lexer.token != Some(Token::RParen) {
+ while !matches!(lexer.token, Some((Token::RParen, _))) {
parse_data_item(lexer, output, symbol_table)?;
}
lexer.get()?;
Token::Count => put_counted_items::<u32, 4>(lexer, "COUNT", output, symbol_table)?,
Token::Count8 => put_counted_items::<u8, 1>(lexer, "COUNT8", output, symbol_table)?,
Token::Hex => {
- let Some(Token::String(ref string)) = lexer.token else {
+ let Some((Token::String(ref string), _)) = lexer.token else {
Err(lexer.error(String::from("string expected after 'hex'")))?
};
let mut i = string.chars();
}
}
Token::Label(name) => {
+ println!("define {name}");
let value = output.len() as u32;
- match symbol_table.entry(name) {
+ match symbol_table.entry(name.clone()) {
Entry::Vacant(v) => {
v.insert(Some(value));
}
- Entry::Occupied(o) => {
- if let Some(v) = o.get() {
- if *v != value {
- Err(lexer.error(String::from("syntax error")))?
+ Entry::Occupied(mut o) => {
+ match o.get() {
+ Some(v) => {
+ if *v != value {
+ Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))?
+ }
}
+ None => drop(o.insert(Some(value))),
}
}
};
+ return Ok(true);
}
Token::At(name) => {
- let mut value = symbol_table.entry(name).or_insert(None).unwrap_or(0);
- lexer.get()?;
+ let mut value = symbol_table
+ .entry(name.clone())
+ .or_insert(None)
+ .unwrap_or(0);
+ println!("{name} has value {value}");
loop {
let plus = match lexer.token {
- Some(Token::Plus) => true,
- Some(Token::Minus) => false,
+ Some((Token::Plus, _)) => true,
+ Some((Token::Minus, _)) => false,
_ => break,
};
lexer.get()?;
let operand = match lexer.token {
- Some(Token::At(ref name)) => if let Some(value) = symbol_table.get(name) {
+ Some((Token::At(ref name), _)) => if let Some(value) = symbol_table.get(name) {
*value
} else {
symbol_table.insert(name.clone(), None);
None
}
.unwrap_or(0),
- Some(Token::Integer(integer)) => integer
+ Some((Token::Integer(integer), _)) => integer
.try_into()
.map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?,
_ => Err(lexer.error(String::from("expecting @label or integer literal")))?,
}
_ => (),
};
- if lexer.token == Some(Token::Asterisk) {
+ if let Some((Token::Asterisk, _)) = lexer.token {
lexer.get()?;
let Token::Integer(count) = lexer.take()? else {
Err(lexer.error(String::from("positive integer expected after '*'")))?
}
}
match lexer.token {
- Some(Token::Semicolon) => {
+ Some((Token::Semicolon, _)) => {
lexer.get()?;
}
- Some(Token::RParen) => (),
+ Some((Token::RParen, _)) => (),
_ => Err(lexer.error(String::from("';' expected")))?,
}
Ok(true)
{
let old_size = output.len();
output.extend_from_slice(&lexer.endian.to_bytes(T::zero()));
- if lexer.token != Some(Token::LParen) {
+ let start = output.len();
+ if !matches!(lexer.token, Some((Token::LParen, _))) {
Err(lexer.error(format!("'(' expected after '{name}'")))?
}
lexer.get()?;
- while lexer.token != Some(Token::RParen) {
+ while !matches!(lexer.token, Some((Token::RParen, _))) {
parse_data_item(lexer, output, symbol_table)?;
}
lexer.get()?;
- let delta = output.len() - old_size;
+ let delta = output.len() - start;
let Ok(delta): Result<T, _> = delta.try_into() else {
Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))?
};
T: Bounded + Display + TryFrom<i64> + Copy,
Endian: ToBytes<T, N>,
{
+ println!("put_integers {:?}", lexer.token);
let mut n = 0;
while let Some(integer) = lexer.take_if(|t| match t {
Token::Integer(integer) => Some(*integer),
_ => None,
})? {
+ println!("got integer {integer}");
let Ok(integer) = integer.try_into() else {
Err(lexer.error(format!(
"{integer} is not in the valid range [{},{}]",
output.extend_from_slice(&lexer.endian.to_bytes(integer));
n += 1;
}
+ println!("put_integers {:?} {n}", lexer.token);
if n == 0 {
Err(lexer.error(format!("integer expected after '{name}'")))?
}
}
struct Lexer<'a> {
- iter: Peekable<Chars<'a>>,
- token: Option<Token>,
+ input: &'a str,
+ token: Option<(Token, &'a str)>,
input_file_name: Option<&'a str>,
line_number: usize,
endian: Endian,
}
+fn skip_comments(mut s: &str) -> (&str, usize) {
+ let mut n_newlines = 0;
+ let s = loop {
+ s = s.trim_start_matches([' ', '\t', '\r', '<', '>']);
+ if let Some(remainder) = s.strip_prefix('#') {
+ let Some((_, remainder)) = remainder.split_once('\n') else {
+ break "";
+ };
+ s = remainder;
+ n_newlines += 1;
+ } else if let Some(remainder) = s.strip_prefix('\n') {
+ s = remainder;
+ n_newlines += 1;
+ } else {
+ break s;
+ }
+ };
+ (s, n_newlines)
+}
+
impl<'a> Lexer<'a> {
fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result<Lexer<'a>> {
let mut lexer = Lexer {
- iter: input.chars().peekable(),
+ input,
token: None,
input_file_name,
line_number: 1,
Ok(lexer)
}
fn error(&self, message: String) -> Error {
- Error::new(self.input_file_name, Some(self.line_number), message)
+ let repr = self.token.as_ref().map(|(_, repr)| *repr);
+ Error::new(self.input_file_name, Some(self.line_number), repr, message)
}
fn take(&mut self) -> Result<Token> {
let Some(token) = self.token.take() else {
Err(self.error(String::from("unexpected end of input")))?
};
self.token = self.next()?;
- Ok(token)
+ Ok(token.0)
}
fn take_if<F, T>(&mut self, condition: F) -> Result<Option<T>>
where
let Some(ref token) = self.token else {
return Ok(None);
};
- match condition(token) {
+ match condition(&token.0) {
Some(value) => {
self.token = self.next()?;
Ok(Some(value))
Err(self.error(String::from("unexpected end of input")))?
} else {
self.token = self.next()?;
- Ok((&self.token).into())
+ match self.token {
+ Some((ref token, _)) => Ok(Some(token)),
+ None => Ok(None),
+ }
}
}
- fn next(&mut self) -> Result<Option<Token>> {
+ fn next(&mut self) -> Result<Option<(Token, &'a str)>> {
// Get the first character of the token, skipping past white space and
// comments.
- let c = loop {
- let Some(c) = self.iter.next() else {
- return Ok(None);
- };
- let c = if c == '#' {
- loop {
- match self.iter.next() {
- None => return Ok(None),
- Some('\n') => break,
- _ => (),
- }
- }
- '\n'
- } else {
- c
- };
- if c == '\n' {
- self.line_number += 1
- } else if !c.is_whitespace() && c != '<' && c != '>' {
- break c;
- }
- };
+ let (s, n_newlines) = skip_comments(self.input);
+ self.line_number += n_newlines;
+ self.input = s;
- let token =
- match c {
- c if c.is_ascii_digit() || c == '-' => {
- let mut s = String::from(c);
- while let Some(c) = self
- .iter
- .next_if(|&c| c.is_ascii_digit() || c.is_alphabetic() || c == '.')
- {
- s.push(c);
- }
-
- if s == "-" {
- Token::Minus
- } else if !s.contains('.') {
- Token::Integer(s.parse().map_err(|msg| {
- self.error(format!("bad integer literal '{s}' ({msg})"))
- })?)
- } else {
- Token::Float(s.parse().map_err(|msg| {
- self.error(format!("bad float literal '{s}' ({msg})"))
- })?)
- }
- }
- '"' => {
- let mut s = String::new();
- loop {
- match self.iter.next() {
- None => Err(self.error(String::from("end-of-file inside string")))?,
- Some('\n') => Err(self.error(String::from("new-line inside string")))?,
- Some('"') => break,
- Some(c) => s.push(c),
- }
- }
- Token::String(s)
- }
- ';' => Token::Semicolon,
- '*' => Token::Asterisk,
- '+' => Token::Plus,
- '(' => Token::LParen,
- ')' => Token::RParen,
- c if c.is_alphabetic() || c == '@' || c == '_' => {
- let mut s = String::from(c);
- while let Some(c) = self.iter.next_if(|&c| {
- c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '_'
- }) {
- s.push(c);
- }
- if self.iter.next_if_eq(&':').is_some() {
- Token::Label(s)
- } else if s.starts_with('@') {
- Token::At(s)
- } else if let Some(count) = s.strip_prefix('s') {
+ let start = s;
+ let mut iter = s.chars();
+ let Some(c) = iter.next() else {
+ return Ok(None);
+ };
+ let (token, rest) = match c {
+ c if c.is_ascii_digit() || c == '-' => {
+ let len = s
+ .find(|c: char| {
+ !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-')
+ })
+ .unwrap_or_else(|| s.len());
+ let (number, rest) = s.split_at(len);
+ let token = if number == "-" {
+ Token::Minus
+ } else if let Some(digits) = number.strip_prefix("0x") {
+ Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| {
+ self.error(format!("bad integer literal '{number}' ({msg})"))
+ })?)
+ } else if !number.contains('.') {
+ Token::Integer(number.parse().map_err(|msg| {
+ self.error(format!("bad integer literal '{number}' ({msg})"))
+ })?)
+ } else {
+ Token::Float(number.parse().map_err(|msg| {
+ self.error(format!("bad float literal '{number}' ({msg})"))
+ })?)
+ };
+ (token, rest)
+ }
+ '"' => {
+ let s = iter.as_str();
+ let Some(len) = s.find(['\n', '"']) else {
+ Err(self.error(String::from("end-of-file inside string")))?
+ };
+ let (string, rest) = s.split_at(len);
+ let Some(rest) = rest.strip_prefix('"') else {
+ Err(self.error(format!("new-line inside string ({string}...{rest})")))?
+ };
+ (Token::String(string.into()), rest)
+ }
+ ';' => (Token::Semicolon, iter.as_str()),
+ '*' => (Token::Asterisk, iter.as_str()),
+ '+' => (Token::Plus, iter.as_str()),
+ '(' => (Token::LParen, iter.as_str()),
+ ')' => (Token::RParen, iter.as_str()),
+ c if c.is_alphabetic() || c == '@' || c == '_' => {
+ let len = s
+ .find(|c: char| {
+ !(c.is_ascii_digit()
+ || c.is_alphabetic()
+ || c == '@'
+ || c == '.'
+ || c == '_')
+ })
+ .unwrap_or_else(|| s.len());
+ let (s, rest) = s.split_at(len);
+ if let Some(rest) = rest.strip_prefix(':') {
+ (Token::Label(s.into()), rest)
+ } else if let Some(name) = s.strip_prefix('@') {
+ (Token::At(name.into()), rest)
+ } else if let Some(count) = s.strip_prefix('s') {
+ let token =
Token::S(count.parse().map_err(|msg| {
self.error(format!("bad counted string '{s}' ({msg})"))
- })?)
- } else {
- match &s[..] {
- "i8" => Token::I8,
- "i16" => Token::I16,
- "i64" => Token::I64,
- "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)),
- "PCSYSMIS" => Token::PcSysmis,
- "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()),
- "HIGHEST" => Token::Float(f64::MAX.into()),
- "ENDIAN" => {
- Token::Integer(if self.endian == Endian::Big { 1 } else { 2 })
- }
- "COUNT" => Token::Count,
- "COUNT8" => Token::Count8,
- "hex" => Token::Hex,
- _ => Err(self.error(format!("invalid token '{s}'")))?,
- }
- }
+ })?);
+ (token, rest)
+ } else {
+ let token = match &s[..] {
+ "i8" => Token::I8,
+ "i16" => Token::I16,
+ "i64" => Token::I64,
+ "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)),
+ "PCSYSMIS" => Token::PcSysmis,
+ "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()),
+ "HIGHEST" => Token::Float(f64::MAX.into()),
+ "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }),
+ "COUNT" => Token::Count,
+ "COUNT8" => Token::Count8,
+ "hex" => Token::Hex,
+ _ => Err(self.error(format!("invalid token '{s}'")))?,
+ };
+ (token, rest)
}
- _ => Err(self.error(format!("invalid input byte '{c}'")))?,
- };
- Ok(Some(token))
+ }
+ _ => Err(self.error(format!("invalid input byte '{c}'")))?,
+ };
+ self.input = rest;
+ let repr = &start[..start.len() - rest.len()];
+ println!("{token:?} {repr}");
+ Ok(Some((token, repr)))
}
}
"01 Jan 11"; "20:53:52";
"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
i8 0 *3;
+"#;
+ let output = sack(input, None, Endian::Big)?;
+ HexView::new(&output).print()?;
+ Ok(())
+ }
+
+ #[test]
+ fn pcp_sack() -> Result<()> {
+ let input = r#"
+# File header.
+2; 0;
+@MAIN; @MAIN_END - @MAIN;
+@VARS; @VARS_END - @VARS;
+@LABELS; @LABELS_END - @LABELS;
+@DATA; @DATA_END - @DATA;
+(0; 0) * 11;
+i8 0 * 128;
+
+MAIN:
+ i16 1; # Fixed.
+ s62 "PCSPSS PSPP synthetic test product";
+ PCSYSMIS;
+ 0; 0; i16 1; # Fixed.
+ i16 0;
+ i16 15;
+ 1;
+ i16 0; # Fixed.
+ 1;
+ s8 "11/28/14";
+ s8 "15:11:00";
+ s64 "PSPP synthetic test file";
+MAIN_END:
+
+VARS:
+ 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS;
+ 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS;
+ 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS;
+
+ # Numeric variable, no label or missing values.
+ 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS;
+
+ # Numeric variable, variable label.
+ 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS;
+
+ # Numeric variable with missing value.
+ 0; 0; 0; 0x050800; s8 "NUM3"; 1.0;
+
+ # Numeric variable, variable label and missing value.
+ 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0;
+
+ # String variable, no label or missing values.
+ 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS;
+
+ # String variable, variable label.
+ 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS;
+
+ # String variable with missing value.
+ 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS";
+
+ # String variable, variable label and missing value.
+ 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR";
+
+ # Long string variable
+ 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS;
+ 0 * 8;
+
+ # Long string variable with variable label
+ 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS;
+ 0 * 8;
+VARS_END:
+
+LABELS:
+ 3; i8 0 0 0; LABELS_OFS: i8 0;
+ NUM2_LABEL: COUNT8("Numeric variable 2's label");
+ NUM4_LABEL: COUNT8("Another numeric variable label");
+ STR2_LABEL: COUNT8("STR2's variable label");
+ STR4_LABEL: COUNT8("STR4's variable label");
+ STR6_LABEL: COUNT8("Another string variable's label");
+LABELS_END:
+
+DATA:
+ 0.0; "11/28/14"; 1.0;
+ 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r";
+ s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM";
+DATA_END:
"#;
let output = sack(input, None, Endian::Big)?;
HexView::new(&output).print()?;