1 use float_next_after::NextAfter;
2 use num::{Bounded, Zero};
3 use ordered_float::OrderedFloat;
5 collections::{hash_map::Entry, HashMap},
6 error::Error as StdError,
7 fmt::{Display, Formatter, Result as FmtResult},
11 use crate::endian::{Endian, ToBytes};
13 pub type Result<T, F = Error> = std::result::Result<T, F>;
17 pub file_name: Option<String>,
18 pub line_number: Option<usize>,
19 pub token: Option<String>,
25 file_name: Option<&str>,
26 line_number: Option<usize>,
31 file_name: file_name.map(String::from),
33 token: token.map(String::from),
39 impl StdError for Error {}
41 impl Display for Error {
42 fn fmt(&self, f: &mut Formatter) -> FmtResult {
43 match (self.file_name.as_ref(), self.line_number) {
44 (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?,
45 (Some(ref file_name), None) => write!(f, "{file_name}: ")?,
46 (None, Some(line_number)) => write!(f, "line {line_number}: ")?,
49 if let Some(ref token) = self.token {
50 write!(f, "at '{token}': ")?;
52 write!(f, "{}", self.message)
56 pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result<Vec<u8>> {
57 let mut symbol_table = HashMap::new();
58 let output = _sack(input, input_file_name, endian, &mut symbol_table)?;
59 let output = if !symbol_table.is_empty() {
60 for (k, v) in symbol_table.iter() {
61 println!("{k} => {v:?}");
63 for (k, v) in symbol_table.iter() {
69 format!("label {k} used but never defined"),
73 _sack(input, input_file_name, endian, &mut symbol_table)?
82 input_file_name: Option<&str>,
84 symbol_table: &mut HashMap<String, Option<u32>>,
85 ) -> Result<Vec<u8>> {
86 let mut lexer = Lexer::new(input, input_file_name, endian)?;
87 let mut output = Vec::new();
88 while parse_data_item(&mut lexer, &mut output, symbol_table)? {}
95 symbol_table: &mut HashMap<String, Option<u32>>,
97 if lexer.token.is_none() {
101 let initial_len = output.len();
102 match lexer.take()? {
103 Token::Integer(integer) => {
104 if let Ok(integer) = TryInto::<i32>::try_into(integer) {
105 output.extend_from_slice(&lexer.endian.to_bytes(integer));
106 } else if let Ok(integer) = TryInto::<u32>::try_into(integer) {
107 output.extend_from_slice(&lexer.endian.to_bytes(integer));
109 Err(lexer.error(format!(
110 "{integer} is not in the valid range [{},{}]",
116 Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)),
118 output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff])
120 Token::I8 => put_integers::<u8, 1>(lexer, "i8", output)?,
121 Token::I16 => put_integers::<u16, 2>(lexer, "i16", output)?,
122 Token::I64 => put_integers::<i64, 8>(lexer, "i64", output)?,
123 Token::String(string) => output.extend_from_slice(string.as_bytes()),
125 let Some((Token::String(ref string), _)) = lexer.token else {
126 Err(lexer.error(format!("string expected after 's{size}'")))?
128 let len = string.len();
130 Err(lexer.error(format!(
131 "{len}-byte string is longer than pad length {size}"
134 output.extend_from_slice(string.as_bytes());
135 output.extend(repeat(b' ').take(size - len));
139 while !matches!(lexer.token, Some((Token::RParen, _))) {
140 parse_data_item(lexer, output, symbol_table)?;
144 Token::Count => put_counted_items::<u32, 4>(lexer, "COUNT", output, symbol_table)?,
145 Token::Count8 => put_counted_items::<u8, 1>(lexer, "COUNT8", output, symbol_table)?,
147 let Some((Token::String(ref string), _)) = lexer.token else {
148 Err(lexer.error(String::from("string expected after 'hex'")))?
150 let mut string = &string[..];
152 string = string.trim_start();
153 if string.is_empty() {
157 let mut i = string.chars();
158 let Some(c0) = i.next() else { return Ok(true) };
159 let Some(c1) = i.next() else {
160 Err(lexer.error(String::from("hex string has odd number of characters")))?
163 let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else {
164 Err(lexer.error(String::from("invalid digit in hex string")))?
166 let byte = digit0 * 16 + digit1;
167 output.push(byte as u8);
173 Token::Label(name) => {
174 println!("define {name}");
175 let value = output.len() as u32;
176 match symbol_table.entry(name.clone()) {
177 Entry::Vacant(v) => {
178 v.insert(Some(value));
180 Entry::Occupied(mut o) => {
184 Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))?
187 None => drop(o.insert(Some(value))),
194 let mut value = *symbol_table.entry(name.clone()).or_insert(None);
196 let plus = match lexer.token {
197 Some((Token::Plus, _)) => true,
198 Some((Token::Minus, _)) => false,
203 let operand = match lexer.token {
204 Some((Token::At(ref name), _)) => {
205 *symbol_table.entry(name.clone()).or_insert(None)
207 Some((Token::Integer(integer), _)) => Some(
210 .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?,
212 _ => Err(lexer.error(String::from("expecting @label or integer literal")))?,
216 value = match (value, operand) {
217 (Some(a), Some(b)) => Some(
224 lexer.error(String::from("overflow in offset arithmetic"))
230 let value = value.unwrap_or(0);
231 output.extend_from_slice(&lexer.endian.to_bytes(value));
235 if let Some((Token::Asterisk, _)) = lexer.token {
237 let Token::Integer(count) = lexer.take()? else {
238 Err(lexer.error(String::from("positive integer expected after '*'")))?
241 Err(lexer.error(String::from("positive integer expected after '*'")))?
243 let final_len = output.len();
245 output.extend_from_within(initial_len..final_len);
249 Some((Token::Semicolon, _)) => {
252 Some((Token::RParen, _)) => (),
253 _ => Err(lexer.error(String::from("';' expected")))?,
258 fn put_counted_items<T, const N: usize>(
261 output: &mut Vec<u8>,
262 symbol_table: &mut HashMap<String, Option<u32>>,
265 T: Zero + TryFrom<usize>,
266 Endian: ToBytes<T, N>,
268 let old_size = output.len();
269 output.extend_from_slice(&lexer.endian.to_bytes(T::zero()));
270 let start = output.len();
271 if !matches!(lexer.token, Some((Token::LParen, _))) {
272 Err(lexer.error(format!("'(' expected after '{name}'")))?
275 while !matches!(lexer.token, Some((Token::RParen, _))) {
276 parse_data_item(lexer, output, symbol_table)?;
279 let delta = output.len() - start;
280 let Ok(delta): Result<T, _> = delta.try_into() else {
281 Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))?
283 let dest = &mut output[old_size..old_size + N];
284 dest.copy_from_slice(&lexer.endian.to_bytes(delta));
288 fn put_integers<T, const N: usize>(
291 output: &mut Vec<u8>,
294 T: Bounded + Display + TryFrom<i64> + Copy,
295 Endian: ToBytes<T, N>,
297 println!("put_integers {:?}", lexer.token);
299 while let Some(integer) = lexer.take_if(|t| match t {
300 Token::Integer(integer) => Some(*integer),
303 println!("got integer {integer}");
304 let Ok(integer) = integer.try_into() else {
305 Err(lexer.error(format!(
306 "{integer} is not in the valid range [{},{}]",
311 output.extend_from_slice(&lexer.endian.to_bytes(integer));
314 println!("put_integers {:?} {n}", lexer.token);
316 Err(lexer.error(format!("integer expected after '{name}'")))?
321 #[derive(PartialEq, Eq, Clone, Debug)]
324 Float(OrderedFloat<f64>),
346 token: Option<(Token, &'a str)>,
347 input_file_name: Option<&'a str>,
352 fn skip_comments(mut s: &str) -> (&str, usize) {
353 let mut n_newlines = 0;
355 s = s.trim_start_matches([' ', '\t', '\r', '<', '>']);
356 if let Some(remainder) = s.strip_prefix('#') {
357 let Some((_, remainder)) = remainder.split_once('\n') else {
362 } else if let Some(remainder) = s.strip_prefix('\n') {
373 fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result<Lexer<'a>> {
374 let mut lexer = Lexer {
381 lexer.token = lexer.next()?;
384 fn error(&self, message: String) -> Error {
385 let repr = self.token.as_ref().map(|(_, repr)| *repr);
386 Error::new(self.input_file_name, Some(self.line_number), repr, message)
388 fn take(&mut self) -> Result<Token> {
389 let Some(token) = self.token.take() else {
390 Err(self.error(String::from("unexpected end of input")))?
392 self.token = self.next()?;
395 fn take_if<F, T>(&mut self, condition: F) -> Result<Option<T>>
397 F: FnOnce(&Token) -> Option<T>,
399 let Some(ref token) = self.token else {
402 match condition(&token.0) {
404 self.token = self.next()?;
410 fn get(&mut self) -> Result<Option<&Token>> {
411 if self.token.is_none() {
412 Err(self.error(String::from("unexpected end of input")))?
414 self.token = self.next()?;
416 Some((ref token, _)) => Ok(Some(token)),
422 fn next(&mut self) -> Result<Option<(Token, &'a str)>> {
423 // Get the first character of the token, skipping past white space and
425 let (s, n_newlines) = skip_comments(self.input);
426 self.line_number += n_newlines;
430 let mut iter = s.chars();
431 let Some(c) = iter.next() else {
434 let (token, rest) = match c {
435 c if c.is_ascii_digit() || c == '-' => {
438 !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-')
441 let (number, rest) = s.split_at(len);
442 let token = if number == "-" {
444 } else if let Some(digits) = number.strip_prefix("0x") {
445 Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| {
446 self.error(format!("bad integer literal '{number}' ({msg})"))
448 } else if !number.contains('.') {
449 Token::Integer(number.parse().map_err(|msg| {
450 self.error(format!("bad integer literal '{number}' ({msg})"))
453 Token::Float(number.parse().map_err(|msg| {
454 self.error(format!("bad float literal '{number}' ({msg})"))
460 let s = iter.as_str();
461 let Some(len) = s.find(['\n', '"']) else {
462 Err(self.error(String::from("end-of-file inside string")))?
464 let (string, rest) = s.split_at(len);
465 let Some(rest) = rest.strip_prefix('"') else {
466 Err(self.error(format!("new-line inside string ({string}...{rest})")))?
468 (Token::String(string.into()), rest)
470 ';' => (Token::Semicolon, iter.as_str()),
471 '*' => (Token::Asterisk, iter.as_str()),
472 '+' => (Token::Plus, iter.as_str()),
473 '(' => (Token::LParen, iter.as_str()),
474 ')' => (Token::RParen, iter.as_str()),
475 c if c.is_alphabetic() || c == '@' || c == '_' => {
485 let (s, rest) = s.split_at(len);
486 if let Some(rest) = rest.strip_prefix(':') {
487 (Token::Label(s.into()), rest)
488 } else if let Some(name) = s.strip_prefix('@') {
489 (Token::At(name.into()), rest)
490 } else if let Some(count) = s.strip_prefix('s') {
492 Token::S(count.parse().map_err(|msg| {
493 self.error(format!("bad counted string '{s}' ({msg})"))
497 let token = match s {
501 "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)),
502 "PCSYSMIS" => Token::PcSysmis,
503 "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()),
504 "HIGHEST" => Token::Float(f64::MAX.into()),
505 "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }),
506 "COUNT" => Token::Count,
507 "COUNT8" => Token::Count8,
509 _ => Err(self.error(format!("invalid token '{s}'")))?,
514 _ => Err(self.error(format!("invalid input byte '{c}'")))?,
517 let repr = &start[..start.len() - rest.len()];
518 println!("{token:?} {repr}");
519 Ok(Some((token, repr)))
525 use crate::endian::Endian;
526 use crate::sack::sack;
528 use hexplay::HexView;
531 fn basic_sack() -> Result<()> {
533 "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
535 28; # Nominal case size
540 "01 Jan 11"; "20:53:52";
541 "PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
544 let output = sack(input, None, Endian::Big)?;
545 HexView::new(&output).print()?;
550 fn pcp_sack() -> Result<()> {
554 @MAIN; @MAIN_END - @MAIN;
555 @VARS; @VARS_END - @VARS;
556 @LABELS; @LABELS_END - @LABELS;
557 @DATA; @DATA_END - @DATA;
563 s62 "PCSPSS PSPP synthetic test product";
565 0; 0; i16 1; # Fixed.
573 s64 "PSPP synthetic test file";
577 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS;
578 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS;
579 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS;
581 # Numeric variable, no label or missing values.
582 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS;
584 # Numeric variable, variable label.
585 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS;
587 # Numeric variable with missing value.
588 0; 0; 0; 0x050800; s8 "NUM3"; 1.0;
590 # Numeric variable, variable label and missing value.
591 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0;
593 # String variable, no label or missing values.
594 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS;
596 # String variable, variable label.
597 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS;
599 # String variable with missing value.
600 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS";
602 # String variable, variable label and missing value.
603 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR";
605 # Long string variable
606 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS;
609 # Long string variable with variable label
610 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS;
615 3; i8 0 0 0; LABELS_OFS: i8 0;
616 NUM2_LABEL: COUNT8("Numeric variable 2's label");
617 NUM4_LABEL: COUNT8("Another numeric variable label");
618 STR2_LABEL: COUNT8("STR2's variable label");
619 STR4_LABEL: COUNT8("STR4's variable label");
620 STR6_LABEL: COUNT8("Another string variable's label");
624 0.0; "11/28/14"; 1.0;
625 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r";
626 s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM";
629 let output = sack(input, None, Endian::Big)?;
630 HexView::new(&output).print()?;