scan code passes all the tests
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 14 Jul 2024 19:38:40 +0000 (12:38 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 14 Jul 2024 19:38:40 +0000 (12:38 -0700)
rust/src/lex/scan/mod.rs
rust/src/lex/scan/test.rs
rust/src/lex/segment/mod.rs
rust/src/lex/segment/test.rs
rust/src/lex/token.rs

index 2e5f993311eb45c60be85c632f79c70dc973dcdf..5503a5bcc058469c29ee6ed0abef842e6cc58abe 100644 (file)
 
 use super::{
     segment::{Mode, Segment, Segmenter},
-    token::{Punct, Token, TokenError},
+    token::{MacroToken, Punct, Token},
 };
 use std::collections::VecDeque;
+use thiserror::Error as ThisError;
 
-/// Attempts to merge a sequence of tokens together into a single token. The
-/// tokens are taken from the beginning of `input`. If successful, removes one
-/// or more token from the beginning of `input` and returnss the merged
-/// token. More input tokens might be needed; if so, leaves `input` alone and
-/// returns `None`. In the latter case, the caller should add more tokens to the
-/// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient).
-///
-/// This performs two different kinds of token merging:
-///
-///   - String concatenation, where syntax like `"a" + "b"` is converted into a
-///   single string token.  This is definitely needed because the parser relies
-///   on it.
-///
-///   - Negative number merging, where syntax like `-5` is converted from a pair
-///     of tokens (a dash and a positive number) into a single token (a negative
-///     number).  This might not be needed anymore because the segmenter
-///     directly treats a dash followed by a number, with optional intervening
-///     white space, as a negative number.  It's only needed if we want
-///     intervening comments to be allowed or for part of the negative number
-///     token to be produced by macro expansion.
-pub fn merge_tokens(input: &mut VecDeque<Token>) -> Option<Token> {
-    match input.get(0)? {
-        Token::Punct(Punct::Dash) => match input.get(1)? {
-            Token::Number(number) if number.is_sign_positive() => {
-                let number = *number;
-                input.pop_front().unwrap();
-                input.pop_front().unwrap();
-                return Some(Token::Number(-number));
+#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
+pub enum ScanError {
+    /// Unterminated string constant.
+    #[error("Unterminated string constant.")]
+    ExpectedQuote,
+
+    /// Missing exponent.
+    #[error("Missing exponent following `{0}`")]
+    ExpectedExponent(String),
+
+    /// Odd length hex string.
+    #[error("String of hex digits has {0} characters, which is not a multiple of 2.")]
+    OddLengthHexString(usize),
+
+    /// Invalid hex digit.
+    #[error("Invalid hex digit {0:?}.")]
+    BadHexDigit(char),
+
+    /// Invalid length Unicode string.
+    #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
+    BadLengthUnicodeString(usize),
+
+    /// Invalid code point.
+    #[error("U+{0:04X} is not a valid Unicode code point.")]
+    BadCodePoint(u32),
+
+    /// Expected hexadecimal Unicode code point
+    #[error("Expected hexadecimal Unicode code point.")]
+    ExpectedCodePoint,
+
+    /// `DO REPEAT` nested too deeply.
+    #[error("`DO REPEAT` nested too deeply.")]
+    DoRepeatOverflow,
+
+    /// Unexpected character.
+    #[error("Unexpected character {0:?} in input.")]
+    UnexpectedChar(char),
+}
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum ScanToken {
+    Token(Token),
+    Error(ScanError),
+}
+
+impl ScanToken {
+    pub fn from_segment(s: &str, segment: Segment) -> Option<Self> {
+        match segment {
+            Segment::Number => Some(Self::Token(Token::Number(s.parse().unwrap()))),
+            Segment::QuotedString => {
+                // Trim quote mark from front and back.
+                let mut chars = s.chars();
+                let quote = chars.next().unwrap();
+                let s = chars.as_str().strip_suffix(quote).unwrap();
+
+                // Replace doubled quotes by single ones.
+                let (single_quote, double_quote) = match quote {
+                    '\'' => ("'", "''"),
+                    '"' => ("\"", "\"\""),
+                    _ => unreachable!(),
+                };
+                Some(Self::Token(Token::String(
+                    s.replace(double_quote, single_quote),
+                )))
             }
-            _ => Some(input.pop_front().unwrap()),
-        },
-        Token::String(_) => {
-            let mut i = 0;
-            while matches!(input.get(i * 2 + 1)?, Token::Punct(Punct::Plus))
-                && matches!(input.get(i * 2 + 2)?, Token::String(_))
-            {
-                i += 1;
+            Segment::HexString => {
+                // Strip `X"` prefix and `"` suffix (or variations).
+                let s = &s[2..s.len() - 1];
+                for c in s.chars() {
+                    if !c.is_ascii_hexdigit() {
+                        return Some(Self::Error(ScanError::BadHexDigit(c)));
+                    }
+                }
+                if s.len() % 2 != 0 {
+                    return Some(Self::Error(ScanError::OddLengthHexString(s.len())));
+                }
+                let mut out = String::with_capacity(s.len());
+                for pair in s.as_bytes().chunks_exact(2) {
+                    let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
+                    let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
+                    out.push(char::from(hi * 16 + lo));
+                }
+                Some(Self::Token(Token::String(out)))
+            }
+            Segment::UnicodeString => {
+                // Strip `U"` prefix and `"` suffix (or variations).
+                let s = &s[2..s.len() - 1];
+                if !(1..=8).contains(&s.len()) {
+                    return Some(Self::Error(ScanError::BadLengthUnicodeString(s.len())));
+                }
+                let Ok(code_point) = u32::from_str_radix(s, 16) else {
+                    return Some(Self::Error(ScanError::ExpectedCodePoint));
+                };
+                let Some(c) = char::from_u32(code_point) else {
+                    return Some(Self::Error(ScanError::BadCodePoint(code_point)));
+                };
+                Some(Self::Token(Token::String(String::from(c))))
             }
-            if i == 0 {
-                Some(input.pop_front().unwrap())
-            } else {
-                let mut output = String::new();
-                for i in 0..=i {
-                    let Token::String(s) = &input[i * 2] else {
-                        unreachable!()
-                    };
-                    output.push_str(&s);
+
+            Segment::UnquotedString
+            | Segment::DoRepeatCommand
+            | Segment::InlineData
+            | Segment::Document
+            | Segment::MacroBody
+            | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))),
+
+            Segment::ReservedWord => {
+                let c0 = s.as_bytes()[0].to_ascii_uppercase();
+                let c1 = s.as_bytes()[1].to_ascii_uppercase();
+                match (c0, c1) {
+                    (b'B', _) => Some(Self::Token(Token::Punct(Punct::By))),
+                    (b'E', _) => Some(Self::Token(Token::Punct(Punct::Eq))),
+                    (b'G', b'T') => Some(Self::Token(Token::Punct(Punct::Gt))),
+                    (b'G', _) => Some(Self::Token(Token::Punct(Punct::Ge))),
+                    (b'L', b'T') => Some(Self::Token(Token::Punct(Punct::Lt))),
+                    (b'L', _) => Some(Self::Token(Token::Punct(Punct::Le))),
+                    (b'N', b'E') => Some(Self::Token(Token::Punct(Punct::Ne))),
+                    (b'N', _) => Some(Self::Token(Token::Punct(Punct::Not))),
+                    (b'O', _) => Some(Self::Token(Token::Punct(Punct::Or))),
+                    (b'T', _) => Some(Self::Token(Token::Punct(Punct::To))),
+                    (b'A', b'L') => Some(Self::Token(Token::Punct(Punct::All))),
+                    (b'A', _) => Some(Self::Token(Token::Punct(Punct::And))),
+                    (b'W', _) => Some(Self::Token(Token::Punct(Punct::With))),
+                    _ => unreachable!(),
                 }
-                for _ in 0..i * 2 + 1 {
+            }
+            Segment::Identifier => Some(Self::Token(Token::Id(String::from(s)))),
+            Segment::Punct => match s {
+                "(" => Some(Self::Token(Token::Punct(Punct::LParen))),
+                ")" => Some(Self::Token(Token::Punct(Punct::RParen))),
+                "[" => Some(Self::Token(Token::Punct(Punct::LSquare))),
+                "]" => Some(Self::Token(Token::Punct(Punct::RSquare))),
+                "{" => Some(Self::Token(Token::Punct(Punct::LCurly))),
+                "}" => Some(Self::Token(Token::Punct(Punct::RCurly))),
+                "," => Some(Self::Token(Token::Punct(Punct::Comma))),
+                "=" => Some(Self::Token(Token::Punct(Punct::Equals))),
+                "-" => Some(Self::Token(Token::Punct(Punct::Dash))),
+                "&" => Some(Self::Token(Token::Punct(Punct::And))),
+                "|" => Some(Self::Token(Token::Punct(Punct::Or))),
+                "+" => Some(Self::Token(Token::Punct(Punct::Plus))),
+                "/" => Some(Self::Token(Token::Punct(Punct::Slash))),
+                "*" => Some(Self::Token(Token::Punct(Punct::Asterisk))),
+                "<" => Some(Self::Token(Token::Punct(Punct::Lt))),
+                ">" => Some(Self::Token(Token::Punct(Punct::Gt))),
+                "~" => Some(Self::Token(Token::Punct(Punct::Not))),
+                ":" => Some(Self::Token(Token::Punct(Punct::Colon))),
+                ";" => Some(Self::Token(Token::Punct(Punct::Semicolon))),
+                "**" => Some(Self::Token(Token::Punct(Punct::Exp))),
+                "<=" => Some(Self::Token(Token::Punct(Punct::Le))),
+                "<>" => Some(Self::Token(Token::Punct(Punct::Ne))),
+                "~=" => Some(Self::Token(Token::Punct(Punct::Ne))),
+                ">=" => Some(Self::Token(Token::Punct(Punct::Ge))),
+                "!" => Some(Self::Token(Token::MacroToken(MacroToken::Bang))),
+                "%" => Some(Self::Token(Token::MacroToken(MacroToken::Percent))),
+                "?" => Some(Self::Token(Token::MacroToken(MacroToken::Question))),
+                "`" => Some(Self::Token(Token::MacroToken(MacroToken::Backtick))),
+                "_" => Some(Self::Token(Token::MacroToken(MacroToken::Underscore))),
+                "." => Some(Self::Token(Token::MacroToken(MacroToken::Dot))),
+                _ => unreachable!("bad punctuator {s:?}"),
+            },
+            Segment::Shbang
+            | Segment::Spaces
+            | Segment::Comment
+            | Segment::Newline
+            | Segment::CommentCommand => None,
+            Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)),
+            Segment::MacroId => Some(Self::Token(Token::MacroToken(MacroToken::MacroId(
+                String::from(s),
+            )))),
+            Segment::StartDocument => Some(Self::Token(Token::Id(String::from("DOCUMENT")))),
+            Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
+                Some(Self::Token(Token::EndCommand))
+            }
+            Segment::End => Some(Self::Token(Token::End)),
+            Segment::ExpectedQuote => Some(Self::Error(ScanError::ExpectedQuote)),
+            Segment::ExpectedExponent => {
+                Some(Self::Error(ScanError::ExpectedExponent(String::from(s))))
+            }
+            Segment::UnexpectedChar => Some(Self::Error(ScanError::UnexpectedChar(
+                s.chars().next().unwrap(),
+            ))),
+        }
+    }
+
+    /// Attempts to merge a sequence of tokens together into a single token. The
+    /// tokens are taken from the beginning of `input`. If successful, removes one
+    /// or more token from the beginning of `input` and returnss the merged
+    /// token. More input tokens might be needed; if so, leaves `input` alone and
+    /// returns `None`. In the latter case, the caller should add more tokens to the
+    /// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient).
+    ///
+    /// This performs two different kinds of token merging:
+    ///
+    ///   - String concatenation, where syntax like `"a" + "b"` is converted into a
+    ///   single string token.  This is definitely needed because the parser relies
+    ///   on it.
+    ///
+    ///   - Negative number merging, where syntax like `-5` is converted from a pair
+    ///     of tokens (a dash and a positive number) into a single token (a negative
+    ///     number).  This might not be needed anymore because the segmenter
+    ///     directly treats a dash followed by a number, with optional intervening
+    ///     white space, as a negative number.  It's only needed if we want
+    ///     intervening comments to be allowed or for part of the negative number
+    ///     token to be produced by macro expansion.
+    pub fn merge(input: &mut VecDeque<ScanToken>) -> Option<ScanToken> {
+        match input.get(0)? {
+            ScanToken::Token(Token::Punct(Punct::Dash)) => match input.get(1)? {
+                ScanToken::Token(Token::Number(number)) if number.is_sign_positive() => {
+                    let number = *number;
+                    input.pop_front().unwrap();
                     input.pop_front().unwrap();
+                    return Some(ScanToken::Token(Token::Number(-number)));
+                }
+                _ => Some(input.pop_front().unwrap()),
+            },
+            ScanToken::Token(Token::String(_)) => {
+                let mut i = 0;
+                while matches!(
+                    input.get(i * 2 + 1)?,
+                    ScanToken::Token(Token::Punct(Punct::Plus))
+                ) && matches!(input.get(i * 2 + 2)?, ScanToken::Token(Token::String(_)))
+                {
+                    i += 1;
+                }
+                if i == 0 {
+                    Some(input.pop_front().unwrap())
+                } else {
+                    let mut output = String::new();
+                    for i in 0..=i {
+                        let ScanToken::Token(Token::String(s)) = &input[i * 2] else {
+                            unreachable!()
+                        };
+                        output.push_str(&s);
+                    }
+                    for _ in 0..i * 2 + 1 {
+                        input.pop_front().unwrap();
+                    }
+                    Some(ScanToken::Token(Token::String(output)))
                 }
-                Some(Token::String(output))
             }
+            _ => Some(input.pop_front().unwrap()),
         }
-        _ => Some(input.pop_front().unwrap()),
     }
 }
 
 pub struct StringLexer<'a> {
     input: &'a str,
     segmenter: Segmenter,
-    tokens: VecDeque<Token>,
+    tokens: VecDeque<ScanToken>,
 }
 
 impl<'a> StringLexer<'a> {
@@ -91,11 +279,11 @@ impl<'a> StringLexer<'a> {
 }
 
 impl<'a> Iterator for StringLexer<'a> {
-    type Item = Result<Token, TokenError>;
+    type Item = ScanToken;
 
     fn next(&mut self) -> Option<Self::Item> {
-        if let Some(token) = merge_tokens(&mut self.tokens) {
-            return Some(Ok(token));
+        if let Some(token) = ScanToken::merge(&mut self.tokens) {
+            return Some(token);
         }
         loop {
             let (rest, segment) = self.segmenter.push(self.input, true).unwrap();
@@ -104,19 +292,13 @@ impl<'a> Iterator for StringLexer<'a> {
             }
             let s = &self.input[..self.input.len() - rest.len()];
             self.input = rest;
-            match Token::try_from_segment(s, segment) {
-                Err(error) => {
-                    println!("{:?}", &self.tokens);
-                    return Some(Err(error));
-                }
-                Ok(Some(token)) => {
-                    self.tokens.push_back(token);
-                    if let Some(token) = merge_tokens(&mut self.tokens) {
-                        return Some(Ok(token));
-                    }
+
+            if let Some(token) = ScanToken::from_segment(s, segment) {
+                self.tokens.push_back(token);
+                if let Some(token) = ScanToken::merge(&mut self.tokens) {
+                    return Some(token);
                 }
-                Ok(None) => (),
-            };
+            }
         }
     }
 }
index 1b84e5ffbe4786e3cd24ccdce1b3ae32efaa29fc..d009131f63c2741f0b7ec0c974afb66d8aead8e1 100644 (file)
@@ -1,9 +1,9 @@
 use crate::lex::{
     segment::Mode,
-    token::{MacroToken, Punct, Token, TokenError},
+    token::{MacroToken, Punct, Token},
 };
 
-use super::StringLexer;
+use super::{ScanError, ScanToken, StringLexer};
 
 fn print_token(token: &Token) {
     match token {
@@ -13,22 +13,25 @@ fn print_token(token: &Token) {
         Token::String(s) => print!("Token::String(String::from({s:?}))"),
         Token::EndCommand => print!("Token::EndCommand"),
         Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"),
+        Token::MacroToken(MacroToken::MacroId(id)) => {
+            print!("Token::MacroToken(MacroToken::MacroId(String::from({id:?})))")
+        }
         Token::MacroToken(m) => print!("Token::MacroToken(MacroToken::{m:?})"),
     }
 }
 
-fn check_scan(input: &str, expected: &[Result<Token, TokenError>]) {
-    let tokens = StringLexer::new(input, Mode::Auto, false).collect::<Vec<_>>();
+fn check_scan(input: &str, mode: Mode, expected: &[ScanToken]) {
+    let tokens = StringLexer::new(input, mode, false).collect::<Vec<_>>();
 
     if &tokens != expected {
         for token in &tokens {
             match token {
-                Ok(token) => {
-                    print!("Ok(");
+                ScanToken::Token(token) => {
+                    print!("ScanToken::Token(");
                     print_token(token);
                     print!(")");
                 }
-                Err(error) => print!("Err(TokenError::{error:?})"),
+                ScanToken::Error(error) => print!("ScanToken::Error(ScanError::{error:?})"),
             }
             println!(",");
         }
@@ -56,36 +59,37 @@ QrStUv./* end of line comment */
 WXYZ. /* unterminated end of line comment
 �. /* U+FFFD is not valid in an identifier
 "#,
+        Mode::Auto,
         &[
-            Ok(Token::Id(String::from("a"))),
-            Ok(Token::Id(String::from("aB"))),
-            Ok(Token::Id(String::from("i5"))),
-            Ok(Token::Id(String::from("$x"))),
-            Ok(Token::Id(String::from("@efg"))),
-            Ok(Token::Id(String::from("@@."))),
-            Ok(Token::MacroToken(MacroToken::MacroId(String::from(
+            ScanToken::Token(Token::Id(String::from("a"))),
+            ScanToken::Token(Token::Id(String::from("aB"))),
+            ScanToken::Token(Token::Id(String::from("i5"))),
+            ScanToken::Token(Token::Id(String::from("$x"))),
+            ScanToken::Token(Token::Id(String::from("@efg"))),
+            ScanToken::Token(Token::Id(String::from("@@."))),
+            ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
                 "!abcd",
             )))),
-            Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
-            Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
-            Ok(Token::Id(String::from("a"))),
-            Ok(Token::Id(String::from("#.#"))),
-            Ok(Token::MacroToken(MacroToken::Dot)),
-            Ok(Token::Id(String::from("x"))),
-            Ok(Token::MacroToken(MacroToken::Underscore)),
-            Ok(Token::Id(String::from("z"))),
-            Ok(Token::EndCommand),
-            Ok(Token::Id(String::from("abcd."))),
-            Ok(Token::Id(String::from("abcd"))),
-            Ok(Token::EndCommand),
-            Ok(Token::Id(String::from("QRSTUV"))),
-            Ok(Token::EndCommand),
-            Ok(Token::Id(String::from("QrStUv"))),
-            Ok(Token::EndCommand),
-            Ok(Token::Id(String::from("WXYZ"))),
-            Ok(Token::EndCommand),
-            Err(TokenError::UnexpectedChar('�')),
-            Ok(Token::EndCommand),
+            ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
+            ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
+            ScanToken::Token(Token::Id(String::from("a"))),
+            ScanToken::Token(Token::Id(String::from("#.#"))),
+            ScanToken::Token(Token::MacroToken(MacroToken::Dot)),
+            ScanToken::Token(Token::Id(String::from("x"))),
+            ScanToken::Token(Token::MacroToken(MacroToken::Underscore)),
+            ScanToken::Token(Token::Id(String::from("z"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("abcd."))),
+            ScanToken::Token(Token::Id(String::from("abcd"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("QRSTUV"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("QrStUv"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("WXYZ"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Error(ScanError::UnexpectedChar('�')),
+            ScanToken::Token(Token::EndCommand),
         ],
     );
 }
@@ -98,49 +102,50 @@ AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
 andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
 and. with.
 "#,
+        Mode::Auto,
         &[
-            Ok(Token::Punct(Punct::And)),
-            Ok(Token::Punct(Punct::Or)),
-            Ok(Token::Punct(Punct::Not)),
-            Ok(Token::Punct(Punct::Eq)),
-            Ok(Token::Punct(Punct::Ge)),
-            Ok(Token::Punct(Punct::Gt)),
-            Ok(Token::Punct(Punct::Le)),
-            Ok(Token::Punct(Punct::Lt)),
-            Ok(Token::Punct(Punct::Ne)),
-            Ok(Token::Punct(Punct::All)),
-            Ok(Token::Punct(Punct::By)),
-            Ok(Token::Punct(Punct::To)),
-            Ok(Token::Punct(Punct::With)),
-            Ok(Token::Punct(Punct::And)),
-            Ok(Token::Punct(Punct::Or)),
-            Ok(Token::Punct(Punct::Not)),
-            Ok(Token::Punct(Punct::Eq)),
-            Ok(Token::Punct(Punct::Ge)),
-            Ok(Token::Punct(Punct::Gt)),
-            Ok(Token::Punct(Punct::Le)),
-            Ok(Token::Punct(Punct::Lt)),
-            Ok(Token::Punct(Punct::Ne)),
-            Ok(Token::Punct(Punct::All)),
-            Ok(Token::Punct(Punct::By)),
-            Ok(Token::Punct(Punct::To)),
-            Ok(Token::Punct(Punct::With)),
-            Ok(Token::Id(String::from("andx"))),
-            Ok(Token::Id(String::from("orx"))),
-            Ok(Token::Id(String::from("notx"))),
-            Ok(Token::Id(String::from("eqx"))),
-            Ok(Token::Id(String::from("gex"))),
-            Ok(Token::Id(String::from("gtx"))),
-            Ok(Token::Id(String::from("lex"))),
-            Ok(Token::Id(String::from("ltx"))),
-            Ok(Token::Id(String::from("nex"))),
-            Ok(Token::Id(String::from("allx"))),
-            Ok(Token::Id(String::from("byx"))),
-            Ok(Token::Id(String::from("tox"))),
-            Ok(Token::Id(String::from("withx"))),
-            Ok(Token::Id(String::from("and."))),
-            Ok(Token::Punct(Punct::With)),
-            Ok(Token::EndCommand),
+            ScanToken::Token(Token::Punct(Punct::And)),
+            ScanToken::Token(Token::Punct(Punct::Or)),
+            ScanToken::Token(Token::Punct(Punct::Not)),
+            ScanToken::Token(Token::Punct(Punct::Eq)),
+            ScanToken::Token(Token::Punct(Punct::Ge)),
+            ScanToken::Token(Token::Punct(Punct::Gt)),
+            ScanToken::Token(Token::Punct(Punct::Le)),
+            ScanToken::Token(Token::Punct(Punct::Lt)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::All)),
+            ScanToken::Token(Token::Punct(Punct::By)),
+            ScanToken::Token(Token::Punct(Punct::To)),
+            ScanToken::Token(Token::Punct(Punct::With)),
+            ScanToken::Token(Token::Punct(Punct::And)),
+            ScanToken::Token(Token::Punct(Punct::Or)),
+            ScanToken::Token(Token::Punct(Punct::Not)),
+            ScanToken::Token(Token::Punct(Punct::Eq)),
+            ScanToken::Token(Token::Punct(Punct::Ge)),
+            ScanToken::Token(Token::Punct(Punct::Gt)),
+            ScanToken::Token(Token::Punct(Punct::Le)),
+            ScanToken::Token(Token::Punct(Punct::Lt)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::All)),
+            ScanToken::Token(Token::Punct(Punct::By)),
+            ScanToken::Token(Token::Punct(Punct::To)),
+            ScanToken::Token(Token::Punct(Punct::With)),
+            ScanToken::Token(Token::Id(String::from("andx"))),
+            ScanToken::Token(Token::Id(String::from("orx"))),
+            ScanToken::Token(Token::Id(String::from("notx"))),
+            ScanToken::Token(Token::Id(String::from("eqx"))),
+            ScanToken::Token(Token::Id(String::from("gex"))),
+            ScanToken::Token(Token::Id(String::from("gtx"))),
+            ScanToken::Token(Token::Id(String::from("lex"))),
+            ScanToken::Token(Token::Id(String::from("ltx"))),
+            ScanToken::Token(Token::Id(String::from("nex"))),
+            ScanToken::Token(Token::Id(String::from("allx"))),
+            ScanToken::Token(Token::Id(String::from("byx"))),
+            ScanToken::Token(Token::Id(String::from("tox"))),
+            ScanToken::Token(Token::Id(String::from("withx"))),
+            ScanToken::Token(Token::Id(String::from("and."))),
+            ScanToken::Token(Token::Punct(Punct::With)),
+            ScanToken::Token(Token::EndCommand),
         ],
     );
 }
@@ -152,56 +157,57 @@ fn test_punctuation() {
 ~&|=>=><=<~=<>(),-+*/[]**
 % : ; ? _ ` { } ~
 "#,
+        Mode::Auto,
         &[
-            Ok(Token::Punct(Punct::Not)),
-            Ok(Token::Punct(Punct::And)),
-            Ok(Token::Punct(Punct::Or)),
-            Ok(Token::Punct(Punct::Equals)),
-            Ok(Token::Punct(Punct::Ge)),
-            Ok(Token::Punct(Punct::Gt)),
-            Ok(Token::Punct(Punct::Le)),
-            Ok(Token::Punct(Punct::Lt)),
-            Ok(Token::Punct(Punct::Ne)),
-            Ok(Token::Punct(Punct::Ne)),
-            Ok(Token::Punct(Punct::LParen)),
-            Ok(Token::Punct(Punct::RParen)),
-            Ok(Token::Punct(Punct::Comma)),
-            Ok(Token::Punct(Punct::Dash)),
-            Ok(Token::Punct(Punct::Plus)),
-            Ok(Token::Punct(Punct::Asterisk)),
-            Ok(Token::Punct(Punct::Slash)),
-            Ok(Token::Punct(Punct::LSquare)),
-            Ok(Token::Punct(Punct::RSquare)),
-            Ok(Token::Punct(Punct::Exp)),
-            Ok(Token::Punct(Punct::Not)),
-            Ok(Token::Punct(Punct::And)),
-            Ok(Token::Punct(Punct::Or)),
-            Ok(Token::Punct(Punct::Equals)),
-            Ok(Token::Punct(Punct::Ge)),
-            Ok(Token::Punct(Punct::Gt)),
-            Ok(Token::Punct(Punct::Le)),
-            Ok(Token::Punct(Punct::Lt)),
-            Ok(Token::Punct(Punct::Ne)),
-            Ok(Token::Punct(Punct::Ne)),
-            Ok(Token::Punct(Punct::LParen)),
-            Ok(Token::Punct(Punct::RParen)),
-            Ok(Token::Punct(Punct::Comma)),
-            Ok(Token::Punct(Punct::Dash)),
-            Ok(Token::Punct(Punct::Plus)),
-            Ok(Token::Punct(Punct::Asterisk)),
-            Ok(Token::Punct(Punct::Slash)),
-            Ok(Token::Punct(Punct::LSquare)),
-            Ok(Token::Punct(Punct::RSquare)),
-            Ok(Token::Punct(Punct::Exp)),
-            Ok(Token::MacroToken(MacroToken::Percent)),
-            Ok(Token::Punct(Punct::Colon)),
-            Ok(Token::Punct(Punct::Semicolon)),
-            Ok(Token::MacroToken(MacroToken::Question)),
-            Ok(Token::MacroToken(MacroToken::Underscore)),
-            Ok(Token::MacroToken(MacroToken::Backtick)),
-            Ok(Token::Punct(Punct::LCurly)),
-            Ok(Token::Punct(Punct::RCurly)),
-            Ok(Token::Punct(Punct::Not)),
+            ScanToken::Token(Token::Punct(Punct::Not)),
+            ScanToken::Token(Token::Punct(Punct::And)),
+            ScanToken::Token(Token::Punct(Punct::Or)),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Punct(Punct::Ge)),
+            ScanToken::Token(Token::Punct(Punct::Gt)),
+            ScanToken::Token(Token::Punct(Punct::Le)),
+            ScanToken::Token(Token::Punct(Punct::Lt)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::LParen)),
+            ScanToken::Token(Token::Punct(Punct::RParen)),
+            ScanToken::Token(Token::Punct(Punct::Comma)),
+            ScanToken::Token(Token::Punct(Punct::Dash)),
+            ScanToken::Token(Token::Punct(Punct::Plus)),
+            ScanToken::Token(Token::Punct(Punct::Asterisk)),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Punct(Punct::LSquare)),
+            ScanToken::Token(Token::Punct(Punct::RSquare)),
+            ScanToken::Token(Token::Punct(Punct::Exp)),
+            ScanToken::Token(Token::Punct(Punct::Not)),
+            ScanToken::Token(Token::Punct(Punct::And)),
+            ScanToken::Token(Token::Punct(Punct::Or)),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Punct(Punct::Ge)),
+            ScanToken::Token(Token::Punct(Punct::Gt)),
+            ScanToken::Token(Token::Punct(Punct::Le)),
+            ScanToken::Token(Token::Punct(Punct::Lt)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::LParen)),
+            ScanToken::Token(Token::Punct(Punct::RParen)),
+            ScanToken::Token(Token::Punct(Punct::Comma)),
+            ScanToken::Token(Token::Punct(Punct::Dash)),
+            ScanToken::Token(Token::Punct(Punct::Plus)),
+            ScanToken::Token(Token::Punct(Punct::Asterisk)),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Punct(Punct::LSquare)),
+            ScanToken::Token(Token::Punct(Punct::RSquare)),
+            ScanToken::Token(Token::Punct(Punct::Exp)),
+            ScanToken::Token(Token::MacroToken(MacroToken::Percent)),
+            ScanToken::Token(Token::Punct(Punct::Colon)),
+            ScanToken::Token(Token::Punct(Punct::Semicolon)),
+            ScanToken::Token(Token::MacroToken(MacroToken::Question)),
+            ScanToken::Token(Token::MacroToken(MacroToken::Underscore)),
+            ScanToken::Token(Token::MacroToken(MacroToken::Backtick)),
+            ScanToken::Token(Token::Punct(Punct::LCurly)),
+            ScanToken::Token(Token::Punct(Punct::RCurly)),
+            ScanToken::Token(Token::Punct(Punct::Not)),
         ],
     );
 }
@@ -217,41 +223,42 @@ fn test_positive_numbers() {
 1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
 . 1e e1 1e+ 1e-
 "#,
+        Mode::Auto,
         &[
-            Ok(Token::Number(0.0)),
-            Ok(Token::Number(1.0)),
-            Ok(Token::Number(1.0)),
-            Ok(Token::Number(1.0)),
-            Ok(Token::Number(1.0)),
-            Ok(Token::EndCommand),
-            Ok(Token::Number(123.0)),
-            Ok(Token::EndCommand),
-            Ok(Token::EndCommand),
-            Ok(Token::Number(1.0)),
-            Ok(Token::Number(0.1)),
-            Ok(Token::Number(0.1)),
-            Ok(Token::Number(0.1)),
-            Ok(Token::Number(50.0)),
-            Ok(Token::Number(0.6)),
-            Ok(Token::Number(70.0)),
-            Ok(Token::Number(60.0)),
-            Ok(Token::Number(0.006)),
-            Ok(Token::EndCommand),
-            Ok(Token::Number(30.0)),
-            Ok(Token::Number(0.04)),
-            Ok(Token::Number(5.0)),
-            Ok(Token::Number(6.0)),
-            Ok(Token::Number(0.0007)),
-            Ok(Token::Number(12.3)),
-            Ok(Token::Number(4.56)),
-            Ok(Token::Number(789.0)),
-            Ok(Token::Number(999.0)),
-            Ok(Token::Number(0.0112)),
-            Ok(Token::EndCommand),
-            Err(TokenError::ExpectedExponent(String::from("1e"))),
-            Ok(Token::Id(String::from("e1"))),
-            Err(TokenError::ExpectedExponent(String::from("1e+"))),
-            Err(TokenError::ExpectedExponent(String::from("1e-"))),
+            ScanToken::Token(Token::Number(0.0)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(123.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::Number(0.1)),
+            ScanToken::Token(Token::Number(0.1)),
+            ScanToken::Token(Token::Number(0.1)),
+            ScanToken::Token(Token::Number(50.0)),
+            ScanToken::Token(Token::Number(0.6)),
+            ScanToken::Token(Token::Number(70.0)),
+            ScanToken::Token(Token::Number(60.0)),
+            ScanToken::Token(Token::Number(0.006)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(30.0)),
+            ScanToken::Token(Token::Number(0.04)),
+            ScanToken::Token(Token::Number(5.0)),
+            ScanToken::Token(Token::Number(6.0)),
+            ScanToken::Token(Token::Number(0.0007)),
+            ScanToken::Token(Token::Number(12.3)),
+            ScanToken::Token(Token::Number(4.56)),
+            ScanToken::Token(Token::Number(789.0)),
+            ScanToken::Token(Token::Number(999.0)),
+            ScanToken::Token(Token::Number(0.0112)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))),
+            ScanToken::Token(Token::Id(String::from("e1"))),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))),
         ],
     );
 }
@@ -268,52 +275,53 @@ fn test_negative_numbers() {
  -/**/1
  -. -1e -e1 -1e+ -1e- -1.
 "#,
+        Mode::Auto,
         &[
-            Ok(Token::Number(-0.0)),
-            Ok(Token::Number(-1.0)),
-            Ok(Token::Number(-1.0)),
-            Ok(Token::Number(-1.0)),
-            Ok(Token::Number(-1.0)),
-            Ok(Token::EndCommand),
-            Ok(Token::Number(-123.0)),
-            Ok(Token::EndCommand),
-            Ok(Token::Number(-0.1)),
-            Ok(Token::Number(-0.1)),
-            Ok(Token::Number(-0.1)),
-            Ok(Token::Number(-0.1)),
-            Ok(Token::Number(-50.0)),
-            Ok(Token::Number(-0.6)),
-            Ok(Token::Number(-70.0)),
-            Ok(Token::Number(-60.0)),
-            Ok(Token::Number(-0.006)),
-            Ok(Token::Number(-3.0)),
-            Ok(Token::Number(-0.04)),
-            Ok(Token::Number(-5.0)),
-            Ok(Token::Number(-6.0)),
-            Ok(Token::Number(-0.0007)),
-            Ok(Token::Number(-12.3)),
-            Ok(Token::Number(-4.56)),
-            Ok(Token::Number(-789.0)),
-            Ok(Token::Number(-999.0)),
-            Ok(Token::Number(-0.0112)),
-            Ok(Token::Number(-1.0)),
-            Ok(Token::Punct(Punct::Dash)),
-            Ok(Token::MacroToken(MacroToken::Dot)),
-            Err(TokenError::ExpectedExponent(String::from("-1e"))),
-            Ok(Token::Punct(Punct::Dash)),
-            Ok(Token::Id(String::from("e1"))),
-            Err(TokenError::ExpectedExponent(String::from("-1e+"))),
-            Err(TokenError::ExpectedExponent(String::from("-1e-"))),
-            Ok(Token::Number(-1.0)),
-            Ok(Token::EndCommand),
+            ScanToken::Token(Token::Number(-0.0)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(-123.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(-0.1)),
+            ScanToken::Token(Token::Number(-0.1)),
+            ScanToken::Token(Token::Number(-0.1)),
+            ScanToken::Token(Token::Number(-0.1)),
+            ScanToken::Token(Token::Number(-50.0)),
+            ScanToken::Token(Token::Number(-0.6)),
+            ScanToken::Token(Token::Number(-70.0)),
+            ScanToken::Token(Token::Number(-60.0)),
+            ScanToken::Token(Token::Number(-0.006)),
+            ScanToken::Token(Token::Number(-3.0)),
+            ScanToken::Token(Token::Number(-0.04)),
+            ScanToken::Token(Token::Number(-5.0)),
+            ScanToken::Token(Token::Number(-6.0)),
+            ScanToken::Token(Token::Number(-0.0007)),
+            ScanToken::Token(Token::Number(-12.3)),
+            ScanToken::Token(Token::Number(-4.56)),
+            ScanToken::Token(Token::Number(-789.0)),
+            ScanToken::Token(Token::Number(-999.0)),
+            ScanToken::Token(Token::Number(-0.0112)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::Punct(Punct::Dash)),
+            ScanToken::Token(Token::MacroToken(MacroToken::Dot)),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))),
+            ScanToken::Token(Token::Punct(Punct::Dash)),
+            ScanToken::Token(Token::Id(String::from("e1"))),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::EndCommand),
         ],
     );
 }
 
-
 #[test]
 fn test_strings() {
-    check_scan(r#"'x' "y" 'abc'
+    check_scan(
+        r#"'x' "y" 'abc'
 'Don''t' "Can't" 'Won''t'
 """quoted""" '"quoted"'
 '' "" '''' """"
@@ -342,12 +350,692 @@ x"4142"
 +u'304a'
 "�あいうえお"
 "abc"+U"FFFD"+u'3048'+"xyz"
-"#, &[]);
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::String(String::from("x"))),
+            ScanToken::Token(Token::String(String::from("y"))),
+            ScanToken::Token(Token::String(String::from("abc"))),
+            ScanToken::Token(Token::String(String::from("Don't"))),
+            ScanToken::Token(Token::String(String::from("Can't"))),
+            ScanToken::Token(Token::String(String::from("Won't"))),
+            ScanToken::Token(Token::String(String::from("\"quoted\""))),
+            ScanToken::Token(Token::String(String::from("\"quoted\""))),
+            ScanToken::Token(Token::String(String::from(""))),
+            ScanToken::Token(Token::String(String::from(""))),
+            ScanToken::Token(Token::String(String::from("'"))),
+            ScanToken::Token(Token::String(String::from("\""))),
+            ScanToken::Error(ScanError::ExpectedQuote),
+            ScanToken::Error(ScanError::ExpectedQuote),
+            ScanToken::Token(Token::String(String::from("xyzabcde"))),
+            ScanToken::Token(Token::String(String::from("foobar"))),
+            ScanToken::Token(Token::String(String::from("foobar"))),
+            ScanToken::Token(Token::String(String::from("foo"))),
+            ScanToken::Token(Token::Punct(Punct::Plus)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("bar"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Punct(Punct::Plus)),
+            ScanToken::Token(Token::String(String::from("AB5152"))),
+            ScanToken::Token(Token::String(String::from("4142QR"))),
+            ScanToken::Token(Token::String(String::from("ABお"))),
+            ScanToken::Token(Token::String(String::from("�あいうえお"))),
+            ScanToken::Token(Token::String(String::from("abc�えxyz"))),
+            ScanToken::Token(Token::End),
+        ],
+    );
+}
+
+#[test]
+fn test_shbang() {
+    check_scan(
+        r#"#! /usr/bin/pspp
+#! /usr/bin/pspp
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(String::from("#"))),
+            ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!")))),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Id(String::from("usr"))),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Id(String::from("bin"))),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Id(String::from("pspp"))),
+        ],
+    );
+}
+
+#[test]
+fn test_comments() {
+    check_scan(
+        r#"* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+   * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("com"))),
+            ScanToken::Token(Token::Id(String::from("is"))),
+            ScanToken::Token(Token::Id(String::from("ambiguous"))),
+            ScanToken::Token(Token::Punct(Punct::With)),
+            ScanToken::Token(Token::Id(String::from("COMPUTE"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("next"))),
+            ScanToken::Token(Token::Id(String::from("command"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_document() {
+    check_scan(
+        r#"DOCUMENT one line.
+DOC more
+    than
+        one
+            line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(String::from("DOCUMENT"))),
+            ScanToken::Token(Token::String(String::from("DOCUMENT one line."))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("DOCUMENT"))),
+            ScanToken::Token(Token::String(String::from("DOC more"))),
+            ScanToken::Token(Token::String(String::from("    than"))),
+            ScanToken::Token(Token::String(String::from("        one"))),
+            ScanToken::Token(Token::String(String::from("            line."))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("DOCUMENT"))),
+            ScanToken::Token(Token::String(String::from("docu"))),
+            ScanToken::Token(Token::String(String::from("first.paragraph"))),
+            ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))),
+            ScanToken::Token(Token::String(String::from(""))),
+            ScanToken::Token(Token::String(String::from("second paragraph."))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_file_label() {
+    check_scan(
+        r#"FIL label isn't quoted.
+FILE
+  lab 'is quoted'.
+FILE /*
+/**/  lab not quoted here either
+
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(String::from("FIL"))),
+            ScanToken::Token(Token::Id(String::from("label"))),
+            ScanToken::Token(Token::String(String::from("isn't quoted"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("FILE"))),
+            ScanToken::Token(Token::Id(String::from("lab"))),
+            ScanToken::Token(Token::String(String::from("is quoted"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("FILE"))),
+            ScanToken::Token(Token::Id(String::from("lab"))),
+            ScanToken::Token(Token::String(String::from("not quoted here either"))),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_begin_data() {
+    check_scan(
+        r#"begin data.
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end  data
+end data
+.
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(String::from("begin"))),
+            ScanToken::Token(Token::Id(String::from("data"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("123"))),
+            ScanToken::Token(Token::String(String::from("xxx"))),
+            ScanToken::Token(Token::Id(String::from("end"))),
+            ScanToken::Token(Token::Id(String::from("data"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("BEG"))),
+            ScanToken::Token(Token::Id(String::from("DAT"))),
+            ScanToken::Token(Token::String(String::from("5 6 7 /* x"))),
+            ScanToken::Token(Token::String(String::from(""))),
+            ScanToken::Token(Token::String(String::from("end  data"))),
+            ScanToken::Token(Token::Id(String::from("end"))),
+            ScanToken::Token(Token::Id(String::from("data"))),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_do_repeat() {
+    check_scan(
+        r#"do repeat x=a b c
+          y=d e f.
+  do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(String::from("do"))),
+            ScanToken::Token(Token::Id(String::from("repeat"))),
+            ScanToken::Token(Token::Id(String::from("x"))),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Id(String::from("a"))),
+            ScanToken::Token(Token::Id(String::from("b"))),
+            ScanToken::Token(Token::Id(String::from("c"))),
+            ScanToken::Token(Token::Id(String::from("y"))),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Id(String::from("d"))),
+            ScanToken::Token(Token::Id(String::from("e"))),
+            ScanToken::Token(Token::Id(String::from("f"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("  do repeat a=1 thru 5."))),
+            ScanToken::Token(Token::String(String::from("another command."))),
+            ScanToken::Token(Token::String(String::from("second command"))),
+            ScanToken::Token(Token::String(String::from("+ third command."))),
+            ScanToken::Token(Token::String(String::from(
+                "end /* x */ /* y */ repeat print.",
+            ))),
+            ScanToken::Token(Token::Id(String::from("end"))),
+            ScanToken::Token(Token::Id(String::from("repeat"))),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_do_repeat_batch() {
+    check_scan(
+        r#"do repeat x=a b c
+          y=d e f
+do repeat a=1 thru 5
+another command
+second command
++ third command
+end /* x */ /* y */ repeat print
+end
+ repeat
+do
+  repeat #a=1
+
+  inner command
+end repeat
+"#,
+        Mode::Batch,
+        &[
+            ScanToken::Token(Token::Id(String::from("do"))),
+            ScanToken::Token(Token::Id(String::from("repeat"))),
+            ScanToken::Token(Token::Id(String::from("x"))),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Id(String::from("a"))),
+            ScanToken::Token(Token::Id(String::from("b"))),
+            ScanToken::Token(Token::Id(String::from("c"))),
+            ScanToken::Token(Token::Id(String::from("y"))),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Id(String::from("d"))),
+            ScanToken::Token(Token::Id(String::from("e"))),
+            ScanToken::Token(Token::Id(String::from("f"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))),
+            ScanToken::Token(Token::String(String::from("another command"))),
+            ScanToken::Token(Token::String(String::from("second command"))),
+            ScanToken::Token(Token::String(String::from("+ third command"))),
+            ScanToken::Token(Token::String(String::from(
+                "end /* x */ /* y */ repeat print",
+            ))),
+            ScanToken::Token(Token::Id(String::from("end"))),
+            ScanToken::Token(Token::Id(String::from("repeat"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("do"))),
+            ScanToken::Token(Token::Id(String::from("repeat"))),
+            ScanToken::Token(Token::Id(String::from("#a"))),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("  inner command"))),
+            ScanToken::Token(Token::Id(String::from("end"))),
+            ScanToken::Token(Token::Id(String::from("repeat"))),
+        ],
+    );
 }
+
 #[test]
-fn test_strings2() {
-    check_scan(r#"""""
-'error
-'b'
-"#, &[]);
+fn test_batch_mode() {
+    check_scan(
+        r#"first command
+     another line of first command
++  second command
+third command
+
+fourth command.
+   fifth command.
+"#,
+        Mode::Batch,
+        &[
+            ScanToken::Token(Token::Id(String::from("first"))),
+            ScanToken::Token(Token::Id(String::from("command"))),
+            ScanToken::Token(Token::Id(String::from("another"))),
+            ScanToken::Token(Token::Id(String::from("line"))),
+            ScanToken::Token(Token::Id(String::from("of"))),
+            ScanToken::Token(Token::Id(String::from("first"))),
+            ScanToken::Token(Token::Id(String::from("command"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("second"))),
+            ScanToken::Token(Token::Id(String::from("command"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("third"))),
+            ScanToken::Token(Token::Id(String::from("command"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("fourth"))),
+            ScanToken::Token(Token::Id(String::from("command"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(String::from("fifth"))),
+            ScanToken::Token(Token::Id(String::from("command"))),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+mod define {
+    use crate::lex::{
+        scan::ScanToken,
+        segment::Mode,
+        token::{MacroToken, Punct, Token},
+    };
+
+    use super::check_scan;
+
+    #[test]
+    fn test_simple() {
+        check_scan(
+            r#"define !macro1()
+var1 var2 var3
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
+                ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
+                    "!enddefine",
+                )))),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_no_newline_after_parentheses() {
+        check_scan(
+            r#"define !macro1() var1 var2 var3
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from(" var1 var2 var3"))),
+                ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
+                    "!enddefine",
+                )))),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_no_newline_before_enddefine() {
+        check_scan(
+            r#"define !macro1()
+var1 var2 var3!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
+                ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
+                    "!enddefine",
+                )))),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_all_on_one_line() {
+        check_scan(
+            r#"define !macro1()var1 var2 var3!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
+                ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
+                    "!enddefine",
+                )))),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_empty() {
+        check_scan(
+            r#"define !macro1()
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
+                    "!enddefine",
+                )))),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_blank_lines() {
+        check_scan(
+            r#"define !macro1()
+
+
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from(""))),
+                ScanToken::Token(Token::String(String::from(""))),
+                ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
+                    "!enddefine",
+                )))),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_arguments() {
+        check_scan(
+            r#"define !macro1(a(), b(), c())
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Id(String::from("a"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(String::from("b"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(String::from("c"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
+                    "!enddefine",
+                )))),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_multiline_arguments() {
+        check_scan(
+            r#"define !macro1(
+  a(), b(
+  ),
+  c()
+)
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Id(String::from("a"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(String::from("b"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(String::from("c"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
+                    "!enddefine",
+                )))),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_arguments_start_on_second_line() {
+        check_scan(
+            r#"define !macro1
+(x,y,z
+)
+content 1
+content 2
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Id(String::from("x"))),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(String::from("y"))),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(String::from("z"))),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("content 1"))),
+                ScanToken::Token(Token::String(String::from("content 2"))),
+                ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
+                    "!enddefine",
+                )))),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_1() {
+        check_scan(
+            r#"define !macro1.
+data list /x 1.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(String::from("data"))),
+                ScanToken::Token(Token::Id(String::from("list"))),
+                ScanToken::Token(Token::Punct(Punct::Slash)),
+                ScanToken::Token(Token::Id(String::from("x"))),
+                ScanToken::Token(Token::Number(1.0)),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_2() {
+        check_scan(
+            r#"define !macro1
+x.
+data list /x 1.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Id(String::from("x"))),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(String::from("data"))),
+                ScanToken::Token(Token::Id(String::from("list"))),
+                ScanToken::Token(Token::Punct(Punct::Slash)),
+                ScanToken::Token(Token::Id(String::from("x"))),
+                ScanToken::Token(Token::Number(1.0)),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_3() {
+        check_scan(
+            r#"define !macro1(.
+x.
+data list /x 1.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(String::from("x"))),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(String::from("data"))),
+                ScanToken::Token(Token::Id(String::from("list"))),
+                ScanToken::Token(Token::Punct(Punct::Slash)),
+                ScanToken::Token(Token::Id(String::from("x"))),
+                ScanToken::Token(Token::Number(1.0)),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_4() {
+        // Notice the command terminator at the end of the DEFINE command,
+        // which should not be there and ends it early.
+        check_scan(
+            r#"define !macro1.
+data list /x 1.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(String::from("data"))),
+                ScanToken::Token(Token::Id(String::from("list"))),
+                ScanToken::Token(Token::Punct(Punct::Slash)),
+                ScanToken::Token(Token::Id(String::from("x"))),
+                ScanToken::Token(Token::Number(1.0)),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_missing_enddefine() {
+        check_scan(
+            r#"define !macro1()
+content line 1
+content line 2
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(String::from("define"))),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("content line 1"))),
+                ScanToken::Token(Token::String(String::from("content line 2"))),
+                ScanToken::Token(Token::End),
+            ],
+        );
+    }
 }
index ca7dfd0686a6ad30d0d704c9d8d5e356d483cac2..eae0b4810cf49fe56a8026ba8da5262ac49191b3 100644 (file)
@@ -381,14 +381,14 @@ fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomple
 }
 
 fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
-    let (Some(c), _rest) = take(input, eof)? else {
+    let (Some(c), rest) = take(input, eof)? else {
         return Ok(false);
     };
     match c {
-        'x' | 'X' | 'u' | 'U' => Ok({
-            let (c, _rest) = take(input, eof)?;
-            c == Some('\'') || c == Some('"')
-        }),
+        'x' | 'X' | 'u' | 'U' => {
+            let (c, _rest) = take(rest, eof)?;
+            Ok(c == Some('\'') || c == Some('"'))
+        },
         '\'' | '"' => Ok(true),
         '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true),
         _ => Ok(false),
index 056ac7b031e98f9ac7192f638e2b09e5c7e35e56..dd2d50cb62a960ad34ff3259e9bf7317dcdce92f 100644 (file)
@@ -111,7 +111,7 @@ fn check_segmentation(
 
 #[allow(dead_code)]
 fn print_segmentation(mut input: &str) {
-    let mut segmenter = Segmenter::new(Mode::Auto, false);
+    let mut segmenter = Segmenter::new(Mode::Interactive, false);
     loop {
         let (rest, segment) = segmenter.push(input, true).unwrap();
         let len = input.len() - rest.len();
@@ -2157,37 +2157,3 @@ fourth command.
         ],
     );
 }
-
-#[test]
-fn test_strings2() {
-    print_segmentation(r#"'x' "y" 'abc'
-'Don''t' "Can't" 'Won''t'
-"""quoted""" '"quoted"'
-'' "" '''' """"
-'missing end quote
-"missing double quote
-'x' + "y"
-+ 'z' +
-'a' /* abc */ + "b" /*
-+ 'c' +/* */"d"/* */+'e'
-'foo'
-+          /* special case: + in column 0 would ordinarily start a new command
-'bar'
-'foo'
- +
-'bar'
-'foo'
-+
-
-'bar'
-
-+
-x"4142"+'5152'
-"4142"+
-x'5152'
-x"4142"
-+u'304a'
-"�あいうえお"
-"abc"+U"FFFD"+u'3048'+"xyz"
-"#);
-}
index 2c3489c55400439cf2f0b32b7cc296fccfdfaec8..016b2828386072f5273f7e96efd413c0f03d2cb7 100644 (file)
@@ -1,7 +1,3 @@
-use thiserror::Error as ThisError;
-
-use super::segment::Segment;
-
 #[derive(Clone, Debug, PartialEq)]
 pub enum Token {
     /// End of input.
@@ -143,174 +139,3 @@ pub enum MacroToken {
     /// first character, so this represents an underscore found on its own.
     Underscore,
 }
-
-#[derive(ThisError, Debug, PartialEq, Eq)]
-pub enum TokenError {
-    /// Unterminated string constant.
-    #[error("Unterminated string constant.")]
-    ExpectedQuote,
-
-    /// Missing exponent.
-    #[error("Missing exponent following `{0}`")]
-    ExpectedExponent(String),
-
-    /// Odd length hex string.
-    #[error("String of hex digits has {0} characters, which is not a multiple of 2.")]
-    OddLengthHexString(usize),
-
-    /// Invalid hex digit.
-    #[error("Invalid hex digit {0:?}.")]
-    BadHexDigit(char),
-
-    /// Invalid length Unicode string.
-    #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
-    BadLengthUnicodeString(usize),
-
-    /// Invalid code point.
-    #[error("U+{0:04X} is not a valid Unicode code point.")]
-    BadCodePoint(u32),
-
-    /// Expected hexadecimal Unicode code point
-    #[error("Expected hexadecimal Unicode code point.")]
-    ExpectedCodePoint,
-
-    /// `DO REPEAT` nested too deeply.
-    #[error("`DO REPEAT` nested too deeply.")]
-    DoRepeatOverflow,
-
-    /// Unexpected character.
-    #[error("Unexpected character {0:?} in input.")]
-    UnexpectedChar(char),
-}
-
-impl Token {
-    pub fn try_from_segment(s: &str, segment: Segment) -> Result<Option<Self>, TokenError> {
-        match segment {
-            Segment::Number => Ok(Some(Self::Number(s.parse().unwrap()))),
-            Segment::QuotedString => {
-                // Trim quote mark from front and back.
-                let mut chars = s.chars();
-                let quote = chars.next().unwrap();
-                let s = chars.as_str().strip_suffix(quote).unwrap();
-
-                // Replace doubled quotes by single ones.
-                let (single_quote, double_quote) = match quote {
-                    '\'' => ("'", "''"),
-                    '"' => ("\"", "\"\""),
-                    _ => unreachable!(),
-                };
-                Ok(Some(Self::String(s.replace(double_quote, single_quote))))
-            }
-            Segment::HexString => {
-                // Strip `X"` prefix and `"` suffix (or variations).
-                let s = &s[2..s.len() - 1];
-                for c in s.chars() {
-                    if !c.is_ascii_hexdigit() {
-                        return Err(TokenError::BadHexDigit(c))
-                    }
-                }
-                if s.len() % 2 != 0 {
-                    return Err(TokenError::OddLengthHexString(s.len()))
-                }
-                let mut out = String::with_capacity(s.len());
-                for pair in s.as_bytes().chunks_exact(2) {
-                    let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
-                    let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
-                    out.push(char::from(hi * 16 + lo));
-                }
-                Ok(Some(Self::String(out)))
-            }
-            Segment::UnicodeString => {
-                // Strip `U"` prefix and `"` suffix (or variations).
-                let s = &s[2..s.len() - 1];
-                if !(1..=8).contains(&s.len()) {
-                    return Err(TokenError::BadLengthUnicodeString(s.len()));
-                }
-                let Ok(code_point) = u32::from_str_radix(s, 16) else {
-                    return Err(TokenError::ExpectedCodePoint);
-                };
-                let Some(c) = char::from_u32(code_point) else {
-                    return Err(TokenError::BadCodePoint(code_point));
-                };
-                Ok(Some(Self::String(String::from(c))))
-            }
-
-            Segment::UnquotedString
-            | Segment::DoRepeatCommand
-            | Segment::InlineData
-            | Segment::Document
-            | Segment::MacroBody
-            | Segment::MacroName => Ok(Some(Self::String(String::from(s)))),
-
-            Segment::ReservedWord => {
-                let c0 = s.as_bytes()[0].to_ascii_uppercase();
-                let c1 = s.as_bytes()[1].to_ascii_uppercase();
-                match (c0, c1) {
-                    (b'B', _) => Ok(Some(Self::Punct(Punct::By))),
-                    (b'E', _) => Ok(Some(Self::Punct(Punct::Eq))),
-                    (b'G', b'T') => Ok(Some(Self::Punct(Punct::Gt))),
-                    (b'G', _) => Ok(Some(Self::Punct(Punct::Ge))),
-                    (b'L', b'T') => Ok(Some(Self::Punct(Punct::Lt))),
-                    (b'L', _) => Ok(Some(Self::Punct(Punct::Le))),
-                    (b'N', b'E') => Ok(Some(Self::Punct(Punct::Ne))),
-                    (b'N', _) => Ok(Some(Self::Punct(Punct::Not))),
-                    (b'O', _) => Ok(Some(Self::Punct(Punct::Or))),
-                    (b'T', _) => Ok(Some(Self::Punct(Punct::To))),
-                    (b'A', b'L') => Ok(Some(Self::Punct(Punct::All))),
-                    (b'A', _) => Ok(Some(Self::Punct(Punct::And))),
-                    (b'W', _) => Ok(Some(Self::Punct(Punct::With))),
-                    _ => unreachable!(),
-                }
-            }
-            Segment::Identifier => Ok(Some(Self::Id(String::from(s)))),
-            Segment::Punct => match s {
-                "(" => Ok(Some(Self::Punct(Punct::LParen))),
-                ")" => Ok(Some(Self::Punct(Punct::RParen))),
-                "[" => Ok(Some(Self::Punct(Punct::LSquare))),
-                "]" => Ok(Some(Self::Punct(Punct::RSquare))),
-                "{" => Ok(Some(Self::Punct(Punct::LCurly))),
-                "}" => Ok(Some(Self::Punct(Punct::RCurly))),
-                "," => Ok(Some(Self::Punct(Punct::Comma))),
-                "=" => Ok(Some(Self::Punct(Punct::Equals))),
-                "-" => Ok(Some(Self::Punct(Punct::Dash))),
-                "&" => Ok(Some(Self::Punct(Punct::And))),
-                "|" => Ok(Some(Self::Punct(Punct::Or))),
-                "+" => Ok(Some(Self::Punct(Punct::Plus))),
-                "/" => Ok(Some(Self::Punct(Punct::Slash))),
-                "*" => Ok(Some(Self::Punct(Punct::Asterisk))),
-                "<" => Ok(Some(Self::Punct(Punct::Lt))),
-                ">" => Ok(Some(Self::Punct(Punct::Gt))),
-                "~" => Ok(Some(Self::Punct(Punct::Not))),
-                ":" => Ok(Some(Self::Punct(Punct::Colon))),
-                ";" => Ok(Some(Self::Punct(Punct::Semicolon))),
-                "**" => Ok(Some(Self::Punct(Punct::Exp))),
-                "<=" => Ok(Some(Self::Punct(Punct::Le))),
-                "<>" => Ok(Some(Self::Punct(Punct::Ne))),
-                "~=" => Ok(Some(Self::Punct(Punct::Ne))),
-                ">=" => Ok(Some(Self::Punct(Punct::Ge))),
-                "!" => Ok(Some(Self::MacroToken(MacroToken::Bang))),
-                "%" => Ok(Some(Self::MacroToken(MacroToken::Percent))),
-                "?" => Ok(Some(Self::MacroToken(MacroToken::Question))),
-                "`" => Ok(Some(Self::MacroToken(MacroToken::Backtick))),
-                "_" => Ok(Some(Self::MacroToken(MacroToken::Underscore))),
-                "." => Ok(Some(Self::MacroToken(MacroToken::Dot))),
-                _ => unreachable!("bad punctuator {s:?}"),
-            },
-            Segment::Shbang
-            | Segment::Spaces
-            | Segment::Comment
-            | Segment::Newline
-            | Segment::CommentCommand => Ok(None),
-            Segment::DoRepeatOverflow => Err(TokenError::DoRepeatOverflow),
-            Segment::MacroId => Ok(Some(Self::MacroToken(MacroToken::MacroId(String::from(s))))),
-            Segment::StartDocument => Ok(Some(Self::Id(String::from("DOCUMENT")))),
-            Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
-                Ok(Some(Self::EndCommand))
-            }
-            Segment::End => Ok(Some(Self::End)),
-            Segment::ExpectedQuote => Err(TokenError::ExpectedQuote),
-            Segment::ExpectedExponent => Err(TokenError::ExpectedExponent(String::from(s))),
-            Segment::UnexpectedChar => Err(TokenError::UnexpectedChar(s.chars().next().unwrap())),
-        }
-    }
-}