From 0a1d87ae85656e3bf7401814f933a0d032b88652 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 4 Jun 2024 22:20:29 -0700 Subject: [PATCH] work on lexer --- rust/src/lex/mod.rs | 1 + rust/src/lex/segment.rs | 83 +++++++++++++++++++++++++++++++++++++++++ rust/src/lib.rs | 3 ++ 3 files changed, 87 insertions(+) create mode 100644 rust/src/lex/mod.rs create mode 100644 rust/src/lex/segment.rs diff --git a/rust/src/lex/mod.rs b/rust/src/lex/mod.rs new file mode 100644 index 0000000000..c5780e0db5 --- /dev/null +++ b/rust/src/lex/mod.rs @@ -0,0 +1 @@ +pub mod segment; diff --git a/rust/src/lex/segment.rs b/rust/src/lex/segment.rs new file mode 100644 index 0000000000..597e5766c5 --- /dev/null +++ b/rust/src/lex/segment.rs @@ -0,0 +1,83 @@ +//! Syntax segmentation. +//! +//! PSPP divides traditional "lexical analysis" or "tokenization" into two +//! phases: a lower-level phase called "segmentation" and a higher-level phase +//! called "scanning". This module implements the segmentation phase. +//! [`super::scan`] contains declarations for the scanning phase. +//! +//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label +//! (a segment type) for each byte or contiguous sequence of bytes in the input. +//! It also, in a few corner cases, outputs zero-width segments that label the +//! boundary between a pair of bytes in the input. +//! +//! Some segment types correspond directly to tokens; for example, an +//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID) +//! later in lexical analysis. Other segments contribute to tokens but do not +//! correspond directly; for example, multiple quoted string segments +//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators +//! (SEG_PUNCT) may be combined to form a single string token (T_STRING). Still +//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior +//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE). + +/// Segmentation mode. +/// +/// PSPP syntax is written in one of two modes which are broadly defined as +/// follows: +/// +/// - In interactive mode, commands end with a period at the end of the line +/// or with a blank line. +/// +/// - In batch mode, the second and subsequent lines of a command are indented +/// from the left margin. +/// +/// The segmenter can also try to automatically detect the mode in use, using a +/// heuristic that is usually correct. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] +pub enum Mode { + /// Try to interpret input correctly regardless of whether it is written + /// for interactive or batch mode. + #[default] + Auto, + + /// Interactive syntax mode. + Interactive, + + /// Batch syntax mode. + Batch, +} + +/// The type of a segment. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] +pub enum Type { + Number, + QuotedString, + HexString, + UnicodeString, + UnquotedString, + ReservedWord, + Identifier, + Punct, + Shbang, + Spaces, + Comment, + Newline, + CommentCommand, + DoRepeatCommand, + InlineData, + MacroId, + MacroName, + MacroBody, + StartDocument, + Document, + StartCommand, + SeparateCommands, + EndCommand, + End, + ExpectedQuote, + ExpectedExponent, + UnexpectedChar +} + +pub struct Segmenter { + state: State +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index f8e880c14e..933c74ad09 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,3 +1,4 @@ +#[allow(unused_variables, unused_mut, dead_code)] pub mod cooked; pub mod dictionary; pub mod encoding; @@ -5,5 +6,7 @@ pub mod endian; pub mod format; pub mod identifier; pub mod locale_charset; +#[allow(unused_variables, unused_mut, dead_code)] pub mod raw; pub mod sack; +pub mod lex; -- 2.30.2