From 0a1d87ae85656e3bf7401814f933a0d032b88652 Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Tue, 4 Jun 2024 22:20:29 -0700
Subject: [PATCH] work on lexer

---
 rust/src/lex/mod.rs     |  1 +
 rust/src/lex/segment.rs | 83 +++++++++++++++++++++++++++++++++++++++++
 rust/src/lib.rs         |  3 ++
 3 files changed, 87 insertions(+)
 create mode 100644 rust/src/lex/mod.rs
 create mode 100644 rust/src/lex/segment.rs

diff --git a/rust/src/lex/mod.rs b/rust/src/lex/mod.rs
new file mode 100644
index 0000000000..c5780e0db5
--- /dev/null
+++ b/rust/src/lex/mod.rs
@@ -0,0 +1 @@
+pub mod segment;
diff --git a/rust/src/lex/segment.rs b/rust/src/lex/segment.rs
new file mode 100644
index 0000000000..597e5766c5
--- /dev/null
+++ b/rust/src/lex/segment.rs
@@ -0,0 +1,83 @@
+//! Syntax segmentation.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into two
+//! phases: a lower-level phase called "segmentation" and a higher-level phase
+//! called "scanning".  This module implements the segmentation phase.
+//! [`super::scan`] contains declarations for the scanning phase.
+//!
+//! Segmentation accepts a stream of UTF-8 bytes as input.  It outputs a label
+//! (a segment type) for each byte or contiguous sequence of bytes in the input.
+//! It also, in a few corner cases, outputs zero-width segments that label the
+//! boundary between a pair of bytes in the input.
+//!
+//! Some segment types correspond directly to tokens; for example, an
+//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID)
+//! later in lexical analysis.  Other segments contribute to tokens but do not
+//! correspond directly; for example, multiple quoted string segments
+//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators
+//! (SEG_PUNCT) may be combined to form a single string token (T_STRING).  Still
+//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior
+//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
+
+/// Segmentation mode.
+///
+/// PSPP syntax is written in one of two modes which are broadly defined as
+/// follows:
+///
+/// - In interactive mode, commands end with a period at the end of the line
+///   or with a blank line.
+///
+/// - In batch mode, the second and subsequent lines of a command are indented
+///   from the left margin.
+///
+/// The segmenter can also try to automatically detect the mode in use, using a
+/// heuristic that is usually correct.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
+pub enum Mode {
+    /// Try to interpret input correctly regardless of whether it is written
+    /// for interactive or batch mode.
+    #[default]
+    Auto,
+
+    /// Interactive syntax mode.
+    Interactive,
+
+    /// Batch syntax mode.
+    Batch,
+}
+
+/// The type of a segment.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
+pub enum Type {
+    Number,
+    QuotedString,
+    HexString,
+    UnicodeString,
+    UnquotedString,
+    ReservedWord,
+    Identifier,
+    Punct,
+    Shbang,
+    Spaces,
+    Comment,
+    Newline,
+    CommentCommand,
+    DoRepeatCommand,
+    InlineData,
+    MacroId,
+    MacroName,
+    MacroBody,
+    StartDocument,
+    Document,
+    StartCommand,
+    SeparateCommands,
+    EndCommand,
+    End,
+    ExpectedQuote,
+    ExpectedExponent,
+    UnexpectedChar
+}
+
+pub struct Segmenter {
+    state: State
+}
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
index f8e880c14e..933c74ad09 100644
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -1,3 +1,4 @@
+#[allow(unused_variables, unused_mut, dead_code)]
 pub mod cooked;
 pub mod dictionary;
 pub mod encoding;
@@ -5,5 +6,7 @@ pub mod endian;
 pub mod format;
 pub mod identifier;
 pub mod locale_charset;
+#[allow(unused_variables, unused_mut, dead_code)]
 pub mod raw;
 pub mod sack;
+pub mod lex;
-- 
2.30.2