Docs for the `lex` module.

author Ben Pfaff <blp@cs.stanford.edu>

Sat, 12 Jul 2025 17:13:32 +0000 (10:13 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sat, 12 Jul 2025 17:13:32 +0000 (10:13 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sat, 12 Jul 2025 17:13:32 +0000 (10:13 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sat, 12 Jul 2025 17:13:32 +0000 (10:13 -0700)
diff --git a/rust/pspp/src/command/crosstabs.rs b/rust/pspp/src/command/crosstabs.rs

index 7f8e31da337d9105e5e605571a82baf9e0ef25f1..b8e3c2635b918045cc696f087d465016c1e105fa 100644 (file)
--- a/rust/pspp/src/command/crosstabs.rs
+++ b/rust/pspp/src/command/crosstabs.rs
@@ -195,13 +195,13 @@ mod tests {
  
      use crate::{
          engine::Engine,
-        lex::lexer::{Source, SourceFile},
+        lex::lexer::{Source, SyntaxFile},
      };
  
      fn test(syntax: &str) {
          let mut engine = Engine::new();
          engine.run(Source::new_default(&Arc::new(
-            SourceFile::for_file_contents(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
+            SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
          )));
      }
  
diff --git a/rust/pspp/src/command/ctables.rs b/rust/pspp/src/command/ctables.rs

index be29c4864270dadb9d4d6fe7cff6614dcc66714e..a847242372043a1142746a0e55e211c8eb68cb15 100644 (file)
--- a/rust/pspp/src/command/ctables.rs
+++ b/rust/pspp/src/command/ctables.rs
@@ -380,13 +380,13 @@ mod tests {
  
      use crate::{
          engine::Engine,
-        lex::lexer::{Source, SourceFile},
+        lex::lexer::{Source, SyntaxFile},
      };
  
      fn test(syntax: &str) {
          let mut engine = Engine::new();
          engine.run(Source::new_default(&Arc::new(
-            SourceFile::for_file_contents(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
+            SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
          )));
      }
  
diff --git a/rust/pspp/src/command/data_list.rs b/rust/pspp/src/command/data_list.rs

index 9ef58bb94af9c8eb82c0d38c5122eb7f8c40ffe6..181d8e51e4a6024f7cfbe8c9f5cfb166fad94c1f 100644 (file)
--- a/rust/pspp/src/command/data_list.rs
+++ b/rust/pspp/src/command/data_list.rs
@@ -110,13 +110,13 @@ mod tests {
  
      use crate::{
          engine::Engine,
-        lex::lexer::{Source, SourceFile},
+        lex::lexer::{Source, SyntaxFile},
      };
  
      fn test(syntax: &str) {
          let mut engine = Engine::new();
          engine.run(Source::new_default(&Arc::new(
-            SourceFile::for_file_contents(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
+            SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
          )));
      }
  
diff --git a/rust/pspp/src/command/descriptives.rs b/rust/pspp/src/command/descriptives.rs

index d5484e16eff823a776acdbac3f44ba5edf2e0504..28619d37b5c822ccf5f318f905ad3cdcf1d9c2e9 100644 (file)
--- a/rust/pspp/src/command/descriptives.rs
+++ b/rust/pspp/src/command/descriptives.rs
@@ -152,13 +152,13 @@ mod tests {
  
      use crate::{
          engine::Engine,
-        lex::lexer::{Source, SourceFile},
+        lex::lexer::{Source, SyntaxFile},
      };
  
      fn test(syntax: &str) {
          let mut engine = Engine::new();
          engine.run(Source::new_default(&Arc::new(
-            SourceFile::for_file_contents(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
+            SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
          )));
      }
  
diff --git a/rust/pspp/src/command/mod.rs b/rust/pspp/src/command/mod.rs

index 8d40580b843a79d64dde247e43cb618c5e8a8b36..5f0d1ec55bad7a1cf555d572d5edd137455e0fff 100644 (file)
--- a/rust/pspp/src/command/mod.rs
+++ b/rust/pspp/src/command/mod.rs
@@ -36,7 +36,7 @@ use crate::{
      lex::{
          command_name::CommandMatcher,
          lexer::{LexToken, TokenSlice},
-        token::{Punct, Token},
+        Punct, Token,
      },
      message::{Diagnostic, Diagnostics},
  };
diff --git a/rust/pspp/src/engine.rs b/rust/pspp/src/engine.rs

index bc7b83e25afaf07db2ccb42d55142e23bb65a621..6d9085c2146102e52d5e43b2e5e11909184ce98d 100644 (file)
--- a/rust/pspp/src/engine.rs
+++ b/rust/pspp/src/engine.rs
@@ -46,7 +46,7 @@ mod tests {
  
      use encoding_rs::UTF_8;
  
-    use crate::lex::lexer::{Source, SourceFile};
+    use crate::lex::lexer::{Source, SyntaxFile};
  
      use super::Engine;
  
@@ -55,7 +55,7 @@ mod tests {
      fn test_echo() {
          let mut engine = Engine::new();
          engine.run(Source::new_default(&Arc::new(
-            SourceFile::for_file_contents(
+            SyntaxFile::new(
                  "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(),
                  Some("test.sps".to_string()),
                  UTF_8,
@@ -67,7 +67,7 @@ mod tests {
      fn test_descriptives() {
          let mut engine = Engine::new();
          engine.run(Source::new_default(&Arc::new(
-            SourceFile::for_file_contents(
+            SyntaxFile::new(
                  "DESCRIPTIVES VARIABLES=a (za) b to c/MISSING=x y z/MISSING=VARIABLE INCLUDE/STATISTICS=DEFAULT/SAVE/SORT=SKEWNESS (A)\n".to_string(),
                  Some("test.sps".to_string()),
                  UTF_8,
diff --git a/rust/pspp/src/format/display/test.rs b/rust/pspp/src/format/display/test.rs

index b723b1b0bc8c4e166f3f3a12b74ea2ee8b67d956..c041fa6a2bbe7a63aa33e95d7757e0e59cc97258 100644 (file)
--- a/rust/pspp/src/format/display/test.rs
+++ b/rust/pspp/src/format/display/test.rs
@@ -26,11 +26,7 @@ use crate::{
      dictionary::Datum,
      endian::Endian,
      format::{AbstractFormat, Epoch, Format, Settings, Type, UncheckedFormat, CC},
-    lex::{
-        scan::StringScanner,
-        segment::Syntax,
-        token::{Punct, Token},
-    },
+    lex::{scan::StringScanner, segment::Syntax, Punct, Token},
      settings::EndianSettings,
  };
  
diff --git a/rust/pspp/src/lex/command_name.rs b/rust/pspp/src/lex/command_name.rs

index 9aa26300bf35236ad0a354ff7465dafa570e0517..327f7cbf45477d815b8f64f3f5329f412fa4ba62 100644 (file)
--- a/rust/pspp/src/lex/command_name.rs
+++ b/rust/pspp/src/lex/command_name.rs
@@ -14,7 +14,7 @@
  // You should have received a copy of the GNU General Public License along with
  // this program.  If not, see <http://www.gnu.org/licenses/>.
  
-//! # Command names
+//! Command names.
  //!
  //! PSPP needs to parse command names in a few contexts:
  //!
@@ -27,9 +27,6 @@
  //!
  //! This module supports identifying commands for these purposes.
  
-// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
-#![cfg_attr(not(test), warn(missing_docs))]
-
  use crate::identifier::id_match_n_nonstatic;
  
  /// How a string matches the name of a command.
diff --git a/rust/pspp/src/lex/lexer.rs b/rust/pspp/src/lex/lexer.rs

index 904a853b0819794784f910171a75ae2d2d7ba29f..860a44a33e1f17263447f612fadc87250beacdab 100644 (file)
--- a/rust/pspp/src/lex/lexer.rs
+++ b/rust/pspp/src/lex/lexer.rs
@@ -14,6 +14,8 @@
  // You should have received a copy of the GNU General Public License along with
  // this program.  If not, see <http://www.gnu.org/licenses/>.
  
+//! High-level lexical analysis.
+
  use std::{
      borrow::{Borrow, Cow},
      collections::VecDeque,
@@ -30,7 +32,6 @@ use std::{
  
  use chardetng::EncodingDetector;
  use encoding_rs::{Encoding, UTF_8};
-use thiserror::Error as ThisError;
  use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
  
  use crate::{
@@ -63,35 +64,30 @@ pub enum ErrorHandling {
      Stop,
  }
  
-pub struct SourceFile {
+/// A syntax file and its contents.
+///
+/// This holds the entire contents of a syntax file, which are always read into
+/// memory in their entirety, recoded into UTF-8 if necessary.  It includes the
+/// file name (if any), and an index to make finding lines by line number more
+/// efficient.
+pub struct SyntaxFile {
      /// `None` if this reader is not associated with a file.
      file_name: Option<Arc<String>>,
  
-    /// Encoding.
+    /// Original encoding.
      #[allow(dead_code)]
      encoding: &'static Encoding,
  
      /// Source file contents.
-    buffer: String,
+    contents: String,
  
      /// Byte offsets into `buffer` of starts of lines.  The first element is 0.
      lines: Vec<usize>,
  }
  
-impl SourceFile {
-    fn new(buffer: String, encoding: &'static Encoding, file_name: Option<String>) -> Self {
-        let lines = once(0)
-            .chain(buffer.match_indices('\n').map(|(index, _s)| index + 1))
-            .filter(|index| *index < buffer.len())
-            .collect::<Vec<_>>();
-        Self {
-            file_name: file_name.map(Arc::new),
-            encoding,
-            buffer,
-            lines,
-        }
-    }
-
+impl SyntaxFile {
+    /// Returns a `SyntaxFile` by reading `path` and recoding it from
+    /// `encoding`.
      pub fn for_file<P>(path: P, encoding: Option<&'static Encoding>) -> IoResult<Self>
      where
          P: AsRef<Path>,
@@ -105,21 +101,29 @@ impl SourceFile {
          let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes);
          Ok(Self::new(
              contents.to_string(),
-            encoding,
              Some(path.as_ref().to_string_lossy().to_string()),
+            encoding,
          ))
      }
  
-    pub fn for_file_contents(
-        contents: String,
-        file_name: Option<String>,
-        encoding: &'static Encoding,
-    ) -> Self {
-        Self::new(contents, encoding, file_name)
+    /// Creates a new `SyntaxFile` for `contents`, recording that `contents` was
+    /// originally encoded in `encoding` and that it was read from `file_name`.
+    pub fn new(contents: String, file_name: Option<String>, encoding: &'static Encoding) -> Self {
+        let lines = once(0)
+            .chain(contents.match_indices('\n').map(|(index, _s)| index + 1))
+            .filter(|index| *index < contents.len())
+            .collect::<Vec<_>>();
+        Self {
+            file_name: file_name.map(Arc::new),
+            encoding,
+            contents,
+            lines,
+        }
      }
  
-    pub fn for_string(contents: String, encoding: &'static Encoding) -> Self {
-        Self::new(contents, encoding, None)
+    /// Returns a `SyntaxFile` for `contents`.
+    pub fn for_string(contents: String) -> Self {
+        Self::new(contents, None, UTF_8)
      }
  
      fn offset_to_point(&self, offset: usize) -> Point {
@@ -129,7 +133,7 @@ impl SourceFile {
          Point {
              line: line as i32,
              column: Some(
-                self.buffer
+                self.contents
                      .get(self.lines[line - 1]..offset)
                      .unwrap_or_default()
                      .width() as i32
@@ -144,12 +148,12 @@ impl SourceFile {
              let line_number = line_number as usize;
              let start = self.lines[line_number - 1];
              let end = self.lines.get(line_number).copied().unwrap_or(
-                self.buffer[start..]
+                self.contents[start..]
                      .find('\n')
                      .map(|ofs| ofs + start)
-                    .unwrap_or(self.buffer.len()),
+                    .unwrap_or(self.contents.len()),
              );
-            self.buffer[start..end].strip_newline()
+            self.contents[start..end].strip_newline()
          } else {
              ""
          }
@@ -166,9 +170,9 @@ impl SourceFile {
      }
  }
  
-impl Default for SourceFile {
+impl Default for SyntaxFile {
      fn default() -> Self {
-        Self::new(String::new(), UTF_8, None)
+        Self::new(String::new(), None, UTF_8)
      }
  }
  
@@ -202,11 +206,15 @@ fn ellipsize(s: &str) -> Cow<str> {
  }
  
  /// A token in a [`Source`].
+///
+/// This relates a token back to where it was read, which allows for better
+/// error reporting.
  pub struct LexToken {
-    /// The regular token.
+    /// The token.
      pub token: Token,
  
-    pub file: Arc<SourceFile>,
+    /// The source file that the token was read from.
+    pub file: Arc<SyntaxFile>,
  
      /// For a token obtained through the lexer in an ordinary way, this is the
      /// location of the token in the [`Source`]'s buffer.
@@ -243,7 +251,7 @@ impl Borrow<Token> for LexToken {
  
  impl LexToken {
      fn representation(&self) -> &str {
-        &self.file.buffer[self.pos.clone()]
+        &self.file.contents[self.pos.clone()]
      }
  }
  
@@ -255,13 +263,7 @@ struct MacroRepresentation {
      pos: RangeInclusive<usize>,
  }
  
-#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
-pub enum Error {
-    /// Error forming tokens from the input.
-    #[error("{0}")]
-    TokenError(#[from] ScanError),
-}
-
+/// A sequence of tokens.
  pub struct Tokens {
      tokens: Vec<LexToken>,
  }
@@ -286,18 +288,22 @@ impl Debug for Tokens {
      }
  }
  
+/// An iterator for [TokenSlice].
  pub struct TokenSliceIter<'a> {
      slice: &'a TokenSlice,
      rest: Range<usize>,
  }
  
  impl<'a> TokenSliceIter<'a> {
+    /// Creates a new iterator for `slice`.
      pub fn new(slice: &'a TokenSlice) -> Self {
          Self {
              slice,
              rest: slice.range.clone(),
          }
      }
+
+    /// Returns the tokens not yet visited by the iterator.
      pub fn remainder(&self) -> TokenSlice {
          TokenSlice {
              backing: self.slice.backing.clone(),
@@ -319,6 +325,7 @@ impl<'a> Iterator for TokenSliceIter<'a> {
      }
  }
  
+/// A subrange of tokens inside [Tokens].
  #[derive(Clone)]
  pub struct TokenSlice {
      backing: Rc<Tokens>,
@@ -338,7 +345,9 @@ impl Debug for TokenSlice {
      }
  }
  
+#[allow(missing_docs)]
  impl TokenSlice {
+    /// Create a new slice that initially contains all of `backing`.
      pub fn new(backing: Rc<Tokens>) -> Self {
          let range = 0..backing.tokens.len() - 1;
          Self { backing, range }
@@ -347,14 +356,19 @@ impl TokenSlice {
      fn tokens(&self) -> &[LexToken] {
          &self.backing.tokens[self.range.clone()]
      }
+    /// Returns the token with the given `index`, or `None` if `index` is out of
+    /// range.
      pub fn get_token(&self, index: usize) -> Option<&Token> {
          self.get(index).map(|token| &token.token)
      }
  
+    /// Returns the [LexToken] with the given `index`, or `None` if `index` is
+    /// out of range.
      pub fn get(&self, index: usize) -> Option<&LexToken> {
          self.tokens().get(index)
      }
  
+    /// Returns an error with the given `text`, citing these tokens.
      pub fn error<S>(&self, text: S) -> Diagnostic
      where
          S: ToString,
@@ -362,6 +376,7 @@ impl TokenSlice {
          self.diagnostic(Severity::Error, text.to_string())
      }
  
+    /// Returns a warning with the given `text`, citing these tokens.
      pub fn warning<S>(&self, text: S) -> Diagnostic
      where
          S: ToString,
@@ -390,7 +405,7 @@ impl TokenSlice {
          self.subslice(self.len()..self.len())
      }
  
-    fn file(&self) -> Option<&Arc<SourceFile>> {
+    fn file(&self) -> Option<&Arc<SyntaxFile>> {
          let first = self.first();
          let last = self.last();
          if Arc::ptr_eq(&first.file, &last.file) {
@@ -427,7 +442,7 @@ impl TokenSlice {
                  let start = token0.pos.start;
                  let end = token1.pos.end;
                  if start < end {
-                    return Some(&file.buffer[start..end]);
+                    return Some(&file.contents[start..end]);
                  }
              }
          }
@@ -548,19 +563,22 @@ impl TokenSlice {
      }
  }
  
+/// A source of tokens read from a [SyntaxFile].
  pub struct Source {
-    file: Arc<SourceFile>,
+    file: Arc<SyntaxFile>,
      segmenter: Segmenter,
      seg_pos: usize,
      lookahead: VecDeque<LexToken>,
  }
  
  impl Source {
-    pub fn new_default(file: &Arc<SourceFile>) -> Self {
+    /// Creates a new `Source` reading from `file`, using the default [Syntax].
+    pub fn new_default(file: &Arc<SyntaxFile>) -> Self {
          Self::new(file, Syntax::default())
      }
  
-    pub fn new(file: &Arc<SourceFile>, syntax: Syntax) -> Self {
+    /// Creates a new `Source` reading from `file` using `syntax`.
+    pub fn new(file: &Arc<SyntaxFile>, syntax: Syntax) -> Self {
          Self {
              file: file.clone(),
              segmenter: Segmenter::new(syntax, false),
@@ -569,6 +587,8 @@ impl Source {
          }
      }
  
+    /// Reads and returns a whole command from this source, expanding the given
+    /// `macros` as it reads.
      pub fn read_command(&mut self, macros: &MacroSet) -> Option<Tokens> {
          loop {
              if let Some(end) = self
@@ -582,7 +602,7 @@ impl Source {
                  if self.lookahead.is_empty() {
                      return None;
                  }
-                let len = self.file.buffer.len();
+                let len = self.file.contents.len();
                  self.lookahead.push_back(LexToken {
                      token: Token::End,
                      file: self.file.clone(),
@@ -593,18 +613,18 @@ impl Source {
          }
      }
  
-    pub fn read_lookahead(&mut self, macros: &MacroSet) -> bool {
+    fn read_lookahead(&mut self, macros: &MacroSet) -> bool {
          let mut errors = Vec::new();
          let mut pp = VecDeque::new();
          while let Some((seg_len, seg_type)) = self
              .segmenter
-            .push(&self.file.buffer[self.seg_pos..], true)
+            .push(&self.file.contents[self.seg_pos..], true)
              .unwrap()
          {
              let pos = self.seg_pos..self.seg_pos + seg_len;
              self.seg_pos += seg_len;
  
-            match seg_type.to_token(&self.file.buffer[pos.clone()]) {
+            match seg_type.to_token(&self.file.contents[pos.clone()]) {
                  None => (),
                  Some(Ok(token)) => {
                      let end = token == Token::End;
@@ -678,7 +698,7 @@ impl Source {
              return;
          };
          for token in src.range(1..) {
-            if parser.push(&token.token, &self.file.buffer[token.pos.clone()], &|e| {
+            if parser.push(&token.token, &self.file.contents[token.pos.clone()], &|e| {
                  println!("{e:?}")
              }) == ParseStatus::Complete
              {
@@ -740,7 +760,7 @@ mod new_lexer_tests {
  
      use crate::macros::MacroSet;
  
-    use super::{Source, SourceFile};
+    use super::{Source, SyntaxFile};
  
      #[test]
      fn test() {
@@ -751,7 +771,7 @@ END DATA.
  
  CROSSTABS VARIABLES X (1,7) Y (1,7) /TABLES X BY Y.
  "#;
-        let file = Arc::new(SourceFile::for_file_contents(
+        let file = Arc::new(SyntaxFile::new(
              String::from(code),
              Some(String::from("crosstabs.sps")),
              UTF_8,
diff --git a/rust/pspp/src/lex/mod.rs b/rust/pspp/src/lex/mod.rs

index 34bd92de50f411483dd5b0a692387a31df3a7409..2cf29eec920bc6dced22a55df8c507ee98370950 100644 (file)
--- a/rust/pspp/src/lex/mod.rs
+++ b/rust/pspp/src/lex/mod.rs
@@ -14,20 +14,27 @@
  // You should have received a copy of the GNU General Public License along with
  // this program.  If not, see <http://www.gnu.org/licenses/>.
  
-//! # PSPP tokenization
+//! PSPP lexical analysis.
  //!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning".  The [segment] module implements the segmentation phase
-//! and the [scan] module the scanning phase.
+//! PSPP divides traditional "lexical analysis" or "tokenization" into three
+//! phases:
  //!
-//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
-//! labeled with a segment type.  It outputs a stream of "scan tokens", which
-//! are the same as the tokens used by the PSPP parser with a few additional
-//! types.
+//! 1. A low level called "segmentation", implemented in the [segment] module.
+//!    This labels syntax strings with [Segment](segment::Segment)s.
+//!
+//! 2. A middle level called "scanning", implemented in the [scan] module.
+//!    This transforms and merges segments to form [Token]s.
+//!
+//! 3. A high level called "lexing", implemented in the [lexer] module.  Lexing
+//!    brings together multiple source files and invokes macro expansion on the
+//!    tokens output by the scanner.
+
+// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
+#![cfg_attr(not(test), warn(missing_docs))]
  
  pub mod command_name;
  pub mod lexer;
  pub mod scan;
  pub mod segment;
-pub mod token;
+mod token;
+pub use token::{Punct, Token};
diff --git a/rust/pspp/src/lex/scan/mod.rs b/rust/pspp/src/lex/scan/mod.rs

index 7e99b847b3b70bafaef35124b54acf4793be646a..fcb1bc341676e0cd25f0e3152e9a116123468aad 100644 (file)
--- a/rust/pspp/src/lex/scan/mod.rs
+++ b/rust/pspp/src/lex/scan/mod.rs
@@ -14,18 +14,15 @@
  // You should have received a copy of the GNU General Public License along with
  // this program.  If not, see <http://www.gnu.org/licenses/>.
  
-//! # High-level lexical analysis
+//! Mid-level lexical analysis.
  //!
-//! This module implements higher-level lexical analysis using the segments
+//! This module implements mid-level lexical analysis using the segments
  //! output by the lower-level [segmentation phase](super::segment).
  //!
  //! Scanning accepts as input a stream of segments, which are UTF-8 strings
  //! labeled with a [segment type](super::segment::Segment).  It outputs a stream
  //! of [Token]s used by the PSPP parser or an error.
  
-// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
-#![cfg_attr(not(test), warn(missing_docs))]
-
  use crate::identifier::{Identifier, ReservedWord};
  
  use super::{
diff --git a/rust/pspp/src/lex/segment/mod.rs b/rust/pspp/src/lex/segment/mod.rs

index 9cb1cc410d0c3601ce613b556a68ca39e3d0e4ca..5a568692b5030aa94f9e9cdf84101ba28963ac5a 100644 (file)
--- a/rust/pspp/src/lex/segment/mod.rs
+++ b/rust/pspp/src/lex/segment/mod.rs
@@ -14,12 +14,10 @@
  // You should have received a copy of the GNU General Public License along with
  // this program.  If not, see <http://www.gnu.org/licenses/>.
  
-//! # Low-level lexical analysis
+//! Low-level lexical analysis.
  //!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning".  This module implements the segmentation phase; the
-//! [scan](super::scan) module implements the higher-level scanning phase.
+//! PSPP divides traditional "lexical analysis" or "tokenization" into [three
+//! phases](super).  This module implements the low-level segmentation phase.
  //!
  //! Segmentation accepts a stream of UTF-8 bytes as input.  It outputs a label
  //! (a segment type) for each byte or contiguous sequence of bytes in the input.
@@ -38,9 +36,6 @@
  //! [Token::Id]: crate::lex::token::Token::Id
  //! [Token::String]: crate::lex::token::Token::String
  
-// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
-#![cfg_attr(not(test), warn(missing_docs))]
-
  use std::cmp::Ordering;
  
  use crate::{
diff --git a/rust/pspp/src/lex/token.rs b/rust/pspp/src/lex/token.rs

index 66547c83e249eee417928e259ac17692919f01ea..27cc2b1a55c989ac95021cf74f3edd8a73acf8c8 100644 (file)
--- a/rust/pspp/src/lex/token.rs
+++ b/rust/pspp/src/lex/token.rs
@@ -18,28 +18,42 @@ use std::fmt::{Display, Formatter, Result as FmtResult};
  
  use crate::identifier::Identifier;
  
+/// A PSPP syntax token.
  #[derive(Clone, Debug, PartialEq)]
  pub enum Token {
      /// Identifier.
-    Id(Identifier),
+    Id(
+        /// The identifier.
+        Identifier,
+    ),
  
      /// Number.
-    Number(f64),
+    Number(
+        /// Numeric value.
+        f64,
+    ),
  
      /// Quoted string.
      String(String),
  
      /// Command terminator or separator.
      ///
-    /// Usually this is `.`, but a blank line also separates commands, and in
-    /// batch mode any line that begins with a non-blank starts a new command.
+    /// The most common command terminator is `.`.  A blank line also separates
+    /// commands.  In [Batch](crate::lex::segment::Syntax::Batch) mode, any line
+    /// that begins with a non-blank starts a new command.  Other special cases
+    /// exist, too.
      End,
  
      /// Operators, punctuators, and reserved words.
-    Punct(Punct),
+    Punct(
+        /// The punctuator.
+        Punct,
+    ),
  }
  
  impl Token {
+    /// Returns the [Identifier] within this token, or `None` if this is not an
+    /// identifier token.
      pub fn id(&self) -> Option<&Identifier> {
          match self {
              Self::Id(identifier) => Some(identifier),
@@ -47,10 +61,14 @@ impl Token {
          }
      }
  
+    /// Returns true if this token contains an [Identifier] that matches
+    /// `keyword` as decided by [Identifier::matches_keyword], false otherwise.
      pub fn matches_keyword(&self, keyword: &str) -> bool {
          self.id().is_some_and(|id| id.matches_keyword(keyword))
      }
  
+    /// Returns the number within this token, or `None` if this is not a number
+    /// token.
      pub fn as_number(&self) -> Option<f64> {
          if let Self::Number(number) = self {
              Some(*number)
@@ -59,6 +77,8 @@ impl Token {
          }
      }
  
+    /// Returns the integer within this token, or `None` if this is not a number
+    /// token with an integer value.
      pub fn as_integer(&self) -> Option<i64> {
          match self {
              Self::Number(number)
@@ -72,13 +92,8 @@ impl Token {
          }
      }
  
-    pub fn as_id(&self) -> Option<&Identifier> {
-        match self {
-            Self::Id(id) => Some(id),
-            _ => None,
-        }
-    }
-
+    /// Returns the quoted string within this token, or `None` if this is not a
+    /// [Token::String] token.
      pub fn as_string(&self) -> Option<&str> {
          match self {
              Self::String(string) => Some(string.as_str()),
@@ -162,6 +177,7 @@ mod test {
      }
  }
  
+/// An operator, punctuator, or reserved word.
  #[derive(Copy, Clone, Debug, PartialEq, Eq)]
  pub enum Punct {
      /// `+`.
@@ -276,6 +292,11 @@ pub enum Punct {
  }
  
  impl Punct {
+    /// Returns a syntax representation of this punctuator.
+    ///
+    /// Some punctuators have more than one valid syntax representation (for
+    /// example, [Punct::And] can be written as `AND` or `&`).  This returns one
+    /// of the valid representations.
      pub fn as_str(&self) -> &'static str {
          match self {
              Self::Plus => "+",
diff --git a/rust/pspp/src/macros.rs b/rust/pspp/src/macros.rs

index e34a247cbcbae0182dedc7c28c8f011ee8ac87bf..7ec1c9ea1b9e949a1bd0d24aeacec7620e03d6b3 100644 (file)
--- a/rust/pspp/src/macros.rs
+++ b/rust/pspp/src/macros.rs
@@ -33,7 +33,7 @@ use crate::{
      lex::{
          scan::{ScanError, StringScanner, StringSegmenter},
          segment::Syntax,
-        token::{Punct, Token},
+        Punct, Token,
      },
      message::Location,
      settings::Settings,
author	Ben Pfaff <blp@cs.stanford.edu>
	Sat, 12 Jul 2025 17:13:32 +0000 (10:13 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sat, 12 Jul 2025 17:13:32 +0000 (10:13 -0700)
rust/pspp/src/command/crosstabs.rs		patch \| blob \| history
rust/pspp/src/command/ctables.rs		patch \| blob \| history
rust/pspp/src/command/data_list.rs		patch \| blob \| history
rust/pspp/src/command/descriptives.rs		patch \| blob \| history
rust/pspp/src/command/mod.rs		patch \| blob \| history
rust/pspp/src/engine.rs		patch \| blob \| history
rust/pspp/src/format/display/test.rs		patch \| blob \| history
rust/pspp/src/lex/command_name.rs		patch \| blob \| history
rust/pspp/src/lex/lexer.rs		patch \| blob \| history
rust/pspp/src/lex/mod.rs		patch \| blob \| history
rust/pspp/src/lex/scan/mod.rs		patch \| blob \| history
rust/pspp/src/lex/segment/mod.rs		patch \| blob \| history
rust/pspp/src/lex/token.rs		patch \| blob \| history
rust/pspp/src/macros.rs		patch \| blob \| history