scan: New library for high-level PSPP syntax lexical analysis.

author Ben Pfaff <blp@cs.stanford.edu>

Sat, 19 Mar 2011 23:32:16 +0000 (16:32 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 20 Mar 2011 16:43:44 +0000 (09:43 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sat, 19 Mar 2011 23:32:16 +0000 (16:32 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 20 Mar 2011 16:43:44 +0000 (09:43 -0700)
diff --git a/Smake b/Smake

index 14c2a75f28fc1a482eed7e6e7d1c2f5e2027d789..683a8e382871f190c1a824d66f263a7a65a602c1 100644 (file)
--- a/Smake
+++ b/Smake
@@ -71,6 +71,7 @@ GNULIB_MODULES = \
         sys_stat \
         tempname \
         trunc \
         sys_stat \
         tempname \
         trunc \
+       unictype/ctype-print \
         unictype/property-id-continue \
         unictype/property-id-start \
         unigbrk/uc-is-grapheme-break \
         unictype/property-id-continue \
         unictype/property-id-start \
         unigbrk/uc-is-grapheme-break \
diff --git a/src/language/lexer/automake.mk b/src/language/lexer/automake.mk

index b3d06fece79dd319f9a4306519fef1fde0eb74e4..be48873e8d3e8fdb701109b31f9603112bf73205 100644 (file)
--- a/src/language/lexer/automake.mk
+++ b/src/language/lexer/automake.mk
@@ -10,8 +10,12 @@ language_lexer_sources = \
         src/language/lexer/subcommand-list.h \
         src/language/lexer/format-parser.c \
         src/language/lexer/format-parser.h \
         src/language/lexer/subcommand-list.h \
         src/language/lexer/format-parser.c \
         src/language/lexer/format-parser.h \
+       src/language/lexer/scan.c \
+       src/language/lexer/scan.h \
         src/language/lexer/segment.c \
         src/language/lexer/segment.h \
         src/language/lexer/segment.c \
         src/language/lexer/segment.h \
+       src/language/lexer/token.c \
+       src/language/lexer/token.h \
         src/language/lexer/value-parser.c \
         src/language/lexer/value-parser.h \
         src/language/lexer/variable-parser.c \
         src/language/lexer/value-parser.c \
         src/language/lexer/value-parser.h \
         src/language/lexer/variable-parser.c \
diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c

new file mode 100644 (file)

index 0000000..caf294a
--- /dev/null
+++ b/src/language/lexer/scan.c
@@ -0,0 +1,596 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "language/lexer/scan.h"
+
+#include <limits.h>
+#include <unistr.h>
+
+#include "data/identifier.h"
+#include "language/lexer/token.h"
+#include "libpspp/assertion.h"
+#include "libpspp/cast.h"
+
+#include "gl/c-ctype.h"
+#include "gl/xmemdup0.h"
+
+enum
+  {
+    S_START,
+    S_DASH,
+    S_STRING
+  };
+
+#define SS_NL_BEFORE_PLUS (1u << 0)
+#define SS_PLUS           (1u << 1)
+#define SS_NL_AFTER_PLUS  (1u << 2)
+
+/* Returns the integer value of (hex) digit C. */
+static int
+digit_value (int c)
+{
+  switch (c)
+    {
+    case '0': return 0;
+    case '1': return 1;
+    case '2': return 2;
+    case '3': return 3;
+    case '4': return 4;
+    case '5': return 5;
+    case '6': return 6;
+    case '7': return 7;
+    case '8': return 8;
+    case '9': return 9;
+    case 'a': case 'A': return 10;
+    case 'b': case 'B': return 11;
+    case 'c': case 'C': return 12;
+    case 'd': case 'D': return 13;
+    case 'e': case 'E': return 14;
+    case 'f': case 'F': return 15;
+    default: return INT_MAX;
+    }
+}
+
+static bool
+scan_quoted_string__ (struct substring s, struct token *token)
+{
+  int quote;
+
+  /* Trim ' or " from front and back. */
+  quote = s.string[s.length - 1];
+  s.string++;
+  s.length -= 2;
+
+  ss_realloc (&token->string, token->string.length + s.length + 1);
+
+  for (;;)
+    {
+      size_t pos = ss_find_byte (s, quote);
+      if (pos == SIZE_MAX)
+        break;
+
+      memcpy (ss_end (token->string), s.string, pos + 1);
+      token->string.length += pos + 1;
+      ss_advance (&s, pos + 2);
+    }
+
+  memcpy (ss_end (token->string), s.string, ss_length (s));
+  token->string.length += ss_length (s);
+
+  return true;
+}
+
+static bool
+scan_hex_string__ (struct substring s, struct token *token)
+{
+  uint8_t *dst;
+  size_t i;
+
+  /* Trim X' from front and ' from back. */
+  s.string += 2;
+  s.length -= 3;
+
+  if (s.length % 2 != 0)
+    {
+      token->type = SCAN_BAD_HEX_LENGTH;
+      token->number = s.length;
+      return false;
+    }
+
+  ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
+  dst = CHAR_CAST (uint8_t *, ss_end (token->string));
+  token->string.length += s.length / 2;
+  for (i = 0; i < s.length; i += 2)
+    {
+      int hi = digit_value (s.string[i]);
+      int lo = digit_value (s.string[i + 1]);
+
+      if (hi >= 16 || lo >= 16)
+        {
+          token->type = SCAN_BAD_HEX_DIGIT;
+          token->number = s.string[hi >= 16 ? i : i + 1];
+          return false;
+        }
+
+      *dst++ = hi * 16 + lo;
+    }
+
+  return true;
+}
+
+static bool
+scan_unicode_string__ (struct substring s, struct token *token)
+{
+  uint8_t *dst;
+  ucs4_t uc;
+  size_t i;
+
+  /* Trim U' from front and ' from back. */
+  s.string += 2;
+  s.length -= 3;
+
+  if (s.length < 1 || s.length > 8)
+    {
+      token->type = SCAN_BAD_UNICODE_LENGTH;
+      token->number = s.length;
+      return 0;
+    }
+
+  ss_realloc (&token->string, token->string.length + 4 + 1);
+
+  uc = 0;
+  for (i = 0; i < s.length; i++)
+    {
+      int digit = digit_value (s.string[i]);
+      if (digit >= 16)
+        {
+          token->type = SCAN_BAD_UNICODE_DIGIT;
+          token->number = s.string[i];
+          return 0;
+        }
+      uc = uc * 16 + digit;
+    }
+
+  if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
+    {
+      token->type = SCAN_BAD_UNICODE_CODE_POINT;
+      token->number = uc;
+      return 0;
+    }
+
+  dst = CHAR_CAST (uint8_t *, ss_end (token->string));
+  token->string.length += u8_uctomb (dst, uc, 4);
+
+  return true;
+}
+
+static enum scan_result
+scan_string_segment__ (struct scanner *scanner, enum segment_type type,
+                       struct substring s, struct token *token)
+{
+  bool ok;
+
+  switch (type)
+    {
+    case SEG_QUOTED_STRING:
+      ok = scan_quoted_string__ (s, token);
+      break;
+
+    case SEG_HEX_STRING:
+      ok = scan_hex_string__ (s, token);
+      break;
+
+    case SEG_UNICODE_STRING:
+      ok = scan_unicode_string__ (s, token);
+      break;
+
+    default:
+      NOT_REACHED ();
+    }
+
+  if (ok)
+    {
+      token->type = T_STRING;
+      token->string.string[token->string.length] = '\0';
+      scanner->state = S_STRING;
+      scanner->substate = 0;
+      return SCAN_SAVE;
+    }
+  else
+    {
+      /* The function we called above should have filled in token->type and
+         token->number properly to describe the error. */
+      ss_dealloc (&token->string);
+      token->string = ss_empty ();
+      return SCAN_DONE;
+    }
+
+}
+
+static enum scan_result
+add_bit (struct scanner *scanner, unsigned int bit)
+{
+  if (!(scanner->substate & bit))
+    {
+      scanner->substate |= bit;
+      return SCAN_MORE;
+    }
+  else
+    return SCAN_BACK;
+}
+
+static enum scan_result
+scan_string__ (struct scanner *scanner, enum segment_type type,
+               struct substring s, struct token *token)
+{
+  switch (type)
+    {
+    case SEG_SPACES:
+    case SEG_COMMENT:
+      return SCAN_MORE;
+
+    case SEG_NEWLINE:
+      if (scanner->substate & SS_PLUS)
+        return add_bit (scanner, SS_NL_AFTER_PLUS);
+      else
+        return add_bit (scanner, SS_NL_BEFORE_PLUS);
+
+    case SEG_PUNCT:
+      return (s.length == 1 && s.string[0] == '+'
+              ? add_bit (scanner, SS_PLUS)
+              : SCAN_BACK);
+
+    case SEG_QUOTED_STRING:
+    case SEG_HEX_STRING:
+    case SEG_UNICODE_STRING:
+      return (scanner->substate & SS_PLUS
+              ? scan_string_segment__ (scanner, type, s, token)
+              : SCAN_BACK);
+
+    default:
+      return SCAN_BACK;
+    }
+}
+
+static enum token_type
+scan_reserved_word__ (struct substring word)
+{
+  switch (c_toupper (word.string[0]))
+    {
+    case 'B':
+      return T_BY;
+
+    case 'E':
+      return T_EQ;
+
+    case 'G':
+      return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
+
+    case 'L':
+      return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
+
+    case 'N':
+      return word.length == 2 ? T_NE : T_NOT;
+
+    case 'O':
+      return T_OR;
+
+    case 'T':
+      return T_TO;
+
+    case 'A':
+      return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
+
+    case 'W':
+      return T_WITH;
+    }
+
+  NOT_REACHED ();
+}
+
+static enum token_type
+scan_punct1__ (char c0)
+{
+  switch (c0)
+    {
+    case '(': return T_LPAREN;
+    case ')': return T_RPAREN;
+    case ',': return T_COMMA;
+    case '=': return T_EQUALS;
+    case '-': return T_DASH;
+    case '[': return T_LBRACK;
+    case ']': return T_RBRACK;
+    case '&': return T_AND;
+    case '|': return T_OR;
+    case '+': return T_PLUS;
+    case '/': return T_SLASH;
+    case '*': return T_ASTERISK;
+    case '<': return T_LT;
+    case '>': return T_GT;
+    case '~': return T_NOT;
+    }
+
+  NOT_REACHED ();
+}
+
+static enum token_type
+scan_punct2__ (char c0, char c1)
+{
+  switch (c0)
+    {
+    case '*':
+      return T_EXP;
+
+    case '<':
+      return c1 == '=' ? T_LE : T_NE;
+
+    case '>':
+      return T_GE;
+
+    case '~':
+      return T_NE;
+
+    case '&':
+      return T_AND;
+
+    case '|':
+      return T_OR;
+    }
+
+  NOT_REACHED ();
+}
+
+static enum token_type
+scan_punct__ (struct substring s)
+{
+  return (s.length == 1
+          ? scan_punct1__ (s.string[0])
+          : scan_punct2__ (s.string[0], s.string[1]));
+}
+
+static double
+scan_number__ (struct substring s)
+{
+  char buf[128];
+  double number;
+  char *p;
+
+  if (s.length < sizeof buf)
+    {
+      p = buf;
+      memcpy (buf, s.string, s.length);
+      buf[s.length] = '\0';
+    }
+  else
+    p = xmemdup0 (s.string, s.length);
+
+  number = strtod (p, NULL);
+
+  if (p != buf)
+    free (p);
+
+  return number;
+}
+
+static enum scan_result
+scan_unexpected_char (const struct substring *s, struct token *token)
+{
+  ucs4_t uc;
+
+  token->type = SCAN_UNEXPECTED_CHAR;
+  u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
+  token->number = uc;
+
+  return SCAN_DONE;
+}
+
+const char *
+scan_type_to_string (enum scan_type type)
+{
+  switch (type)
+    {
+#define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
+      SCAN_TYPES
+#undef SCAN_TYPE
+
+    default:
+      return token_type_to_name (type);
+    }
+}
+
+bool
+is_scan_type (enum scan_type type)
+{
+  return type > SCAN_FIRST && type < SCAN_LAST;
+}
+
+static enum scan_result
+scan_start__ (struct scanner *scanner, enum segment_type type,
+              struct substring s, struct token *token)
+{
+  switch (type)
+    {
+    case SEG_NUMBER:
+      token->type = T_POS_NUM;
+      token->number = scan_number__ (s);
+      return SCAN_DONE;
+
+    case SEG_QUOTED_STRING:
+    case SEG_HEX_STRING:
+    case SEG_UNICODE_STRING:
+      return scan_string_segment__ (scanner, type, s, token);
+
+    case SEG_UNQUOTED_STRING:
+    case SEG_DO_REPEAT_COMMAND:
+    case SEG_INLINE_DATA:
+    case SEG_DOCUMENT:
+      token->type = T_STRING;
+      ss_alloc_substring (&token->string, s);
+      return SCAN_DONE;
+
+    case SEG_RESERVED_WORD:
+      token->type = scan_reserved_word__ (s);
+      return SCAN_DONE;
+
+    case SEG_IDENTIFIER:
+      token->type = T_ID;
+      ss_alloc_substring (&token->string, s);
+      return SCAN_DONE;
+
+    case SEG_PUNCT:
+      if (s.length == 1 && s.string[0] == '-')
+        {
+          scanner->state = S_DASH;
+          return SCAN_SAVE;
+        }
+      else
+        {
+          token->type = scan_punct__ (s);
+          return SCAN_DONE;
+        }
+
+    case SEG_SHBANG:
+    case SEG_SPACES:
+    case SEG_COMMENT:
+    case SEG_NEWLINE:
+    case SEG_COMMENT_COMMAND:
+      token->type = SCAN_SKIP;
+      return SCAN_DONE;
+
+    case SEG_START_DOCUMENT:
+      token->type = T_ID;
+      ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
+      return SCAN_DONE;
+
+    case SEG_START_COMMAND:
+    case SEG_SEPARATE_COMMANDS:
+    case SEG_END_COMMAND:
+      token->type = T_ENDCMD;
+      return SCAN_DONE;
+
+    case SEG_END:
+      token->type = T_STOP;
+      return SCAN_DONE;
+
+    case SEG_EXPECTED_QUOTE:
+      token->type = SCAN_EXPECTED_QUOTE;
+      return SCAN_DONE;
+
+    case SEG_EXPECTED_EXPONENT:
+      token->type = SCAN_EXPECTED_EXPONENT;
+      ss_alloc_substring (&token->string, s);
+      return SCAN_DONE;
+
+    case SEG_UNEXPECTED_DOT:
+      token->type = SCAN_UNEXPECTED_DOT;
+      return SCAN_DONE;
+
+    case SEG_UNEXPECTED_CHAR:
+      return scan_unexpected_char (&s, token);
+
+    case SEG_N_TYPES:
+      NOT_REACHED ();
+    }
+
+  NOT_REACHED ();
+}
+
+static enum scan_result
+scan_dash__ (enum segment_type type, struct substring s, struct token *token)
+{
+  switch (type)
+    {
+    case SEG_SPACES:
+    case SEG_COMMENT:
+      return SCAN_MORE;
+
+    case SEG_NUMBER:
+      token->type = T_NEG_NUM;
+      token->number = -scan_number__ (s);
+      return SCAN_DONE;
+
+    default:
+      token->type = T_DASH;
+      return SCAN_BACK;
+    }
+}
+
+/* Initializes SCANNER for scanning a token from a sequence of segments.
+   Initializes TOKEN as the output token.  (The client retains ownership of
+   TOKEN, but it must be preserved across subsequent calls to scanner_push()
+   for SCANNER.)
+
+   A scanner only produces a single token.  To obtain the next token,
+   re-initialize it by calling this function again.
+
+   A scanner does not contain any external references, so nothing needs to be
+   done to destroy one.  For the same reason, scanners may be copied with plain
+   struct assignment (or memcpy). */
+void
+scanner_init (struct scanner *scanner, struct token *token)
+{
+  scanner->state = S_START;
+  token_init (token);
+}
+
+/* Adds the segment with type TYPE and UTF-8 text S to SCANNER.  TOKEN must be
+   the same token passed to scanner_init() for SCANNER, or a copy of it.
+   scanner_push() may modify TOKEN.  The client retains ownership of TOKEN,
+
+   The possible return values are:
+
+     - SCAN_DONE: All of the segments that have been passed to scanner_push()
+       form the token now stored in TOKEN.  SCANNER is now "used up" and must
+       be reinitialized with scanner_init() if it is to be used again.
+
+       Most tokens only consist of a single segment, so this is the most common
+       return value.
+
+     - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
+       token.  The caller should call scanner_push() again with the next token.
+       (This won't happen if TYPE is SEG_END indicating the end of input.)
+
+     - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
+       needs to "save its place" in the stream of segments for a possible
+       future SCAN_BACK return.  This value can be returned more than once in a
+       sequence of scanner_push() calls for SCANNER, but the caller only needs
+       to keep track of the most recent position.
+
+     - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
+       the segments up to and including the segment for which SCAN_SAVE was
+       most recently returned.  Segments following that one should be passed to
+       the next scanner to be initialized.
+*/
+enum scan_result
+scanner_push (struct scanner *scanner, enum segment_type type,
+              struct substring s, struct token *token)
+{
+  switch (scanner->state)
+    {
+    case S_START:
+      return scan_start__ (scanner, type, s, token);
+
+    case S_DASH:
+      return scan_dash__ (type, s, token);
+
+    case S_STRING:
+      return scan_string__ (scanner, type, s, token);
+    }
+
+  NOT_REACHED ();
+}
diff --git a/src/language/lexer/scan.h b/src/language/lexer/scan.h

new file mode 100644 (file)

index 0000000..fdb5080
--- /dev/null
+++ b/src/language/lexer/scan.h
@@ -0,0 +1,93 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef SCAN_H
+#define SCAN_H 1
+
+#include "language/lexer/segment.h"
+#include "libpspp/str.h"
+
+struct token;
+
+/* PSPP syntax scanning.
+
+   PSPP divides traditional "lexical analysis" or "tokenization" into two
+   phases: a lower-level phase called "segmentation" and a higher-level phase
+   called "scanning".  segment.h provides declarations for the segmentation
+   phase.  This header file contains declarations for the scanning phase.
+
+   Scanning accepts as input a stream of segments, which are UTF-8 strings each
+   labeled with a segment type.  It outputs a stream of "scan tokens", which
+   are the same as the tokens used by the PSPP parser with a few additional
+   types.
+*/
+
+#define SCAN_TYPES                              \
+    SCAN_TYPE(BAD_HEX_LENGTH)                   \
+    SCAN_TYPE(BAD_HEX_DIGIT)                    \
+                                                \
+    SCAN_TYPE(BAD_UNICODE_LENGTH)               \
+    SCAN_TYPE(BAD_UNICODE_DIGIT)                \
+    SCAN_TYPE(BAD_UNICODE_CODE_POINT)           \
+                                                \
+    SCAN_TYPE(EXPECTED_QUOTE)                   \
+    SCAN_TYPE(EXPECTED_EXPONENT)                \
+    SCAN_TYPE(UNEXPECTED_DOT)                   \
+    SCAN_TYPE(UNEXPECTED_CHAR)                  \
+                                                \
+    SCAN_TYPE(SKIP)
+
+/* Types of scan tokens.
+
+   Scan token types are a superset of enum token_type.  Only the additional
+   scan token types are defined here, so see the definition of enum token_type
+   for the others. */
+enum scan_type
+  {
+#define SCAN_TYPE(TYPE) SCAN_##TYPE,
+    SCAN_FIRST = 255,
+    SCAN_TYPES
+    SCAN_LAST
+#undef SCAN_TYPE
+  };
+
+const char *scan_type_to_string (enum scan_type);
+bool is_scan_type (enum scan_type);
+
+/* A scanner.  Opaque. */
+struct scanner
+  {
+    unsigned char state;
+    unsigned char substate;
+  };
+
+/* scanner_push() return type. */
+enum scan_result
+  {
+    /* Complete token. */
+    SCAN_DONE,                  /* Token successfully scanned. */
+    SCAN_MORE,                  /* More segments needed to scan token. */
+
+    /* Incomplete token. */
+    SCAN_BACK,                  /* Done, but go back to saved position too. */
+    SCAN_SAVE                   /* Need more segments, and save position. */
+  };
+
+void scanner_init (struct scanner *, struct token *);
+enum scan_result scanner_push (struct scanner *, enum segment_type,
+                               struct substring, struct token *);
+
+#endif /* scan.h */
diff --git a/src/language/lexer/token.c b/src/language/lexer/token.c

new file mode 100644 (file)

index 0000000..89a5cf0
--- /dev/null
+++ b/src/language/lexer/token.c
@@ -0,0 +1,173 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "language/lexer/token.h"
+
+#include <math.h>
+#include <unictype.h>
+#include <unistr.h>
+
+#include "data/identifier.h"
+#include "libpspp/assertion.h"
+#include "libpspp/cast.h"
+
+#include "gl/ftoastr.h"
+#include "gl/xalloc.h"
+
+/* Initializes TOKEN with an arbitrary type, number 0, and a null string. */
+void
+token_init (struct token *token)
+{
+  token->type = 0;
+  token->number = 0.0;
+  token->string = ss_empty ();
+}
+
+/* Frees the string that TOKEN contains. */
+void
+token_destroy (struct token *token)
+{
+  if (token != NULL)
+    ss_dealloc (&token->string);
+}
+
+static char *
+number_token_to_string (const struct token *token)
+{
+  char buffer[DBL_BUFSIZE_BOUND];
+
+  dtoastr (buffer, sizeof buffer, 0, 0, fabs (token->number));
+  return (token->type == T_POS_NUM
+          ? xstrdup (buffer)
+          : xasprintf ("-%s", buffer));
+}
+
+static char *
+quoted_string_representation (struct substring ss, size_t n_quotes)
+{
+  char *rep;
+  size_t i;
+  char *p;
+
+  p = rep = xmalloc (1 + ss.length + n_quotes + 1 + 1);
+  *p++ = '\'';
+  for (i = 0; i < ss.length; i++)
+    {
+      uint8_t c = ss.string[i];
+      if (c == '\'')
+        *p++ = c;
+      *p++ = c;
+    }
+  *p++ = '\'';
+  *p = '\0';
+
+  return rep;
+}
+
+static char *
+hex_string_representation (struct substring ss)
+{
+  char *rep;
+  size_t i;
+  char *p;
+
+  p = rep = xmalloc (2 + 2 * ss.length + 1 + 1);
+  *p++ = 'X';
+  *p++ = '\'';
+  for (i = 0; i < ss.length; i++)
+    {
+      static const char hex_digits[] = "0123456789abcdef";
+      uint8_t c = ss.string[i];
+      *p++ = hex_digits[c >> 4];
+      *p++ = hex_digits[c & 15];
+    }
+  *p++ = '\'';
+  *p = '\0';
+
+  return rep;
+}
+
+static char *
+string_representation (struct substring ss)
+{
+  size_t n_quotes;
+  size_t ofs;
+  int mblen;
+
+  n_quotes = 0;
+  for (ofs = 0; ofs < ss.length; ofs += mblen)
+    {
+      ucs4_t uc;
+
+      mblen = u8_mbtoucr (&uc,
+                          CHAR_CAST (const uint8_t *, ss.string + ofs),
+                          ss.length - ofs);
+      if (mblen < 0 || !uc_is_print (uc))
+        return hex_string_representation (ss);
+      else if (uc == '\'')
+        n_quotes++;
+    }
+  return quoted_string_representation (ss, n_quotes);
+}
+
+/* Returns a UTF-8 string that would yield TOKEN if it appeared in a syntax
+   file.  The caller should free the returned string, with free(), when it is
+   no longer needed.
+
+   The T_STOP token has no representation, so this function returns NULL. */
+char *
+token_to_string (const struct token *token)
+{
+  const char *name;
+
+  switch (token->type)
+    {
+    case T_POS_NUM:
+    case T_NEG_NUM:
+      return number_token_to_string (token);
+
+    case T_ID:
+      return ss_xstrdup (token->string);
+
+    case T_STRING:
+      return string_representation (token->string);
+
+    default:
+      name = token_type_to_name (token->type);
+      return name != NULL ? xstrdup (name) : NULL;
+    }
+}
+
+/* Prints TOKEN on STREAM, for debugging. */
+void
+token_print (const struct token *token, FILE *stream)
+{
+  fputs (token_type_to_name (token->type), stream);
+  if (token->type == T_POS_NUM || token->type == T_NEG_NUM
+      || token->number != 0.0)
+    {
+      char s[DBL_BUFSIZE_BOUND];
+
+      dtoastr (s, sizeof s, 0, 0, token->number);
+      fprintf (stream, "\t%s", s);
+    }
+  if (token->type == T_ID || token->type == T_STRING || token->string.length)
+    fprintf (stream, "\t\"%.*s\"",
+             (int) token->string.length, token->string.string);
+  putc ('\n', stream);
+}
diff --git a/src/language/lexer/token.h b/src/language/lexer/token.h

new file mode 100644 (file)

index 0000000..8feaf81
--- /dev/null
+++ b/src/language/lexer/token.h
@@ -0,0 +1,45 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef TOKEN_H
+#define TOKEN_H 1
+
+#include <stdio.h>
+#include "libpspp/str.h"
+#include "data/identifier.h"
+
+/* A PSPP syntax token.
+
+   The 'type' member is used by the scanner (see scan.h) for SCAN_* values as
+   well, which is why it is not declared as type "enum token_type". */
+struct token
+  {
+    int type;                   /* Usually a "enum token_type" value. */
+    double number;
+    struct substring string;
+  };
+
+#define TOKEN_INITIALIZER(TYPE, NUMBER, STRING) \
+        { TYPE, NUMBER, SS_LITERAL_INITIALIZER (STRING) }
+
+void token_init (struct token *);
+void token_destroy (struct token *);
+
+char *token_to_string (const struct token *);
+
+void token_print (const struct token *, FILE *);
+
+#endif /* token.h */
diff --git a/tests/automake.mk b/tests/automake.mk

index 4d49e5badbc4812177ad96f7af67e0a5d66adf8d..484ef0e11c67647e08b5bab59edf4dccdad667b1 100644 (file)
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -5,6 +5,7 @@ check_PROGRAMS += \
         tests/data/sack \
         tests/data/inexactify \
         tests/language/lexer/command-name-test \
         tests/data/sack \
         tests/data/inexactify \
         tests/language/lexer/command-name-test \
+       tests/language/lexer/scan-test \
         tests/language/lexer/segment-test \
         tests/libpspp/abt-test \
         tests/libpspp/bt-test \
         tests/language/lexer/segment-test \
         tests/libpspp/abt-test \
         tests/libpspp/bt-test \
@@ -211,6 +212,20 @@ tests_language_lexer_command_name_test_LDADD = \
         $(LIBINTL) 
  tests_language_lexer_command_name_test_CFLAGS = $(AM_CFLAGS)
  
         $(LIBINTL) 
  tests_language_lexer_command_name_test_CFLAGS = $(AM_CFLAGS)
  
+check_PROGRAMS += tests/language/lexer/scan-test
+tests_language_lexer_scan_test_SOURCES = \
+       src/data/identifier.c \
+       src/language/lexer/command-name.c \
+       src/language/lexer/scan.c \
+       src/language/lexer/segment.c \
+       src/language/lexer/token.c \
+       src/libpspp/pool.c \
+       src/libpspp/prompt.c \
+       src/libpspp/str.c \
+       src/libpspp/temp-file.c \
+       tests/language/lexer/scan-test.c
+tests_language_lexer_scan_test_LDADD = gl/libgl.la $(LIBINTL)
+tests_language_lexer_scan_test_CFLAGS = $(AM_CFLAGS)
  
  check_PROGRAMS += tests/language/lexer/segment-test
  tests_language_lexer_segment_test_SOURCES = \
  
  check_PROGRAMS += tests/language/lexer/segment-test
  tests_language_lexer_segment_test_SOURCES = \
@@ -306,6 +321,7 @@ TESTSUITE_AT = \
         tests/language/lexer/command-name.at \
         tests/language/lexer/lexer.at \
         tests/language/lexer/q2c.at \
         tests/language/lexer/command-name.at \
         tests/language/lexer/lexer.at \
         tests/language/lexer/q2c.at \
+       tests/language/lexer/scan.at \
         tests/language/lexer/segment.at \
         tests/language/lexer/variable-parser.at \
         tests/language/stats/aggregate.at \
         tests/language/lexer/segment.at \
         tests/language/lexer/variable-parser.at \
         tests/language/stats/aggregate.at \
diff --git a/tests/language/lexer/scan-test.c b/tests/language/lexer/scan-test.c

new file mode 100644 (file)

index 0000000..a56dfd7
--- /dev/null
+++ b/tests/language/lexer/scan-test.c
@@ -0,0 +1,217 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libpspp/assertion.h"
+#include "libpspp/compiler.h"
+#include "libpspp/misc.h"
+#include "language/lexer/scan.h"
+#include "language/lexer/token.h"
+
+#include "gl/error.h"
+#include "gl/ftoastr.h"
+#include "gl/progname.h"
+#include "gl/read-file.h"
+#include "gl/xalloc.h"
+
+/* -a/--auto, -b/--batch, -i/--interactive: syntax mode. */
+static enum segmenter_mode mode = SEG_MODE_AUTO;
+
+static const char *parse_options (int argc, char **argv);
+static void usage (void) NO_RETURN;
+
+int
+main (int argc, char *argv[])
+{
+  struct segment
+    {
+      enum segment_type type;
+      struct substring string;
+    };
+
+  size_t offset;
+  const char *file_name;
+  char *input;
+  struct segmenter s;
+  struct segment *segs;
+  size_t n_segs, allocated_segs;
+  size_t length;
+  size_t i;
+  int n;
+
+  set_program_name (argv[0]);
+  file_name = parse_options (argc, argv);
+
+  /* Read from stdin into 'input'.  Ensure that 'input' ends in a new-line
+     followed by a null byte. */
+  input = (!strcmp (file_name, "-")
+           ? fread_file (stdin, &length)
+           : read_file (file_name, &length));
+  if (input == NULL)
+    error (EXIT_FAILURE, errno, "reading %s failed", file_name);
+  input = xrealloc (input, length + 3);
+  if (length == 0 || input[length - 1] != '\n')
+    input[length++] = '\n';
+  input[length++] = '\0';
+
+  segs = NULL;
+  n_segs = allocated_segs = 0;
+
+  segmenter_init (&s, mode);
+  for (offset = 0; offset < length; offset += n)
+    {
+      enum segment_type type;
+
+      n = segmenter_push (&s, input + offset, length - offset, &type);
+      assert (n >= 0);
+      assert (offset + n <= length);
+
+      if (n_segs >= allocated_segs)
+        segs = x2nrealloc (segs, &allocated_segs, sizeof *segs);
+
+      segs[n_segs].type = type;
+      segs[n_segs].string.string = input + offset;
+      segs[n_segs].string.length = n;
+      n_segs++;
+    }
+
+  for (i = 0; i < n_segs; )
+    {
+      enum scan_result result;
+      struct scanner scanner;
+      struct token token;
+      int saved = -1;
+
+      scanner_init (&scanner, &token);
+      do
+        {
+          struct segment *seg;
+
+          assert (i < n_segs);
+
+          seg = &segs[i++];
+          result = scanner_push (&scanner, seg->type, seg->string, &token);
+          if (result == SCAN_SAVE)
+            saved = i;
+        }
+      while (result == SCAN_MORE || result == SCAN_SAVE);
+
+      if (result == SCAN_BACK)
+        {
+          assert (saved >= 0);
+          i = saved;
+        }
+
+      printf ("%s", scan_type_to_string (token.type));
+      if (token.number != 0.0)
+        {
+          char s[DBL_BUFSIZE_BOUND];
+
+          dtoastr (s, sizeof s, 0, 0, token.number);
+          printf (" %s", s);
+        }
+      if (token.string.string != NULL || token.string.length > 0)
+        printf (" \"%.*s\"", (int) token.string.length, token.string.string);
+      printf ("\n");
+
+      token_destroy (&token);
+    }
+
+  free (input);
+
+  return 0;
+}
+
+static const char *
+parse_options (int argc, char **argv)
+{
+  for (;;)
+    {
+      static const struct option options[] =
+        {
+          {"auto", no_argument, NULL, 'a'},
+          {"batch", no_argument, NULL, 'b'},
+          {"interactive", no_argument, NULL, 'i'},
+          {"help", no_argument, NULL, 'h'},
+          {NULL, 0, NULL, 0},
+        };
+
+      int c = getopt_long (argc, argv, "abih", options, NULL);
+      if (c == -1)
+        break;
+
+      switch (c)
+        {
+        case 'a':
+          mode = SEG_MODE_AUTO;
+          break;
+
+        case 'b':
+          mode = SEG_MODE_BATCH;
+          break;
+
+        case 'i':
+          mode = SEG_MODE_INTERACTIVE;
+          break;
+
+        case 'h':
+          usage ();
+
+        case 0:
+          break;
+
+        case '?':
+          exit (EXIT_FAILURE);
+          break;
+
+        default:
+          NOT_REACHED ();
+        }
+
+    }
+
+  if (optind + 1 != argc)
+    error (1, 0, "exactly one non-option argument required; "
+           "use --help for help");
+  return argv[optind];
+}
+
+static void
+usage (void)
+{
+  printf ("\
+%s, to test breaking PSPP syntax into tokens\n\
+usage: %s [OPTIONS] INPUT\n\
+\n\
+Options:\n\
+  -1, --one-segment   feed one segment at a time\n\
+  -a, --auto          use \"auto\" syntax mode\n\
+  -b, --batch         use \"batch\" syntax mode\n\
+  -i, --interactive   use \"interactive\" syntax mode (default)\n\
+  -v, --verbose       include rows and column numbers in output\n\
+  -h, --help          print this help message\n",
+          program_name, program_name);
+  exit (EXIT_SUCCESS);
+}
diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at

new file mode 100644 (file)

index 0000000..50ee123
--- /dev/null
+++ b/tests/language/lexer/scan.at
@@ -0,0 +1,818 @@
+AT_BANNER([syntax scanning])
+m4_define([PSPP_CHECK_SCAN],
+  [AT_CHECK([scan-test $1 input], [0], [expout])])
+\f
+AT_SETUP([identifiers])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+a aB i5 $x @efg @@. #.# .x _z.
+abcd. abcd.
+QRSTUV./* end of line comment */
+QrStUv./* end of line comment */ @&t@
+WXYZ. /* unterminated end of line comment
+�. /* U+FFFD is not valid in an identifier
+])
+AT_DATA([expout], [dnl
+ID "a"
+SKIP
+ID "aB"
+SKIP
+ID "i5"
+SKIP
+ID "$x"
+SKIP
+ID "@efg"
+SKIP
+ID "@@."
+SKIP
+ID "#.#"
+SKIP
+UNEXPECTED_DOT
+ID "x"
+SKIP
+UNEXPECTED_CHAR 95
+ID "z"
+ENDCMD
+SKIP
+ID "abcd."
+SKIP
+ID "abcd"
+ENDCMD
+SKIP
+ID "QRSTUV"
+ENDCMD
+SKIP
+SKIP
+ID "QrStUv"
+ENDCMD
+SKIP
+SKIP
+SKIP
+ID "WXYZ"
+ENDCMD
+SKIP
+SKIP
+SKIP
+UNEXPECTED_CHAR 65533
+ENDCMD
+SKIP
+SKIP
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([reserved words])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+])
+AT_DATA([expout], [dnl
+AND
+SKIP
+OR
+SKIP
+NOT
+SKIP
+EQ
+SKIP
+GE
+SKIP
+GT
+SKIP
+LE
+SKIP
+LT
+SKIP
+NE
+SKIP
+ALL
+SKIP
+BY
+SKIP
+TO
+SKIP
+WITH
+SKIP
+AND
+SKIP
+OR
+SKIP
+NOT
+SKIP
+EQ
+SKIP
+GE
+SKIP
+GT
+SKIP
+LE
+SKIP
+LT
+SKIP
+NE
+SKIP
+ALL
+SKIP
+BY
+SKIP
+TO
+SKIP
+WITH
+SKIP
+ID "andx"
+SKIP
+ID "orx"
+SKIP
+ID "notx"
+SKIP
+ID "eqx"
+SKIP
+ID "gex"
+SKIP
+ID "gtx"
+SKIP
+ID "lex"
+SKIP
+ID "ltx"
+SKIP
+ID "nex"
+SKIP
+ID "allx"
+SKIP
+ID "byx"
+SKIP
+ID "tox"
+SKIP
+ID "withx"
+SKIP
+ID "and."
+SKIP
+WITH
+ENDCMD
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([punctuation])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
+~&|=>=><=<~=<>(),-+*/[[]]**
+])
+AT_DATA([expout], [dnl
+NOT
+SKIP
+AND
+SKIP
+OR
+SKIP
+EQUALS
+SKIP
+GE
+SKIP
+GT
+SKIP
+LE
+SKIP
+LT
+SKIP
+NE
+SKIP
+NE
+SKIP
+LPAREN
+SKIP
+RPAREN
+SKIP
+COMMA
+SKIP
+DASH
+SKIP
+PLUS
+SKIP
+ASTERISK
+SKIP
+SLASH
+SKIP
+LBRACK
+SKIP
+RBRACK
+SKIP
+EXP
+SKIP
+NOT
+AND
+OR
+EQUALS
+GE
+GT
+LE
+LT
+NE
+NE
+LPAREN
+RPAREN
+COMMA
+DASH
+PLUS
+ASTERISK
+SLASH
+LBRACK
+RBRACK
+EXP
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([numbers])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e-
+])
+AT_DATA([expout], [dnl
+POS_NUM
+SKIP
+POS_NUM 1
+SKIP
+POS_NUM 1
+SKIP
+POS_NUM 1
+SKIP
+POS_NUM 1
+ENDCMD
+SKIP
+POS_NUM 123
+ENDCMD
+SKIP
+SKIP
+SKIP
+SKIP
+SKIP
+ENDCMD
+POS_NUM 1
+SKIP
+POS_NUM 0.1
+SKIP
+POS_NUM 0.1
+SKIP
+POS_NUM 0.1
+SKIP
+POS_NUM 50
+SKIP
+POS_NUM 0.6
+SKIP
+POS_NUM 70
+SKIP
+POS_NUM 60
+SKIP
+POS_NUM 0.006
+SKIP
+ENDCMD
+POS_NUM 30
+SKIP
+POS_NUM 0.04
+SKIP
+POS_NUM 5
+SKIP
+POS_NUM 6
+SKIP
+POS_NUM 0.0007
+SKIP
+POS_NUM 12.3
+SKIP
+POS_NUM 4.56
+SKIP
+POS_NUM 789
+SKIP
+POS_NUM 999
+SKIP
+POS_NUM 0.0112
+SKIP
+ENDCMD
+SKIP
+EXPECTED_EXPONENT "1e"
+SKIP
+ID "e1"
+SKIP
+EXPECTED_EXPONENT "1e+"
+SKIP
+EXPECTED_EXPONENT "1e-"
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([strings])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' "" '''' """"
+'missing end quote
+"missing double quote
+'x' + "y"
++ 'z' +
+'a' /* abc */ + "b" /*
++ 'c' +/* */"d"/* */+'e'
+'foo'
++          /* special case: + in column 0 would ordinarily start a new command
+'bar'
+'foo'
+ +
+'bar'
+'foo'
++
+
+'bar'
+
++
+x"4142"+'5152'
+"4142"+
+x'5152'
+x"4142"
++u'304a'
+"�あいうえお"
+"abc"+U"FFFD"+u'3048'+"xyz"
+])
+AT_DATA([expout], [dnl
+STRING "x"
+SKIP
+STRING "y"
+SKIP
+STRING "abc"
+SKIP
+STRING "Don't"
+SKIP
+STRING "Can't"
+SKIP
+STRING "Won't"
+SKIP
+STRING ""quoted""
+SKIP
+STRING ""quoted""
+SKIP
+STRING ""
+SKIP
+STRING ""
+SKIP
+STRING "'"
+SKIP
+STRING """
+SKIP
+EXPECTED_QUOTE
+SKIP
+EXPECTED_QUOTE
+SKIP
+STRING "xyzabcde"
+SKIP
+STRING "foobar"
+SKIP
+STRING "foobar"
+SKIP
+STRING "foo"
+SKIP
+PLUS
+SKIP
+ENDCMD
+SKIP
+STRING "bar"
+SKIP
+ENDCMD
+SKIP
+PLUS
+SKIP
+STRING "AB5152"
+SKIP
+STRING "4142QR"
+SKIP
+STRING "ABお"
+SKIP
+STRING "�あいうえお"
+SKIP
+STRING "abc�えxyz"
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([@%:@! construct])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+#! /usr/bin/pspp
+#! /usr/bin/pspp
+])
+AT_DATA([expout], [dnl
+SKIP
+SKIP
+ID "#"
+UNEXPECTED_CHAR 33
+SKIP
+SLASH
+ID "usr"
+SLASH
+ID "bin"
+SLASH
+ID "pspp"
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([* and COMMENT commands])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+   * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+])
+AT_DATA([expout], [dnl
+SKIP
+SKIP
+SKIP
+ENDCMD
+SKIP
+ENDCMD
+SKIP
+SKIP
+ENDCMD
+SKIP
+SKIP
+ENDCMD
+SKIP
+ENDCMD
+SKIP
+SKIP
+ENDCMD
+SKIP
+SKIP
+ENDCMD
+SKIP
+ID "com"
+SKIP
+ID "is"
+SKIP
+ID "ambiguous"
+SKIP
+WITH
+SKIP
+ID "COMPUTE"
+ENDCMD
+SKIP
+ENDCMD
+SKIP
+SKIP
+SKIP
+ENDCMD
+SKIP
+ENDCMD
+SKIP
+SKIP
+SKIP
+ENDCMD
+SKIP
+ID "next"
+SKIP
+ID "command"
+ENDCMD
+SKIP
+ENDCMD
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([DOCUMENT command])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+DOCUMENT one line.
+DOC more
+    than
+        one
+            line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+])
+AT_DATA([expout], [dnl
+ID "DOCUMENT"
+STRING "DOCUMENT one line."
+ENDCMD
+ENDCMD
+SKIP
+ID "DOCUMENT"
+STRING "DOC more"
+SKIP
+STRING "    than"
+SKIP
+STRING "        one"
+SKIP
+STRING "            line."
+ENDCMD
+ENDCMD
+SKIP
+ID "DOCUMENT"
+STRING "docu"
+SKIP
+STRING "first.paragraph"
+SKIP
+STRING "isn't parsed as tokens"
+SKIP
+STRING ""
+SKIP
+STRING "second paragraph."
+ENDCMD
+ENDCMD
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([TITLE, SUBTITLE, FILE LABEL commands])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+title/**/'Quoted string title'.
+tit /*
+"Quoted string on second line".
+sub "Quoted string subtitle"
+ .
+
+TITL /* Not a */ quoted string title.
+SUBT Not a quoted string /* subtitle
+
+FIL label isn't quoted.
+FILE
+  lab 'is quoted'.
+FILE /*
+/**/  lab not quoted here either
+
+])
+AT_DATA([expout], [dnl
+ID "title"
+SKIP
+STRING "Quoted string title"
+ENDCMD
+SKIP
+ID "tit"
+SKIP
+SKIP
+SKIP
+STRING "Quoted string on second line"
+ENDCMD
+SKIP
+ID "sub"
+SKIP
+STRING "Quoted string subtitle"
+SKIP
+SKIP
+ENDCMD
+SKIP
+ENDCMD
+SKIP
+ID "TITL"
+SKIP
+STRING "/* Not a */ quoted string title"
+ENDCMD
+SKIP
+ID "SUBT"
+SKIP
+STRING "Not a quoted string /* subtitle"
+SKIP
+ENDCMD
+SKIP
+ID "FIL"
+SKIP
+ID "label"
+SKIP
+STRING "isn't quoted"
+ENDCMD
+SKIP
+ID "FILE"
+SKIP
+SKIP
+ID "lab"
+SKIP
+STRING "is quoted"
+ENDCMD
+SKIP
+ID "FILE"
+SKIP
+SKIP
+SKIP
+SKIP
+SKIP
+ID "lab"
+SKIP
+STRING "not quoted here either"
+SKIP
+ENDCMD
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([BEGIN DATA command])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+begin data.
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end  data
+end data
+.
+])
+AT_DATA([expout], [dnl
+ID "begin"
+SKIP
+ID "data"
+ENDCMD
+SKIP
+STRING "123"
+SKIP
+STRING "xxx"
+SKIP
+ID "end"
+SKIP
+ID "data"
+ENDCMD
+SKIP
+ENDCMD
+SKIP
+ID "BEG"
+SKIP
+SKIP
+SKIP
+ID "DAT"
+SKIP
+SKIP
+SKIP
+STRING "5 6 7 /* x"
+SKIP
+STRING ""
+SKIP
+STRING "end  data"
+SKIP
+ID "end"
+SKIP
+ID "data"
+SKIP
+ENDCMD
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([DO REPEAT command])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+do repeat x=a b c
+          y=d e f.
+  do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+])
+AT_DATA([expout], [dnl
+ID "do"
+SKIP
+ID "repeat"
+SKIP
+ID "x"
+EQUALS
+ID "a"
+SKIP
+ID "b"
+SKIP
+ID "c"
+SKIP
+SKIP
+ID "y"
+EQUALS
+ID "d"
+SKIP
+ID "e"
+SKIP
+ID "f"
+ENDCMD
+SKIP
+STRING "  do repeat a=1 thru 5."
+SKIP
+STRING "another command."
+SKIP
+STRING "second command"
+SKIP
+STRING "+ third command."
+SKIP
+STRING "end /* x */ /* y */ repeat print."
+SKIP
+ID "end"
+SKIP
+SKIP
+ID "repeat"
+ENDCMD
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-i])
+AT_CLEANUP
+\f
+AT_SETUP([batch mode])
+AT_KEYWORDS([scan])
+AT_DATA([input], [dnl
+first command
+     another line of first command
++  second command
+third command
+
+fourth command.
+   fifth command.
+])
+AT_DATA([expout], [dnl
+ID "first"
+SKIP
+ID "command"
+SKIP
+SKIP
+ID "another"
+SKIP
+ID "line"
+SKIP
+ID "of"
+SKIP
+ID "first"
+SKIP
+ID "command"
+SKIP
+ENDCMD
+SKIP
+ID "second"
+SKIP
+ID "command"
+SKIP
+ENDCMD
+ID "third"
+SKIP
+ID "command"
+SKIP
+ENDCMD
+SKIP
+ID "fourth"
+SKIP
+ID "command"
+ENDCMD
+SKIP
+SKIP
+ID "fifth"
+SKIP
+ID "command"
+ENDCMD
+SKIP
+STOP
+])
+PSPP_CHECK_SCAN([-b])
+AT_CLEANUP
author	Ben Pfaff <blp@cs.stanford.edu>
	Sat, 19 Mar 2011 23:32:16 +0000 (16:32 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 20 Mar 2011 16:43:44 +0000 (09:43 -0700)
Smake		patch \| blob \| history
src/language/lexer/automake.mk		patch \| blob \| history
src/language/lexer/scan.c	[new file with mode: 0644]	patch \| blob
src/language/lexer/scan.h	[new file with mode: 0644]	patch \| blob
src/language/lexer/token.c	[new file with mode: 0644]	patch \| blob
src/language/lexer/token.h	[new file with mode: 0644]	patch \| blob
tests/automake.mk		patch \| blob \| history
tests/language/lexer/scan-test.c	[new file with mode: 0644]	patch \| blob
tests/language/lexer/scan.at	[new file with mode: 0644]	patch \| blob