pintos-os.org Git - pspp/blob - src/language/lexer/scan.h

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #ifndef SCAN_H
  18 #define SCAN_H 1
  19
  20 #include "language/lexer/segment.h"
  21 #include "libpspp/str.h"
  22
  23 struct token;
  24
  25 /* PSPP syntax scanning.
  26
  27    PSPP divides traditional "lexical analysis" or "tokenization" into two
  28    phases: a lower-level phase called "segmentation" and a higher-level phase
  29    called "scanning".  segment.h provides declarations for the segmentation
  30    phase.  This header file contains declarations for the scanning phase.
  31
  32    Scanning accepts as input a stream of segments, which are UTF-8 strings each
  33    labeled with a segment type.  It outputs a stream of "scan tokens", which
  34    are the same as the tokens used by the PSPP parser with a few additional
  35    types.
  36 */
  37
  38 #define SCAN_TYPES                              \
  39     SCAN_TYPE(BAD_HEX_LENGTH)                   \
  40     SCAN_TYPE(BAD_HEX_DIGIT)                    \
  41                                                 \
  42     SCAN_TYPE(BAD_UNICODE_LENGTH)               \
  43     SCAN_TYPE(BAD_UNICODE_DIGIT)                \
  44     SCAN_TYPE(BAD_UNICODE_CODE_POINT)           \
  45                                                 \
  46     SCAN_TYPE(EXPECTED_QUOTE)                   \
  47     SCAN_TYPE(EXPECTED_EXPONENT)                \
  48     SCAN_TYPE(UNEXPECTED_DOT)                   \
  49     SCAN_TYPE(UNEXPECTED_CHAR)                  \
  50                                                 \
  51     SCAN_TYPE(SKIP)
  52
  53 /* Types of scan tokens.
  54
  55    Scan token types are a superset of enum token_type.  Only the additional
  56    scan token types are defined here, so see the definition of enum token_type
  57    for the others. */
  58 enum scan_type
  59   {
  60 #define SCAN_TYPE(TYPE) SCAN_##TYPE,
  61     SCAN_FIRST = 255,
  62     SCAN_TYPES
  63     SCAN_LAST
  64 #undef SCAN_TYPE
  65   };
  66
  67 const char *scan_type_to_string (enum scan_type);
  68 bool is_scan_type (enum scan_type);
  69
  70 /* A scanner.  Opaque. */
  71 struct scanner
  72   {
  73     unsigned char state;
  74     unsigned char substate;
  75   };
  76
  77 /* scanner_push() return type. */
  78 enum scan_result
  79   {
  80     /* Complete token. */
  81     SCAN_DONE,                  /* Token successfully scanned. */
  82     SCAN_MORE,                  /* More segments needed to scan token. */
  83
  84     /* Incomplete token. */
  85     SCAN_BACK,                  /* Done, but go back to saved position too. */
  86     SCAN_SAVE                   /* Need more segments, and save position. */
  87   };
  88
  89 void scanner_init (struct scanner *, struct token *);
  90 enum scan_result scanner_push (struct scanner *, enum segment_type,
  91                                struct substring, struct token *);
  92 \f
  93 /* A simplified lexer for handling syntax in a string. */
  94
  95 struct string_lexer
  96   {
  97     const char *input;
  98     size_t length;
  99     size_t offset;
 100     struct segmenter segmenter;
 101   };
 102
 103 void string_lexer_init (struct string_lexer *, const char *input,
 104                         enum segmenter_mode);
 105 bool string_lexer_next (struct string_lexer *, struct token *);
 106
 107 #endif /* scan.h */