1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
22 #include "libpspp/prompt.h"
24 /* PSPP syntax segmentation.
26 PSPP divides traditional "lexical analysis" or "tokenization" into two
27 phases: a lower-level phase called "segmentation" and a higher-level phase
28 called "scanning". This header file provides declarations for the
29 segmentation phase. scan.h contains declarations for the scanning phase.
31 Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label
32 (a segment type) for each byte or contiguous sequence of bytes in the input.
33 It also, in a few corner cases, outputs zero-width segments that label the
34 boundary between a pair of bytes in the input.
36 Some segment types correspond directly to tokens; for example, an
37 "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID)
38 later in lexical analysis. Other segments contribute to tokens but do not
39 correspond diectly; for example, multiple quoted string segments
40 (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators
41 (SEG_PUNCT) may be combined to form a single string token (T_STRING).
42 Still other segments are ignored (e.g. SEG_SPACES) or trigger special
43 behavior such as error messages later in tokenization
44 (e.g. SEG_EXPECTED_QUOTE).
49 This corresponds to the syntax mode for which a syntax file is intended.
50 This is the only configuration setting for a segmenter. */
53 /* Try to interpret input correctly regardless of whether it is written
54 for interactive or batch mode. */
57 /* Interactive or batch syntax mode. */
64 SEG_TYPE(QUOTED_STRING) \
65 SEG_TYPE(HEX_STRING) \
66 SEG_TYPE(UNICODE_STRING) \
67 SEG_TYPE(UNQUOTED_STRING) \
68 SEG_TYPE(RESERVED_WORD) \
69 SEG_TYPE(IDENTIFIER) \
77 SEG_TYPE(COMMENT_COMMAND) \
78 SEG_TYPE(DO_REPEAT_COMMAND) \
79 SEG_TYPE(INLINE_DATA) \
81 SEG_TYPE(START_DOCUMENT) \
84 SEG_TYPE(START_COMMAND) \
85 SEG_TYPE(SEPARATE_COMMANDS) \
86 SEG_TYPE(END_COMMAND) \
89 SEG_TYPE(EXPECTED_QUOTE) \
90 SEG_TYPE(EXPECTED_EXPONENT) \
91 SEG_TYPE(UNEXPECTED_DOT) \
92 SEG_TYPE(UNEXPECTED_CHAR)
94 /* Types of segments. */
97 #define SEG_TYPE(NAME) SEG_##NAME,
102 /* Number of segment types. */
103 #define SEG_TYPE(NAME) + 1
104 enum { SEG_N_TYPES = SEG_TYPES };
107 const char *segment_type_to_string (enum segment_type);
109 /* A segmenter. Opaque. */
113 unsigned char substate;
117 void segmenter_init (struct segmenter *, enum segmenter_mode);
119 enum segmenter_mode segmenter_get_mode (const struct segmenter *);
121 int segmenter_push (struct segmenter *, const char *input, size_t n, bool eof,
122 enum segment_type *);
124 enum prompt_style segmenter_get_prompt (const struct segmenter *);
126 #endif /* segment.h */