1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
31 #include "gl/memchr2.h"
54 #define SS_START_OF_LINE (1u << 0)
55 #define SS_START_OF_COMMAND (1u << 1)
57 static int segmenter_detect_command_name__ (const char *input,
61 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n)
63 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
68 mblen = u8_mbtoucr (puc, input, n);
69 return (mblen >= 0 ? mblen
71 : u8_mbtouc (puc, input, n));
75 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
76 enum segment_type *type)
82 else if (input[1] == '!')
86 for (ofs = 2; ofs < n; ofs++)
87 if (input[ofs] == '\n' || input[ofs] == '\0')
89 if (input[ofs] == '\n' && input[ofs - 1] == '\r')
93 s->substate = SS_START_OF_COMMAND;
102 s->state = S_GENERAL;
103 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
104 return segmenter_push (s, input, n, type);
108 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
109 const char *input, size_t n,
110 enum segment_type *type)
112 assert (s->state == S_GENERAL);
119 return input[1] != '\0' && strchr (seconds, input[1]) != NULL ? 2 : 1;
123 skip_comment (const char *input, size_t n, size_t ofs)
125 for (; ofs < n; ofs++)
127 if (input[ofs] == '\n' || input[ofs] == '\0')
129 else if (input[ofs] == '*')
133 else if (input[ofs + 1] == '/')
141 skip_spaces_and_comments (const char *input, size_t n, int ofs)
148 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
156 else if (input[ofs + 1] != '*')
159 ofs = skip_comment (input, n, ofs + 2);
163 else if (lex_uc_is_space (uc) && uc != '\n')
173 is_end_of_line (const char *input, size_t n, int ofs)
175 if (input[ofs] == '\n' || input[ofs] == '\0')
177 else if (input[ofs] == '\r')
181 return input[ofs + 1] == '\n';
188 at_end_of_line (const char *input, size_t n, int ofs)
190 ofs = skip_spaces_and_comments (input, n, ofs);
194 return is_end_of_line (input, n, ofs);
198 segmenter_parse_newline__ (const char *input, size_t n,
199 enum segment_type *type)
203 if (input[0] == '\n')
210 assert (input[0] == '\r');
211 assert (input[1] == '\n');
220 skip_spaces (const char *input, size_t n, size_t ofs)
227 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
231 if (!lex_uc_is_space (uc) || uc == '\n' || uc == '\0')
241 skip_digits (const char *input, size_t n, int ofs)
243 for (; ofs < n; ofs++)
244 if (!c_isdigit (input[ofs]))
250 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
251 enum segment_type *type)
255 assert (s->state == S_GENERAL);
257 ofs = skip_digits (input, n, 0);
261 if (input[ofs] == '.')
263 ofs = skip_digits (input, n, ofs + 1);
270 if (input[ofs] == 'e' || input[ofs] == 'E')
276 if (input[ofs] == '+' || input[ofs] == '-')
283 if (!c_isdigit (input[ofs]))
285 *type = SEG_EXPECTED_EXPONENT;
290 ofs = skip_digits (input, n, ofs);
295 if (input[ofs - 1] == '.')
297 int eol = at_end_of_line (input, n, ofs);
310 is_reserved_word (const char *s, int n)
314 s0 = c_toupper (s[0]);
318 s1 = c_toupper (s[1]);
319 return ((s0 == 'B' && s1 == 'Y')
320 || (s0 == 'E' && s1 == 'Q')
321 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
322 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
323 || (s0 == 'N' && s1 == 'E')
324 || (s0 == 'O' && s1 == 'R')
325 || (s0 == 'T' && s1 == 'O'));
328 s1 = c_toupper (s[1]);
329 s2 = c_toupper (s[2]);
330 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
331 || (s1 == 'N' && s2 == 'D')))
332 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
335 s1 = c_toupper (s[1]);
336 s2 = c_toupper (s[2]);
337 s3 = c_toupper (s[3]);
338 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
346 segmenter_parse_comment_1__ (struct segmenter *s,
347 const char *input, size_t n,
348 enum segment_type *type)
360 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
371 if (ofs > 1 && input[ofs - 1] == '\r')
375 if (endcmd == -2 || uc == '\0')
377 /* Blank line ends comment command. */
378 s->state = S_GENERAL;
379 s->substate = SS_START_OF_COMMAND;
380 *type = SEG_SEPARATE_COMMANDS;
383 else if (endcmd >= 0)
385 /* '.' at end of line ends comment command. */
386 s->state = S_GENERAL;
388 *type = SEG_COMMENT_COMMAND;
393 /* Comment continues onto next line. */
394 *type = SEG_COMMENT_COMMAND;
395 s->state = S_COMMENT_2;
401 if (!lex_uc_is_space (uc))
412 segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n,
413 enum segment_type *type)
420 ofs = segmenter_parse_newline__ (input, n, type);
421 if (ofs < 0 || ofs >= n)
424 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
428 if (uc == '+' || uc == '-' || uc == '.')
430 else if (!lex_uc_is_space (uc))
433 case SEG_MODE_INTERACTIVE:
442 new_cmd = segmenter_detect_command_name__ (input, n, ofs);
455 s->state = S_GENERAL;
456 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
459 s->state = S_COMMENT_1;
464 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
465 enum segment_type *type)
477 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
488 if (ofs > 1 && input[ofs - 1] == '\r')
491 *type = SEG_DOCUMENT;
492 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
496 *type = SEG_DOCUMENT;
497 s->state = S_DOCUMENT_3;
501 if (!lex_uc_is_space (uc))
512 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
513 enum segment_type *type)
517 ofs = segmenter_parse_newline__ (input, n, type);
521 s->state = S_DOCUMENT_1;
526 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
528 *type = SEG_END_COMMAND;
529 s->state = S_GENERAL;
530 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
535 segmenter_unquoted (const char *input, size_t n, int ofs)
540 ofs = skip_spaces_and_comments (input, n, ofs);
545 return c != '\'' && c != '"' && c != '\n' && c != '\0';
549 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
550 int ofs, char id[], size_t id_size)
552 struct segmenter sub;
554 assert (id_size > 0);
557 sub.state = S_GENERAL;
561 enum segment_type type;
564 retval = segmenter_push (&sub, input + ofs, n - ofs, &type);
580 if (retval < id_size)
582 memcpy (id, input + ofs, retval);
589 case SEG_QUOTED_STRING:
591 case SEG_UNICODE_STRING:
592 case SEG_UNQUOTED_STRING:
593 case SEG_RESERVED_WORD:
595 case SEG_COMMENT_COMMAND:
596 case SEG_DO_REPEAT_COMMAND:
597 case SEG_INLINE_DATA:
598 case SEG_START_DOCUMENT:
600 case SEG_START_COMMAND:
601 case SEG_SEPARATE_COMMANDS:
602 case SEG_END_COMMAND:
604 case SEG_EXPECTED_QUOTE:
605 case SEG_EXPECTED_EXPONENT:
606 case SEG_UNEXPECTED_DOT:
607 case SEG_UNEXPECTED_CHAR:
616 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
617 enum segment_type *type)
622 assert (s->state == S_GENERAL);
624 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
632 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
635 else if (!lex_uc_is_idn (uc))
641 if (input[ofs - 1] == '.')
643 int eol = at_end_of_line (input, n, ofs);
650 if (is_reserved_word (input, ofs))
651 *type = SEG_RESERVED_WORD;
653 *type = SEG_IDENTIFIER;
655 if (s->substate & SS_START_OF_COMMAND)
657 struct substring word = ss_buffer (input, ofs);
659 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
661 s->state = S_COMMENT_1;
662 return segmenter_parse_comment_1__ (s, input, n, type);
664 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
666 s->state = S_DOCUMENT_1;
667 *type = SEG_START_DOCUMENT;
670 else if (lex_id_match (ss_cstr ("TITLE"), word)
671 || lex_id_match (ss_cstr ("SUBTITLE"), word))
673 int result = segmenter_unquoted (input, n, ofs);
678 s->state = S_TITLE_1;
682 else if (lex_id_match (ss_cstr ("FILE"), word))
686 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
688 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
690 s->state = S_FILE_LABEL;
695 else if (lex_id_match (ss_cstr ("DO"), word))
699 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
701 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
703 s->state = S_DO_REPEAT_1;
708 else if (lex_id_match (ss_cstr ("BEGIN"), word))
713 ofs2 = next_id_in_command (s, input, n, ofs, id, sizeof id);
716 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
720 ofs2 = skip_spaces_and_comments (input, n, ofs2);
724 if (input[ofs2] == '.')
726 ofs2 = skip_spaces_and_comments (input, n, ofs2 + 1);
731 eol = is_end_of_line (input, n, ofs2);
736 if (memchr (input, '\n', ofs2))
737 s->state = S_BEGIN_DATA_1;
739 s->state = S_BEGIN_DATA_2;
752 segmenter_parse_string__ (enum segment_type string_type,
753 int ofs, struct segmenter *s,
754 const char *input, size_t n, enum segment_type *type)
756 int quote = input[ofs];
760 if (input[ofs] == quote)
765 else if (input[ofs] == quote)
774 else if (input[ofs] == '\n' || input[ofs] == '\0')
776 *type = SEG_EXPECTED_QUOTE;
787 segmenter_maybe_parse_string__ (enum segment_type string_type,
789 const char *input, size_t n,
790 enum segment_type *type)
794 else if (input[1] == '\'' || input[1] == '"')
795 return segmenter_parse_string__ (string_type, 1, s, input, n, type);
797 return segmenter_parse_id__ (s, input, n, type);
801 segmenter_parse_mid_command__ (struct segmenter *s,
802 const char *input, size_t n,
803 enum segment_type *type)
809 assert (s->state == S_GENERAL);
810 assert (!(s->substate & SS_START_OF_LINE));
812 mblen = segmenter_u8_to_uc__ (&uc, input, n);
819 s->substate |= SS_START_OF_LINE;
826 else if (input[1] == '*')
828 ofs = skip_comment (input, n, 2);
842 case '(': case ')': case ',': case '=': case '-':
843 case '[': case ']': case '&': case '|': case '+':
849 if (s->substate & SS_START_OF_COMMAND)
851 /* '*' at the beginning of a command begins a comment. */
852 s->state = S_COMMENT_1;
853 return segmenter_parse_comment_1__ (s, input, n, type);
856 return segmenter_parse_digraph__ ("*", s, input, n, type);
859 return segmenter_parse_digraph__ ("=>", s, input, n, type);
862 return segmenter_parse_digraph__ ("=", s, input, n, type);
865 return segmenter_parse_digraph__ ("=", s, input, n, type);
870 else if (c_isdigit (input[1]))
871 return segmenter_parse_number__ (s, input, n, type);
874 int eol = at_end_of_line (input, n, 1);
880 *type = SEG_END_COMMAND;
881 s->substate = SS_START_OF_COMMAND;
884 *type = SEG_UNEXPECTED_DOT;
889 case '0': case '1': case '2': case '3': case '4':
890 case '5': case '6': case '7': case '8': case '9':
891 return segmenter_parse_number__ (s, input, n, type);
894 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
898 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
902 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
906 if (lex_uc_is_space (uc))
908 ofs = skip_spaces (input, n, mblen);
912 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
916 s->substate |= SS_START_OF_LINE;
926 else if (lex_uc_is_id1 (uc))
927 return segmenter_parse_id__ (s, input, n, type);
930 *type = SEG_UNEXPECTED_CHAR;
938 compare_commands (const void *a_, const void *b_)
940 const char *const *ap = a_;
941 const char *const *bp = b_;
945 return c_strcasecmp (a, b);
949 segmenter_get_command_name_candidates (unsigned char first)
951 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
952 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
953 static const char *commands[] =
955 #include "language/command.def"
958 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
964 static const char **cindex[UCHAR_MAX + 1];
972 qsort (commands, n_commands, sizeof *commands, compare_commands);
973 for (i = 0; i < n_commands; i++)
975 unsigned char c = c_toupper (commands[i][0]);
976 if (cindex[c] == NULL)
977 cindex[c] = &commands[i];
979 for (i = 0; i <= UCHAR_MAX; i++)
980 if (cindex[i] == NULL)
981 cindex[i] = &commands[n_commands];
984 return cindex[c_toupper (first)];
988 segmenter_detect_command_name__ (const char *input, size_t n, int ofs)
990 const char **commands;
1003 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1007 if (uc == '\n' || uc == '\0'
1008 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1013 if (input[ofs - 1] == '.')
1016 for (commands = segmenter_get_command_name_candidates (input[0]);
1017 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1023 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1024 &exact, &missing_words)
1025 && missing_words <= 0)
1033 is_start_of_string__ (const char *input, size_t n, int ofs)
1038 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1043 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1046 return c == '\'' || c == '"' || c == '\n';
1050 segmenter_parse_start_of_line__ (struct segmenter *s,
1051 const char *input, size_t n,
1052 enum segment_type *type)
1058 assert (s->state == S_GENERAL);
1059 assert (s->substate & SS_START_OF_LINE);
1061 mblen = segmenter_u8_to_uc__ (&uc, input, n);
1068 ofs = skip_spaces_and_comments (input, n, 1);
1073 int is_string = is_start_of_string__ (input, n, ofs);
1078 /* This is punctuation that may separate pieces of a string. */
1088 *type = SEG_START_COMMAND;
1089 s->substate = SS_START_OF_COMMAND;
1093 if (lex_uc_is_space (uc))
1095 int eol = at_end_of_line (input, n, 0);
1100 s->substate = SS_START_OF_COMMAND;
1101 *type = SEG_SEPARATE_COMMANDS;
1107 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1109 else if (s->mode == SEG_MODE_AUTO)
1111 int cmd = segmenter_detect_command_name__ (input, n, 0);
1118 assert (s->mode == SEG_MODE_BATCH);
1120 s->substate = SS_START_OF_COMMAND;
1121 *type = SEG_START_COMMAND;
1125 s->substate = SS_START_OF_COMMAND;
1126 return segmenter_parse_mid_command__ (s, input, n, type);
1130 segmenter_parse_file_label__ (struct segmenter *s,
1131 const char *input, size_t n,
1132 enum segment_type *type)
1134 struct segmenter sub;
1138 sub.state = S_GENERAL;
1139 ofs = segmenter_push (&sub, input, n, type);
1143 else if (*type == SEG_IDENTIFIER)
1147 assert (lex_id_match (ss_cstr ("LABEL"),
1148 ss_buffer ((char *) input, ofs)));
1149 result = segmenter_unquoted (input, n, ofs);
1155 s->state = S_TITLE_1;
1163 s->substate = sub.substate;
1169 segmenter_subparse (struct segmenter *s,
1170 const char *input, size_t n, enum segment_type *type)
1172 struct segmenter sub;
1176 sub.state = S_GENERAL;
1177 sub.substate = s->substate;
1178 ofs = segmenter_push (&sub, input, n, type);
1179 s->substate = sub.substate;
1184 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1185 const char *input, size_t n,
1186 enum segment_type *type)
1188 int ofs = segmenter_subparse (s, input, n, type);
1192 if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
1193 s->state = S_DO_REPEAT_2;
1194 else if (*type == SEG_END_COMMAND)
1196 s->state = S_DO_REPEAT_3;
1204 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1205 const char *input, size_t n,
1206 enum segment_type *type)
1208 int ofs = segmenter_subparse (s, input, n, type);
1212 if (*type == SEG_NEWLINE)
1214 s->state = S_DO_REPEAT_3;
1222 check_repeat_command (struct segmenter *s,
1223 const char *input, size_t n)
1230 if (input[ofs] == '+' || input[ofs] == '-')
1233 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1236 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1238 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1243 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1247 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1248 s->substate += direction;
1253 segmenter_parse_full_line__ (const char *input, size_t n,
1254 enum segment_type *type)
1256 const char *newline = memchr2 (input, '\n', '\0', n);
1258 if (newline == NULL)
1262 int ofs = newline - input;
1263 if (*newline == '\0')
1268 else if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1270 *type = SEG_NEWLINE;
1274 return ofs - (input[ofs - 1] == '\r');
1279 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1280 const char *input, size_t n,
1281 enum segment_type *type)
1285 ofs = segmenter_parse_full_line__ (input, n, type);
1286 if (ofs < 0 || input[ofs - 1] == '\n')
1288 else if (!check_repeat_command (s, input, n))
1290 else if (s->substate == 0)
1292 s->state = S_GENERAL;
1293 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1294 return segmenter_push (s, input, n, type);
1298 *type = SEG_DO_REPEAT_COMMAND;
1304 segmenter_parse_begin_data_1__ (struct segmenter *s,
1305 const char *input, size_t n,
1306 enum segment_type *type)
1308 int ofs = segmenter_subparse (s, input, n, type);
1312 if (*type == SEG_NEWLINE)
1313 s->state = S_BEGIN_DATA_2;
1319 segmenter_parse_begin_data_2__ (struct segmenter *s,
1320 const char *input, size_t n,
1321 enum segment_type *type)
1323 int ofs = segmenter_subparse (s, input, n, type);
1327 if (*type == SEG_NEWLINE)
1328 s->state = S_BEGIN_DATA_3;
1334 is_end_data (const char *input, size_t n)
1336 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1342 if (n < 3 || c_strncasecmp (input, "END", 3))
1346 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1347 if (!lex_uc_is_space (uc))
1351 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1358 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1365 else if (!lex_uc_is_space (uc))
1374 segmenter_parse_begin_data_3__ (struct segmenter *s,
1375 const char *input, size_t n,
1376 enum segment_type *type)
1380 ofs = segmenter_parse_full_line__ (input, n, type);
1383 else if (is_end_data (input, ofs))
1385 s->state = S_GENERAL;
1386 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1387 return segmenter_push (s, input, n, type);
1391 *type = SEG_INLINE_DATA;
1392 s->state = S_BEGIN_DATA_4;
1393 return input[ofs - 1] == '\n' ? 0 : ofs;
1398 segmenter_parse_begin_data_4__ (struct segmenter *s,
1399 const char *input, size_t n,
1400 enum segment_type *type)
1404 ofs = segmenter_parse_newline__ (input, n, type);
1408 s->state = S_BEGIN_DATA_3;
1413 segmenter_parse_title_1__ (struct segmenter *s,
1414 const char *input, size_t n,
1415 enum segment_type *type)
1419 ofs = skip_spaces (input, n, 0);
1422 s->state = S_TITLE_2;
1428 segmenter_parse_title_2__ (struct segmenter *s,
1429 const char *input, size_t n,
1430 enum segment_type *type)
1442 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1450 s->state = S_GENERAL;
1452 *type = SEG_UNQUOTED_STRING;
1453 return endcmd >= 0 ? endcmd : ofs;
1460 if (!lex_uc_is_space (uc))
1471 /* Returns the name of segment TYPE as a string. The caller must not modify
1472 or free the returned string.
1474 This is useful only for debugging and testing. */
1476 segment_type_to_string (enum segment_type type)
1480 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1484 return "unknown segment type";
1488 /* Initializes S as a segmenter with the given syntax MODE.
1490 A segmenter does not contain any external references, so nothing needs to be
1491 done to destroy one. For the same reason, segmenters may be copied with
1492 plain struct assignment (or memcpy). */
1494 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1496 s->state = S_SHBANG;
1501 /* Returns the mode passed to segmenter_init() for S. */
1503 segmenter_get_mode (const struct segmenter *s)
1508 /* Attempts to label a prefix of S's remaining input with a segment type. The
1509 caller supplies the first N bytes of the remaining input as INPUT, which
1510 must be a UTF-8 encoded string. The end of the input stream must be
1511 indicated by a null byte at the beginning of a line, that is, immediately
1512 following a new-line (or as the first byte of the input stream).
1514 The input may contain '\n' or '\r\n' line ends in any combination.
1516 If successful, returns the number of bytes in the segment at the beginning
1517 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1518 into *TYPE. The next call to segmenter_push() should not include those
1519 bytes as part of INPUT, because they have (figuratively) been consumed by
1522 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1523 be determined. In this case segmenter_push() returns -1. The caller should
1524 obtain more input and then call segmenter_push() again with a larger N and
1525 repeat until the input is exhausted (which must be indicated as described
1526 above) or until a valid segment is returned. segmenter_push() will never
1527 return -1 when the end of input is visible within INPUT.
1529 The caller must not, in a sequence of calls, supply contradictory input.
1530 That is, bytes provided as part of INPUT in one call, but not consumed, must
1531 not be provided with *different* values on subsequent calls. This is
1532 because segmenter_push() must often make decisions based on looking ahead
1533 beyond the bytes that it consumes. */
1535 segmenter_push (struct segmenter *s, const char *input, size_t n,
1536 enum segment_type *type)
1541 if (input[0] == '\0')
1550 return segmenter_parse_shbang__ (s, input, n, type);
1553 return (s->substate & SS_START_OF_LINE
1554 ? segmenter_parse_start_of_line__ (s, input, n, type)
1555 : segmenter_parse_mid_command__ (s, input, n, type));
1558 return segmenter_parse_comment_1__ (s, input, n, type);
1560 return segmenter_parse_comment_2__ (s, input, n, type);
1563 return segmenter_parse_document_1__ (s, input, n, type);
1565 return segmenter_parse_document_2__ (s, input, n, type);
1567 return segmenter_parse_document_3__ (s, type);
1570 return segmenter_parse_file_label__ (s, input, n, type);
1573 return segmenter_parse_do_repeat_1__ (s, input, n, type);
1575 return segmenter_parse_do_repeat_2__ (s, input, n, type);
1577 return segmenter_parse_do_repeat_3__ (s, input, n, type);
1579 case S_BEGIN_DATA_1:
1580 return segmenter_parse_begin_data_1__ (s, input, n, type);
1581 case S_BEGIN_DATA_2:
1582 return segmenter_parse_begin_data_2__ (s, input, n, type);
1583 case S_BEGIN_DATA_3:
1584 return segmenter_parse_begin_data_3__ (s, input, n, type);
1585 case S_BEGIN_DATA_4:
1586 return segmenter_parse_begin_data_4__ (s, input, n, type);
1589 return segmenter_parse_title_1__ (s, input, n, type);
1591 return segmenter_parse_title_2__ (s, input, n, type);
1597 /* Returns the style of command prompt to display to an interactive user for
1598 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1599 and at the beginning of a line (that is, if segmenter_push() consumed as
1600 much as possible of the input up to a new-line). */
1602 segmenter_get_prompt (const struct segmenter *s)
1607 return PROMPT_FIRST;
1610 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1614 return PROMPT_COMMENT;
1618 return PROMPT_DOCUMENT;
1620 return PROMPT_FIRST;
1623 return PROMPT_LATER;
1627 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1629 return PROMPT_DO_REPEAT;
1631 case S_BEGIN_DATA_1:
1632 return PROMPT_FIRST;
1633 case S_BEGIN_DATA_2:
1634 return PROMPT_LATER;
1635 case S_BEGIN_DATA_3:
1636 case S_BEGIN_DATA_4:
1641 return PROMPT_FIRST;