1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
56 static int segmenter_detect_command_name__ (const char *input,
60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n)
62 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
67 mblen = u8_mbtoucr (puc, input, n);
68 return (mblen >= 0 ? mblen
70 : u8_mbtouc (puc, input, n));
74 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
75 enum segment_type *type)
81 else if (input[1] == '!')
85 for (ofs = 2; ofs < n; ofs++)
86 if (input[ofs] == '\n')
88 if (input[ofs - 1] == '\r')
92 s->substate = SS_START_OF_COMMAND;
101 s->state = S_GENERAL;
102 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
103 return segmenter_push (s, input, n, type);
107 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
108 const char *input, size_t n,
109 enum segment_type *type)
111 assert (s->state == S_GENERAL);
118 return input[1] != '\0' && strchr (seconds, input[1]) != NULL ? 2 : 1;
122 skip_comment (const char *input, size_t n, size_t ofs)
124 for (; ofs < n; ofs++)
126 if (input[ofs] == '\n')
128 else if (input[ofs] == '*')
132 else if (input[ofs + 1] == '/')
140 skip_spaces_and_comments (const char *input, size_t n, int ofs)
147 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
155 else if (input[ofs + 1] != '*')
158 ofs = skip_comment (input, n, ofs + 2);
162 else if (lex_uc_is_space (uc) && uc != '\n')
172 is_end_of_line (const char *input, size_t n, int ofs)
174 if (input[ofs] == '\n')
176 else if (input[ofs] == '\r')
180 return input[ofs + 1] == '\n';
187 at_end_of_line (const char *input, size_t n, int ofs)
189 ofs = skip_spaces_and_comments (input, n, ofs);
193 return is_end_of_line (input, n, ofs);
198 segmenter_parse_newline__ (const char *input, size_t n,
199 enum segment_type *type)
203 if (input[0] == '\n')
210 assert (input[0] == '\r');
211 assert (input[1] == '\n');
220 skip_spaces (const char *input, size_t n, size_t ofs)
227 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
231 if (!lex_uc_is_space (uc) || uc == '\n')
241 skip_digits (const char *input, size_t n, int ofs)
243 for (; ofs < n; ofs++)
244 if (!c_isdigit (input[ofs]))
250 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
251 enum segment_type *type)
255 assert (s->state == S_GENERAL);
257 ofs = skip_digits (input, n, 0);
261 if (input[ofs] == '.')
263 ofs = skip_digits (input, n, ofs + 1);
270 if (input[ofs] == 'e' || input[ofs] == 'E')
276 if (input[ofs] == '+' || input[ofs] == '-')
283 if (!c_isdigit (input[ofs]))
285 *type = SEG_EXPECTED_EXPONENT;
290 ofs = skip_digits (input, n, ofs);
295 if (input[ofs - 1] == '.')
297 int eol = at_end_of_line (input, n, ofs);
310 is_reserved_word (const char *s, int n)
314 s0 = c_toupper (s[0]);
318 s1 = c_toupper (s[1]);
319 return ((s0 == 'B' && s1 == 'Y')
320 || (s0 == 'E' && s1 == 'Q')
321 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
322 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
323 || (s0 == 'N' && s1 == 'E')
324 || (s0 == 'O' && s1 == 'R')
325 || (s0 == 'T' && s1 == 'O'));
328 s1 = c_toupper (s[1]);
329 s2 = c_toupper (s[2]);
330 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
331 || (s1 == 'N' && s2 == 'D')))
332 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
335 s1 = c_toupper (s[1]);
336 s2 = c_toupper (s[2]);
337 s3 = c_toupper (s[3]);
338 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
346 segmenter_parse_comment_1__ (struct segmenter *s,
347 const char *input, size_t n,
348 enum segment_type *type)
360 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
371 if (ofs > 1 && input[ofs - 1] == '\r')
376 /* Blank line ends comment command. */
377 s->state = S_GENERAL;
378 s->substate = SS_START_OF_COMMAND;
379 *type = SEG_SEPARATE_COMMANDS;
382 else if (endcmd >= 0)
384 /* '.' at end of line ends comment command. */
385 s->state = S_GENERAL;
387 *type = SEG_COMMENT_COMMAND;
392 /* Comment continues onto next line. */
393 *type = SEG_COMMENT_COMMAND;
394 s->state = S_COMMENT_2;
400 if (!lex_uc_is_space (uc))
411 segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n,
412 enum segment_type *type)
419 ofs = segmenter_parse_newline__ (input, n, type);
420 if (ofs < 0 || ofs >= n)
423 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
427 if (uc == '+' || uc == '-' || uc == '.')
429 else if (!lex_uc_is_space (uc))
432 case SEG_MODE_INTERACTIVE:
441 new_cmd = segmenter_detect_command_name__ (input, n, ofs);
454 s->state = S_GENERAL;
455 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
458 s->state = S_COMMENT_1;
463 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
464 enum segment_type *type)
476 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
487 if (ofs > 1 && input[ofs - 1] == '\r')
490 *type = SEG_DOCUMENT;
491 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
495 if (!lex_uc_is_space (uc))
506 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
507 enum segment_type *type)
511 ofs = segmenter_parse_newline__ (input, n, type);
515 s->state = S_DOCUMENT_1;
520 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
522 *type = SEG_END_COMMAND;
523 s->state = S_GENERAL;
524 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
529 segmenter_unquoted (const char *input, size_t n, int ofs)
534 ofs = skip_spaces_and_comments (input, n, ofs);
539 return c != '\'' && c != '"' && c != '\n' && c != '\0';
543 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
544 int ofs, char id[], size_t id_size)
546 struct segmenter sub;
548 assert (id_size > 0);
551 sub.state = S_GENERAL;
555 enum segment_type type;
558 retval = segmenter_push (&sub, input + ofs, n - ofs, &type);
574 if (retval < id_size)
576 memcpy (id, input + ofs, retval);
583 case SEG_QUOTED_STRING:
585 case SEG_UNICODE_STRING:
586 case SEG_UNQUOTED_STRING:
587 case SEG_RESERVED_WORD:
589 case SEG_COMMENT_COMMAND:
590 case SEG_DO_REPEAT_COMMAND:
591 case SEG_INLINE_DATA:
592 case SEG_START_DOCUMENT:
594 case SEG_START_COMMAND:
595 case SEG_SEPARATE_COMMANDS:
596 case SEG_END_COMMAND:
598 case SEG_EXPECTED_QUOTE:
599 case SEG_EXPECTED_EXPONENT:
600 case SEG_UNEXPECTED_DOT:
601 case SEG_UNEXPECTED_CHAR:
610 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
611 enum segment_type *type)
616 assert (s->state == S_GENERAL);
618 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
626 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
629 else if (!lex_uc_is_idn (uc))
635 if (input[ofs - 1] == '.')
637 int eol = at_end_of_line (input, n, ofs);
644 if (is_reserved_word (input, ofs))
645 *type = SEG_RESERVED_WORD;
647 *type = SEG_IDENTIFIER;
649 if (s->substate & SS_START_OF_COMMAND)
651 struct substring word = ss_buffer (input, ofs);
653 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
655 s->state = S_COMMENT_1;
656 return segmenter_parse_comment_1__ (s, input, n, type);
658 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
660 s->state = S_DOCUMENT_1;
661 *type = SEG_START_DOCUMENT;
664 else if (lex_id_match (ss_cstr ("TITLE"), word)
665 || lex_id_match (ss_cstr ("SUBTITLE"), word))
667 int result = segmenter_unquoted (input, n, ofs);
672 s->state = S_TITLE_1;
676 else if (lex_id_match (ss_cstr ("FILE"), word))
680 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
682 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
684 s->state = S_FILE_LABEL;
689 else if (lex_id_match (ss_cstr ("DO"), word))
693 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
695 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
697 s->state = S_DO_REPEAT_1;
702 else if (lex_id_match (ss_cstr ("BEGIN"), word))
707 ofs2 = next_id_in_command (s, input, n, ofs, id, sizeof id);
710 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
714 ofs2 = skip_spaces_and_comments (input, n, ofs2);
718 if (input[ofs2] == '.')
720 ofs2 = skip_spaces_and_comments (input, n, ofs2 + 1);
725 eol = is_end_of_line (input, n, ofs2);
730 if (memchr (input, '\n', ofs2))
731 s->state = S_BEGIN_DATA_1;
733 s->state = S_BEGIN_DATA_2;
746 segmenter_parse_string__ (enum segment_type string_type,
747 int ofs, struct segmenter *s,
748 const char *input, size_t n, enum segment_type *type)
750 int quote = input[ofs];
754 if (input[ofs] == quote)
759 else if (input[ofs] == quote)
768 else if (input[ofs] == '\n' || input[ofs] == '\0')
770 *type = SEG_EXPECTED_QUOTE;
781 segmenter_maybe_parse_string__ (enum segment_type string_type,
783 const char *input, size_t n,
784 enum segment_type *type)
788 else if (input[1] == '\'' || input[1] == '"')
789 return segmenter_parse_string__ (string_type, 1, s, input, n, type);
791 return segmenter_parse_id__ (s, input, n, type);
795 segmenter_parse_mid_command__ (struct segmenter *s,
796 const char *input, size_t n,
797 enum segment_type *type)
803 assert (s->state == S_GENERAL);
804 assert (!(s->substate & SS_START_OF_LINE));
806 mblen = segmenter_u8_to_uc__ (&uc, input, n);
813 s->substate |= SS_START_OF_LINE;
820 else if (input[1] == '*')
822 ofs = skip_comment (input, n, 2);
836 case '(': case ')': case ',': case '=': case '-':
837 case '[': case ']': case '&': case '|': case '+':
843 if (s->substate & SS_START_OF_COMMAND)
845 /* '*' at the beginning of a command begins a comment. */
846 s->state = S_COMMENT_1;
847 return segmenter_parse_comment_1__ (s, input, n, type);
850 return segmenter_parse_digraph__ ("*", s, input, n, type);
853 return segmenter_parse_digraph__ ("=>", s, input, n, type);
856 return segmenter_parse_digraph__ ("=", s, input, n, type);
859 return segmenter_parse_digraph__ ("=", s, input, n, type);
864 else if (c_isdigit (input[1]))
865 return segmenter_parse_number__ (s, input, n, type);
868 int eol = at_end_of_line (input, n, 1);
874 *type = SEG_END_COMMAND;
875 s->substate = SS_START_OF_COMMAND;
878 *type = SEG_UNEXPECTED_DOT;
883 case '0': case '1': case '2': case '3': case '4':
884 case '5': case '6': case '7': case '8': case '9':
885 return segmenter_parse_number__ (s, input, n, type);
888 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
892 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
896 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
900 if (lex_uc_is_space (uc))
902 ofs = skip_spaces (input, n, mblen);
906 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
910 s->substate |= SS_START_OF_LINE;
920 else if (lex_uc_is_id1 (uc))
921 return segmenter_parse_id__ (s, input, n, type);
924 *type = SEG_UNEXPECTED_CHAR;
932 compare_commands (const void *a_, const void *b_)
934 const char *const *ap = a_;
935 const char *const *bp = b_;
939 return c_strcasecmp (a, b);
943 segmenter_get_command_name_candidates (unsigned char first)
945 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
946 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
947 static const char *commands[] =
949 #include "language/command.def"
952 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
958 static const char **cindex[UCHAR_MAX + 1];
966 qsort (commands, n_commands, sizeof *commands, compare_commands);
967 for (i = 0; i < n_commands; i++)
969 unsigned char c = c_toupper (commands[i][0]);
970 if (cindex[c] == NULL)
971 cindex[c] = &commands[i];
973 for (i = 0; i <= UCHAR_MAX; i++)
974 if (cindex[i] == NULL)
975 cindex[i] = &commands[n_commands];
978 return cindex[c_toupper (first)];
982 segmenter_detect_command_name__ (const char *input, size_t n, int ofs)
984 const char **commands;
997 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1002 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1007 if (input[ofs - 1] == '.')
1010 for (commands = segmenter_get_command_name_candidates (input[0]);
1011 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1017 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1018 &exact, &missing_words)
1019 && missing_words <= 0)
1027 is_start_of_string__ (const char *input, size_t n, int ofs)
1032 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1037 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1040 return c == '\'' || c == '"' || c == '\n';
1044 segmenter_parse_start_of_line__ (struct segmenter *s,
1045 const char *input, size_t n,
1046 enum segment_type *type)
1052 assert (s->state == S_GENERAL);
1053 assert (s->substate & SS_START_OF_LINE);
1055 mblen = segmenter_u8_to_uc__ (&uc, input, n);
1062 ofs = skip_spaces_and_comments (input, n, 1);
1067 int is_string = is_start_of_string__ (input, n, ofs);
1072 /* This is punctuation that may separate pieces of a string. */
1082 *type = SEG_START_COMMAND;
1083 s->substate = SS_START_OF_COMMAND;
1087 if (lex_uc_is_space (uc))
1089 int eol = at_end_of_line (input, n, 0);
1094 s->substate = SS_START_OF_COMMAND;
1095 *type = SEG_SEPARATE_COMMANDS;
1101 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1103 else if (s->mode == SEG_MODE_AUTO)
1105 int cmd = segmenter_detect_command_name__ (input, n, 0);
1112 assert (s->mode == SEG_MODE_BATCH);
1114 s->substate = SS_START_OF_COMMAND;
1115 *type = SEG_START_COMMAND;
1119 s->substate = SS_START_OF_COMMAND;
1120 return segmenter_parse_mid_command__ (s, input, n, type);
1124 segmenter_parse_file_label__ (struct segmenter *s,
1125 const char *input, size_t n,
1126 enum segment_type *type)
1128 struct segmenter sub;
1132 sub.state = S_GENERAL;
1133 ofs = segmenter_push (&sub, input, n, type);
1137 else if (*type == SEG_IDENTIFIER)
1141 assert (lex_id_match (ss_cstr ("LABEL"),
1142 ss_buffer ((char *) input, ofs)));
1143 result = segmenter_unquoted (input, n, ofs);
1149 s->state = S_TITLE_1;
1157 s->substate = sub.substate;
1163 segmenter_subparse (struct segmenter *s,
1164 const char *input, size_t n, enum segment_type *type)
1166 struct segmenter sub;
1170 sub.state = S_GENERAL;
1171 sub.substate = s->substate;
1172 ofs = segmenter_push (&sub, input, n, type);
1173 s->substate = sub.substate;
1178 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1179 const char *input, size_t n,
1180 enum segment_type *type)
1182 int ofs = segmenter_subparse (s, input, n, type);
1186 if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
1187 s->state = S_DO_REPEAT_2;
1188 else if (*type == SEG_END_COMMAND)
1190 s->state = S_DO_REPEAT_3;
1198 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1199 const char *input, size_t n,
1200 enum segment_type *type)
1202 int ofs = segmenter_subparse (s, input, n, type);
1206 if (*type == SEG_NEWLINE)
1208 s->state = S_DO_REPEAT_3;
1216 check_repeat_command (struct segmenter *s,
1217 const char *input, size_t n)
1224 if (input[ofs] == '+' || input[ofs] == '-')
1227 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1230 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1232 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1237 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1241 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1242 s->substate += direction;
1247 segmenter_parse_full_line__ (const char *input, size_t n,
1248 enum segment_type *type)
1250 const char *newline = memchr (input, '\n', n);
1252 if (newline == NULL)
1256 int ofs = newline - input;
1257 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1259 *type = SEG_NEWLINE;
1263 return ofs - (input[ofs - 1] == '\r');
1268 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1269 const char *input, size_t n,
1270 enum segment_type *type)
1274 ofs = segmenter_parse_full_line__ (input, n, type);
1275 if (ofs < 0 || input[ofs - 1] == '\n')
1277 else if (!check_repeat_command (s, input, n))
1279 else if (s->substate == 0)
1281 s->state = S_GENERAL;
1282 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1283 return segmenter_push (s, input, n, type);
1287 *type = SEG_DO_REPEAT_COMMAND;
1293 segmenter_parse_begin_data_1__ (struct segmenter *s,
1294 const char *input, size_t n,
1295 enum segment_type *type)
1297 int ofs = segmenter_subparse (s, input, n, type);
1301 if (*type == SEG_NEWLINE)
1302 s->state = S_BEGIN_DATA_2;
1308 segmenter_parse_begin_data_2__ (struct segmenter *s,
1309 const char *input, size_t n,
1310 enum segment_type *type)
1312 int ofs = segmenter_subparse (s, input, n, type);
1316 if (*type == SEG_NEWLINE)
1317 s->state = S_BEGIN_DATA_3;
1323 is_end_data (const char *input, size_t n)
1325 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1331 if (n < 3 || c_strncasecmp (input, "END", 3))
1335 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1336 if (!lex_uc_is_space (uc))
1340 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1347 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1354 else if (!lex_uc_is_space (uc))
1363 segmenter_parse_begin_data_3__ (struct segmenter *s,
1364 const char *input, size_t n,
1365 enum segment_type *type)
1369 ofs = segmenter_parse_full_line__ (input, n, type);
1372 else if (is_end_data (input, ofs))
1374 s->state = S_GENERAL;
1375 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1376 return segmenter_push (s, input, n, type);
1380 *type = SEG_INLINE_DATA;
1381 s->state = S_BEGIN_DATA_4;
1382 return input[ofs - 1] == '\n' ? 0 : ofs;
1387 segmenter_parse_begin_data_4__ (struct segmenter *s,
1388 const char *input, size_t n,
1389 enum segment_type *type)
1393 ofs = segmenter_parse_newline__ (input, n, type);
1397 s->state = S_BEGIN_DATA_3;
1402 segmenter_parse_title_1__ (struct segmenter *s,
1403 const char *input, size_t n,
1404 enum segment_type *type)
1408 ofs = skip_spaces (input, n, 0);
1411 s->state = S_TITLE_2;
1417 segmenter_parse_title_2__ (struct segmenter *s,
1418 const char *input, size_t n,
1419 enum segment_type *type)
1431 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1438 s->state = S_GENERAL;
1440 *type = SEG_UNQUOTED_STRING;
1441 return endcmd >= 0 ? endcmd : ofs;
1448 if (!lex_uc_is_space (uc))
1459 /* Returns the name of segment TYPE as a string. The caller must not modify
1460 or free the returned string.
1462 This is useful only for debugging and testing. */
1464 segment_type_to_string (enum segment_type type)
1468 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1472 return "unknown segment type";
1476 /* Initializes S as a segmenter with the given syntax MODE.
1478 A segmenter does not contain any external references, so nothing needs to be
1479 done to destroy one. For the same reason, segmenters may be copied with
1480 plain struct assignment (or memcpy). */
1482 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1484 s->state = S_SHBANG;
1489 /* Returns the mode passed to segmenter_init() for S. */
1491 segmenter_get_mode (const struct segmenter *s)
1496 /* Attempts to label a prefix of S's remaining input with a segment type. The
1497 caller supplies the first N bytes of the remaining input as INPUT, which
1498 must be a UTF-8 encoded string. The end of the input stream must be
1499 indicated by a null byte at the beginning of a line, that is, immediately
1500 following a new-line (or as the first byte of the input stream).
1502 The input may contain '\n' or '\r\n' line ends in any combination.
1504 If successful, returns the number of bytes in the segment at the beginning
1505 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1506 into *TYPE. The next call to segmenter_push() should not include those
1507 bytes as part of INPUT, because they have (figuratively) been consumed by
1510 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1511 be determined. In this case segmenter_push() returns -1. The caller should
1512 obtain more input and then call segmenter_push() again with a larger N and
1513 repeat until the input is exhausted (which must be indicated as described
1514 above) or until a valid segment is returned. segmenter_push() will never
1515 return -1 when the end of input is visible within INPUT.
1517 The caller must not, in a sequence of calls, supply contradictory input.
1518 That is, bytes provided as part of INPUT in one call, but not consumed, must
1519 not be provided with *different* values on subsequent calls. This is
1520 because segmenter_push() must often make decisions based on looking ahead
1521 beyond the bytes that it consumes. */
1523 segmenter_push (struct segmenter *s, const char *input, size_t n,
1524 enum segment_type *type)
1529 if (input[0] == '\0')
1538 return segmenter_parse_shbang__ (s, input, n, type);
1541 return (s->substate & SS_START_OF_LINE
1542 ? segmenter_parse_start_of_line__ (s, input, n, type)
1543 : segmenter_parse_mid_command__ (s, input, n, type));
1546 return segmenter_parse_comment_1__ (s, input, n, type);
1548 return segmenter_parse_comment_2__ (s, input, n, type);
1551 return segmenter_parse_document_1__ (s, input, n, type);
1553 return segmenter_parse_document_2__ (s, input, n, type);
1555 return segmenter_parse_document_3__ (s, type);
1558 return segmenter_parse_file_label__ (s, input, n, type);
1561 return segmenter_parse_do_repeat_1__ (s, input, n, type);
1563 return segmenter_parse_do_repeat_2__ (s, input, n, type);
1565 return segmenter_parse_do_repeat_3__ (s, input, n, type);
1567 case S_BEGIN_DATA_1:
1568 return segmenter_parse_begin_data_1__ (s, input, n, type);
1569 case S_BEGIN_DATA_2:
1570 return segmenter_parse_begin_data_2__ (s, input, n, type);
1571 case S_BEGIN_DATA_3:
1572 return segmenter_parse_begin_data_3__ (s, input, n, type);
1573 case S_BEGIN_DATA_4:
1574 return segmenter_parse_begin_data_4__ (s, input, n, type);
1577 return segmenter_parse_title_1__ (s, input, n, type);
1579 return segmenter_parse_title_2__ (s, input, n, type);
1585 /* Returns the style of command prompt to display to an interactive user for
1586 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1587 and at the beginning of a line (that is, if segmenter_push() consumed as
1588 much as possible of the input up to a new-line). */
1590 segmenter_get_prompt (const struct segmenter *s)
1595 return PROMPT_FIRST;
1598 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1602 return PROMPT_COMMENT;
1606 return PROMPT_DOCUMENT;
1608 return PROMPT_FIRST;
1611 return PROMPT_LATER;
1615 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1617 return PROMPT_DO_REPEAT;
1619 case S_BEGIN_DATA_1:
1620 return PROMPT_FIRST;
1621 case S_BEGIN_DATA_2:
1622 return PROMPT_LATER;
1623 case S_BEGIN_DATA_3:
1624 case S_BEGIN_DATA_4:
1629 return PROMPT_FIRST;