1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
56 static int segmenter_detect_command_name__ (const char *input,
60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n)
62 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
67 mblen = u8_mbtoucr (puc, input, n);
68 return (mblen >= 0 ? mblen
70 : u8_mbtouc (puc, input, n));
74 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
75 enum segment_type *type)
81 else if (input[1] == '!')
85 for (ofs = 2; ofs < n; ofs++)
86 if (input[ofs] == '\n')
88 if (input[ofs - 1] == '\r')
92 s->substate = SS_START_OF_COMMAND;
101 s->state = S_GENERAL;
102 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
103 return segmenter_push (s, input, n, type);
107 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
108 const char *input, size_t n,
109 enum segment_type *type)
111 assert (s->state == S_GENERAL);
118 return input[1] != '\0' && strchr (seconds, input[1]) != NULL ? 2 : 1;
122 skip_comment (const char *input, size_t n, size_t ofs)
124 for (; ofs < n; ofs++)
126 if (input[ofs] == '\n')
128 else if (input[ofs] == '*')
132 else if (input[ofs + 1] == '/')
140 skip_spaces_and_comments (const char *input, size_t n, int ofs)
147 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
155 else if (input[ofs + 1] != '*')
158 ofs = skip_comment (input, n, ofs + 2);
162 else if (lex_uc_is_space (uc) && uc != '\n')
172 is_end_of_line (const char *input, size_t n, int ofs)
174 if (input[ofs] == '\n')
176 else if (input[ofs] == '\r')
180 return input[ofs + 1] == '\n';
187 at_end_of_line (const char *input, size_t n, int ofs)
189 ofs = skip_spaces_and_comments (input, n, ofs);
193 return is_end_of_line (input, n, ofs);
198 segmenter_parse_newline__ (const char *input, size_t n,
199 enum segment_type *type)
203 if (input[0] == '\n')
210 assert (input[0] == '\r');
211 assert (input[1] == '\n');
220 skip_spaces (const char *input, size_t n, size_t ofs)
227 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
231 if (!lex_uc_is_space (uc) || uc == '\n')
241 skip_digits (const char *input, size_t n, int ofs)
243 for (; ofs < n; ofs++)
244 if (!c_isdigit (input[ofs]))
250 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
251 enum segment_type *type)
255 assert (s->state == S_GENERAL);
257 ofs = skip_digits (input, n, 0);
261 if (input[ofs] == '.')
263 ofs = skip_digits (input, n, ofs + 1);
270 if (input[ofs] == 'e' || input[ofs] == 'E')
276 if (input[ofs] == '+' || input[ofs] == '-')
283 if (!c_isdigit (input[ofs]))
285 *type = SEG_EXPECTED_EXPONENT;
290 ofs = skip_digits (input, n, ofs);
295 if (input[ofs - 1] == '.')
297 int eol = at_end_of_line (input, n, ofs);
310 is_reserved_word (const char *s, int n)
314 s0 = c_toupper (s[0]);
318 s1 = c_toupper (s[1]);
319 return ((s0 == 'B' && s1 == 'Y')
320 || (s0 == 'E' && s1 == 'Q')
321 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
322 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
323 || (s0 == 'N' && s1 == 'E')
324 || (s0 == 'O' && s1 == 'R')
325 || (s0 == 'T' && s1 == 'O'));
328 s1 = c_toupper (s[1]);
329 s2 = c_toupper (s[2]);
330 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
331 || (s1 == 'N' && s2 == 'D')))
332 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
335 s1 = c_toupper (s[1]);
336 s2 = c_toupper (s[2]);
337 s3 = c_toupper (s[3]);
338 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
346 segmenter_parse_comment_1__ (struct segmenter *s,
347 const char *input, size_t n,
348 enum segment_type *type)
360 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
371 if (ofs > 1 && input[ofs - 1] == '\r')
376 /* Blank line ends comment command. */
377 s->state = S_GENERAL;
378 s->substate = SS_START_OF_COMMAND;
379 *type = SEG_SEPARATE_COMMANDS;
382 else if (endcmd >= 0)
384 /* '.' at end of line ends comment command. */
385 s->state = S_GENERAL;
387 *type = SEG_COMMENT_COMMAND;
392 /* Comment continues onto next line. */
393 *type = SEG_COMMENT_COMMAND;
394 s->state = S_COMMENT_2;
400 if (!lex_uc_is_space (uc))
411 segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n,
412 enum segment_type *type)
419 ofs = segmenter_parse_newline__ (input, n, type);
420 if (ofs < 0 || ofs >= n)
423 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
427 if (uc == '+' || uc == '-' || uc == '.')
429 else if (!lex_uc_is_space (uc))
432 case SEG_MODE_INTERACTIVE:
441 new_cmd = segmenter_detect_command_name__ (input, n, ofs);
454 s->state = S_GENERAL;
455 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
458 s->state = S_COMMENT_1;
463 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
464 enum segment_type *type)
476 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
487 if (ofs > 1 && input[ofs - 1] == '\r')
490 *type = SEG_DOCUMENT;
491 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
495 if (!lex_uc_is_space (uc))
506 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
507 enum segment_type *type)
511 ofs = segmenter_parse_newline__ (input, n, type);
515 s->state = S_DOCUMENT_1;
520 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
522 *type = SEG_END_COMMAND;
523 s->state = S_GENERAL;
524 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
529 segmenter_unquoted (const char *input, size_t n, int ofs)
534 ofs = skip_spaces_and_comments (input, n, ofs);
539 return c != '\'' && c != '"' && c != '\n' && c != '\0';
543 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
544 int ofs, char id[], size_t id_size)
546 struct segmenter sub;
548 assert (id_size > 0);
551 sub.state = S_GENERAL;
555 enum segment_type type;
558 retval = segmenter_push (&sub, input + ofs, n - ofs, &type);
574 if (retval < id_size)
576 memcpy (id, input + ofs, retval);
583 case SEG_QUOTED_STRING:
585 case SEG_UNICODE_STRING:
586 case SEG_UNQUOTED_STRING:
587 case SEG_RESERVED_WORD:
589 case SEG_COMMENT_COMMAND:
590 case SEG_DO_REPEAT_COMMAND:
591 case SEG_INLINE_DATA:
592 case SEG_START_DOCUMENT:
594 case SEG_START_COMMAND:
595 case SEG_SEPARATE_COMMANDS:
596 case SEG_END_COMMAND:
598 case SEG_EXPECTED_QUOTE:
599 case SEG_EXPECTED_EXPONENT:
600 case SEG_UNEXPECTED_DOT:
601 case SEG_UNEXPECTED_CHAR:
613 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
614 enum segment_type *type)
619 assert (s->state == S_GENERAL);
621 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
629 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
632 else if (!lex_uc_is_idn (uc))
638 if (input[ofs - 1] == '.')
640 int eol = at_end_of_line (input, n, ofs);
647 if (is_reserved_word (input, ofs))
648 *type = SEG_RESERVED_WORD;
650 *type = SEG_IDENTIFIER;
652 if (s->substate & SS_START_OF_COMMAND)
654 struct substring word = ss_buffer (input, ofs);
656 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
658 s->state = S_COMMENT_1;
659 return segmenter_parse_comment_1__ (s, input, n, type);
661 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
663 s->state = S_DOCUMENT_1;
664 *type = SEG_START_DOCUMENT;
667 else if (lex_id_match (ss_cstr ("TITLE"), word)
668 || lex_id_match (ss_cstr ("SUBTITLE"), word))
670 int result = segmenter_unquoted (input, n, ofs);
675 s->state = S_TITLE_1;
679 else if (lex_id_match (ss_cstr ("FILE"), word))
683 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
685 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
687 s->state = S_FILE_LABEL;
692 else if (lex_id_match (ss_cstr ("DO"), word))
696 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
698 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
700 s->state = S_DO_REPEAT_1;
705 else if (lex_id_match (ss_cstr ("BEGIN"), word))
710 ofs2 = next_id_in_command (s, input, n, ofs, id, sizeof id);
713 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
717 ofs2 = skip_spaces_and_comments (input, n, ofs2);
721 if (input[ofs2] == '.')
723 ofs2 = skip_spaces_and_comments (input, n, ofs2 + 1);
728 eol = is_end_of_line (input, n, ofs2);
733 if (memchr (input, '\n', ofs2))
734 s->state = S_BEGIN_DATA_1;
736 s->state = S_BEGIN_DATA_2;
749 segmenter_parse_string__ (enum segment_type string_type,
750 int ofs, struct segmenter *s,
751 const char *input, size_t n, enum segment_type *type)
753 int quote = input[ofs];
757 if (input[ofs] == quote)
762 else if (input[ofs] == quote)
771 else if (input[ofs] == '\n' || input[ofs] == '\0')
773 *type = SEG_EXPECTED_QUOTE;
784 segmenter_maybe_parse_string__ (enum segment_type string_type,
786 const char *input, size_t n,
787 enum segment_type *type)
791 else if (input[1] == '\'' || input[1] == '"')
792 return segmenter_parse_string__ (string_type, 1, s, input, n, type);
794 return segmenter_parse_id__ (s, input, n, type);
798 segmenter_parse_mid_command__ (struct segmenter *s,
799 const char *input, size_t n,
800 enum segment_type *type)
806 assert (s->state == S_GENERAL);
807 assert (!(s->substate & SS_START_OF_LINE));
809 mblen = segmenter_u8_to_uc__ (&uc, input, n);
816 s->substate |= SS_START_OF_LINE;
823 else if (input[1] == '*')
825 ofs = skip_comment (input, n, 2);
839 case '(': case ')': case ',': case '=': case '-':
840 case '[': case ']': case '&': case '|': case '+':
846 if (s->substate & SS_START_OF_COMMAND)
848 /* '*' at the beginning of a command begins a comment. */
849 s->state = S_COMMENT_1;
850 return segmenter_parse_comment_1__ (s, input, n, type);
853 return segmenter_parse_digraph__ ("*", s, input, n, type);
856 return segmenter_parse_digraph__ ("=>", s, input, n, type);
859 return segmenter_parse_digraph__ ("=", s, input, n, type);
862 return segmenter_parse_digraph__ ("=", s, input, n, type);
867 else if (c_isdigit (input[1]))
868 return segmenter_parse_number__ (s, input, n, type);
871 int eol = at_end_of_line (input, n, 1);
877 *type = SEG_END_COMMAND;
878 s->substate = SS_START_OF_COMMAND;
881 *type = SEG_UNEXPECTED_DOT;
886 case '0': case '1': case '2': case '3': case '4':
887 case '5': case '6': case '7': case '8': case '9':
888 return segmenter_parse_number__ (s, input, n, type);
891 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
895 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
899 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
903 if (lex_uc_is_space (uc))
905 ofs = skip_spaces (input, n, mblen);
909 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
913 s->substate |= SS_START_OF_LINE;
923 else if (lex_uc_is_id1 (uc))
924 return segmenter_parse_id__ (s, input, n, type);
927 *type = SEG_UNEXPECTED_CHAR;
935 compare_commands (const void *a_, const void *b_)
937 const char *const *ap = a_;
938 const char *const *bp = b_;
942 return c_strcasecmp (a, b);
946 segmenter_get_command_name_candidates (unsigned char first)
948 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
949 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
950 static const char *commands[] =
952 #include "language/command.def"
955 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
961 static const char **cindex[UCHAR_MAX + 1];
969 qsort (commands, n_commands, sizeof *commands, compare_commands);
970 for (i = 0; i < n_commands; i++)
972 unsigned char c = c_toupper (commands[i][0]);
973 if (cindex[c] == NULL)
974 cindex[c] = &commands[i];
976 for (i = 0; i <= UCHAR_MAX; i++)
977 if (cindex[i] == NULL)
978 cindex[i] = &commands[n_commands];
981 return cindex[c_toupper (first)];
985 segmenter_detect_command_name__ (const char *input, size_t n, int ofs)
987 const char **commands;
1000 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1005 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1010 if (input[ofs - 1] == '.')
1013 for (commands = segmenter_get_command_name_candidates (input[0]);
1014 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1020 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1021 &exact, &missing_words)
1022 && missing_words <= 0)
1030 is_start_of_string__ (const char *input, size_t n, int ofs)
1035 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1040 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1043 return c == '\'' || c == '"' || c == '\n';
1047 segmenter_parse_start_of_line__ (struct segmenter *s,
1048 const char *input, size_t n,
1049 enum segment_type *type)
1055 assert (s->state == S_GENERAL);
1056 assert (s->substate & SS_START_OF_LINE);
1058 mblen = segmenter_u8_to_uc__ (&uc, input, n);
1065 ofs = skip_spaces_and_comments (input, n, 1);
1070 int is_string = is_start_of_string__ (input, n, ofs);
1075 /* This is punctuation that may separate pieces of a string. */
1085 *type = SEG_START_COMMAND;
1086 s->substate = SS_START_OF_COMMAND;
1090 if (lex_uc_is_space (uc))
1092 int eol = at_end_of_line (input, n, 0);
1097 s->substate = SS_START_OF_COMMAND;
1098 *type = SEG_SEPARATE_COMMANDS;
1104 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1106 else if (s->mode == SEG_MODE_AUTO)
1108 int cmd = segmenter_detect_command_name__ (input, n, 0);
1115 assert (s->mode == SEG_MODE_BATCH);
1117 s->substate = SS_START_OF_COMMAND;
1118 *type = SEG_START_COMMAND;
1122 s->substate = SS_START_OF_COMMAND;
1123 return segmenter_parse_mid_command__ (s, input, n, type);
1127 segmenter_parse_file_label__ (struct segmenter *s,
1128 const char *input, size_t n,
1129 enum segment_type *type)
1131 struct segmenter sub;
1135 sub.state = S_GENERAL;
1136 ofs = segmenter_push (&sub, input, n, type);
1140 else if (*type == SEG_IDENTIFIER)
1144 assert (lex_id_match (ss_cstr ("LABEL"),
1145 ss_buffer ((char *) input, ofs)));
1146 result = segmenter_unquoted (input, n, ofs);
1152 s->state = S_TITLE_1;
1160 s->substate = sub.substate;
1166 segmenter_subparse (struct segmenter *s,
1167 const char *input, size_t n, enum segment_type *type)
1169 struct segmenter sub;
1173 sub.state = S_GENERAL;
1174 sub.substate = s->substate;
1175 ofs = segmenter_push (&sub, input, n, type);
1176 s->substate = sub.substate;
1181 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1182 const char *input, size_t n,
1183 enum segment_type *type)
1185 int ofs = segmenter_subparse (s, input, n, type);
1189 if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
1190 s->state = S_DO_REPEAT_2;
1191 else if (*type == SEG_END_COMMAND)
1193 s->state = S_DO_REPEAT_3;
1201 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1202 const char *input, size_t n,
1203 enum segment_type *type)
1205 int ofs = segmenter_subparse (s, input, n, type);
1209 if (*type == SEG_NEWLINE)
1211 s->state = S_DO_REPEAT_3;
1219 check_repeat_command (struct segmenter *s,
1220 const char *input, size_t n)
1227 if (input[ofs] == '+' || input[ofs] == '-')
1230 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1233 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1235 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1240 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1244 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1245 s->substate += direction;
1250 segmenter_parse_full_line__ (const char *input, size_t n,
1251 enum segment_type *type)
1253 const char *newline = memchr (input, '\n', n);
1255 if (newline == NULL)
1259 int ofs = newline - input;
1260 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1262 *type = SEG_NEWLINE;
1266 return ofs - (input[ofs - 1] == '\r');
1271 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1272 const char *input, size_t n,
1273 enum segment_type *type)
1277 ofs = segmenter_parse_full_line__ (input, n, type);
1278 if (ofs < 0 || input[ofs - 1] == '\n')
1280 else if (!check_repeat_command (s, input, n))
1282 else if (s->substate == 0)
1284 s->state = S_GENERAL;
1285 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1286 return segmenter_push (s, input, n, type);
1290 *type = SEG_DO_REPEAT_COMMAND;
1296 segmenter_parse_begin_data_1__ (struct segmenter *s,
1297 const char *input, size_t n,
1298 enum segment_type *type)
1300 int ofs = segmenter_subparse (s, input, n, type);
1304 if (*type == SEG_NEWLINE)
1305 s->state = S_BEGIN_DATA_2;
1311 segmenter_parse_begin_data_2__ (struct segmenter *s,
1312 const char *input, size_t n,
1313 enum segment_type *type)
1315 int ofs = segmenter_subparse (s, input, n, type);
1319 if (*type == SEG_NEWLINE)
1320 s->state = S_BEGIN_DATA_3;
1326 is_end_data (const char *input, size_t n)
1328 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1334 if (n < 3 || c_strncasecmp (input, "END", 3))
1338 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1339 if (!lex_uc_is_space (uc))
1343 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1350 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1357 else if (!lex_uc_is_space (uc))
1366 segmenter_parse_begin_data_3__ (struct segmenter *s,
1367 const char *input, size_t n,
1368 enum segment_type *type)
1372 ofs = segmenter_parse_full_line__ (input, n, type);
1375 else if (is_end_data (input, ofs))
1377 s->state = S_GENERAL;
1378 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1379 return segmenter_push (s, input, n, type);
1383 *type = SEG_INLINE_DATA;
1384 s->state = S_BEGIN_DATA_4;
1385 return input[ofs - 1] == '\n' ? 0 : ofs;
1390 segmenter_parse_begin_data_4__ (struct segmenter *s,
1391 const char *input, size_t n,
1392 enum segment_type *type)
1396 ofs = segmenter_parse_newline__ (input, n, type);
1400 s->state = S_BEGIN_DATA_3;
1405 segmenter_parse_title_1__ (struct segmenter *s,
1406 const char *input, size_t n,
1407 enum segment_type *type)
1411 ofs = skip_spaces (input, n, 0);
1414 s->state = S_TITLE_2;
1420 segmenter_parse_title_2__ (struct segmenter *s,
1421 const char *input, size_t n,
1422 enum segment_type *type)
1434 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1441 s->state = S_GENERAL;
1443 *type = SEG_UNQUOTED_STRING;
1444 return endcmd >= 0 ? endcmd : ofs;
1451 if (!lex_uc_is_space (uc))
1462 /* Returns the name of segment TYPE as a string. The caller must not modify
1463 or free the returned string.
1465 This is useful only for debugging and testing. */
1467 segment_type_to_string (enum segment_type type)
1471 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1475 return "unknown segment type";
1479 /* Initializes S as a segmenter with the given syntax MODE.
1481 A segmenter does not contain any external references, so nothing needs to be
1482 done to destroy one. For the same reason, segmenters may be copied with
1483 plain struct assignment (or memcpy). */
1485 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1487 s->state = S_SHBANG;
1492 /* Returns the mode passed to segmenter_init() for S. */
1494 segmenter_get_mode (const struct segmenter *s)
1499 /* Attempts to label a prefix of S's remaining input with a segment type. The
1500 caller supplies the first N bytes of the remaining input as INPUT, which
1501 must be a UTF-8 encoded string. The end of the input stream must be
1502 indicated by a null byte at the beginning of a line, that is, immediately
1503 following a new-line (or as the first byte of the input stream).
1505 The input may contain '\n' or '\r\n' line ends in any combination.
1507 If successful, returns the number of bytes in the segment at the beginning
1508 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1509 into *TYPE. The next call to segmenter_push() should not include those
1510 bytes as part of INPUT, because they have (figuratively) been consumed by
1513 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1514 be determined. In this case segmenter_push() returns -1. The caller should
1515 obtain more input and then call segmenter_push() again with a larger N and
1516 repeat until the input is exhausted (which must be indicated as described
1517 above) or until a valid segment is returned. segmenter_push() will never
1518 return -1 when the end of input is visible within INPUT.
1520 The caller must not, in a sequence of calls, supply contradictory input.
1521 That is, bytes provided as part of INPUT in one call, but not consumed, must
1522 not be provided with *different* values on subsequent calls. This is
1523 because segmenter_push() must often make decisions based on looking ahead
1524 beyond the bytes that it consumes. */
1526 segmenter_push (struct segmenter *s, const char *input, size_t n,
1527 enum segment_type *type)
1532 if (input[0] == '\0')
1541 return segmenter_parse_shbang__ (s, input, n, type);
1544 return (s->substate & SS_START_OF_LINE
1545 ? segmenter_parse_start_of_line__ (s, input, n, type)
1546 : segmenter_parse_mid_command__ (s, input, n, type));
1549 return segmenter_parse_comment_1__ (s, input, n, type);
1551 return segmenter_parse_comment_2__ (s, input, n, type);
1554 return segmenter_parse_document_1__ (s, input, n, type);
1556 return segmenter_parse_document_2__ (s, input, n, type);
1558 return segmenter_parse_document_3__ (s, type);
1561 return segmenter_parse_file_label__ (s, input, n, type);
1564 return segmenter_parse_do_repeat_1__ (s, input, n, type);
1566 return segmenter_parse_do_repeat_2__ (s, input, n, type);
1568 return segmenter_parse_do_repeat_3__ (s, input, n, type);
1570 case S_BEGIN_DATA_1:
1571 return segmenter_parse_begin_data_1__ (s, input, n, type);
1572 case S_BEGIN_DATA_2:
1573 return segmenter_parse_begin_data_2__ (s, input, n, type);
1574 case S_BEGIN_DATA_3:
1575 return segmenter_parse_begin_data_3__ (s, input, n, type);
1576 case S_BEGIN_DATA_4:
1577 return segmenter_parse_begin_data_4__ (s, input, n, type);
1580 return segmenter_parse_title_1__ (s, input, n, type);
1582 return segmenter_parse_title_2__ (s, input, n, type);
1588 /* Returns the style of command prompt to display to an interactive user for
1589 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1590 and at the beginning of a line (that is, if segmenter_push() consumed as
1591 much as possible of the input up to a new-line). */
1593 segmenter_get_prompt (const struct segmenter *s)
1598 return PROMPT_FIRST;
1601 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1605 return PROMPT_COMMENT;
1609 return PROMPT_DOCUMENT;
1611 return PROMPT_FIRST;
1614 return PROMPT_LATER;
1618 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1620 return PROMPT_DO_REPEAT;
1622 case S_BEGIN_DATA_1:
1623 return PROMPT_FIRST;
1624 case S_BEGIN_DATA_2:
1625 return PROMPT_LATER;
1626 case S_BEGIN_DATA_3:
1627 case S_BEGIN_DATA_4:
1632 return PROMPT_FIRST;