1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
56 static int segmenter_detect_command_name__ (const char *input,
60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n)
62 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
67 mblen = u8_mbtoucr (puc, input, n);
68 return (mblen >= 0 ? mblen
70 : u8_mbtouc (puc, input, n));
74 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
75 enum segment_type *type)
81 else if (input[1] == '!')
85 for (ofs = 2; ofs < n; ofs++)
86 if (input[ofs] == '\n')
88 if (input[ofs - 1] == '\r')
92 s->substate = SS_START_OF_COMMAND;
101 s->state = S_GENERAL;
102 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
103 return segmenter_push (s, input, n, type);
107 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
108 const char *input, size_t n,
109 enum segment_type *type)
111 assert (s->state == S_GENERAL);
118 return input[1] != '\0' && strchr (seconds, input[1]) != NULL ? 2 : 1;
122 skip_comment (const char *input, size_t n, size_t ofs)
124 for (; ofs < n; ofs++)
126 if (input[ofs] == '\n')
128 else if (input[ofs] == '*')
132 else if (input[ofs + 1] == '/')
140 skip_spaces_and_comments (const char *input, size_t n, int ofs)
147 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
155 else if (input[ofs + 1] != '*')
158 ofs = skip_comment (input, n, ofs + 2);
162 else if (lex_uc_is_space (uc) && uc != '\n')
172 is_end_of_line (const char *input, size_t n, int ofs)
174 if (input[ofs] == '\n')
176 else if (input[ofs] == '\r')
180 return input[ofs + 1] == '\n';
187 at_end_of_line (const char *input, size_t n, int ofs)
189 ofs = skip_spaces_and_comments (input, n, ofs);
193 return is_end_of_line (input, n, ofs);
198 segmenter_parse_newline__ (const char *input, size_t n,
199 enum segment_type *type)
203 if (input[0] == '\n')
210 assert (input[0] == '\r');
211 assert (input[1] == '\n');
220 skip_spaces (const char *input, size_t n, size_t ofs)
227 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
231 if (!lex_uc_is_space (uc) || uc == '\n')
241 skip_digits (const char *input, size_t n, int ofs)
243 for (; ofs < n; ofs++)
244 if (!c_isdigit (input[ofs]))
250 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
251 enum segment_type *type)
255 assert (s->state == S_GENERAL);
257 ofs = skip_digits (input, n, 0);
261 if (input[ofs] == '.')
263 ofs = skip_digits (input, n, ofs + 1);
270 if (input[ofs] == 'e' || input[ofs] == 'E')
276 if (input[ofs] == '+' || input[ofs] == '-')
283 if (!c_isdigit (input[ofs]))
285 *type = SEG_EXPECTED_EXPONENT;
290 ofs = skip_digits (input, n, ofs);
295 if (input[ofs - 1] == '.')
297 int eol = at_end_of_line (input, n, ofs);
310 is_reserved_word (const char *s, int n)
314 s0 = c_toupper (s[0]);
318 s1 = c_toupper (s[1]);
319 return ((s0 == 'B' && s1 == 'Y')
320 || (s0 == 'E' && s1 == 'Q')
321 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
322 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
323 || (s0 == 'N' && s1 == 'E')
324 || (s0 == 'O' && s1 == 'R')
325 || (s0 == 'T' && s1 == 'O'));
328 s1 = c_toupper (s[1]);
329 s2 = c_toupper (s[2]);
330 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
331 || (s1 == 'N' && s2 == 'D')))
332 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
335 s1 = c_toupper (s[1]);
336 s2 = c_toupper (s[2]);
337 s3 = c_toupper (s[3]);
338 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
346 segmenter_parse_comment_1__ (struct segmenter *s,
347 const char *input, size_t n,
348 enum segment_type *type)
360 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
371 if (ofs > 1 && input[ofs - 1] == '\r')
376 /* Blank line ends comment command. */
377 s->state = S_GENERAL;
378 s->substate = SS_START_OF_COMMAND;
379 *type = SEG_SEPARATE_COMMANDS;
382 else if (endcmd >= 0)
384 /* '.' at end of line ends comment command. */
385 s->state = S_GENERAL;
387 *type = SEG_COMMENT_COMMAND;
392 /* Comment continues onto next line. */
393 *type = SEG_COMMENT_COMMAND;
394 s->state = S_COMMENT_2;
400 if (!lex_uc_is_space (uc))
411 segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n,
412 enum segment_type *type)
419 ofs = segmenter_parse_newline__ (input, n, type);
420 if (ofs < 0 || ofs >= n)
423 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
427 if (uc == '+' || uc == '-' || uc == '.')
429 else if (!lex_uc_is_space (uc))
432 case SEG_MODE_INTERACTIVE:
441 new_cmd = segmenter_detect_command_name__ (input, n, ofs);
449 s->state = S_GENERAL;
450 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
453 s->state = S_COMMENT_1;
458 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
459 enum segment_type *type)
471 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
482 if (ofs > 1 && input[ofs - 1] == '\r')
485 *type = SEG_DOCUMENT;
486 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
490 if (!lex_uc_is_space (uc))
501 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
502 enum segment_type *type)
506 ofs = segmenter_parse_newline__ (input, n, type);
510 s->state = S_DOCUMENT_1;
515 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
517 *type = SEG_END_COMMAND;
518 s->state = S_GENERAL;
519 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
524 segmenter_unquoted (const char *input, size_t n, int ofs)
529 ofs = skip_spaces_and_comments (input, n, ofs);
534 return c != '\'' && c != '"' && c != '\n' && c != '\0';
538 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
539 int ofs, char id[], size_t id_size)
541 struct segmenter sub;
543 assert (id_size > 0);
546 sub.state = S_GENERAL;
550 enum segment_type type;
553 retval = segmenter_push (&sub, input + ofs, n - ofs, &type);
569 if (retval < id_size)
571 memcpy (id, input + ofs, retval);
578 case SEG_QUOTED_STRING:
580 case SEG_UNICODE_STRING:
581 case SEG_UNQUOTED_STRING:
582 case SEG_RESERVED_WORD:
584 case SEG_COMMENT_COMMAND:
585 case SEG_DO_REPEAT_COMMAND:
586 case SEG_INLINE_DATA:
587 case SEG_START_DOCUMENT:
589 case SEG_START_COMMAND:
590 case SEG_SEPARATE_COMMANDS:
591 case SEG_END_COMMAND:
593 case SEG_EXPECTED_QUOTE:
594 case SEG_EXPECTED_EXPONENT:
595 case SEG_UNEXPECTED_DOT:
596 case SEG_UNEXPECTED_CHAR:
608 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
609 enum segment_type *type)
614 assert (s->state == S_GENERAL);
616 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
624 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
627 else if (!lex_uc_is_idn (uc))
633 if (input[ofs - 1] == '.')
635 int eol = at_end_of_line (input, n, ofs);
642 if (is_reserved_word (input, ofs))
643 *type = SEG_RESERVED_WORD;
645 *type = SEG_IDENTIFIER;
647 if (s->substate & SS_START_OF_COMMAND)
649 struct substring word = ss_buffer (input, ofs);
651 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
653 s->state = S_COMMENT_1;
654 return segmenter_parse_comment_1__ (s, input, n, type);
656 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
658 s->state = S_DOCUMENT_1;
659 *type = SEG_START_DOCUMENT;
662 else if (lex_id_match (ss_cstr ("TITLE"), word)
663 || lex_id_match (ss_cstr ("SUBTITLE"), word))
665 int result = segmenter_unquoted (input, n, ofs);
670 s->state = S_TITLE_1;
674 else if (lex_id_match (ss_cstr ("FILE"), word))
678 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
680 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
682 s->state = S_FILE_LABEL;
687 else if (lex_id_match (ss_cstr ("DO"), word))
691 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
693 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
695 s->state = S_DO_REPEAT_1;
700 else if (lex_id_match (ss_cstr ("BEGIN"), word))
705 ofs2 = next_id_in_command (s, input, n, ofs, id, sizeof id);
708 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
712 ofs2 = skip_spaces_and_comments (input, n, ofs2);
716 if (input[ofs2] == '.')
718 ofs2 = skip_spaces_and_comments (input, n, ofs2 + 1);
723 eol = is_end_of_line (input, n, ofs2);
728 if (memchr (input, '\n', ofs2))
729 s->state = S_BEGIN_DATA_1;
731 s->state = S_BEGIN_DATA_2;
744 segmenter_parse_string__ (enum segment_type string_type,
745 int ofs, struct segmenter *s,
746 const char *input, size_t n, enum segment_type *type)
748 int quote = input[ofs];
752 if (input[ofs] == quote)
757 else if (input[ofs] == quote)
766 else if (input[ofs] == '\n' || input[ofs] == '\0')
768 *type = SEG_EXPECTED_QUOTE;
779 segmenter_maybe_parse_string__ (enum segment_type string_type,
781 const char *input, size_t n,
782 enum segment_type *type)
786 else if (input[1] == '\'' || input[1] == '"')
787 return segmenter_parse_string__ (string_type, 1, s, input, n, type);
789 return segmenter_parse_id__ (s, input, n, type);
793 segmenter_parse_mid_command__ (struct segmenter *s,
794 const char *input, size_t n,
795 enum segment_type *type)
801 assert (s->state == S_GENERAL);
802 assert (!(s->substate & SS_START_OF_LINE));
804 mblen = segmenter_u8_to_uc__ (&uc, input, n);
811 s->substate |= SS_START_OF_LINE;
818 else if (input[1] == '*')
820 ofs = skip_comment (input, n, 2);
834 case '(': case ')': case ',': case '=': case '-':
835 case '[': case ']': case '&': case '|': case '+':
841 if (s->substate & SS_START_OF_COMMAND)
843 /* '*' at the beginning of a command begins a comment. */
844 s->state = S_COMMENT_1;
845 return segmenter_parse_comment_1__ (s, input, n, type);
848 return segmenter_parse_digraph__ ("*", s, input, n, type);
851 return segmenter_parse_digraph__ ("=>", s, input, n, type);
854 return segmenter_parse_digraph__ ("=", s, input, n, type);
857 return segmenter_parse_digraph__ ("=", s, input, n, type);
862 else if (c_isdigit (input[1]))
863 return segmenter_parse_number__ (s, input, n, type);
866 int eol = at_end_of_line (input, n, 1);
872 *type = SEG_END_COMMAND;
873 s->substate = SS_START_OF_COMMAND;
876 *type = SEG_UNEXPECTED_DOT;
881 case '0': case '1': case '2': case '3': case '4':
882 case '5': case '6': case '7': case '8': case '9':
883 return segmenter_parse_number__ (s, input, n, type);
886 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
890 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
894 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
898 if (lex_uc_is_space (uc))
900 ofs = skip_spaces (input, n, mblen);
904 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
908 s->substate |= SS_START_OF_LINE;
918 else if (lex_uc_is_id1 (uc))
919 return segmenter_parse_id__ (s, input, n, type);
922 *type = SEG_UNEXPECTED_CHAR;
930 compare_commands (const void *a_, const void *b_)
932 const char *const *ap = a_;
933 const char *const *bp = b_;
937 return c_strcasecmp (a, b);
941 segmenter_get_command_name_candidates (unsigned char first)
943 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
944 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
945 static const char *commands[] =
947 #include "language/command.def"
950 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
956 static const char **cindex[UCHAR_MAX + 1];
964 qsort (commands, n_commands, sizeof *commands, compare_commands);
965 for (i = 0; i < n_commands; i++)
967 unsigned char c = c_toupper (commands[i][0]);
968 if (cindex[c] == NULL)
969 cindex[c] = &commands[i];
971 for (i = 0; i <= UCHAR_MAX; i++)
972 if (cindex[i] == NULL)
973 cindex[i] = &commands[n_commands];
976 return cindex[c_toupper (first)];
980 segmenter_detect_command_name__ (const char *input, size_t n, int ofs)
982 const char **commands;
995 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1000 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1005 if (input[ofs - 1] == '.')
1008 for (commands = segmenter_get_command_name_candidates (input[0]);
1009 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1015 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1016 &exact, &missing_words)
1017 && missing_words <= 0)
1025 is_start_of_string__ (const char *input, size_t n, int ofs)
1030 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1035 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1038 return c == '\'' || c == '"' || c == '\n';
1042 segmenter_parse_start_of_line__ (struct segmenter *s,
1043 const char *input, size_t n,
1044 enum segment_type *type)
1050 assert (s->state == S_GENERAL);
1051 assert (s->substate & SS_START_OF_LINE);
1053 mblen = segmenter_u8_to_uc__ (&uc, input, n);
1060 ofs = skip_spaces_and_comments (input, n, 1);
1065 int is_string = is_start_of_string__ (input, n, ofs);
1070 /* This is punctuation that may separate pieces of a string. */
1080 *type = SEG_START_COMMAND;
1081 s->substate = SS_START_OF_COMMAND;
1085 if (lex_uc_is_space (uc))
1087 int eol = at_end_of_line (input, n, 0);
1092 s->substate = SS_START_OF_COMMAND;
1093 *type = SEG_SEPARATE_COMMANDS;
1099 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1101 else if (s->mode == SEG_MODE_AUTO)
1103 int cmd = segmenter_detect_command_name__ (input, n, 0);
1110 assert (s->mode == SEG_MODE_BATCH);
1112 s->substate = SS_START_OF_COMMAND;
1113 *type = SEG_START_COMMAND;
1117 s->substate = SS_START_OF_COMMAND;
1118 return segmenter_parse_mid_command__ (s, input, n, type);
1122 segmenter_parse_file_label__ (struct segmenter *s,
1123 const char *input, size_t n,
1124 enum segment_type *type)
1126 struct segmenter sub;
1130 sub.state = S_GENERAL;
1131 ofs = segmenter_push (&sub, input, n, type);
1135 else if (*type == SEG_IDENTIFIER)
1139 assert (lex_id_match (ss_cstr ("LABEL"),
1140 ss_buffer ((char *) input, ofs)));
1141 result = segmenter_unquoted (input, n, ofs);
1147 s->state = S_TITLE_1;
1155 s->substate = sub.substate;
1161 segmenter_subparse (struct segmenter *s,
1162 const char *input, size_t n, enum segment_type *type)
1164 struct segmenter sub;
1168 sub.state = S_GENERAL;
1169 sub.substate = s->substate;
1170 ofs = segmenter_push (&sub, input, n, type);
1171 s->substate = sub.substate;
1176 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1177 const char *input, size_t n,
1178 enum segment_type *type)
1180 int ofs = segmenter_subparse (s, input, n, type);
1184 if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
1185 s->state = S_DO_REPEAT_2;
1186 else if (*type == SEG_END_COMMAND)
1188 s->state = S_DO_REPEAT_3;
1196 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1197 const char *input, size_t n,
1198 enum segment_type *type)
1200 int ofs = segmenter_subparse (s, input, n, type);
1204 if (*type == SEG_NEWLINE)
1206 s->state = S_DO_REPEAT_3;
1214 check_repeat_command (struct segmenter *s,
1215 const char *input, size_t n)
1222 if (input[ofs] == '+' || input[ofs] == '-')
1225 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1228 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1230 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1235 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1239 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1240 s->substate += direction;
1245 segmenter_parse_full_line__ (const char *input, size_t n,
1246 enum segment_type *type)
1248 const char *newline = memchr (input, '\n', n);
1250 if (newline == NULL)
1254 int ofs = newline - input;
1255 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1257 *type = SEG_NEWLINE;
1261 return ofs - (input[ofs - 1] == '\r');
1266 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1267 const char *input, size_t n,
1268 enum segment_type *type)
1272 ofs = segmenter_parse_full_line__ (input, n, type);
1273 if (ofs < 0 || input[ofs - 1] == '\n')
1275 else if (!check_repeat_command (s, input, n))
1277 else if (s->substate == 0)
1279 s->state = S_GENERAL;
1280 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1281 return segmenter_push (s, input, n, type);
1285 *type = SEG_DO_REPEAT_COMMAND;
1291 segmenter_parse_begin_data_1__ (struct segmenter *s,
1292 const char *input, size_t n,
1293 enum segment_type *type)
1295 int ofs = segmenter_subparse (s, input, n, type);
1299 if (*type == SEG_NEWLINE)
1300 s->state = S_BEGIN_DATA_2;
1306 segmenter_parse_begin_data_2__ (struct segmenter *s,
1307 const char *input, size_t n,
1308 enum segment_type *type)
1310 int ofs = segmenter_subparse (s, input, n, type);
1314 if (*type == SEG_NEWLINE)
1315 s->state = S_BEGIN_DATA_3;
1321 is_end_data (const char *input, size_t n)
1323 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1329 if (n < 3 || c_strncasecmp (input, "END", 3))
1333 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1334 if (!lex_uc_is_space (uc))
1338 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1345 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1352 else if (!lex_uc_is_space (uc))
1361 segmenter_parse_begin_data_3__ (struct segmenter *s,
1362 const char *input, size_t n,
1363 enum segment_type *type)
1367 ofs = segmenter_parse_full_line__ (input, n, type);
1370 else if (is_end_data (input, ofs))
1372 s->state = S_GENERAL;
1373 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1374 return segmenter_push (s, input, n, type);
1378 *type = SEG_INLINE_DATA;
1379 s->state = S_BEGIN_DATA_4;
1380 return input[ofs - 1] == '\n' ? 0 : ofs;
1385 segmenter_parse_begin_data_4__ (struct segmenter *s,
1386 const char *input, size_t n,
1387 enum segment_type *type)
1391 ofs = segmenter_parse_newline__ (input, n, type);
1395 s->state = S_BEGIN_DATA_3;
1400 segmenter_parse_title_1__ (struct segmenter *s,
1401 const char *input, size_t n,
1402 enum segment_type *type)
1406 ofs = skip_spaces (input, n, 0);
1409 s->state = S_TITLE_2;
1415 segmenter_parse_title_2__ (struct segmenter *s,
1416 const char *input, size_t n,
1417 enum segment_type *type)
1429 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1436 s->state = S_GENERAL;
1438 *type = SEG_UNQUOTED_STRING;
1439 return endcmd >= 0 ? endcmd : ofs;
1446 if (!lex_uc_is_space (uc))
1457 /* Returns the name of segment TYPE as a string. The caller must not modify
1458 or free the returned string.
1460 This is useful only for debugging and testing. */
1462 segment_type_to_string (enum segment_type type)
1466 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1470 return "unknown segment type";
1474 /* Initializes S as a segmenter with the given syntax MODE.
1476 A segmenter does not contain any external references, so nothing needs to be
1477 done to destroy one. For the same reason, segmenters may be copied with
1478 plain struct assignment (or memcpy). */
1480 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1482 s->state = S_SHBANG;
1487 /* Returns the mode passed to segmenter_init() for S. */
1489 segmenter_get_mode (const struct segmenter *s)
1494 /* Attempts to label a prefix of S's remaining input with a segment type. The
1495 caller supplies the first N bytes of the remaining input as INPUT, which
1496 must be a UTF-8 encoded string. The end of the input stream must be
1497 indicated by a null byte at the beginning of a line, that is, immediately
1498 following a new-line (or as the first byte of the input stream).
1500 The input may contain '\n' or '\r\n' line ends in any combination.
1502 If successful, returns the number of bytes in the segment at the beginning
1503 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1504 into *TYPE. The next call to segmenter_push() should not include those
1505 bytes as part of INPUT, because they have (figuratively) been consumed by
1508 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1509 be determined. In this case segmenter_push() returns -1. The caller should
1510 obtain more input and then call segmenter_push() again with a larger N and
1511 repeat until the input is exhausted (which must be indicated as described
1512 above) or until a valid segment is returned. segmenter_push() will never
1513 return -1 when the end of input is visible within INPUT.
1515 The caller must not, in a sequence of calls, supply contradictory input.
1516 That is, bytes provided as part of INPUT in one call, but not consumed, must
1517 not be provided with *different* values on subsequent calls. This is
1518 because segmenter_push() must often make decisions based on looking ahead
1519 beyond the bytes that it consumes. */
1521 segmenter_push (struct segmenter *s, const char *input, size_t n,
1522 enum segment_type *type)
1527 if (input[0] == '\0')
1536 return segmenter_parse_shbang__ (s, input, n, type);
1539 return (s->substate & SS_START_OF_LINE
1540 ? segmenter_parse_start_of_line__ (s, input, n, type)
1541 : segmenter_parse_mid_command__ (s, input, n, type));
1544 return segmenter_parse_comment_1__ (s, input, n, type);
1546 return segmenter_parse_comment_2__ (s, input, n, type);
1549 return segmenter_parse_document_1__ (s, input, n, type);
1551 return segmenter_parse_document_2__ (s, input, n, type);
1553 return segmenter_parse_document_3__ (s, type);
1556 return segmenter_parse_file_label__ (s, input, n, type);
1559 return segmenter_parse_do_repeat_1__ (s, input, n, type);
1561 return segmenter_parse_do_repeat_2__ (s, input, n, type);
1563 return segmenter_parse_do_repeat_3__ (s, input, n, type);
1565 case S_BEGIN_DATA_1:
1566 return segmenter_parse_begin_data_1__ (s, input, n, type);
1567 case S_BEGIN_DATA_2:
1568 return segmenter_parse_begin_data_2__ (s, input, n, type);
1569 case S_BEGIN_DATA_3:
1570 return segmenter_parse_begin_data_3__ (s, input, n, type);
1571 case S_BEGIN_DATA_4:
1572 return segmenter_parse_begin_data_4__ (s, input, n, type);
1575 return segmenter_parse_title_1__ (s, input, n, type);
1577 return segmenter_parse_title_2__ (s, input, n, type);
1583 /* Returns the style of command prompt to display to an interactive user for
1584 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1585 and at the beginning of a line (that is, if segmenter_push() consumed as
1586 much as possible of the input up to a new-line). */
1588 segmenter_get_prompt (const struct segmenter *s)
1593 return PROMPT_FIRST;
1596 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1600 return PROMPT_COMMENT;
1604 return PROMPT_DOCUMENT;
1606 return PROMPT_FIRST;
1609 return PROMPT_LATER;
1613 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1615 return PROMPT_DO_REPEAT;
1617 case S_BEGIN_DATA_1:
1618 return PROMPT_FIRST;
1619 case S_BEGIN_DATA_2:
1620 return PROMPT_LATER;
1621 case S_BEGIN_DATA_3:
1622 case S_BEGIN_DATA_4:
1627 return PROMPT_FIRST;