1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
31 #include "gl/memchr2.h"
54 #define SS_START_OF_LINE (1u << 0)
55 #define SS_START_OF_COMMAND (1u << 1)
57 static int segmenter_detect_command_name__ (const char *input,
61 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n)
63 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
68 mblen = u8_mbtoucr (puc, input, n);
69 return (mblen >= 0 ? mblen
71 : u8_mbtouc (puc, input, n));
75 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
76 enum segment_type *type)
82 else if (input[1] == '!')
86 for (ofs = 2; ofs < n; ofs++)
87 if (input[ofs] == '\n' || input[ofs] == '\0')
89 if (input[ofs] == '\n' && input[ofs - 1] == '\r')
93 s->substate = SS_START_OF_COMMAND;
102 s->state = S_GENERAL;
103 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
104 return segmenter_push (s, input, n, type);
108 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
109 const char *input, size_t n,
110 enum segment_type *type)
112 assert (s->state == S_GENERAL);
119 return input[1] != '\0' && strchr (seconds, input[1]) != NULL ? 2 : 1;
123 skip_comment (const char *input, size_t n, size_t ofs)
125 for (; ofs < n; ofs++)
127 if (input[ofs] == '\n' || input[ofs] == '\0')
129 else if (input[ofs] == '*')
133 else if (input[ofs + 1] == '/')
141 skip_spaces_and_comments (const char *input, size_t n, int ofs)
148 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
156 else if (input[ofs + 1] != '*')
159 ofs = skip_comment (input, n, ofs + 2);
163 else if (lex_uc_is_space (uc) && uc != '\n')
173 is_end_of_line (const char *input, size_t n, int ofs)
175 if (input[ofs] == '\n' || input[ofs] == '\0')
177 else if (input[ofs] == '\r')
181 return input[ofs + 1] == '\n';
188 at_end_of_line (const char *input, size_t n, int ofs)
190 ofs = skip_spaces_and_comments (input, n, ofs);
194 return is_end_of_line (input, n, ofs);
198 segmenter_parse_newline__ (const char *input, size_t n,
199 enum segment_type *type)
203 if (input[0] == '\n')
210 assert (input[0] == '\r');
211 assert (input[1] == '\n');
220 skip_spaces (const char *input, size_t n, size_t ofs)
227 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
231 if (!lex_uc_is_space (uc) || uc == '\n' || uc == '\0')
241 skip_digits (const char *input, size_t n, int ofs)
243 for (; ofs < n; ofs++)
244 if (!c_isdigit (input[ofs]))
250 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
251 enum segment_type *type)
255 assert (s->state == S_GENERAL);
257 ofs = skip_digits (input, n, 0);
261 if (input[ofs] == '.')
263 ofs = skip_digits (input, n, ofs + 1);
270 if (input[ofs] == 'e' || input[ofs] == 'E')
276 if (input[ofs] == '+' || input[ofs] == '-')
283 if (!c_isdigit (input[ofs]))
285 *type = SEG_EXPECTED_EXPONENT;
290 ofs = skip_digits (input, n, ofs);
295 if (input[ofs - 1] == '.')
297 int eol = at_end_of_line (input, n, ofs);
310 is_reserved_word (const char *s, int n)
314 s0 = c_toupper (s[0]);
318 s1 = c_toupper (s[1]);
319 return ((s0 == 'B' && s1 == 'Y')
320 || (s0 == 'E' && s1 == 'Q')
321 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
322 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
323 || (s0 == 'N' && s1 == 'E')
324 || (s0 == 'O' && s1 == 'R')
325 || (s0 == 'T' && s1 == 'O'));
328 s1 = c_toupper (s[1]);
329 s2 = c_toupper (s[2]);
330 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
331 || (s1 == 'N' && s2 == 'D')))
332 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
335 s1 = c_toupper (s[1]);
336 s2 = c_toupper (s[2]);
337 s3 = c_toupper (s[3]);
338 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
346 segmenter_parse_comment_1__ (struct segmenter *s,
347 const char *input, size_t n,
348 enum segment_type *type)
360 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
371 if (ofs > 1 && input[ofs - 1] == '\r')
375 if (endcmd == -2 || uc == '\0')
377 /* Blank line ends comment command. */
378 s->state = S_GENERAL;
379 s->substate = SS_START_OF_COMMAND;
380 *type = SEG_SEPARATE_COMMANDS;
383 else if (endcmd >= 0)
385 /* '.' at end of line ends comment command. */
386 s->state = S_GENERAL;
388 *type = SEG_COMMENT_COMMAND;
393 /* Comment continues onto next line. */
394 *type = SEG_COMMENT_COMMAND;
395 s->state = S_COMMENT_2;
401 if (!lex_uc_is_space (uc))
412 segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n,
413 enum segment_type *type)
420 ofs = segmenter_parse_newline__ (input, n, type);
421 if (ofs < 0 || ofs >= n)
424 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
428 if (uc == '+' || uc == '-' || uc == '.')
430 else if (!lex_uc_is_space (uc))
433 case SEG_MODE_INTERACTIVE:
442 new_cmd = segmenter_detect_command_name__ (input, n, ofs);
455 s->state = S_GENERAL;
456 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
459 s->state = S_COMMENT_1;
464 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
465 enum segment_type *type)
477 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
488 if (ofs > 1 && input[ofs - 1] == '\r')
491 *type = SEG_DOCUMENT;
492 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
496 *type = SEG_DOCUMENT;
497 s->state = S_DOCUMENT_3;
501 if (!lex_uc_is_space (uc))
512 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
513 enum segment_type *type)
517 ofs = segmenter_parse_newline__ (input, n, type);
521 s->state = S_DOCUMENT_1;
526 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
528 *type = SEG_END_COMMAND;
529 s->state = S_GENERAL;
530 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
535 segmenter_unquoted (const char *input, size_t n, int ofs)
540 ofs = skip_spaces_and_comments (input, n, ofs);
545 return c != '\'' && c != '"' && c != '\n' && c != '\0';
549 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
550 int ofs, char id[], size_t id_size)
552 struct segmenter sub;
554 assert (id_size > 0);
557 sub.state = S_GENERAL;
561 enum segment_type type;
564 retval = segmenter_push (&sub, input + ofs, n - ofs, &type);
580 if (retval < id_size)
582 memcpy (id, input + ofs, retval);
589 case SEG_QUOTED_STRING:
591 case SEG_UNICODE_STRING:
592 case SEG_UNQUOTED_STRING:
593 case SEG_RESERVED_WORD:
595 case SEG_COMMENT_COMMAND:
596 case SEG_DO_REPEAT_COMMAND:
597 case SEG_INLINE_DATA:
598 case SEG_START_DOCUMENT:
600 case SEG_START_COMMAND:
601 case SEG_SEPARATE_COMMANDS:
602 case SEG_END_COMMAND:
604 case SEG_EXPECTED_QUOTE:
605 case SEG_EXPECTED_EXPONENT:
606 case SEG_UNEXPECTED_DOT:
607 case SEG_UNEXPECTED_CHAR:
616 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
617 enum segment_type *type)
622 assert (s->state == S_GENERAL);
624 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
632 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
635 else if (!lex_uc_is_idn (uc))
641 if (input[ofs - 1] == '.')
643 int eol = at_end_of_line (input, n, ofs);
650 if (is_reserved_word (input, ofs))
651 *type = SEG_RESERVED_WORD;
653 *type = SEG_IDENTIFIER;
655 if (s->substate & SS_START_OF_COMMAND)
657 struct substring word = ss_buffer (input, ofs);
659 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
661 s->state = S_COMMENT_1;
662 return segmenter_parse_comment_1__ (s, input, n, type);
664 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
666 s->state = S_DOCUMENT_1;
667 *type = SEG_START_DOCUMENT;
670 else if (lex_id_match (ss_cstr ("TITLE"), word)
671 || lex_id_match (ss_cstr ("SUBTITLE"), word))
673 int result = segmenter_unquoted (input, n, ofs);
678 s->state = S_TITLE_1;
682 else if (lex_id_match (ss_cstr ("FILE"), word))
686 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
688 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
690 s->state = S_FILE_LABEL;
695 else if (lex_id_match (ss_cstr ("DO"), word))
699 if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
701 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
703 s->state = S_DO_REPEAT_1;
708 else if (lex_id_match (ss_cstr ("BEGIN"), word))
713 ofs2 = next_id_in_command (s, input, n, ofs, id, sizeof id);
716 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
720 ofs2 = skip_spaces_and_comments (input, n, ofs2);
724 if (input[ofs2] == '.')
726 ofs2 = skip_spaces_and_comments (input, n, ofs2 + 1);
731 eol = is_end_of_line (input, n, ofs2);
736 if (memchr (input, '\n', ofs2))
737 s->state = S_BEGIN_DATA_1;
739 s->state = S_BEGIN_DATA_2;
752 segmenter_parse_string__ (enum segment_type string_type,
753 int ofs, struct segmenter *s,
754 const char *input, size_t n, enum segment_type *type)
756 int quote = input[ofs];
760 if (input[ofs] == quote)
765 else if (input[ofs] == quote)
774 else if (input[ofs] == '\n' || input[ofs] == '\0')
776 *type = SEG_EXPECTED_QUOTE;
787 segmenter_maybe_parse_string__ (enum segment_type string_type,
789 const char *input, size_t n,
790 enum segment_type *type)
794 else if (input[1] == '\'' || input[1] == '"')
795 return segmenter_parse_string__ (string_type, 1, s, input, n, type);
797 return segmenter_parse_id__ (s, input, n, type);
801 segmenter_parse_mid_command__ (struct segmenter *s,
802 const char *input, size_t n,
803 enum segment_type *type)
809 assert (s->state == S_GENERAL);
810 assert (!(s->substate & SS_START_OF_LINE));
812 mblen = segmenter_u8_to_uc__ (&uc, input, n);
819 s->substate |= SS_START_OF_LINE;
826 else if (input[1] == '*')
828 ofs = skip_comment (input, n, 2);
842 case '(': case ')': case ',': case '=': case '-':
843 case '[': case ']': case '&': case '|': case '+':
849 if (s->substate & SS_START_OF_COMMAND)
851 /* '*' at the beginning of a command begins a comment. */
852 s->state = S_COMMENT_1;
853 return segmenter_parse_comment_1__ (s, input, n, type);
856 return segmenter_parse_digraph__ ("*", s, input, n, type);
859 return segmenter_parse_digraph__ ("=>", s, input, n, type);
862 return segmenter_parse_digraph__ ("=", s, input, n, type);
865 return segmenter_parse_digraph__ ("=", s, input, n, type);
870 else if (c_isdigit (input[1]))
871 return segmenter_parse_number__ (s, input, n, type);
874 int eol = at_end_of_line (input, n, 1);
880 *type = SEG_END_COMMAND;
881 s->substate = SS_START_OF_COMMAND;
884 *type = SEG_UNEXPECTED_DOT;
889 case '0': case '1': case '2': case '3': case '4':
890 case '5': case '6': case '7': case '8': case '9':
891 return segmenter_parse_number__ (s, input, n, type);
894 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
898 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
902 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
906 if (lex_uc_is_space (uc))
908 ofs = skip_spaces (input, n, mblen);
912 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
916 s->substate |= SS_START_OF_LINE;
926 else if (lex_uc_is_id1 (uc))
927 return segmenter_parse_id__ (s, input, n, type);
930 *type = SEG_UNEXPECTED_CHAR;
938 compare_commands (const void *a_, const void *b_)
940 const char *const *ap = a_;
941 const char *const *bp = b_;
945 return c_strcasecmp (a, b);
949 segmenter_get_command_name_candidates (unsigned char first)
951 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
952 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
953 static const char *commands[] =
955 #include "language/command.def"
958 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
964 static const char **cindex[UCHAR_MAX + 1];
972 qsort (commands, n_commands, sizeof *commands, compare_commands);
973 for (i = 0; i < n_commands; i++)
975 unsigned char c = c_toupper (commands[i][0]);
976 if (cindex[c] == NULL)
977 cindex[c] = &commands[i];
979 for (i = 0; i <= UCHAR_MAX; i++)
980 if (cindex[i] == NULL)
981 cindex[i] = &commands[n_commands];
984 return cindex[c_toupper (first)];
988 segmenter_detect_command_name__ (const char *input, size_t n, int ofs)
990 const char **commands;
1003 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1007 if (uc == '\n' || uc == '\0'
1008 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1016 if (input[ofs - 1] == '.')
1019 for (commands = segmenter_get_command_name_candidates (input[0]);
1020 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1026 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1027 &exact, &missing_words)
1028 && missing_words <= 0)
1036 is_start_of_string__ (const char *input, size_t n, int ofs)
1041 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1046 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1049 return c == '\'' || c == '"' || c == '\n';
1053 segmenter_parse_start_of_line__ (struct segmenter *s,
1054 const char *input, size_t n,
1055 enum segment_type *type)
1061 assert (s->state == S_GENERAL);
1062 assert (s->substate & SS_START_OF_LINE);
1064 mblen = segmenter_u8_to_uc__ (&uc, input, n);
1071 ofs = skip_spaces_and_comments (input, n, 1);
1076 int is_string = is_start_of_string__ (input, n, ofs);
1081 /* This is punctuation that may separate pieces of a string. */
1091 *type = SEG_START_COMMAND;
1092 s->substate = SS_START_OF_COMMAND;
1096 if (lex_uc_is_space (uc))
1098 int eol = at_end_of_line (input, n, 0);
1103 s->substate = SS_START_OF_COMMAND;
1104 *type = SEG_SEPARATE_COMMANDS;
1110 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1112 else if (s->mode == SEG_MODE_AUTO)
1114 int cmd = segmenter_detect_command_name__ (input, n, 0);
1121 assert (s->mode == SEG_MODE_BATCH);
1123 s->substate = SS_START_OF_COMMAND;
1124 *type = SEG_START_COMMAND;
1128 s->substate = SS_START_OF_COMMAND;
1129 return segmenter_parse_mid_command__ (s, input, n, type);
1133 segmenter_parse_file_label__ (struct segmenter *s,
1134 const char *input, size_t n,
1135 enum segment_type *type)
1137 struct segmenter sub;
1141 sub.state = S_GENERAL;
1142 ofs = segmenter_push (&sub, input, n, type);
1146 else if (*type == SEG_IDENTIFIER)
1150 assert (lex_id_match (ss_cstr ("LABEL"),
1151 ss_buffer ((char *) input, ofs)));
1152 result = segmenter_unquoted (input, n, ofs);
1158 s->state = S_TITLE_1;
1166 s->substate = sub.substate;
1172 segmenter_subparse (struct segmenter *s,
1173 const char *input, size_t n, enum segment_type *type)
1175 struct segmenter sub;
1179 sub.state = S_GENERAL;
1180 sub.substate = s->substate;
1181 ofs = segmenter_push (&sub, input, n, type);
1182 s->substate = sub.substate;
1187 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1188 const char *input, size_t n,
1189 enum segment_type *type)
1191 int ofs = segmenter_subparse (s, input, n, type);
1195 if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
1196 s->state = S_DO_REPEAT_2;
1197 else if (*type == SEG_END_COMMAND)
1199 s->state = S_DO_REPEAT_3;
1207 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1208 const char *input, size_t n,
1209 enum segment_type *type)
1211 int ofs = segmenter_subparse (s, input, n, type);
1215 if (*type == SEG_NEWLINE)
1217 s->state = S_DO_REPEAT_3;
1225 check_repeat_command (struct segmenter *s,
1226 const char *input, size_t n)
1233 if (input[ofs] == '+' || input[ofs] == '-')
1236 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1239 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1241 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1246 ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
1250 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1251 s->substate += direction;
1256 segmenter_parse_full_line__ (const char *input, size_t n,
1257 enum segment_type *type)
1259 const char *newline = memchr2 (input, '\n', '\0', n);
1261 if (newline == NULL)
1265 int ofs = newline - input;
1266 if (*newline == '\0')
1271 else if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1273 *type = SEG_NEWLINE;
1277 return ofs - (input[ofs - 1] == '\r');
1282 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1283 const char *input, size_t n,
1284 enum segment_type *type)
1288 ofs = segmenter_parse_full_line__ (input, n, type);
1289 if (ofs < 0 || input[ofs - 1] == '\n')
1291 else if (!check_repeat_command (s, input, n))
1293 else if (s->substate == 0)
1295 s->state = S_GENERAL;
1296 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1297 return segmenter_push (s, input, n, type);
1301 *type = SEG_DO_REPEAT_COMMAND;
1307 segmenter_parse_begin_data_1__ (struct segmenter *s,
1308 const char *input, size_t n,
1309 enum segment_type *type)
1311 int ofs = segmenter_subparse (s, input, n, type);
1315 if (*type == SEG_NEWLINE)
1316 s->state = S_BEGIN_DATA_2;
1322 segmenter_parse_begin_data_2__ (struct segmenter *s,
1323 const char *input, size_t n,
1324 enum segment_type *type)
1326 int ofs = segmenter_subparse (s, input, n, type);
1330 if (*type == SEG_NEWLINE)
1331 s->state = S_BEGIN_DATA_3;
1337 is_end_data (const char *input, size_t n)
1339 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1345 if (n < 3 || c_strncasecmp (input, "END", 3))
1349 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1350 if (!lex_uc_is_space (uc))
1354 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1361 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1368 else if (!lex_uc_is_space (uc))
1377 segmenter_parse_begin_data_3__ (struct segmenter *s,
1378 const char *input, size_t n,
1379 enum segment_type *type)
1383 ofs = segmenter_parse_full_line__ (input, n, type);
1386 else if (is_end_data (input, ofs))
1388 s->state = S_GENERAL;
1389 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1390 return segmenter_push (s, input, n, type);
1394 *type = SEG_INLINE_DATA;
1395 s->state = S_BEGIN_DATA_4;
1396 return input[ofs - 1] == '\n' ? 0 : ofs;
1401 segmenter_parse_begin_data_4__ (struct segmenter *s,
1402 const char *input, size_t n,
1403 enum segment_type *type)
1407 ofs = segmenter_parse_newline__ (input, n, type);
1411 s->state = S_BEGIN_DATA_3;
1416 segmenter_parse_title_1__ (struct segmenter *s,
1417 const char *input, size_t n,
1418 enum segment_type *type)
1422 ofs = skip_spaces (input, n, 0);
1425 s->state = S_TITLE_2;
1431 segmenter_parse_title_2__ (struct segmenter *s,
1432 const char *input, size_t n,
1433 enum segment_type *type)
1445 mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
1453 s->state = S_GENERAL;
1455 *type = SEG_UNQUOTED_STRING;
1456 return endcmd >= 0 ? endcmd : ofs;
1463 if (!lex_uc_is_space (uc))
1474 /* Returns the name of segment TYPE as a string. The caller must not modify
1475 or free the returned string.
1477 This is useful only for debugging and testing. */
1479 segment_type_to_string (enum segment_type type)
1483 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1487 return "unknown segment type";
1491 /* Initializes S as a segmenter with the given syntax MODE.
1493 A segmenter does not contain any external references, so nothing needs to be
1494 done to destroy one. For the same reason, segmenters may be copied with
1495 plain struct assignment (or memcpy). */
1497 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1499 s->state = S_SHBANG;
1504 /* Returns the mode passed to segmenter_init() for S. */
1506 segmenter_get_mode (const struct segmenter *s)
1511 /* Attempts to label a prefix of S's remaining input with a segment type. The
1512 caller supplies the first N bytes of the remaining input as INPUT, which
1513 must be a UTF-8 encoded string. The end of the input stream must be
1514 indicated by a null byte at the beginning of a line, that is, immediately
1515 following a new-line (or as the first byte of the input stream).
1517 The input may contain '\n' or '\r\n' line ends in any combination.
1519 If successful, returns the number of bytes in the segment at the beginning
1520 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1521 into *TYPE. The next call to segmenter_push() should not include those
1522 bytes as part of INPUT, because they have (figuratively) been consumed by
1525 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1526 be determined. In this case segmenter_push() returns -1. The caller should
1527 obtain more input and then call segmenter_push() again with a larger N and
1528 repeat until the input is exhausted (which must be indicated as described
1529 above) or until a valid segment is returned. segmenter_push() will never
1530 return -1 when the end of input is visible within INPUT.
1532 The caller must not, in a sequence of calls, supply contradictory input.
1533 That is, bytes provided as part of INPUT in one call, but not consumed, must
1534 not be provided with *different* values on subsequent calls. This is
1535 because segmenter_push() must often make decisions based on looking ahead
1536 beyond the bytes that it consumes. */
1538 segmenter_push (struct segmenter *s, const char *input, size_t n,
1539 enum segment_type *type)
1544 if (input[0] == '\0')
1553 return segmenter_parse_shbang__ (s, input, n, type);
1556 return (s->substate & SS_START_OF_LINE
1557 ? segmenter_parse_start_of_line__ (s, input, n, type)
1558 : segmenter_parse_mid_command__ (s, input, n, type));
1561 return segmenter_parse_comment_1__ (s, input, n, type);
1563 return segmenter_parse_comment_2__ (s, input, n, type);
1566 return segmenter_parse_document_1__ (s, input, n, type);
1568 return segmenter_parse_document_2__ (s, input, n, type);
1570 return segmenter_parse_document_3__ (s, type);
1573 return segmenter_parse_file_label__ (s, input, n, type);
1576 return segmenter_parse_do_repeat_1__ (s, input, n, type);
1578 return segmenter_parse_do_repeat_2__ (s, input, n, type);
1580 return segmenter_parse_do_repeat_3__ (s, input, n, type);
1582 case S_BEGIN_DATA_1:
1583 return segmenter_parse_begin_data_1__ (s, input, n, type);
1584 case S_BEGIN_DATA_2:
1585 return segmenter_parse_begin_data_2__ (s, input, n, type);
1586 case S_BEGIN_DATA_3:
1587 return segmenter_parse_begin_data_3__ (s, input, n, type);
1588 case S_BEGIN_DATA_4:
1589 return segmenter_parse_begin_data_4__ (s, input, n, type);
1592 return segmenter_parse_title_1__ (s, input, n, type);
1594 return segmenter_parse_title_2__ (s, input, n, type);
1600 /* Returns the style of command prompt to display to an interactive user for
1601 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1602 and at the beginning of a line (that is, if segmenter_push() consumed as
1603 much as possible of the input up to a new-line). */
1605 segmenter_get_prompt (const struct segmenter *s)
1610 return PROMPT_FIRST;
1613 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1617 return PROMPT_COMMENT;
1621 return PROMPT_DOCUMENT;
1623 return PROMPT_FIRST;
1626 return PROMPT_LATER;
1630 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1632 return PROMPT_DO_REPEAT;
1634 case S_BEGIN_DATA_1:
1635 return PROMPT_FIRST;
1636 case S_BEGIN_DATA_2:
1637 return PROMPT_LATER;
1638 case S_BEGIN_DATA_3:
1639 case S_BEGIN_DATA_4:
1644 return PROMPT_FIRST;