1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
56 static int segmenter_detect_command_name__ (const char *input,
57 size_t n, bool eof, int ofs);
60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
63 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
71 mblen = u8_mbtoucr (puc, input, n);
75 return u8_mbtouc (puc, input, n);
86 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
87 bool eof, enum segment_type *type)
95 for (int ofs = 2; ; ofs++)
102 else if (input[ofs] == '\n')
104 if (input[ofs - 1] == '\r')
110 s->state = S_GENERAL;
111 s->substate = SS_START_OF_COMMAND;
121 s->state = S_GENERAL;
122 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
123 return segmenter_push (s, input, n, eof, type);
127 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
128 const char *input, size_t n, bool eof,
129 enum segment_type *type)
131 assert (s->state == S_GENERAL);
137 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
141 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
143 for (; ofs < n; ofs++)
145 if (input[ofs] == '\n')
147 else if (input[ofs] == '*')
150 return eof ? ofs + 1 : -1;
151 else if (input[ofs + 1] == '/')
155 return eof ? ofs : -1;
159 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
166 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
173 return eof ? ofs : -1;
174 else if (input[ofs + 1] != '*')
177 ofs = skip_comment (input, n, eof, ofs + 2);
181 else if (lex_uc_is_space (uc) && uc != '\n')
187 return eof ? ofs : -1;
191 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
195 else if (input[ofs] == '\n')
197 else if (input[ofs] == '\r')
201 return input[ofs + 1] == '\n';
208 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
210 ofs = skip_spaces_and_comments (input, n, eof, ofs);
214 return is_end_of_line (input, n, eof, ofs);
218 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
219 enum segment_type *type)
223 if (input[0] == '\n')
233 assert (input[0] == '\r');
234 assert (input[1] == '\n');
243 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
250 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
254 if (!lex_uc_is_space (uc) || uc == '\n')
260 return eof ? ofs : -1;
264 skip_digits (const char *input, size_t n, bool eof, int ofs)
266 for (; ofs < n; ofs++)
267 if (!c_isdigit (input[ofs]))
269 return eof ? ofs : -1;
273 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
274 bool eof, enum segment_type *type)
278 assert (s->state == S_GENERAL);
280 ofs = skip_digits (input, n, eof, 0);
290 if (input[ofs] == '.')
292 ofs = skip_digits (input, n, eof, ofs + 1);
303 if (input[ofs] == 'e' || input[ofs] == 'E')
310 goto expected_exponent;
313 if (input[ofs] == '+' || input[ofs] == '-')
320 goto expected_exponent;
324 if (!c_isdigit (input[ofs]))
325 goto expected_exponent;
327 ofs = skip_digits (input, n, eof, ofs);
332 if (input[ofs - 1] == '.')
334 int eol = at_end_of_line (input, n, eof, ofs);
347 *type = SEG_EXPECTED_EXPONENT;
353 is_reserved_word (const char *s, int n)
357 s0 = c_toupper (s[0]);
361 s1 = c_toupper (s[1]);
362 return ((s0 == 'B' && s1 == 'Y')
363 || (s0 == 'E' && s1 == 'Q')
364 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
365 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
366 || (s0 == 'N' && s1 == 'E')
367 || (s0 == 'O' && s1 == 'R')
368 || (s0 == 'T' && s1 == 'O'));
371 s1 = c_toupper (s[1]);
372 s2 = c_toupper (s[2]);
373 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
374 || (s1 == 'N' && s2 == 'D')))
375 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
378 s1 = c_toupper (s[1]);
379 s2 = c_toupper (s[2]);
380 s3 = c_toupper (s[3]);
381 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
389 segmenter_parse_comment_1__ (struct segmenter *s,
390 const char *input, size_t n, bool eof,
391 enum segment_type *type)
403 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
414 if (ofs > 1 && input[ofs - 1] == '\r')
418 /* Blank line ends comment command. */
419 s->state = S_GENERAL;
420 s->substate = SS_START_OF_COMMAND;
421 *type = SEG_SEPARATE_COMMANDS;
424 else if (endcmd >= 0)
426 /* '.' at end of line ends comment command. */
427 s->state = S_GENERAL;
429 *type = SEG_COMMENT_COMMAND;
434 /* Comment continues onto next line. */
435 *type = SEG_COMMENT_COMMAND;
436 s->state = S_COMMENT_2;
442 if (!lex_uc_is_space (uc))
453 s->state = S_GENERAL;
454 s->substate = SS_START_OF_COMMAND;
455 *type = SEG_SEPARATE_COMMANDS;
463 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
464 size_t n, bool eof, enum segment_type *type)
466 int ofs = segmenter_parse_newline__ (input, n, eof, type);
480 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
484 if (uc == '+' || uc == '-' || uc == '.')
486 else if (!lex_uc_is_space (uc))
489 case SEG_MODE_INTERACTIVE:
498 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
512 s->state = S_GENERAL;
513 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
516 s->state = S_COMMENT_1;
521 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
522 bool eof, enum segment_type *type)
534 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
545 if (ofs > 1 && input[ofs - 1] == '\r')
548 *type = SEG_DOCUMENT;
549 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
553 if (!lex_uc_is_space (uc))
562 *type = SEG_DOCUMENT;
563 s->state = S_DOCUMENT_3;
570 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
571 bool eof, enum segment_type *type)
575 ofs = segmenter_parse_newline__ (input, n, eof, type);
579 s->state = S_DOCUMENT_1;
584 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
586 *type = SEG_END_COMMAND;
587 s->state = S_GENERAL;
588 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
593 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
596 ofs = skip_spaces_and_comments (input, n, eof, ofs);
602 return c != '\'' && c != '"' && c != '\n';
612 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
613 bool eof, int ofs, char id[], size_t id_size)
615 struct segmenter sub;
617 assert (id_size > 0);
620 sub.state = S_GENERAL;
624 enum segment_type type;
627 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
643 if (retval < id_size)
645 memcpy (id, input + ofs, retval);
652 case SEG_QUOTED_STRING:
654 case SEG_UNICODE_STRING:
655 case SEG_UNQUOTED_STRING:
656 case SEG_RESERVED_WORD:
658 case SEG_COMMENT_COMMAND:
659 case SEG_DO_REPEAT_COMMAND:
660 case SEG_INLINE_DATA:
661 case SEG_START_DOCUMENT:
663 case SEG_START_COMMAND:
664 case SEG_SEPARATE_COMMANDS:
665 case SEG_END_COMMAND:
667 case SEG_EXPECTED_QUOTE:
668 case SEG_EXPECTED_EXPONENT:
669 case SEG_UNEXPECTED_DOT:
670 case SEG_UNEXPECTED_CHAR:
678 /* Called when INPUT begins with a character that can start off an ID token. */
680 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
681 bool eof, enum segment_type *type)
687 assert (s->state == S_GENERAL);
689 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
701 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
704 else if (!lex_uc_is_idn (uc))
710 if (input[ofs - 1] == '.')
712 int eol = at_end_of_line (input, n, eof, ofs);
719 if (is_reserved_word (input, ofs))
720 *type = SEG_RESERVED_WORD;
722 *type = SEG_IDENTIFIER;
724 if (s->substate & SS_START_OF_COMMAND)
726 struct substring word = ss_buffer (input, ofs);
728 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
730 s->state = S_COMMENT_1;
731 return segmenter_parse_comment_1__ (s, input, n, eof, type);
733 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
735 s->state = S_DOCUMENT_1;
736 *type = SEG_START_DOCUMENT;
739 else if (lex_id_match (ss_cstr ("TITLE"), word)
740 || lex_id_match (ss_cstr ("SUBTITLE"), word))
742 int result = segmenter_unquoted (input, n, eof, ofs);
747 s->state = S_TITLE_1;
751 else if (lex_id_match (ss_cstr ("FILE"), word))
755 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
757 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
759 s->state = S_FILE_LABEL;
764 else if (lex_id_match (ss_cstr ("DO"), word))
768 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
770 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
772 s->state = S_DO_REPEAT_1;
777 else if (lex_id_match (ss_cstr ("BEGIN"), word))
782 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
785 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
789 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
795 else if (input[ofs2] == '.')
797 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
802 eol = is_end_of_line (input, n, eof, ofs2);
807 if (memchr (input, '\n', ofs2))
808 s->state = S_BEGIN_DATA_1;
810 s->state = S_BEGIN_DATA_2;
823 segmenter_parse_string__ (enum segment_type string_type,
824 int ofs, struct segmenter *s,
825 const char *input, size_t n, bool eof,
826 enum segment_type *type)
828 int quote = input[ofs];
832 if (input[ofs] == quote)
837 if (input[ofs] == quote)
850 else if (input[ofs] == '\n')
861 *type = SEG_EXPECTED_QUOTE;
867 segmenter_maybe_parse_string__ (enum segment_type string_type,
869 const char *input, size_t n, bool eof,
870 enum segment_type *type)
877 else if (input[1] == '\'' || input[1] == '"')
878 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
880 return segmenter_parse_id__ (s, input, n, eof, type);
884 segmenter_parse_mid_command__ (struct segmenter *s,
885 const char *input, size_t n, bool eof,
886 enum segment_type *type)
892 assert (s->state == S_GENERAL);
893 assert (!(s->substate & SS_START_OF_LINE));
895 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
902 s->substate |= SS_START_OF_LINE;
912 else if (input[1] == '*')
914 ofs = skip_comment (input, n, eof, 2);
926 case '(': case ')': case ',': case '=': case '-':
927 case '[': case ']': case '&': case '|': case '+':
933 if (s->substate & SS_START_OF_COMMAND)
935 /* '*' at the beginning of a command begins a comment. */
936 s->state = S_COMMENT_1;
937 return segmenter_parse_comment_1__ (s, input, n, eof, type);
940 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
943 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
946 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
949 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
957 else if (c_isdigit (input[1]))
958 return segmenter_parse_number__ (s, input, n, eof, type);
960 int eol = at_end_of_line (input, n, eof, 1);
966 *type = SEG_END_COMMAND;
967 s->substate = SS_START_OF_COMMAND;
970 *type = SEG_UNEXPECTED_DOT;
973 case '0': case '1': case '2': case '3': case '4':
974 case '5': case '6': case '7': case '8': case '9':
975 return segmenter_parse_number__ (s, input, n, eof, type);
978 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
979 s, input, n, eof, type);
982 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
983 s, input, n, eof, type);
986 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
987 s, input, n, eof, type);
990 if (lex_uc_is_space (uc))
992 ofs = skip_spaces (input, n, eof, mblen);
996 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1000 s->substate |= SS_START_OF_LINE;
1001 *type = SEG_NEWLINE;
1010 else if (lex_uc_is_id1 (uc))
1011 return segmenter_parse_id__ (s, input, n, eof, type);
1014 *type = SEG_UNEXPECTED_CHAR;
1022 compare_commands (const void *a_, const void *b_)
1024 const char *const *ap = a_;
1025 const char *const *bp = b_;
1026 const char *a = *ap;
1027 const char *b = *bp;
1029 return c_strcasecmp (a, b);
1032 static const char **
1033 segmenter_get_command_name_candidates (unsigned char first)
1035 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1036 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1037 static const char *commands[] =
1039 #include "language/command.def"
1042 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1048 static const char **cindex[UCHAR_MAX + 1];
1056 qsort (commands, n_commands, sizeof *commands, compare_commands);
1057 for (i = 0; i < n_commands; i++)
1059 unsigned char c = c_toupper (commands[i][0]);
1060 if (cindex[c] == NULL)
1061 cindex[c] = &commands[i];
1063 for (i = 0; i <= UCHAR_MAX; i++)
1064 if (cindex[i] == NULL)
1065 cindex[i] = &commands[n_commands];
1068 return cindex[c_toupper (first)];
1072 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1075 const char **commands;
1092 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1097 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1105 if (input[ofs - 1] == '.')
1108 for (commands = segmenter_get_command_name_candidates (input[0]);
1109 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1115 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1116 &exact, &missing_words)
1117 && missing_words <= 0)
1125 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1128 return eof ? 0 : -1;
1131 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1134 return eof ? 0 : -1;
1136 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1139 return c == '\'' || c == '"' || c == '\n';
1143 segmenter_parse_start_of_line__ (struct segmenter *s,
1144 const char *input, size_t n, bool eof,
1145 enum segment_type *type)
1151 assert (s->state == S_GENERAL);
1152 assert (s->substate & SS_START_OF_LINE);
1154 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1161 ofs = skip_spaces_and_comments (input, n, eof, 1);
1166 int is_string = is_start_of_string__ (input, n, eof, ofs);
1171 /* This is punctuation that may separate pieces of a string. */
1181 *type = SEG_START_COMMAND;
1182 s->substate = SS_START_OF_COMMAND;
1186 if (lex_uc_is_space (uc))
1188 int eol = at_end_of_line (input, n, eof, 0);
1193 s->substate = SS_START_OF_COMMAND;
1194 *type = SEG_SEPARATE_COMMANDS;
1200 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1202 else if (s->mode == SEG_MODE_AUTO)
1204 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1211 assert (s->mode == SEG_MODE_BATCH);
1213 s->substate = SS_START_OF_COMMAND;
1214 *type = SEG_START_COMMAND;
1218 s->substate = SS_START_OF_COMMAND;
1219 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1223 segmenter_parse_file_label__ (struct segmenter *s,
1224 const char *input, size_t n, bool eof,
1225 enum segment_type *type)
1227 struct segmenter sub;
1231 sub.state = S_GENERAL;
1232 ofs = segmenter_push (&sub, input, n, eof, type);
1236 else if (*type == SEG_IDENTIFIER)
1240 assert (lex_id_match (ss_cstr ("LABEL"),
1241 ss_buffer ((char *) input, ofs)));
1242 result = segmenter_unquoted (input, n, eof, ofs);
1248 s->state = S_TITLE_1;
1256 s->substate = sub.substate;
1262 segmenter_subparse (struct segmenter *s,
1263 const char *input, size_t n, bool eof,
1264 enum segment_type *type)
1266 struct segmenter sub;
1270 sub.state = S_GENERAL;
1271 sub.substate = s->substate;
1272 ofs = segmenter_push (&sub, input, n, eof, type);
1273 s->substate = sub.substate;
1278 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1279 const char *input, size_t n, bool eof,
1280 enum segment_type *type)
1282 int ofs = segmenter_subparse (s, input, n, eof, type);
1286 if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
1287 s->state = S_DO_REPEAT_2;
1288 else if (*type == SEG_END_COMMAND)
1290 s->state = S_DO_REPEAT_3;
1298 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1299 const char *input, size_t n, bool eof,
1300 enum segment_type *type)
1302 int ofs = segmenter_subparse (s, input, n, eof, type);
1306 if (*type == SEG_NEWLINE)
1308 s->state = S_DO_REPEAT_3;
1316 check_repeat_command (struct segmenter *s,
1317 const char *input, size_t n, bool eof)
1324 if (input[ofs] == '+' || input[ofs] == '-')
1327 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1330 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1332 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1337 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1341 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1342 s->substate += direction;
1347 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1348 enum segment_type *type)
1350 const char *newline = memchr (input, '\n', n);
1352 return eof ? n : -1;
1354 ptrdiff_t ofs = newline - input;
1355 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1357 *type = SEG_NEWLINE;
1361 return ofs - (input[ofs - 1] == '\r');
1365 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1366 const char *input, size_t n, bool eof,
1367 enum segment_type *type)
1371 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1372 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1374 else if (!check_repeat_command (s, input, n, eof) && !eof)
1376 else if (s->substate == 0)
1378 s->state = S_GENERAL;
1379 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1380 return segmenter_push (s, input, n, eof, type);
1384 *type = SEG_DO_REPEAT_COMMAND;
1390 segmenter_parse_begin_data_1__ (struct segmenter *s,
1391 const char *input, size_t n, bool eof,
1392 enum segment_type *type)
1394 int ofs = segmenter_subparse (s, input, n, eof, type);
1398 if (*type == SEG_NEWLINE)
1399 s->state = S_BEGIN_DATA_2;
1405 segmenter_parse_begin_data_2__ (struct segmenter *s,
1406 const char *input, size_t n, bool eof,
1407 enum segment_type *type)
1409 int ofs = segmenter_subparse (s, input, n, eof, type);
1413 if (*type == SEG_NEWLINE)
1414 s->state = S_BEGIN_DATA_3;
1420 is_end_data (const char *input, size_t n)
1422 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1428 if (n < 4 || c_strncasecmp (input, "END", 3))
1432 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1433 if (!lex_uc_is_space (uc))
1437 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1444 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1451 else if (!lex_uc_is_space (uc))
1460 segmenter_parse_begin_data_3__ (struct segmenter *s,
1461 const char *input, size_t n, bool eof,
1462 enum segment_type *type)
1466 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1469 else if (is_end_data (input, ofs))
1471 s->state = S_GENERAL;
1472 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1473 return segmenter_push (s, input, n, eof, type);
1477 *type = SEG_INLINE_DATA;
1478 s->state = S_BEGIN_DATA_4;
1479 return input[ofs - 1] == '\n' ? 0 : ofs;
1484 segmenter_parse_begin_data_4__ (struct segmenter *s,
1485 const char *input, size_t n, bool eof,
1486 enum segment_type *type)
1490 ofs = segmenter_parse_newline__ (input, n, eof, type);
1494 s->state = S_BEGIN_DATA_3;
1499 segmenter_parse_title_1__ (struct segmenter *s,
1500 const char *input, size_t n, bool eof,
1501 enum segment_type *type)
1505 ofs = skip_spaces (input, n, eof, 0);
1508 s->state = S_TITLE_2;
1514 segmenter_parse_title_2__ (struct segmenter *s,
1515 const char *input, size_t n, bool eof,
1516 enum segment_type *type)
1528 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1542 if (!lex_uc_is_space (uc))
1553 s->state = S_GENERAL;
1555 *type = SEG_UNQUOTED_STRING;
1556 return endcmd >= 0 ? endcmd : ofs;
1562 /* Returns the name of segment TYPE as a string. The caller must not modify
1563 or free the returned string.
1565 This is useful only for debugging and testing. */
1567 segment_type_to_string (enum segment_type type)
1571 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1575 return "unknown segment type";
1579 /* Initializes S as a segmenter with the given syntax MODE.
1581 A segmenter does not contain any external references, so nothing needs to be
1582 done to destroy one. For the same reason, segmenters may be copied with
1583 plain struct assignment (or memcpy). */
1585 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1587 s->state = S_SHBANG;
1592 /* Returns the mode passed to segmenter_init() for S. */
1594 segmenter_get_mode (const struct segmenter *s)
1599 /* Attempts to label a prefix of S's remaining input with a segment type. The
1600 caller supplies the first N bytes of the remaining input as INPUT, which
1601 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1602 are the entire (remainder) of the input; if EOF is false, then further input
1603 is potentially available.
1605 The input may contain '\n' or '\r\n' line ends in any combination.
1607 If successful, returns the number of bytes in the segment at the beginning
1608 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1609 into *TYPE. The next call to segmenter_push() should not include those
1610 bytes as part of INPUT, because they have (figuratively) been consumed by
1613 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1614 be determined. In this case segmenter_push() returns -1. If more input is
1615 available, the caller should obtain some more, then call again with a larger
1616 N. If this is not enough, the process might need to repeat again and agin.
1617 If input is exhausted, then the caller may call again setting EOF to true.
1618 segmenter_push() will never return -1 when EOF is true.
1620 The caller must not, in a sequence of calls, supply contradictory input.
1621 That is, bytes provided as part of INPUT in one call, but not consumed, must
1622 not be provided with *different* values on subsequent calls. This is
1623 because segmenter_push() must often make decisions based on looking ahead
1624 beyond the bytes that it consumes. */
1626 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1627 enum segment_type *type)
1643 return segmenter_parse_shbang__ (s, input, n, eof, type);
1646 return (s->substate & SS_START_OF_LINE
1647 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1648 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1651 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1653 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1656 return segmenter_parse_document_1__ (s, input, n, eof, type);
1658 return segmenter_parse_document_2__ (s, input, n, eof, type);
1660 return segmenter_parse_document_3__ (s, type);
1663 return segmenter_parse_file_label__ (s, input, n, eof, type);
1666 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1668 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1670 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1672 case S_BEGIN_DATA_1:
1673 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1674 case S_BEGIN_DATA_2:
1675 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1676 case S_BEGIN_DATA_3:
1677 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1678 case S_BEGIN_DATA_4:
1679 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1682 return segmenter_parse_title_1__ (s, input, n, eof, type);
1684 return segmenter_parse_title_2__ (s, input, n, eof, type);
1690 /* Returns the style of command prompt to display to an interactive user for
1691 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1692 and at the beginning of a line (that is, if segmenter_push() consumed as
1693 much as possible of the input up to a new-line). */
1695 segmenter_get_prompt (const struct segmenter *s)
1700 return PROMPT_FIRST;
1703 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1707 return PROMPT_COMMENT;
1711 return PROMPT_DOCUMENT;
1713 return PROMPT_FIRST;
1716 return PROMPT_LATER;
1720 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1722 return PROMPT_DO_REPEAT;
1724 case S_BEGIN_DATA_1:
1725 return PROMPT_FIRST;
1726 case S_BEGIN_DATA_2:
1727 return PROMPT_LATER;
1728 case S_BEGIN_DATA_3:
1729 case S_BEGIN_DATA_4:
1734 return PROMPT_FIRST;