1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
56 static int segmenter_detect_command_name__ (const char *input,
57 size_t n, bool eof, int ofs);
60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
63 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
71 mblen = u8_mbtoucr (puc, input, n);
75 return u8_mbtouc (puc, input, n);
86 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
87 bool eof, enum segment_type *type)
95 for (int ofs = 2; ; ofs++)
102 else if (input[ofs] == '\n')
104 if (input[ofs - 1] == '\r')
110 s->state = S_GENERAL;
111 s->substate = SS_START_OF_COMMAND;
121 s->state = S_GENERAL;
122 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
123 return segmenter_push (s, input, n, eof, type);
127 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
128 const char *input, size_t n, bool eof,
129 enum segment_type *type)
131 assert (s->state == S_GENERAL);
137 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
141 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
143 for (; ofs < n; ofs++)
145 if (input[ofs] == '\n')
147 else if (input[ofs] == '*')
150 return eof ? ofs + 1 : -1;
151 else if (input[ofs + 1] == '/')
155 return eof ? ofs : -1;
159 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
166 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
173 return eof ? ofs : -1;
174 else if (input[ofs + 1] != '*')
177 ofs = skip_comment (input, n, eof, ofs + 2);
181 else if (lex_uc_is_space (uc) && uc != '\n')
187 return eof ? ofs : -1;
191 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
195 else if (input[ofs] == '\n')
197 else if (input[ofs] == '\r')
201 return input[ofs + 1] == '\n';
208 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
210 ofs = skip_spaces_and_comments (input, n, eof, ofs);
214 return is_end_of_line (input, n, eof, ofs);
218 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
219 enum segment_type *type)
223 if (input[0] == '\n')
233 assert (input[0] == '\r');
234 assert (input[1] == '\n');
243 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
250 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
254 if (!lex_uc_is_space (uc) || uc == '\n')
260 return eof ? ofs : -1;
264 skip_digits (const char *input, size_t n, bool eof, int ofs)
266 for (; ofs < n; ofs++)
267 if (!c_isdigit (input[ofs]))
269 return eof ? ofs : -1;
273 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
274 bool eof, enum segment_type *type)
278 assert (s->state == S_GENERAL);
280 ofs = skip_digits (input, n, eof, 0);
290 if (input[ofs] == '.')
299 ofs = skip_digits (input, n, eof, ofs + 1);
306 if (input[ofs] == 'e' || input[ofs] == 'E')
313 goto expected_exponent;
316 if (input[ofs] == '+' || input[ofs] == '-')
323 goto expected_exponent;
327 if (!c_isdigit (input[ofs]))
328 goto expected_exponent;
330 ofs = skip_digits (input, n, eof, ofs);
335 if (input[ofs - 1] == '.')
337 int eol = at_end_of_line (input, n, eof, ofs);
350 *type = SEG_EXPECTED_EXPONENT;
356 is_reserved_word (const char *s, int n)
360 s0 = c_toupper (s[0]);
364 s1 = c_toupper (s[1]);
365 return ((s0 == 'B' && s1 == 'Y')
366 || (s0 == 'E' && s1 == 'Q')
367 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
368 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
369 || (s0 == 'N' && s1 == 'E')
370 || (s0 == 'O' && s1 == 'R')
371 || (s0 == 'T' && s1 == 'O'));
374 s1 = c_toupper (s[1]);
375 s2 = c_toupper (s[2]);
376 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
377 || (s1 == 'N' && s2 == 'D')))
378 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
381 s1 = c_toupper (s[1]);
382 s2 = c_toupper (s[2]);
383 s3 = c_toupper (s[3]);
384 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
392 segmenter_parse_comment_1__ (struct segmenter *s,
393 const char *input, size_t n, bool eof,
394 enum segment_type *type)
406 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
417 if (ofs > 1 && input[ofs - 1] == '\r')
421 /* Blank line ends comment command. */
422 s->state = S_GENERAL;
423 s->substate = SS_START_OF_COMMAND;
424 *type = SEG_SEPARATE_COMMANDS;
427 else if (endcmd >= 0)
429 /* '.' at end of line ends comment command. */
430 s->state = S_GENERAL;
432 *type = SEG_COMMENT_COMMAND;
437 /* Comment continues onto next line. */
438 *type = SEG_COMMENT_COMMAND;
439 s->state = S_COMMENT_2;
445 if (!lex_uc_is_space (uc))
456 s->state = S_GENERAL;
457 s->substate = SS_START_OF_COMMAND;
458 *type = SEG_SEPARATE_COMMANDS;
466 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
467 size_t n, bool eof, enum segment_type *type)
469 int ofs = segmenter_parse_newline__ (input, n, eof, type);
483 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
487 if (uc == '+' || uc == '-' || uc == '.')
489 else if (!lex_uc_is_space (uc))
492 case SEG_MODE_INTERACTIVE:
501 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
515 s->state = S_GENERAL;
516 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
519 s->state = S_COMMENT_1;
524 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
525 bool eof, enum segment_type *type)
537 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
548 if (ofs > 1 && input[ofs - 1] == '\r')
551 *type = SEG_DOCUMENT;
552 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
556 if (!lex_uc_is_space (uc))
565 *type = SEG_DOCUMENT;
566 s->state = S_DOCUMENT_3;
573 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
574 bool eof, enum segment_type *type)
578 ofs = segmenter_parse_newline__ (input, n, eof, type);
582 s->state = S_DOCUMENT_1;
587 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
589 *type = SEG_END_COMMAND;
590 s->state = S_GENERAL;
591 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
596 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
599 ofs = skip_spaces_and_comments (input, n, eof, ofs);
605 return c != '\'' && c != '"' && c != '\n';
615 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
616 bool eof, int ofs, char id[], size_t id_size)
618 struct segmenter sub;
620 assert (id_size > 0);
623 sub.state = S_GENERAL;
627 enum segment_type type;
630 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
646 if (retval < id_size)
648 memcpy (id, input + ofs, retval);
656 case SEG_QUOTED_STRING:
658 case SEG_UNICODE_STRING:
659 case SEG_UNQUOTED_STRING:
660 case SEG_RESERVED_WORD:
662 case SEG_COMMENT_COMMAND:
663 case SEG_DO_REPEAT_COMMAND:
664 case SEG_INLINE_DATA:
665 case SEG_START_DOCUMENT:
667 case SEG_START_COMMAND:
668 case SEG_SEPARATE_COMMANDS:
669 case SEG_END_COMMAND:
671 case SEG_EXPECTED_QUOTE:
672 case SEG_EXPECTED_EXPONENT:
673 case SEG_UNEXPECTED_DOT:
674 case SEG_UNEXPECTED_CHAR:
682 /* Called when INPUT begins with a character that can start off an ID token. */
684 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
685 bool eof, enum segment_type *type)
691 assert (s->state == S_GENERAL);
693 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
705 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
708 else if (!lex_uc_is_idn (uc))
714 if (input[ofs - 1] == '.')
716 int eol = at_end_of_line (input, n, eof, ofs);
723 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
724 : input[0] == '!' ? SEG_MACRO_ID
727 if (s->substate & SS_START_OF_COMMAND)
729 struct substring word = ss_buffer (input, ofs);
731 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
733 s->state = S_COMMENT_1;
734 return segmenter_parse_comment_1__ (s, input, n, eof, type);
736 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
738 s->state = S_DOCUMENT_1;
739 *type = SEG_START_DOCUMENT;
742 else if (lex_id_match (ss_cstr ("TITLE"), word)
743 || lex_id_match (ss_cstr ("SUBTITLE"), word))
745 int result = segmenter_unquoted (input, n, eof, ofs);
750 s->state = S_TITLE_1;
754 else if (lex_id_match (ss_cstr ("FILE"), word))
758 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
760 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
762 s->state = S_FILE_LABEL;
767 else if (lex_id_match (ss_cstr ("DO"), word))
771 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
773 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
775 s->state = S_DO_REPEAT_1;
780 else if (lex_id_match (ss_cstr ("BEGIN"), word))
785 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
788 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
792 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
798 else if (input[ofs2] == '.')
800 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
805 eol = is_end_of_line (input, n, eof, ofs2);
810 if (memchr (input, '\n', ofs2))
811 s->state = S_BEGIN_DATA_1;
813 s->state = S_BEGIN_DATA_2;
826 segmenter_parse_string__ (enum segment_type string_type,
827 int ofs, struct segmenter *s,
828 const char *input, size_t n, bool eof,
829 enum segment_type *type)
831 int quote = input[ofs];
835 if (input[ofs] == quote)
840 if (input[ofs] == quote)
853 else if (input[ofs] == '\n')
864 *type = SEG_EXPECTED_QUOTE;
870 segmenter_maybe_parse_string__ (enum segment_type string_type,
872 const char *input, size_t n, bool eof,
873 enum segment_type *type)
880 else if (input[1] == '\'' || input[1] == '"')
881 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
883 return segmenter_parse_id__ (s, input, n, eof, type);
887 segmenter_parse_mid_command__ (struct segmenter *s,
888 const char *input, size_t n, bool eof,
889 enum segment_type *type)
895 assert (s->state == S_GENERAL);
896 assert (!(s->substate & SS_START_OF_LINE));
898 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
905 s->substate |= SS_START_OF_LINE;
915 else if (input[1] == '*')
917 ofs = skip_comment (input, n, eof, 2);
929 case '(': case ')': case ',': case '=': case '-':
930 case '[': case ']': case '&': case '|': case '+':
936 if (s->substate & SS_START_OF_COMMAND)
938 /* '*' at the beginning of a command begins a comment. */
939 s->state = S_COMMENT_1;
940 return segmenter_parse_comment_1__ (s, input, n, eof, type);
943 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
946 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
949 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
952 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
960 else if (c_isdigit (input[1]))
961 return segmenter_parse_number__ (s, input, n, eof, type);
963 int eol = at_end_of_line (input, n, eof, 1);
969 *type = SEG_END_COMMAND;
970 s->substate = SS_START_OF_COMMAND;
973 *type = SEG_UNEXPECTED_DOT;
976 case '0': case '1': case '2': case '3': case '4':
977 case '5': case '6': case '7': case '8': case '9':
978 return segmenter_parse_number__ (s, input, n, eof, type);
981 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
982 s, input, n, eof, type);
985 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
986 s, input, n, eof, type);
989 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
990 s, input, n, eof, type);
993 return segmenter_parse_id__ (s, input, n, eof, type);
996 if (lex_uc_is_space (uc))
998 ofs = skip_spaces (input, n, eof, mblen);
1002 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1006 s->substate |= SS_START_OF_LINE;
1007 *type = SEG_NEWLINE;
1016 else if (lex_uc_is_id1 (uc))
1017 return segmenter_parse_id__ (s, input, n, eof, type);
1020 *type = SEG_UNEXPECTED_CHAR;
1028 compare_commands (const void *a_, const void *b_)
1030 const char *const *ap = a_;
1031 const char *const *bp = b_;
1032 const char *a = *ap;
1033 const char *b = *bp;
1035 return c_strcasecmp (a, b);
1038 static const char **
1039 segmenter_get_command_name_candidates (unsigned char first)
1041 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1042 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1043 static const char *commands[] =
1045 #include "language/command.def"
1048 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1054 static const char **cindex[UCHAR_MAX + 1];
1062 qsort (commands, n_commands, sizeof *commands, compare_commands);
1063 for (i = 0; i < n_commands; i++)
1065 unsigned char c = c_toupper (commands[i][0]);
1066 if (cindex[c] == NULL)
1067 cindex[c] = &commands[i];
1069 for (i = 0; i <= UCHAR_MAX; i++)
1070 if (cindex[i] == NULL)
1071 cindex[i] = &commands[n_commands];
1074 return cindex[c_toupper (first)];
1078 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1081 const char **commands;
1098 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1103 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1111 if (input[ofs - 1] == '.')
1114 for (commands = segmenter_get_command_name_candidates (input[0]);
1115 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1121 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1122 &exact, &missing_words)
1123 && missing_words <= 0)
1131 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1134 return eof ? 0 : -1;
1137 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1140 return eof ? 0 : -1;
1142 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1145 return c == '\'' || c == '"' || c == '\n';
1149 segmenter_parse_start_of_line__ (struct segmenter *s,
1150 const char *input, size_t n, bool eof,
1151 enum segment_type *type)
1157 assert (s->state == S_GENERAL);
1158 assert (s->substate & SS_START_OF_LINE);
1160 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1167 ofs = skip_spaces_and_comments (input, n, eof, 1);
1172 int is_string = is_start_of_string__ (input, n, eof, ofs);
1177 /* This is punctuation that may separate pieces of a string. */
1187 *type = SEG_START_COMMAND;
1188 s->substate = SS_START_OF_COMMAND;
1192 if (lex_uc_is_space (uc))
1194 int eol = at_end_of_line (input, n, eof, 0);
1199 s->substate = SS_START_OF_COMMAND;
1200 *type = SEG_SEPARATE_COMMANDS;
1206 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1208 else if (s->mode == SEG_MODE_AUTO)
1210 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1217 assert (s->mode == SEG_MODE_BATCH);
1219 s->substate = SS_START_OF_COMMAND;
1220 *type = SEG_START_COMMAND;
1224 s->substate = SS_START_OF_COMMAND;
1225 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1229 segmenter_parse_file_label__ (struct segmenter *s,
1230 const char *input, size_t n, bool eof,
1231 enum segment_type *type)
1233 struct segmenter sub;
1237 sub.state = S_GENERAL;
1238 ofs = segmenter_push (&sub, input, n, eof, type);
1242 else if (*type == SEG_IDENTIFIER)
1246 assert (lex_id_match (ss_cstr ("LABEL"),
1247 ss_buffer ((char *) input, ofs)));
1248 result = segmenter_unquoted (input, n, eof, ofs);
1254 s->state = S_TITLE_1;
1262 s->substate = sub.substate;
1268 segmenter_subparse (struct segmenter *s,
1269 const char *input, size_t n, bool eof,
1270 enum segment_type *type)
1272 struct segmenter sub;
1276 sub.state = S_GENERAL;
1277 sub.substate = s->substate;
1278 ofs = segmenter_push (&sub, input, n, eof, type);
1279 s->substate = sub.substate;
1283 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1284 defines the stand-in variables (the head) before the lines of syntax to be
1285 repeated (the body). */
1287 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1288 const char *input, size_t n, bool eof,
1289 enum segment_type *type)
1291 int ofs = segmenter_subparse (s, input, n, eof, type);
1295 if (*type == SEG_SEPARATE_COMMANDS)
1297 /* We reached a blank line that separates the head from the body. */
1298 s->state = S_DO_REPEAT_2;
1300 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1302 /* We reached the body. */
1303 s->state = S_DO_REPEAT_3;
1310 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1311 separates the head from the body. */
1313 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1314 const char *input, size_t n, bool eof,
1315 enum segment_type *type)
1317 int ofs = segmenter_subparse (s, input, n, eof, type);
1321 if (*type == SEG_NEWLINE)
1323 /* We reached the body. */
1324 s->state = S_DO_REPEAT_3;
1332 check_repeat_command (struct segmenter *s,
1333 const char *input, size_t n, bool eof)
1340 if (input[ofs] == '+' || input[ofs] == '-')
1343 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1346 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1348 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1353 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1357 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1358 s->substate += direction;
1363 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1364 enum segment_type *type)
1366 const char *newline = memchr (input, '\n', n);
1368 return eof ? n : -1;
1370 ptrdiff_t ofs = newline - input;
1371 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1373 *type = SEG_NEWLINE;
1377 return ofs - (input[ofs - 1] == '\r');
1380 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1381 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1383 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1384 the lines we're segmenting. s->substate counts the nesting level, starting
1387 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1388 const char *input, size_t n, bool eof,
1389 enum segment_type *type)
1393 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1394 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1396 else if (!check_repeat_command (s, input, n, eof) && !eof)
1398 else if (s->substate == 0)
1400 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1402 s->state = S_GENERAL;
1403 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1404 return segmenter_push (s, input, n, eof, type);
1408 *type = SEG_DO_REPEAT_COMMAND;
1414 segmenter_parse_begin_data_1__ (struct segmenter *s,
1415 const char *input, size_t n, bool eof,
1416 enum segment_type *type)
1418 int ofs = segmenter_subparse (s, input, n, eof, type);
1422 if (*type == SEG_NEWLINE)
1423 s->state = S_BEGIN_DATA_2;
1429 segmenter_parse_begin_data_2__ (struct segmenter *s,
1430 const char *input, size_t n, bool eof,
1431 enum segment_type *type)
1433 int ofs = segmenter_subparse (s, input, n, eof, type);
1437 if (*type == SEG_NEWLINE)
1438 s->state = S_BEGIN_DATA_3;
1444 is_end_data (const char *input, size_t n)
1446 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1452 if (n < 4 || c_strncasecmp (input, "END", 3))
1456 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1457 if (!lex_uc_is_space (uc))
1461 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1468 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1475 else if (!lex_uc_is_space (uc))
1484 segmenter_parse_begin_data_3__ (struct segmenter *s,
1485 const char *input, size_t n, bool eof,
1486 enum segment_type *type)
1490 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1493 else if (is_end_data (input, ofs))
1495 s->state = S_GENERAL;
1496 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1497 return segmenter_push (s, input, n, eof, type);
1501 *type = SEG_INLINE_DATA;
1502 s->state = S_BEGIN_DATA_4;
1503 return input[ofs - 1] == '\n' ? 0 : ofs;
1508 segmenter_parse_begin_data_4__ (struct segmenter *s,
1509 const char *input, size_t n, bool eof,
1510 enum segment_type *type)
1514 ofs = segmenter_parse_newline__ (input, n, eof, type);
1518 s->state = S_BEGIN_DATA_3;
1523 segmenter_parse_title_1__ (struct segmenter *s,
1524 const char *input, size_t n, bool eof,
1525 enum segment_type *type)
1529 ofs = skip_spaces (input, n, eof, 0);
1532 s->state = S_TITLE_2;
1538 segmenter_parse_title_2__ (struct segmenter *s,
1539 const char *input, size_t n, bool eof,
1540 enum segment_type *type)
1552 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1566 if (!lex_uc_is_space (uc))
1577 s->state = S_GENERAL;
1579 *type = SEG_UNQUOTED_STRING;
1580 return endcmd >= 0 ? endcmd : ofs;
1586 /* Returns the name of segment TYPE as a string. The caller must not modify
1587 or free the returned string.
1589 This is useful only for debugging and testing. */
1591 segment_type_to_string (enum segment_type type)
1595 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1599 return "unknown segment type";
1603 /* Initializes S as a segmenter with the given syntax MODE.
1605 A segmenter does not contain any external references, so nothing needs to be
1606 done to destroy one. For the same reason, segmenters may be copied with
1607 plain struct assignment (or memcpy). */
1609 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1611 s->state = S_SHBANG;
1616 /* Returns the mode passed to segmenter_init() for S. */
1618 segmenter_get_mode (const struct segmenter *s)
1623 /* Attempts to label a prefix of S's remaining input with a segment type. The
1624 caller supplies the first N bytes of the remaining input as INPUT, which
1625 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1626 are the entire (remainder) of the input; if EOF is false, then further input
1627 is potentially available.
1629 The input may contain '\n' or '\r\n' line ends in any combination.
1631 If successful, returns the number of bytes in the segment at the beginning
1632 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1633 into *TYPE. The next call to segmenter_push() should not include those
1634 bytes as part of INPUT, because they have (figuratively) been consumed by
1637 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1638 be determined. In this case segmenter_push() returns -1. If more input is
1639 available, the caller should obtain some more, then call again with a larger
1640 N. If this is not enough, the process might need to repeat again and agin.
1641 If input is exhausted, then the caller may call again setting EOF to true.
1642 segmenter_push() will never return -1 when EOF is true.
1644 The caller must not, in a sequence of calls, supply contradictory input.
1645 That is, bytes provided as part of INPUT in one call, but not consumed, must
1646 not be provided with *different* values on subsequent calls. This is
1647 because segmenter_push() must often make decisions based on looking ahead
1648 beyond the bytes that it consumes. */
1650 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1651 enum segment_type *type)
1667 return segmenter_parse_shbang__ (s, input, n, eof, type);
1670 return (s->substate & SS_START_OF_LINE
1671 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1672 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1675 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1677 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1680 return segmenter_parse_document_1__ (s, input, n, eof, type);
1682 return segmenter_parse_document_2__ (s, input, n, eof, type);
1684 return segmenter_parse_document_3__ (s, type);
1687 return segmenter_parse_file_label__ (s, input, n, eof, type);
1690 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1692 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1694 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1696 case S_BEGIN_DATA_1:
1697 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1698 case S_BEGIN_DATA_2:
1699 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1700 case S_BEGIN_DATA_3:
1701 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1702 case S_BEGIN_DATA_4:
1703 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1706 return segmenter_parse_title_1__ (s, input, n, eof, type);
1708 return segmenter_parse_title_2__ (s, input, n, eof, type);
1714 /* Returns the style of command prompt to display to an interactive user for
1715 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1716 and at the beginning of a line (that is, if segmenter_push() consumed as
1717 much as possible of the input up to a new-line). */
1719 segmenter_get_prompt (const struct segmenter *s)
1724 return PROMPT_FIRST;
1727 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1731 return PROMPT_COMMENT;
1735 return PROMPT_DOCUMENT;
1737 return PROMPT_FIRST;
1740 return PROMPT_LATER;
1744 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1746 return PROMPT_DO_REPEAT;
1748 case S_BEGIN_DATA_1:
1749 return PROMPT_FIRST;
1750 case S_BEGIN_DATA_2:
1751 return PROMPT_LATER;
1752 case S_BEGIN_DATA_3:
1753 case S_BEGIN_DATA_4:
1758 return PROMPT_FIRST;