1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
56 static int segmenter_detect_command_name__ (const char *input,
57 size_t n, bool eof, int ofs);
60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
63 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
71 mblen = u8_mbtoucr (puc, input, n);
75 return u8_mbtouc (puc, input, n);
86 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
87 bool eof, enum segment_type *type)
95 for (int ofs = 2; ; ofs++)
102 else if (input[ofs] == '\n')
104 if (input[ofs - 1] == '\r')
110 s->state = S_GENERAL;
111 s->substate = SS_START_OF_COMMAND;
121 s->state = S_GENERAL;
122 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
123 return segmenter_push (s, input, n, eof, type);
127 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
128 const char *input, size_t n, bool eof,
129 enum segment_type *type)
131 assert (s->state == S_GENERAL);
137 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
141 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
143 for (; ofs < n; ofs++)
145 if (input[ofs] == '\n')
147 else if (input[ofs] == '*')
150 return eof ? ofs + 1 : -1;
151 else if (input[ofs + 1] == '/')
155 return eof ? ofs : -1;
159 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
166 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
173 return eof ? ofs : -1;
174 else if (input[ofs + 1] != '*')
177 ofs = skip_comment (input, n, eof, ofs + 2);
181 else if (lex_uc_is_space (uc) && uc != '\n')
187 return eof ? ofs : -1;
191 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
195 else if (input[ofs] == '\n')
197 else if (input[ofs] == '\r')
201 return input[ofs + 1] == '\n';
208 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
210 ofs = skip_spaces_and_comments (input, n, eof, ofs);
214 return is_end_of_line (input, n, eof, ofs);
218 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
219 enum segment_type *type)
223 if (input[0] == '\n')
233 assert (input[0] == '\r');
234 assert (input[1] == '\n');
243 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
250 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
254 if (!lex_uc_is_space (uc) || uc == '\n')
260 return eof ? ofs : -1;
264 skip_digits (const char *input, size_t n, bool eof, int ofs)
266 for (; ofs < n; ofs++)
267 if (!c_isdigit (input[ofs]))
269 return eof ? ofs : -1;
273 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
274 bool eof, enum segment_type *type)
278 assert (s->state == S_GENERAL);
280 ofs = skip_digits (input, n, eof, 0);
290 if (input[ofs] == '.')
299 ofs = skip_digits (input, n, eof, ofs + 1);
306 if (input[ofs] == 'e' || input[ofs] == 'E')
313 goto expected_exponent;
316 if (input[ofs] == '+' || input[ofs] == '-')
323 goto expected_exponent;
327 if (!c_isdigit (input[ofs]))
328 goto expected_exponent;
330 ofs = skip_digits (input, n, eof, ofs);
335 if (input[ofs - 1] == '.')
337 int eol = at_end_of_line (input, n, eof, ofs);
350 *type = SEG_EXPECTED_EXPONENT;
356 is_reserved_word (const char *s, int n)
360 s0 = c_toupper (s[0]);
364 s1 = c_toupper (s[1]);
365 return ((s0 == 'B' && s1 == 'Y')
366 || (s0 == 'E' && s1 == 'Q')
367 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
368 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
369 || (s0 == 'N' && s1 == 'E')
370 || (s0 == 'O' && s1 == 'R')
371 || (s0 == 'T' && s1 == 'O'));
374 s1 = c_toupper (s[1]);
375 s2 = c_toupper (s[2]);
376 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
377 || (s1 == 'N' && s2 == 'D')))
378 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
381 s1 = c_toupper (s[1]);
382 s2 = c_toupper (s[2]);
383 s3 = c_toupper (s[3]);
384 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
392 segmenter_parse_comment_1__ (struct segmenter *s,
393 const char *input, size_t n, bool eof,
394 enum segment_type *type)
406 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
417 if (ofs > 1 && input[ofs - 1] == '\r')
421 /* Blank line ends comment command. */
422 s->state = S_GENERAL;
423 s->substate = SS_START_OF_COMMAND;
424 *type = SEG_SEPARATE_COMMANDS;
427 else if (endcmd >= 0)
429 /* '.' at end of line ends comment command. */
430 s->state = S_GENERAL;
432 *type = SEG_COMMENT_COMMAND;
437 /* Comment continues onto next line. */
438 *type = SEG_COMMENT_COMMAND;
439 s->state = S_COMMENT_2;
445 if (!lex_uc_is_space (uc))
456 s->state = S_GENERAL;
457 s->substate = SS_START_OF_COMMAND;
458 *type = SEG_SEPARATE_COMMANDS;
466 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
467 size_t n, bool eof, enum segment_type *type)
469 int ofs = segmenter_parse_newline__ (input, n, eof, type);
483 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
487 if (uc == '+' || uc == '-' || uc == '.')
489 else if (!lex_uc_is_space (uc))
492 case SEG_MODE_INTERACTIVE:
501 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
515 s->state = S_GENERAL;
516 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
519 s->state = S_COMMENT_1;
524 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
525 bool eof, enum segment_type *type)
537 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
548 if (ofs > 1 && input[ofs - 1] == '\r')
551 *type = SEG_DOCUMENT;
552 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
556 if (!lex_uc_is_space (uc))
565 *type = SEG_DOCUMENT;
566 s->state = S_DOCUMENT_3;
573 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
574 bool eof, enum segment_type *type)
578 ofs = segmenter_parse_newline__ (input, n, eof, type);
582 s->state = S_DOCUMENT_1;
587 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
589 *type = SEG_END_COMMAND;
590 s->state = S_GENERAL;
591 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
596 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
599 ofs = skip_spaces_and_comments (input, n, eof, ofs);
605 return c != '\'' && c != '"' && c != '\n';
615 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
616 bool eof, int ofs, char id[], size_t id_size)
618 struct segmenter sub;
620 assert (id_size > 0);
623 sub.state = S_GENERAL;
627 enum segment_type type;
630 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
646 if (retval < id_size)
648 memcpy (id, input + ofs, retval);
655 case SEG_QUOTED_STRING:
657 case SEG_UNICODE_STRING:
658 case SEG_UNQUOTED_STRING:
659 case SEG_RESERVED_WORD:
661 case SEG_COMMENT_COMMAND:
662 case SEG_DO_REPEAT_COMMAND:
663 case SEG_INLINE_DATA:
664 case SEG_START_DOCUMENT:
666 case SEG_START_COMMAND:
667 case SEG_SEPARATE_COMMANDS:
668 case SEG_END_COMMAND:
670 case SEG_EXPECTED_QUOTE:
671 case SEG_EXPECTED_EXPONENT:
672 case SEG_UNEXPECTED_DOT:
673 case SEG_UNEXPECTED_CHAR:
681 /* Called when INPUT begins with a character that can start off an ID token. */
683 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
684 bool eof, enum segment_type *type)
690 assert (s->state == S_GENERAL);
692 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
704 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
707 else if (!lex_uc_is_idn (uc))
713 if (input[ofs - 1] == '.')
715 int eol = at_end_of_line (input, n, eof, ofs);
722 if (is_reserved_word (input, ofs))
723 *type = SEG_RESERVED_WORD;
725 *type = SEG_IDENTIFIER;
727 if (s->substate & SS_START_OF_COMMAND)
729 struct substring word = ss_buffer (input, ofs);
731 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
733 s->state = S_COMMENT_1;
734 return segmenter_parse_comment_1__ (s, input, n, eof, type);
736 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
738 s->state = S_DOCUMENT_1;
739 *type = SEG_START_DOCUMENT;
742 else if (lex_id_match (ss_cstr ("TITLE"), word)
743 || lex_id_match (ss_cstr ("SUBTITLE"), word))
745 int result = segmenter_unquoted (input, n, eof, ofs);
750 s->state = S_TITLE_1;
754 else if (lex_id_match (ss_cstr ("FILE"), word))
758 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
760 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
762 s->state = S_FILE_LABEL;
767 else if (lex_id_match (ss_cstr ("DO"), word))
771 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
773 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
775 s->state = S_DO_REPEAT_1;
780 else if (lex_id_match (ss_cstr ("BEGIN"), word))
785 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
788 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
792 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
798 else if (input[ofs2] == '.')
800 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
805 eol = is_end_of_line (input, n, eof, ofs2);
810 if (memchr (input, '\n', ofs2))
811 s->state = S_BEGIN_DATA_1;
813 s->state = S_BEGIN_DATA_2;
826 segmenter_parse_string__ (enum segment_type string_type,
827 int ofs, struct segmenter *s,
828 const char *input, size_t n, bool eof,
829 enum segment_type *type)
831 int quote = input[ofs];
835 if (input[ofs] == quote)
840 if (input[ofs] == quote)
853 else if (input[ofs] == '\n')
864 *type = SEG_EXPECTED_QUOTE;
870 segmenter_maybe_parse_string__ (enum segment_type string_type,
872 const char *input, size_t n, bool eof,
873 enum segment_type *type)
880 else if (input[1] == '\'' || input[1] == '"')
881 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
883 return segmenter_parse_id__ (s, input, n, eof, type);
887 segmenter_parse_mid_command__ (struct segmenter *s,
888 const char *input, size_t n, bool eof,
889 enum segment_type *type)
895 assert (s->state == S_GENERAL);
896 assert (!(s->substate & SS_START_OF_LINE));
898 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
905 s->substate |= SS_START_OF_LINE;
915 else if (input[1] == '*')
917 ofs = skip_comment (input, n, eof, 2);
929 case '(': case ')': case ',': case '=': case '-':
930 case '[': case ']': case '&': case '|': case '+':
936 if (s->substate & SS_START_OF_COMMAND)
938 /* '*' at the beginning of a command begins a comment. */
939 s->state = S_COMMENT_1;
940 return segmenter_parse_comment_1__ (s, input, n, eof, type);
943 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
946 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
949 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
952 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
960 else if (c_isdigit (input[1]))
961 return segmenter_parse_number__ (s, input, n, eof, type);
963 int eol = at_end_of_line (input, n, eof, 1);
969 *type = SEG_END_COMMAND;
970 s->substate = SS_START_OF_COMMAND;
973 *type = SEG_UNEXPECTED_DOT;
976 case '0': case '1': case '2': case '3': case '4':
977 case '5': case '6': case '7': case '8': case '9':
978 return segmenter_parse_number__ (s, input, n, eof, type);
981 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
982 s, input, n, eof, type);
985 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
986 s, input, n, eof, type);
989 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
990 s, input, n, eof, type);
993 if (lex_uc_is_space (uc))
995 ofs = skip_spaces (input, n, eof, mblen);
999 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1003 s->substate |= SS_START_OF_LINE;
1004 *type = SEG_NEWLINE;
1013 else if (lex_uc_is_id1 (uc))
1014 return segmenter_parse_id__ (s, input, n, eof, type);
1017 *type = SEG_UNEXPECTED_CHAR;
1025 compare_commands (const void *a_, const void *b_)
1027 const char *const *ap = a_;
1028 const char *const *bp = b_;
1029 const char *a = *ap;
1030 const char *b = *bp;
1032 return c_strcasecmp (a, b);
1035 static const char **
1036 segmenter_get_command_name_candidates (unsigned char first)
1038 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1039 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1040 static const char *commands[] =
1042 #include "language/command.def"
1045 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1051 static const char **cindex[UCHAR_MAX + 1];
1059 qsort (commands, n_commands, sizeof *commands, compare_commands);
1060 for (i = 0; i < n_commands; i++)
1062 unsigned char c = c_toupper (commands[i][0]);
1063 if (cindex[c] == NULL)
1064 cindex[c] = &commands[i];
1066 for (i = 0; i <= UCHAR_MAX; i++)
1067 if (cindex[i] == NULL)
1068 cindex[i] = &commands[n_commands];
1071 return cindex[c_toupper (first)];
1075 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1078 const char **commands;
1095 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1100 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1108 if (input[ofs - 1] == '.')
1111 for (commands = segmenter_get_command_name_candidates (input[0]);
1112 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1118 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1119 &exact, &missing_words)
1120 && missing_words <= 0)
1128 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1131 return eof ? 0 : -1;
1134 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1137 return eof ? 0 : -1;
1139 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1142 return c == '\'' || c == '"' || c == '\n';
1146 segmenter_parse_start_of_line__ (struct segmenter *s,
1147 const char *input, size_t n, bool eof,
1148 enum segment_type *type)
1154 assert (s->state == S_GENERAL);
1155 assert (s->substate & SS_START_OF_LINE);
1157 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1164 ofs = skip_spaces_and_comments (input, n, eof, 1);
1169 int is_string = is_start_of_string__ (input, n, eof, ofs);
1174 /* This is punctuation that may separate pieces of a string. */
1184 *type = SEG_START_COMMAND;
1185 s->substate = SS_START_OF_COMMAND;
1189 if (lex_uc_is_space (uc))
1191 int eol = at_end_of_line (input, n, eof, 0);
1196 s->substate = SS_START_OF_COMMAND;
1197 *type = SEG_SEPARATE_COMMANDS;
1203 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1205 else if (s->mode == SEG_MODE_AUTO)
1207 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1214 assert (s->mode == SEG_MODE_BATCH);
1216 s->substate = SS_START_OF_COMMAND;
1217 *type = SEG_START_COMMAND;
1221 s->substate = SS_START_OF_COMMAND;
1222 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1226 segmenter_parse_file_label__ (struct segmenter *s,
1227 const char *input, size_t n, bool eof,
1228 enum segment_type *type)
1230 struct segmenter sub;
1234 sub.state = S_GENERAL;
1235 ofs = segmenter_push (&sub, input, n, eof, type);
1239 else if (*type == SEG_IDENTIFIER)
1243 assert (lex_id_match (ss_cstr ("LABEL"),
1244 ss_buffer ((char *) input, ofs)));
1245 result = segmenter_unquoted (input, n, eof, ofs);
1251 s->state = S_TITLE_1;
1259 s->substate = sub.substate;
1265 segmenter_subparse (struct segmenter *s,
1266 const char *input, size_t n, bool eof,
1267 enum segment_type *type)
1269 struct segmenter sub;
1273 sub.state = S_GENERAL;
1274 sub.substate = s->substate;
1275 ofs = segmenter_push (&sub, input, n, eof, type);
1276 s->substate = sub.substate;
1280 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1281 defines the stand-in variables (the head) before the lines of syntax to be
1282 repeated (the body). */
1284 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1285 const char *input, size_t n, bool eof,
1286 enum segment_type *type)
1288 int ofs = segmenter_subparse (s, input, n, eof, type);
1292 if (*type == SEG_SEPARATE_COMMANDS)
1294 /* We reached a blank line that separates the head from the body. */
1295 s->state = S_DO_REPEAT_2;
1297 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1299 /* We reached the body. */
1300 s->state = S_DO_REPEAT_3;
1307 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1308 separates the head from the body. */
1310 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1311 const char *input, size_t n, bool eof,
1312 enum segment_type *type)
1314 int ofs = segmenter_subparse (s, input, n, eof, type);
1318 if (*type == SEG_NEWLINE)
1320 /* We reached the body. */
1321 s->state = S_DO_REPEAT_3;
1329 check_repeat_command (struct segmenter *s,
1330 const char *input, size_t n, bool eof)
1337 if (input[ofs] == '+' || input[ofs] == '-')
1340 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1343 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1345 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1350 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1354 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1355 s->substate += direction;
1360 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1361 enum segment_type *type)
1363 const char *newline = memchr (input, '\n', n);
1365 return eof ? n : -1;
1367 ptrdiff_t ofs = newline - input;
1368 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1370 *type = SEG_NEWLINE;
1374 return ofs - (input[ofs - 1] == '\r');
1377 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1378 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1380 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1381 the lines we're segmenting. s->substate counts the nesting level, starting
1384 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1385 const char *input, size_t n, bool eof,
1386 enum segment_type *type)
1390 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1391 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1393 else if (!check_repeat_command (s, input, n, eof) && !eof)
1395 else if (s->substate == 0)
1397 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1399 s->state = S_GENERAL;
1400 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1401 return segmenter_push (s, input, n, eof, type);
1405 *type = SEG_DO_REPEAT_COMMAND;
1411 segmenter_parse_begin_data_1__ (struct segmenter *s,
1412 const char *input, size_t n, bool eof,
1413 enum segment_type *type)
1415 int ofs = segmenter_subparse (s, input, n, eof, type);
1419 if (*type == SEG_NEWLINE)
1420 s->state = S_BEGIN_DATA_2;
1426 segmenter_parse_begin_data_2__ (struct segmenter *s,
1427 const char *input, size_t n, bool eof,
1428 enum segment_type *type)
1430 int ofs = segmenter_subparse (s, input, n, eof, type);
1434 if (*type == SEG_NEWLINE)
1435 s->state = S_BEGIN_DATA_3;
1441 is_end_data (const char *input, size_t n)
1443 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1449 if (n < 4 || c_strncasecmp (input, "END", 3))
1453 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1454 if (!lex_uc_is_space (uc))
1458 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1465 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1472 else if (!lex_uc_is_space (uc))
1481 segmenter_parse_begin_data_3__ (struct segmenter *s,
1482 const char *input, size_t n, bool eof,
1483 enum segment_type *type)
1487 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1490 else if (is_end_data (input, ofs))
1492 s->state = S_GENERAL;
1493 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1494 return segmenter_push (s, input, n, eof, type);
1498 *type = SEG_INLINE_DATA;
1499 s->state = S_BEGIN_DATA_4;
1500 return input[ofs - 1] == '\n' ? 0 : ofs;
1505 segmenter_parse_begin_data_4__ (struct segmenter *s,
1506 const char *input, size_t n, bool eof,
1507 enum segment_type *type)
1511 ofs = segmenter_parse_newline__ (input, n, eof, type);
1515 s->state = S_BEGIN_DATA_3;
1520 segmenter_parse_title_1__ (struct segmenter *s,
1521 const char *input, size_t n, bool eof,
1522 enum segment_type *type)
1526 ofs = skip_spaces (input, n, eof, 0);
1529 s->state = S_TITLE_2;
1535 segmenter_parse_title_2__ (struct segmenter *s,
1536 const char *input, size_t n, bool eof,
1537 enum segment_type *type)
1549 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1563 if (!lex_uc_is_space (uc))
1574 s->state = S_GENERAL;
1576 *type = SEG_UNQUOTED_STRING;
1577 return endcmd >= 0 ? endcmd : ofs;
1583 /* Returns the name of segment TYPE as a string. The caller must not modify
1584 or free the returned string.
1586 This is useful only for debugging and testing. */
1588 segment_type_to_string (enum segment_type type)
1592 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1596 return "unknown segment type";
1600 /* Initializes S as a segmenter with the given syntax MODE.
1602 A segmenter does not contain any external references, so nothing needs to be
1603 done to destroy one. For the same reason, segmenters may be copied with
1604 plain struct assignment (or memcpy). */
1606 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1608 s->state = S_SHBANG;
1613 /* Returns the mode passed to segmenter_init() for S. */
1615 segmenter_get_mode (const struct segmenter *s)
1620 /* Attempts to label a prefix of S's remaining input with a segment type. The
1621 caller supplies the first N bytes of the remaining input as INPUT, which
1622 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1623 are the entire (remainder) of the input; if EOF is false, then further input
1624 is potentially available.
1626 The input may contain '\n' or '\r\n' line ends in any combination.
1628 If successful, returns the number of bytes in the segment at the beginning
1629 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1630 into *TYPE. The next call to segmenter_push() should not include those
1631 bytes as part of INPUT, because they have (figuratively) been consumed by
1634 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1635 be determined. In this case segmenter_push() returns -1. If more input is
1636 available, the caller should obtain some more, then call again with a larger
1637 N. If this is not enough, the process might need to repeat again and agin.
1638 If input is exhausted, then the caller may call again setting EOF to true.
1639 segmenter_push() will never return -1 when EOF is true.
1641 The caller must not, in a sequence of calls, supply contradictory input.
1642 That is, bytes provided as part of INPUT in one call, but not consumed, must
1643 not be provided with *different* values on subsequent calls. This is
1644 because segmenter_push() must often make decisions based on looking ahead
1645 beyond the bytes that it consumes. */
1647 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1648 enum segment_type *type)
1664 return segmenter_parse_shbang__ (s, input, n, eof, type);
1667 return (s->substate & SS_START_OF_LINE
1668 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1669 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1672 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1674 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1677 return segmenter_parse_document_1__ (s, input, n, eof, type);
1679 return segmenter_parse_document_2__ (s, input, n, eof, type);
1681 return segmenter_parse_document_3__ (s, type);
1684 return segmenter_parse_file_label__ (s, input, n, eof, type);
1687 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1689 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1691 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1693 case S_BEGIN_DATA_1:
1694 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1695 case S_BEGIN_DATA_2:
1696 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1697 case S_BEGIN_DATA_3:
1698 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1699 case S_BEGIN_DATA_4:
1700 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1703 return segmenter_parse_title_1__ (s, input, n, eof, type);
1705 return segmenter_parse_title_2__ (s, input, n, eof, type);
1711 /* Returns the style of command prompt to display to an interactive user for
1712 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1713 and at the beginning of a line (that is, if segmenter_push() consumed as
1714 much as possible of the input up to a new-line). */
1716 segmenter_get_prompt (const struct segmenter *s)
1721 return PROMPT_FIRST;
1724 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1728 return PROMPT_COMMENT;
1732 return PROMPT_DOCUMENT;
1734 return PROMPT_FIRST;
1737 return PROMPT_LATER;
1741 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1743 return PROMPT_DO_REPEAT;
1745 case S_BEGIN_DATA_1:
1746 return PROMPT_FIRST;
1747 case S_BEGIN_DATA_2:
1748 return PROMPT_LATER;
1749 case S_BEGIN_DATA_3:
1750 case S_BEGIN_DATA_4:
1755 return PROMPT_FIRST;