1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
56 static int segmenter_detect_command_name__ (const char *input,
57 size_t n, bool eof, int ofs);
60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
63 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
71 mblen = u8_mbtoucr (puc, input, n);
75 return u8_mbtouc (puc, input, n);
86 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
87 bool eof, enum segment_type *type)
95 for (int ofs = 2; ; ofs++)
102 else if (input[ofs] == '\n')
104 if (input[ofs - 1] == '\r')
110 s->state = S_GENERAL;
111 s->substate = SS_START_OF_COMMAND;
121 s->state = S_GENERAL;
122 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
123 return segmenter_push (s, input, n, eof, type);
127 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
128 const char *input, size_t n, bool eof,
129 enum segment_type *type)
131 assert (s->state == S_GENERAL);
137 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
141 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
143 for (; ofs < n; ofs++)
145 if (input[ofs] == '\n')
147 else if (input[ofs] == '*')
150 return eof ? ofs + 1 : -1;
151 else if (input[ofs + 1] == '/')
155 return eof ? ofs : -1;
159 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
166 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
173 return eof ? ofs : -1;
174 else if (input[ofs + 1] != '*')
177 ofs = skip_comment (input, n, eof, ofs + 2);
181 else if (lex_uc_is_space (uc) && uc != '\n')
187 return eof ? ofs : -1;
191 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
195 else if (input[ofs] == '\n')
197 else if (input[ofs] == '\r')
201 return input[ofs + 1] == '\n';
208 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
210 ofs = skip_spaces_and_comments (input, n, eof, ofs);
214 return is_end_of_line (input, n, eof, ofs);
218 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
219 enum segment_type *type)
223 if (input[0] == '\n')
233 assert (input[0] == '\r');
234 assert (input[1] == '\n');
243 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
250 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
254 if (!lex_uc_is_space (uc) || uc == '\n')
260 return eof ? ofs : -1;
264 skip_digits (const char *input, size_t n, bool eof, int ofs)
266 for (; ofs < n; ofs++)
267 if (!c_isdigit (input[ofs]))
269 return eof ? ofs : -1;
273 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
274 bool eof, enum segment_type *type)
278 assert (s->state == S_GENERAL);
280 ofs = skip_digits (input, n, eof, 0);
290 if (input[ofs] == '.')
299 ofs = skip_digits (input, n, eof, ofs + 1);
306 if (input[ofs] == 'e' || input[ofs] == 'E')
313 goto expected_exponent;
316 if (input[ofs] == '+' || input[ofs] == '-')
323 goto expected_exponent;
327 if (!c_isdigit (input[ofs]))
328 goto expected_exponent;
330 ofs = skip_digits (input, n, eof, ofs);
335 if (input[ofs - 1] == '.')
337 int eol = at_end_of_line (input, n, eof, ofs);
350 *type = SEG_EXPECTED_EXPONENT;
356 is_reserved_word (const char *s, int n)
360 s0 = c_toupper (s[0]);
364 s1 = c_toupper (s[1]);
365 return ((s0 == 'B' && s1 == 'Y')
366 || (s0 == 'E' && s1 == 'Q')
367 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
368 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
369 || (s0 == 'N' && s1 == 'E')
370 || (s0 == 'O' && s1 == 'R')
371 || (s0 == 'T' && s1 == 'O'));
374 s1 = c_toupper (s[1]);
375 s2 = c_toupper (s[2]);
376 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
377 || (s1 == 'N' && s2 == 'D')))
378 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
381 s1 = c_toupper (s[1]);
382 s2 = c_toupper (s[2]);
383 s3 = c_toupper (s[3]);
384 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
392 segmenter_parse_comment_1__ (struct segmenter *s,
393 const char *input, size_t n, bool eof,
394 enum segment_type *type)
406 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
417 if (ofs > 1 && input[ofs - 1] == '\r')
421 /* Blank line ends comment command. */
422 s->state = S_GENERAL;
423 s->substate = SS_START_OF_COMMAND;
424 *type = SEG_SEPARATE_COMMANDS;
427 else if (endcmd >= 0)
429 /* '.' at end of line ends comment command. */
430 s->state = S_GENERAL;
432 *type = SEG_COMMENT_COMMAND;
437 /* Comment continues onto next line. */
438 *type = SEG_COMMENT_COMMAND;
439 s->state = S_COMMENT_2;
445 if (!lex_uc_is_space (uc))
456 s->state = S_GENERAL;
457 s->substate = SS_START_OF_COMMAND;
458 *type = SEG_SEPARATE_COMMANDS;
466 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
467 size_t n, bool eof, enum segment_type *type)
469 int ofs = segmenter_parse_newline__ (input, n, eof, type);
483 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
487 if (uc == '+' || uc == '-' || uc == '.')
489 else if (!lex_uc_is_space (uc))
492 case SEG_MODE_INTERACTIVE:
501 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
515 s->state = S_GENERAL;
516 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
519 s->state = S_COMMENT_1;
524 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
525 bool eof, enum segment_type *type)
537 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
548 if (ofs > 1 && input[ofs - 1] == '\r')
551 *type = SEG_DOCUMENT;
552 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
556 if (!lex_uc_is_space (uc))
565 *type = SEG_DOCUMENT;
566 s->state = S_DOCUMENT_3;
573 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
574 bool eof, enum segment_type *type)
578 ofs = segmenter_parse_newline__ (input, n, eof, type);
582 s->state = S_DOCUMENT_1;
587 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
589 *type = SEG_END_COMMAND;
590 s->state = S_GENERAL;
591 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
596 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
599 ofs = skip_spaces_and_comments (input, n, eof, ofs);
605 return c != '\'' && c != '"' && c != '\n';
615 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
616 bool eof, int ofs, char id[], size_t id_size)
618 struct segmenter sub;
620 assert (id_size > 0);
623 sub.state = S_GENERAL;
627 enum segment_type type;
630 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
646 if (retval < id_size)
648 memcpy (id, input + ofs, retval);
656 case SEG_QUOTED_STRING:
658 case SEG_UNICODE_STRING:
659 case SEG_UNQUOTED_STRING:
660 case SEG_RESERVED_WORD:
662 case SEG_COMMENT_COMMAND:
663 case SEG_DO_REPEAT_COMMAND:
664 case SEG_INLINE_DATA:
665 case SEG_START_DOCUMENT:
667 case SEG_START_COMMAND:
668 case SEG_SEPARATE_COMMANDS:
669 case SEG_END_COMMAND:
671 case SEG_EXPECTED_QUOTE:
672 case SEG_EXPECTED_EXPONENT:
673 case SEG_UNEXPECTED_DOT:
674 case SEG_UNEXPECTED_CHAR:
682 /* Called when INPUT begins with a character that can start off an ID token. */
684 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
685 bool eof, enum segment_type *type)
691 assert (s->state == S_GENERAL);
693 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
705 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
708 else if (!lex_uc_is_idn (uc))
714 if (input[ofs - 1] == '.')
716 int eol = at_end_of_line (input, n, eof, ofs);
723 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
724 : input[0] == '!' ? SEG_MACRO_ID
727 if (s->substate & SS_START_OF_COMMAND)
729 struct substring word = ss_buffer (input, ofs);
731 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
733 s->state = S_COMMENT_1;
734 return segmenter_parse_comment_1__ (s, input, n, eof, type);
736 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
738 s->state = S_DOCUMENT_1;
739 *type = SEG_START_DOCUMENT;
742 else if (lex_id_match (ss_cstr ("TITLE"), word)
743 || lex_id_match (ss_cstr ("SUBTITLE"), word))
745 int result = segmenter_unquoted (input, n, eof, ofs);
750 s->state = S_TITLE_1;
754 else if (lex_id_match (ss_cstr ("FILE"), word))
758 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
760 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
762 s->state = S_FILE_LABEL;
767 else if (lex_id_match (ss_cstr ("DO"), word))
771 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
773 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
775 s->state = S_DO_REPEAT_1;
780 else if (lex_id_match (ss_cstr ("BEGIN"), word))
785 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
788 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
792 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
798 else if (input[ofs2] == '.')
800 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
805 eol = is_end_of_line (input, n, eof, ofs2);
810 if (memchr (input, '\n', ofs2))
811 s->state = S_BEGIN_DATA_1;
813 s->state = S_BEGIN_DATA_2;
826 segmenter_parse_string__ (enum segment_type string_type,
827 int ofs, struct segmenter *s,
828 const char *input, size_t n, bool eof,
829 enum segment_type *type)
831 int quote = input[ofs];
835 if (input[ofs] == quote)
840 if (input[ofs] == quote)
853 else if (input[ofs] == '\n')
864 *type = SEG_EXPECTED_QUOTE;
870 segmenter_maybe_parse_string__ (enum segment_type string_type,
872 const char *input, size_t n, bool eof,
873 enum segment_type *type)
880 else if (input[1] == '\'' || input[1] == '"')
881 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
883 return segmenter_parse_id__ (s, input, n, eof, type);
887 segmenter_parse_mid_command__ (struct segmenter *s,
888 const char *input, size_t n, bool eof,
889 enum segment_type *type)
895 assert (s->state == S_GENERAL);
896 assert (!(s->substate & SS_START_OF_LINE));
898 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
905 s->substate |= SS_START_OF_LINE;
915 else if (input[1] == '*')
917 ofs = skip_comment (input, n, eof, 2);
929 case '(': case ')': case ',': case '=': case '-':
930 case '[': case ']': case '&': case '|': case '+':
936 if (s->substate & SS_START_OF_COMMAND)
938 /* '*' at the beginning of a command begins a comment. */
939 s->state = S_COMMENT_1;
940 return segmenter_parse_comment_1__ (s, input, n, eof, type);
943 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
946 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
949 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
952 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
960 else if (c_isdigit (input[1]))
961 return segmenter_parse_number__ (s, input, n, eof, type);
963 int eol = at_end_of_line (input, n, eof, 1);
969 *type = SEG_END_COMMAND;
970 s->substate = SS_START_OF_COMMAND;
973 *type = SEG_UNEXPECTED_DOT;
976 case '0': case '1': case '2': case '3': case '4':
977 case '5': case '6': case '7': case '8': case '9':
978 return segmenter_parse_number__ (s, input, n, eof, type);
981 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
982 s, input, n, eof, type);
985 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
986 s, input, n, eof, type);
989 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
990 s, input, n, eof, type);
993 return segmenter_parse_id__ (s, input, n, eof, type);
996 if (lex_uc_is_space (uc))
998 ofs = skip_spaces (input, n, eof, mblen);
1002 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1006 s->substate |= SS_START_OF_LINE;
1007 *type = SEG_NEWLINE;
1016 else if (lex_uc_is_id1 (uc))
1017 return segmenter_parse_id__ (s, input, n, eof, type);
1018 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1026 *type = SEG_UNEXPECTED_CHAR;
1034 compare_commands (const void *a_, const void *b_)
1036 const char *const *ap = a_;
1037 const char *const *bp = b_;
1038 const char *a = *ap;
1039 const char *b = *bp;
1041 return c_strcasecmp (a, b);
1044 static const char **
1045 segmenter_get_command_name_candidates (unsigned char first)
1047 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1048 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1049 static const char *commands[] =
1051 #include "language/command.def"
1054 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1060 static const char **cindex[UCHAR_MAX + 1];
1068 qsort (commands, n_commands, sizeof *commands, compare_commands);
1069 for (i = 0; i < n_commands; i++)
1071 unsigned char c = c_toupper (commands[i][0]);
1072 if (cindex[c] == NULL)
1073 cindex[c] = &commands[i];
1075 for (i = 0; i <= UCHAR_MAX; i++)
1076 if (cindex[i] == NULL)
1077 cindex[i] = &commands[n_commands];
1080 return cindex[c_toupper (first)];
1084 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1087 const char **commands;
1104 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1109 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1117 if (input[ofs - 1] == '.')
1120 for (commands = segmenter_get_command_name_candidates (input[0]);
1121 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1127 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1128 &exact, &missing_words)
1129 && missing_words <= 0)
1137 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1140 return eof ? 0 : -1;
1143 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1146 return eof ? 0 : -1;
1148 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1151 return c == '\'' || c == '"' || c == '\n';
1155 segmenter_parse_start_of_line__ (struct segmenter *s,
1156 const char *input, size_t n, bool eof,
1157 enum segment_type *type)
1163 assert (s->state == S_GENERAL);
1164 assert (s->substate & SS_START_OF_LINE);
1166 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1173 ofs = skip_spaces_and_comments (input, n, eof, 1);
1178 int is_string = is_start_of_string__ (input, n, eof, ofs);
1183 /* This is punctuation that may separate pieces of a string. */
1193 *type = SEG_START_COMMAND;
1194 s->substate = SS_START_OF_COMMAND;
1198 if (lex_uc_is_space (uc))
1200 int eol = at_end_of_line (input, n, eof, 0);
1205 s->substate = SS_START_OF_COMMAND;
1206 *type = SEG_SEPARATE_COMMANDS;
1212 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1214 else if (s->mode == SEG_MODE_AUTO)
1216 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1223 assert (s->mode == SEG_MODE_BATCH);
1225 s->substate = SS_START_OF_COMMAND;
1226 *type = SEG_START_COMMAND;
1230 s->substate = SS_START_OF_COMMAND;
1231 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1235 segmenter_parse_file_label__ (struct segmenter *s,
1236 const char *input, size_t n, bool eof,
1237 enum segment_type *type)
1239 struct segmenter sub;
1243 sub.state = S_GENERAL;
1244 ofs = segmenter_push (&sub, input, n, eof, type);
1248 else if (*type == SEG_IDENTIFIER)
1252 assert (lex_id_match (ss_cstr ("LABEL"),
1253 ss_buffer ((char *) input, ofs)));
1254 result = segmenter_unquoted (input, n, eof, ofs);
1260 s->state = S_TITLE_1;
1268 s->substate = sub.substate;
1274 segmenter_subparse (struct segmenter *s,
1275 const char *input, size_t n, bool eof,
1276 enum segment_type *type)
1278 struct segmenter sub;
1282 sub.state = S_GENERAL;
1283 sub.substate = s->substate;
1284 ofs = segmenter_push (&sub, input, n, eof, type);
1285 s->substate = sub.substate;
1289 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1290 defines the stand-in variables (the head) before the lines of syntax to be
1291 repeated (the body). */
1293 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1294 const char *input, size_t n, bool eof,
1295 enum segment_type *type)
1297 int ofs = segmenter_subparse (s, input, n, eof, type);
1301 if (*type == SEG_SEPARATE_COMMANDS)
1303 /* We reached a blank line that separates the head from the body. */
1304 s->state = S_DO_REPEAT_2;
1306 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1308 /* We reached the body. */
1309 s->state = S_DO_REPEAT_3;
1316 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1317 separates the head from the body. */
1319 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1320 const char *input, size_t n, bool eof,
1321 enum segment_type *type)
1323 int ofs = segmenter_subparse (s, input, n, eof, type);
1327 if (*type == SEG_NEWLINE)
1329 /* We reached the body. */
1330 s->state = S_DO_REPEAT_3;
1338 check_repeat_command (struct segmenter *s,
1339 const char *input, size_t n, bool eof)
1346 if (input[ofs] == '+' || input[ofs] == '-')
1349 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1352 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1354 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1359 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1363 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1364 s->substate += direction;
1369 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1370 enum segment_type *type)
1372 const char *newline = memchr (input, '\n', n);
1374 return eof ? n : -1;
1376 ptrdiff_t ofs = newline - input;
1377 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1379 *type = SEG_NEWLINE;
1383 return ofs - (input[ofs - 1] == '\r');
1386 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1387 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1389 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1390 the lines we're segmenting. s->substate counts the nesting level, starting
1393 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1394 const char *input, size_t n, bool eof,
1395 enum segment_type *type)
1399 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1400 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1402 else if (!check_repeat_command (s, input, n, eof) && !eof)
1404 else if (s->substate == 0)
1406 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1408 s->state = S_GENERAL;
1409 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1410 return segmenter_push (s, input, n, eof, type);
1414 *type = SEG_DO_REPEAT_COMMAND;
1420 segmenter_parse_begin_data_1__ (struct segmenter *s,
1421 const char *input, size_t n, bool eof,
1422 enum segment_type *type)
1424 int ofs = segmenter_subparse (s, input, n, eof, type);
1428 if (*type == SEG_NEWLINE)
1429 s->state = S_BEGIN_DATA_2;
1435 segmenter_parse_begin_data_2__ (struct segmenter *s,
1436 const char *input, size_t n, bool eof,
1437 enum segment_type *type)
1439 int ofs = segmenter_subparse (s, input, n, eof, type);
1443 if (*type == SEG_NEWLINE)
1444 s->state = S_BEGIN_DATA_3;
1450 is_end_data (const char *input, size_t n)
1452 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1458 if (n < 4 || c_strncasecmp (input, "END", 3))
1462 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1463 if (!lex_uc_is_space (uc))
1467 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1474 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1481 else if (!lex_uc_is_space (uc))
1490 segmenter_parse_begin_data_3__ (struct segmenter *s,
1491 const char *input, size_t n, bool eof,
1492 enum segment_type *type)
1496 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1499 else if (is_end_data (input, ofs))
1501 s->state = S_GENERAL;
1502 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1503 return segmenter_push (s, input, n, eof, type);
1507 *type = SEG_INLINE_DATA;
1508 s->state = S_BEGIN_DATA_4;
1509 return input[ofs - 1] == '\n' ? 0 : ofs;
1514 segmenter_parse_begin_data_4__ (struct segmenter *s,
1515 const char *input, size_t n, bool eof,
1516 enum segment_type *type)
1520 ofs = segmenter_parse_newline__ (input, n, eof, type);
1524 s->state = S_BEGIN_DATA_3;
1529 segmenter_parse_title_1__ (struct segmenter *s,
1530 const char *input, size_t n, bool eof,
1531 enum segment_type *type)
1535 ofs = skip_spaces (input, n, eof, 0);
1538 s->state = S_TITLE_2;
1544 segmenter_parse_title_2__ (struct segmenter *s,
1545 const char *input, size_t n, bool eof,
1546 enum segment_type *type)
1558 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1572 if (!lex_uc_is_space (uc))
1583 s->state = S_GENERAL;
1585 *type = SEG_UNQUOTED_STRING;
1586 return endcmd >= 0 ? endcmd : ofs;
1592 /* Returns the name of segment TYPE as a string. The caller must not modify
1593 or free the returned string.
1595 This is useful only for debugging and testing. */
1597 segment_type_to_string (enum segment_type type)
1601 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1605 return "unknown segment type";
1609 /* Initializes S as a segmenter with the given syntax MODE.
1611 A segmenter does not contain any external references, so nothing needs to be
1612 done to destroy one. For the same reason, segmenters may be copied with
1613 plain struct assignment (or memcpy). */
1615 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1617 s->state = S_SHBANG;
1622 /* Returns the mode passed to segmenter_init() for S. */
1624 segmenter_get_mode (const struct segmenter *s)
1629 /* Attempts to label a prefix of S's remaining input with a segment type. The
1630 caller supplies the first N bytes of the remaining input as INPUT, which
1631 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1632 are the entire (remainder) of the input; if EOF is false, then further input
1633 is potentially available.
1635 The input may contain '\n' or '\r\n' line ends in any combination.
1637 If successful, returns the number of bytes in the segment at the beginning
1638 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1639 into *TYPE. The next call to segmenter_push() should not include those
1640 bytes as part of INPUT, because they have (figuratively) been consumed by
1643 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1644 be determined. In this case segmenter_push() returns -1. If more input is
1645 available, the caller should obtain some more, then call again with a larger
1646 N. If this is not enough, the process might need to repeat again and agin.
1647 If input is exhausted, then the caller may call again setting EOF to true.
1648 segmenter_push() will never return -1 when EOF is true.
1650 The caller must not, in a sequence of calls, supply contradictory input.
1651 That is, bytes provided as part of INPUT in one call, but not consumed, must
1652 not be provided with *different* values on subsequent calls. This is
1653 because segmenter_push() must often make decisions based on looking ahead
1654 beyond the bytes that it consumes. */
1656 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1657 enum segment_type *type)
1673 return segmenter_parse_shbang__ (s, input, n, eof, type);
1676 return (s->substate & SS_START_OF_LINE
1677 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1678 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1681 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1683 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1686 return segmenter_parse_document_1__ (s, input, n, eof, type);
1688 return segmenter_parse_document_2__ (s, input, n, eof, type);
1690 return segmenter_parse_document_3__ (s, type);
1693 return segmenter_parse_file_label__ (s, input, n, eof, type);
1696 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1698 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1700 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1702 case S_BEGIN_DATA_1:
1703 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1704 case S_BEGIN_DATA_2:
1705 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1706 case S_BEGIN_DATA_3:
1707 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1708 case S_BEGIN_DATA_4:
1709 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1712 return segmenter_parse_title_1__ (s, input, n, eof, type);
1714 return segmenter_parse_title_2__ (s, input, n, eof, type);
1720 /* Returns the style of command prompt to display to an interactive user for
1721 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1722 and at the beginning of a line (that is, if segmenter_push() consumed as
1723 much as possible of the input up to a new-line). */
1725 segmenter_get_prompt (const struct segmenter *s)
1730 return PROMPT_FIRST;
1733 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1737 return PROMPT_COMMENT;
1741 return PROMPT_DOCUMENT;
1743 return PROMPT_FIRST;
1746 return PROMPT_LATER;
1750 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1752 return PROMPT_DO_REPEAT;
1754 case S_BEGIN_DATA_1:
1755 return PROMPT_FIRST;
1756 case S_BEGIN_DATA_2:
1757 return PROMPT_LATER;
1758 case S_BEGIN_DATA_3:
1759 case S_BEGIN_DATA_4:
1764 return PROMPT_FIRST;