1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
56 static int segmenter_detect_command_name__ (const char *input,
57 size_t n, bool eof, int ofs);
60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
63 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
71 mblen = u8_mbtoucr (puc, input, n);
75 return u8_mbtouc (puc, input, n);
86 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
87 bool eof, enum segment_type *type)
95 for (int ofs = 2; ; ofs++)
102 else if (input[ofs] == '\n')
104 if (input[ofs - 1] == '\r')
110 s->state = S_GENERAL;
111 s->substate = SS_START_OF_COMMAND;
121 s->state = S_GENERAL;
122 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
123 return segmenter_push (s, input, n, eof, type);
127 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
128 const char *input, size_t n, bool eof,
129 enum segment_type *type)
131 assert (s->state == S_GENERAL);
137 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
141 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
143 for (; ofs < n; ofs++)
145 if (input[ofs] == '\n')
147 else if (input[ofs] == '*')
150 return eof ? ofs + 1 : -1;
151 else if (input[ofs + 1] == '/')
155 return eof ? ofs : -1;
159 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
166 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
173 return eof ? ofs : -1;
174 else if (input[ofs + 1] != '*')
177 ofs = skip_comment (input, n, eof, ofs + 2);
181 else if (lex_uc_is_space (uc) && uc != '\n')
187 return eof ? ofs : -1;
191 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
195 else if (input[ofs] == '\n')
197 else if (input[ofs] == '\r')
201 return input[ofs + 1] == '\n';
208 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
210 ofs = skip_spaces_and_comments (input, n, eof, ofs);
214 return is_end_of_line (input, n, eof, ofs);
218 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
219 enum segment_type *type)
223 if (input[0] == '\n')
233 assert (input[0] == '\r');
234 assert (input[1] == '\n');
243 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
250 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
254 if (!lex_uc_is_space (uc) || uc == '\n')
260 return eof ? ofs : -1;
264 skip_digits (const char *input, size_t n, bool eof, int ofs)
266 for (; ofs < n; ofs++)
267 if (!c_isdigit (input[ofs]))
269 return eof ? ofs : -1;
273 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
274 bool eof, enum segment_type *type)
278 assert (s->state == S_GENERAL);
280 ofs = skip_digits (input, n, eof, 0);
290 if (input[ofs] == '.')
292 ofs = skip_digits (input, n, eof, ofs + 1);
303 if (input[ofs] == 'e' || input[ofs] == 'E')
310 goto expected_exponent;
313 if (input[ofs] == '+' || input[ofs] == '-')
320 goto expected_exponent;
324 if (!c_isdigit (input[ofs]))
325 goto expected_exponent;
327 ofs = skip_digits (input, n, eof, ofs);
332 if (input[ofs - 1] == '.')
334 int eol = at_end_of_line (input, n, eof, ofs);
347 *type = SEG_EXPECTED_EXPONENT;
353 is_reserved_word (const char *s, int n)
357 s0 = c_toupper (s[0]);
361 s1 = c_toupper (s[1]);
362 return ((s0 == 'B' && s1 == 'Y')
363 || (s0 == 'E' && s1 == 'Q')
364 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
365 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
366 || (s0 == 'N' && s1 == 'E')
367 || (s0 == 'O' && s1 == 'R')
368 || (s0 == 'T' && s1 == 'O'));
371 s1 = c_toupper (s[1]);
372 s2 = c_toupper (s[2]);
373 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
374 || (s1 == 'N' && s2 == 'D')))
375 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
378 s1 = c_toupper (s[1]);
379 s2 = c_toupper (s[2]);
380 s3 = c_toupper (s[3]);
381 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
389 segmenter_parse_comment_1__ (struct segmenter *s,
390 const char *input, size_t n, bool eof,
391 enum segment_type *type)
403 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
414 if (ofs > 1 && input[ofs - 1] == '\r')
418 /* Blank line ends comment command. */
419 s->state = S_GENERAL;
420 s->substate = SS_START_OF_COMMAND;
421 *type = SEG_SEPARATE_COMMANDS;
424 else if (endcmd >= 0)
426 /* '.' at end of line ends comment command. */
427 s->state = S_GENERAL;
429 *type = SEG_COMMENT_COMMAND;
434 /* Comment continues onto next line. */
435 *type = SEG_COMMENT_COMMAND;
436 s->state = S_COMMENT_2;
442 if (!lex_uc_is_space (uc))
453 s->state = S_GENERAL;
454 s->substate = SS_START_OF_COMMAND;
455 *type = SEG_SEPARATE_COMMANDS;
463 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
464 size_t n, bool eof, enum segment_type *type)
466 int ofs = segmenter_parse_newline__ (input, n, eof, type);
480 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
484 if (uc == '+' || uc == '-' || uc == '.')
486 else if (!lex_uc_is_space (uc))
489 case SEG_MODE_INTERACTIVE:
498 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
512 s->state = S_GENERAL;
513 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
516 s->state = S_COMMENT_1;
521 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
522 bool eof, enum segment_type *type)
534 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
545 if (ofs > 1 && input[ofs - 1] == '\r')
548 *type = SEG_DOCUMENT;
549 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
553 if (!lex_uc_is_space (uc))
562 *type = SEG_DOCUMENT;
563 s->state = S_DOCUMENT_3;
570 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
571 bool eof, enum segment_type *type)
575 ofs = segmenter_parse_newline__ (input, n, eof, type);
579 s->state = S_DOCUMENT_1;
584 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
586 *type = SEG_END_COMMAND;
587 s->state = S_GENERAL;
588 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
593 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
596 ofs = skip_spaces_and_comments (input, n, eof, ofs);
602 return c != '\'' && c != '"' && c != '\n';
612 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
613 bool eof, int ofs, char id[], size_t id_size)
615 struct segmenter sub;
617 assert (id_size > 0);
620 sub.state = S_GENERAL;
624 enum segment_type type;
627 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
643 if (retval < id_size)
645 memcpy (id, input + ofs, retval);
652 case SEG_QUOTED_STRING:
654 case SEG_UNICODE_STRING:
655 case SEG_UNQUOTED_STRING:
656 case SEG_RESERVED_WORD:
658 case SEG_COMMENT_COMMAND:
659 case SEG_DO_REPEAT_COMMAND:
660 case SEG_INLINE_DATA:
661 case SEG_START_DOCUMENT:
663 case SEG_START_COMMAND:
664 case SEG_SEPARATE_COMMANDS:
665 case SEG_END_COMMAND:
667 case SEG_EXPECTED_QUOTE:
668 case SEG_EXPECTED_EXPONENT:
669 case SEG_UNEXPECTED_DOT:
670 case SEG_UNEXPECTED_CHAR:
678 /* Called when INPUT begins with a character that can start off an ID token. */
680 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
681 bool eof, enum segment_type *type)
687 assert (s->state == S_GENERAL);
689 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
701 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
704 else if (!lex_uc_is_idn (uc))
710 if (input[ofs - 1] == '.')
712 int eol = at_end_of_line (input, n, eof, ofs);
719 if (is_reserved_word (input, ofs))
720 *type = SEG_RESERVED_WORD;
722 *type = SEG_IDENTIFIER;
724 if (s->substate & SS_START_OF_COMMAND)
726 struct substring word = ss_buffer (input, ofs);
728 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
730 s->state = S_COMMENT_1;
731 return segmenter_parse_comment_1__ (s, input, n, eof, type);
733 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
735 s->state = S_DOCUMENT_1;
736 *type = SEG_START_DOCUMENT;
739 else if (lex_id_match (ss_cstr ("TITLE"), word)
740 || lex_id_match (ss_cstr ("SUBTITLE"), word))
742 int result = segmenter_unquoted (input, n, eof, ofs);
747 s->state = S_TITLE_1;
751 else if (lex_id_match (ss_cstr ("FILE"), word))
755 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
757 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
759 s->state = S_FILE_LABEL;
764 else if (lex_id_match (ss_cstr ("DO"), word))
768 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
770 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
772 s->state = S_DO_REPEAT_1;
777 else if (lex_id_match (ss_cstr ("BEGIN"), word))
782 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
785 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
789 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
795 else if (input[ofs2] == '.')
797 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
802 eol = is_end_of_line (input, n, eof, ofs2);
807 if (memchr (input, '\n', ofs2))
808 s->state = S_BEGIN_DATA_1;
810 s->state = S_BEGIN_DATA_2;
823 segmenter_parse_string__ (enum segment_type string_type,
824 int ofs, struct segmenter *s,
825 const char *input, size_t n, bool eof,
826 enum segment_type *type)
828 int quote = input[ofs];
832 if (input[ofs] == quote)
837 if (input[ofs] == quote)
850 else if (input[ofs] == '\n')
861 *type = SEG_EXPECTED_QUOTE;
867 segmenter_maybe_parse_string__ (enum segment_type string_type,
869 const char *input, size_t n, bool eof,
870 enum segment_type *type)
877 else if (input[1] == '\'' || input[1] == '"')
878 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
880 return segmenter_parse_id__ (s, input, n, eof, type);
884 segmenter_parse_mid_command__ (struct segmenter *s,
885 const char *input, size_t n, bool eof,
886 enum segment_type *type)
892 assert (s->state == S_GENERAL);
893 assert (!(s->substate & SS_START_OF_LINE));
895 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
902 s->substate |= SS_START_OF_LINE;
912 else if (input[1] == '*')
914 ofs = skip_comment (input, n, eof, 2);
926 case '(': case ')': case ',': case '=': case '-':
927 case '[': case ']': case '&': case '|': case '+':
933 if (s->substate & SS_START_OF_COMMAND)
935 /* '*' at the beginning of a command begins a comment. */
936 s->state = S_COMMENT_1;
937 return segmenter_parse_comment_1__ (s, input, n, eof, type);
940 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
943 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
946 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
949 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
957 else if (c_isdigit (input[1]))
958 return segmenter_parse_number__ (s, input, n, eof, type);
960 int eol = at_end_of_line (input, n, eof, 1);
966 *type = SEG_END_COMMAND;
967 s->substate = SS_START_OF_COMMAND;
970 *type = SEG_UNEXPECTED_DOT;
973 case '0': case '1': case '2': case '3': case '4':
974 case '5': case '6': case '7': case '8': case '9':
975 return segmenter_parse_number__ (s, input, n, eof, type);
978 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
979 s, input, n, eof, type);
982 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
983 s, input, n, eof, type);
986 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
987 s, input, n, eof, type);
990 if (lex_uc_is_space (uc))
992 ofs = skip_spaces (input, n, eof, mblen);
996 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1000 s->substate |= SS_START_OF_LINE;
1001 *type = SEG_NEWLINE;
1010 else if (lex_uc_is_id1 (uc))
1011 return segmenter_parse_id__ (s, input, n, eof, type);
1014 *type = SEG_UNEXPECTED_CHAR;
1022 compare_commands (const void *a_, const void *b_)
1024 const char *const *ap = a_;
1025 const char *const *bp = b_;
1026 const char *a = *ap;
1027 const char *b = *bp;
1029 return c_strcasecmp (a, b);
1032 static const char **
1033 segmenter_get_command_name_candidates (unsigned char first)
1035 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1036 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1037 static const char *commands[] =
1039 #include "language/command.def"
1042 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1048 static const char **cindex[UCHAR_MAX + 1];
1056 qsort (commands, n_commands, sizeof *commands, compare_commands);
1057 for (i = 0; i < n_commands; i++)
1059 unsigned char c = c_toupper (commands[i][0]);
1060 if (cindex[c] == NULL)
1061 cindex[c] = &commands[i];
1063 for (i = 0; i <= UCHAR_MAX; i++)
1064 if (cindex[i] == NULL)
1065 cindex[i] = &commands[n_commands];
1068 return cindex[c_toupper (first)];
1072 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1075 const char **commands;
1092 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1097 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1105 if (input[ofs - 1] == '.')
1108 for (commands = segmenter_get_command_name_candidates (input[0]);
1109 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1115 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1116 &exact, &missing_words)
1117 && missing_words <= 0)
1125 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1128 return eof ? 0 : -1;
1131 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1134 return eof ? 0 : -1;
1136 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1139 return c == '\'' || c == '"' || c == '\n';
1143 segmenter_parse_start_of_line__ (struct segmenter *s,
1144 const char *input, size_t n, bool eof,
1145 enum segment_type *type)
1151 assert (s->state == S_GENERAL);
1152 assert (s->substate & SS_START_OF_LINE);
1154 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1161 ofs = skip_spaces_and_comments (input, n, eof, 1);
1166 int is_string = is_start_of_string__ (input, n, eof, ofs);
1171 /* This is punctuation that may separate pieces of a string. */
1181 *type = SEG_START_COMMAND;
1182 s->substate = SS_START_OF_COMMAND;
1186 if (lex_uc_is_space (uc))
1188 int eol = at_end_of_line (input, n, eof, 0);
1193 s->substate = SS_START_OF_COMMAND;
1194 *type = SEG_SEPARATE_COMMANDS;
1200 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1202 else if (s->mode == SEG_MODE_AUTO)
1204 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1211 assert (s->mode == SEG_MODE_BATCH);
1213 s->substate = SS_START_OF_COMMAND;
1214 *type = SEG_START_COMMAND;
1218 s->substate = SS_START_OF_COMMAND;
1219 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1223 segmenter_parse_file_label__ (struct segmenter *s,
1224 const char *input, size_t n, bool eof,
1225 enum segment_type *type)
1227 struct segmenter sub;
1231 sub.state = S_GENERAL;
1232 ofs = segmenter_push (&sub, input, n, eof, type);
1236 else if (*type == SEG_IDENTIFIER)
1240 assert (lex_id_match (ss_cstr ("LABEL"),
1241 ss_buffer ((char *) input, ofs)));
1242 result = segmenter_unquoted (input, n, eof, ofs);
1248 s->state = S_TITLE_1;
1256 s->substate = sub.substate;
1262 segmenter_subparse (struct segmenter *s,
1263 const char *input, size_t n, bool eof,
1264 enum segment_type *type)
1266 struct segmenter sub;
1270 sub.state = S_GENERAL;
1271 sub.substate = s->substate;
1272 ofs = segmenter_push (&sub, input, n, eof, type);
1273 s->substate = sub.substate;
1277 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1278 defines the stand-in variables (the head) before the lines of syntax to be
1279 repeated (the body). */
1281 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1282 const char *input, size_t n, bool eof,
1283 enum segment_type *type)
1285 int ofs = segmenter_subparse (s, input, n, eof, type);
1289 if (*type == SEG_SEPARATE_COMMANDS)
1291 /* We reached a blank line that separates the head from the body. */
1292 s->state = S_DO_REPEAT_2;
1294 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1296 /* We reached the body. */
1297 s->state = S_DO_REPEAT_3;
1304 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1305 separates the head from the body. */
1307 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1308 const char *input, size_t n, bool eof,
1309 enum segment_type *type)
1311 int ofs = segmenter_subparse (s, input, n, eof, type);
1315 if (*type == SEG_NEWLINE)
1317 /* We reached the body. */
1318 s->state = S_DO_REPEAT_3;
1326 check_repeat_command (struct segmenter *s,
1327 const char *input, size_t n, bool eof)
1334 if (input[ofs] == '+' || input[ofs] == '-')
1337 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1340 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1342 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1347 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1351 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1352 s->substate += direction;
1357 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1358 enum segment_type *type)
1360 const char *newline = memchr (input, '\n', n);
1362 return eof ? n : -1;
1364 ptrdiff_t ofs = newline - input;
1365 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1367 *type = SEG_NEWLINE;
1371 return ofs - (input[ofs - 1] == '\r');
1374 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1375 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1377 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1378 the lines we're segmenting. s->substate counts the nesting level, starting
1381 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1382 const char *input, size_t n, bool eof,
1383 enum segment_type *type)
1387 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1388 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1390 else if (!check_repeat_command (s, input, n, eof) && !eof)
1392 else if (s->substate == 0)
1394 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1396 s->state = S_GENERAL;
1397 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1398 return segmenter_push (s, input, n, eof, type);
1402 *type = SEG_DO_REPEAT_COMMAND;
1408 segmenter_parse_begin_data_1__ (struct segmenter *s,
1409 const char *input, size_t n, bool eof,
1410 enum segment_type *type)
1412 int ofs = segmenter_subparse (s, input, n, eof, type);
1416 if (*type == SEG_NEWLINE)
1417 s->state = S_BEGIN_DATA_2;
1423 segmenter_parse_begin_data_2__ (struct segmenter *s,
1424 const char *input, size_t n, bool eof,
1425 enum segment_type *type)
1427 int ofs = segmenter_subparse (s, input, n, eof, type);
1431 if (*type == SEG_NEWLINE)
1432 s->state = S_BEGIN_DATA_3;
1438 is_end_data (const char *input, size_t n)
1440 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1446 if (n < 4 || c_strncasecmp (input, "END", 3))
1450 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1451 if (!lex_uc_is_space (uc))
1455 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1462 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1469 else if (!lex_uc_is_space (uc))
1478 segmenter_parse_begin_data_3__ (struct segmenter *s,
1479 const char *input, size_t n, bool eof,
1480 enum segment_type *type)
1484 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1487 else if (is_end_data (input, ofs))
1489 s->state = S_GENERAL;
1490 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1491 return segmenter_push (s, input, n, eof, type);
1495 *type = SEG_INLINE_DATA;
1496 s->state = S_BEGIN_DATA_4;
1497 return input[ofs - 1] == '\n' ? 0 : ofs;
1502 segmenter_parse_begin_data_4__ (struct segmenter *s,
1503 const char *input, size_t n, bool eof,
1504 enum segment_type *type)
1508 ofs = segmenter_parse_newline__ (input, n, eof, type);
1512 s->state = S_BEGIN_DATA_3;
1517 segmenter_parse_title_1__ (struct segmenter *s,
1518 const char *input, size_t n, bool eof,
1519 enum segment_type *type)
1523 ofs = skip_spaces (input, n, eof, 0);
1526 s->state = S_TITLE_2;
1532 segmenter_parse_title_2__ (struct segmenter *s,
1533 const char *input, size_t n, bool eof,
1534 enum segment_type *type)
1546 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1560 if (!lex_uc_is_space (uc))
1571 s->state = S_GENERAL;
1573 *type = SEG_UNQUOTED_STRING;
1574 return endcmd >= 0 ? endcmd : ofs;
1580 /* Returns the name of segment TYPE as a string. The caller must not modify
1581 or free the returned string.
1583 This is useful only for debugging and testing. */
1585 segment_type_to_string (enum segment_type type)
1589 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1593 return "unknown segment type";
1597 /* Initializes S as a segmenter with the given syntax MODE.
1599 A segmenter does not contain any external references, so nothing needs to be
1600 done to destroy one. For the same reason, segmenters may be copied with
1601 plain struct assignment (or memcpy). */
1603 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1605 s->state = S_SHBANG;
1610 /* Returns the mode passed to segmenter_init() for S. */
1612 segmenter_get_mode (const struct segmenter *s)
1617 /* Attempts to label a prefix of S's remaining input with a segment type. The
1618 caller supplies the first N bytes of the remaining input as INPUT, which
1619 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1620 are the entire (remainder) of the input; if EOF is false, then further input
1621 is potentially available.
1623 The input may contain '\n' or '\r\n' line ends in any combination.
1625 If successful, returns the number of bytes in the segment at the beginning
1626 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1627 into *TYPE. The next call to segmenter_push() should not include those
1628 bytes as part of INPUT, because they have (figuratively) been consumed by
1631 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1632 be determined. In this case segmenter_push() returns -1. If more input is
1633 available, the caller should obtain some more, then call again with a larger
1634 N. If this is not enough, the process might need to repeat again and agin.
1635 If input is exhausted, then the caller may call again setting EOF to true.
1636 segmenter_push() will never return -1 when EOF is true.
1638 The caller must not, in a sequence of calls, supply contradictory input.
1639 That is, bytes provided as part of INPUT in one call, but not consumed, must
1640 not be provided with *different* values on subsequent calls. This is
1641 because segmenter_push() must often make decisions based on looking ahead
1642 beyond the bytes that it consumes. */
1644 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1645 enum segment_type *type)
1661 return segmenter_parse_shbang__ (s, input, n, eof, type);
1664 return (s->substate & SS_START_OF_LINE
1665 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1666 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1669 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1671 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1674 return segmenter_parse_document_1__ (s, input, n, eof, type);
1676 return segmenter_parse_document_2__ (s, input, n, eof, type);
1678 return segmenter_parse_document_3__ (s, type);
1681 return segmenter_parse_file_label__ (s, input, n, eof, type);
1684 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1686 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1688 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1690 case S_BEGIN_DATA_1:
1691 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1692 case S_BEGIN_DATA_2:
1693 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1694 case S_BEGIN_DATA_3:
1695 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1696 case S_BEGIN_DATA_4:
1697 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1700 return segmenter_parse_title_1__ (s, input, n, eof, type);
1702 return segmenter_parse_title_2__ (s, input, n, eof, type);
1708 /* Returns the style of command prompt to display to an interactive user for
1709 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1710 and at the beginning of a line (that is, if segmenter_push() consumed as
1711 much as possible of the input up to a new-line). */
1713 segmenter_get_prompt (const struct segmenter *s)
1718 return PROMPT_FIRST;
1721 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1725 return PROMPT_COMMENT;
1729 return PROMPT_DOCUMENT;
1731 return PROMPT_FIRST;
1734 return PROMPT_LATER;
1738 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1740 return PROMPT_DO_REPEAT;
1742 case S_BEGIN_DATA_1:
1743 return PROMPT_FIRST;
1744 case S_BEGIN_DATA_2:
1745 return PROMPT_LATER;
1746 case S_BEGIN_DATA_3:
1747 case S_BEGIN_DATA_4:
1752 return PROMPT_FIRST;