1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
31 #include "gl/verify.h"
58 #define SS_START_OF_LINE (1u << 0)
59 #define SS_START_OF_COMMAND (1u << 1)
61 static int segmenter_detect_command_name__ (const char *input,
62 size_t n, bool eof, int ofs);
65 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
68 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
76 mblen = u8_mbtoucr (puc, input, n);
80 return u8_mbtouc (puc, input, n);
91 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
92 bool eof, enum segment_type *type)
100 for (int ofs = 2; ; ofs++)
107 else if (input[ofs] == '\n')
109 if (input[ofs - 1] == '\r')
115 s->state = S_GENERAL;
116 s->substate = SS_START_OF_COMMAND;
126 s->state = S_GENERAL;
127 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
128 return segmenter_push (s, input, n, eof, type);
132 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
133 const char *input, size_t n, bool eof,
134 enum segment_type *type)
136 assert (s->state == S_GENERAL);
142 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
146 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
148 for (; ofs < n; ofs++)
150 if (input[ofs] == '\n')
152 else if (input[ofs] == '*')
155 return eof ? ofs + 1 : -1;
156 else if (input[ofs + 1] == '/')
160 return eof ? ofs : -1;
164 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
171 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
178 return eof ? ofs : -1;
179 else if (input[ofs + 1] != '*')
182 ofs = skip_comment (input, n, eof, ofs + 2);
186 else if (lex_uc_is_space (uc) && uc != '\n')
192 return eof ? ofs : -1;
196 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
200 else if (input[ofs] == '\n')
202 else if (input[ofs] == '\r')
206 return input[ofs + 1] == '\n';
213 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
215 ofs = skip_spaces_and_comments (input, n, eof, ofs);
219 return is_end_of_line (input, n, eof, ofs);
223 is_all_spaces (const char *input_, size_t n)
225 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
228 for (int ofs = 0; ofs < n; ofs += mblen)
231 mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
232 if (!lex_uc_is_space (uc))
239 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
240 enum segment_type *type)
244 if (input[0] == '\n')
254 assert (input[0] == '\r');
255 assert (input[1] == '\n');
264 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
271 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
275 if (!lex_uc_is_space (uc) || uc == '\n')
281 return eof ? ofs : -1;
285 skip_digits (const char *input, size_t n, bool eof, int ofs)
287 for (; ofs < n; ofs++)
288 if (!c_isdigit (input[ofs]))
290 return eof ? ofs : -1;
294 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
295 bool eof, enum segment_type *type)
299 assert (s->state == S_GENERAL);
301 ofs = skip_digits (input, n, eof, 0);
311 if (input[ofs] == '.')
320 ofs = skip_digits (input, n, eof, ofs + 1);
327 if (input[ofs] == 'e' || input[ofs] == 'E')
334 goto expected_exponent;
337 if (input[ofs] == '+' || input[ofs] == '-')
344 goto expected_exponent;
348 if (!c_isdigit (input[ofs]))
349 goto expected_exponent;
351 ofs = skip_digits (input, n, eof, ofs);
356 if (input[ofs - 1] == '.')
358 int eol = at_end_of_line (input, n, eof, ofs);
371 *type = SEG_EXPECTED_EXPONENT;
377 is_reserved_word (const char *s, int n)
381 s0 = c_toupper (s[0]);
385 s1 = c_toupper (s[1]);
386 return ((s0 == 'B' && s1 == 'Y')
387 || (s0 == 'E' && s1 == 'Q')
388 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
389 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
390 || (s0 == 'N' && s1 == 'E')
391 || (s0 == 'O' && s1 == 'R')
392 || (s0 == 'T' && s1 == 'O'));
395 s1 = c_toupper (s[1]);
396 s2 = c_toupper (s[2]);
397 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
398 || (s1 == 'N' && s2 == 'D')))
399 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
402 s1 = c_toupper (s[1]);
403 s2 = c_toupper (s[2]);
404 s3 = c_toupper (s[3]);
405 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
413 segmenter_parse_comment_1__ (struct segmenter *s,
414 const char *input, size_t n, bool eof,
415 enum segment_type *type)
427 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
438 if (ofs > 1 && input[ofs - 1] == '\r')
442 /* Blank line ends comment command. */
443 s->state = S_GENERAL;
444 s->substate = SS_START_OF_COMMAND;
445 *type = SEG_SEPARATE_COMMANDS;
448 else if (endcmd >= 0)
450 /* '.' at end of line ends comment command. */
451 s->state = S_GENERAL;
453 *type = SEG_COMMENT_COMMAND;
458 /* Comment continues onto next line. */
459 *type = SEG_COMMENT_COMMAND;
460 s->state = S_COMMENT_2;
466 if (!lex_uc_is_space (uc))
477 s->state = S_GENERAL;
478 s->substate = SS_START_OF_COMMAND;
479 *type = SEG_SEPARATE_COMMANDS;
487 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
488 size_t n, bool eof, enum segment_type *type)
490 int ofs = segmenter_parse_newline__ (input, n, eof, type);
504 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
508 if (uc == '+' || uc == '-' || uc == '.')
510 else if (!lex_uc_is_space (uc))
513 case SEG_MODE_INTERACTIVE:
522 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
536 s->state = S_GENERAL;
537 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
540 s->state = S_COMMENT_1;
545 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
546 bool eof, enum segment_type *type)
558 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
569 if (ofs > 1 && input[ofs - 1] == '\r')
572 *type = SEG_DOCUMENT;
573 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
577 if (!lex_uc_is_space (uc))
586 *type = SEG_DOCUMENT;
587 s->state = S_DOCUMENT_3;
594 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
595 bool eof, enum segment_type *type)
599 ofs = segmenter_parse_newline__ (input, n, eof, type);
603 s->state = S_DOCUMENT_1;
608 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
610 *type = SEG_END_COMMAND;
611 s->state = S_GENERAL;
612 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
617 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
620 ofs = skip_spaces_and_comments (input, n, eof, ofs);
626 return c != '\'' && c != '"' && c != '\n';
636 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
637 bool eof, int ofs, char id[], size_t id_size)
639 struct segmenter sub;
641 assert (id_size > 0);
644 sub.state = S_GENERAL;
648 enum segment_type type;
651 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
667 if (retval < id_size)
669 memcpy (id, input + ofs, retval);
676 case SEG_QUOTED_STRING:
678 case SEG_UNICODE_STRING:
679 case SEG_UNQUOTED_STRING:
680 case SEG_RESERVED_WORD:
682 case SEG_COMMENT_COMMAND:
683 case SEG_DO_REPEAT_COMMAND:
684 case SEG_INLINE_DATA:
687 case SEG_START_DOCUMENT:
689 case SEG_START_COMMAND:
690 case SEG_SEPARATE_COMMANDS:
691 case SEG_END_COMMAND:
693 case SEG_EXPECTED_QUOTE:
694 case SEG_EXPECTED_EXPONENT:
695 case SEG_UNEXPECTED_CHAR:
703 /* Called when INPUT begins with a character that can start off an ID token. */
705 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
706 bool eof, enum segment_type *type)
712 assert (s->state == S_GENERAL);
714 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
726 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
729 else if (!lex_uc_is_idn (uc))
735 if (input[ofs - 1] == '.')
737 int eol = at_end_of_line (input, n, eof, ofs);
744 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
745 : input[0] == '!' ? SEG_MACRO_ID
748 if (s->substate & SS_START_OF_COMMAND)
750 struct substring word = ss_buffer (input, ofs);
752 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
754 s->state = S_COMMENT_1;
755 return segmenter_parse_comment_1__ (s, input, n, eof, type);
757 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
759 s->state = S_DOCUMENT_1;
760 *type = SEG_START_DOCUMENT;
763 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
765 s->state = S_DEFINE_1;
768 else if (lex_id_match (ss_cstr ("FILE"), word))
772 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
774 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
776 s->state = S_FILE_LABEL_1;
781 else if (lex_id_match (ss_cstr ("DO"), word))
785 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
787 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
789 s->state = S_DO_REPEAT_1;
794 else if (lex_id_match (ss_cstr ("BEGIN"), word))
799 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
802 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
806 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
812 else if (input[ofs2] == '.')
814 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
819 eol = is_end_of_line (input, n, eof, ofs2);
824 if (memchr (input, '\n', ofs2))
825 s->state = S_BEGIN_DATA_1;
827 s->state = S_BEGIN_DATA_2;
840 segmenter_parse_string__ (enum segment_type string_type,
841 int ofs, struct segmenter *s,
842 const char *input, size_t n, bool eof,
843 enum segment_type *type)
845 int quote = input[ofs];
849 if (input[ofs] == quote)
854 if (input[ofs] == quote)
867 else if (input[ofs] == '\n')
878 *type = SEG_EXPECTED_QUOTE;
884 segmenter_maybe_parse_string__ (enum segment_type string_type,
886 const char *input, size_t n, bool eof,
887 enum segment_type *type)
894 else if (input[1] == '\'' || input[1] == '"')
895 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
897 return segmenter_parse_id__ (s, input, n, eof, type);
901 segmenter_parse_mid_command__ (struct segmenter *s,
902 const char *input, size_t n, bool eof,
903 enum segment_type *type)
909 assert (s->state == S_GENERAL);
910 assert (!(s->substate & SS_START_OF_LINE));
912 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
919 s->substate |= SS_START_OF_LINE;
929 else if (input[1] == '*')
931 ofs = skip_comment (input, n, eof, 2);
943 case '(': case ')': case ',': case '=': case '-':
944 case '[': case ']': case '&': case '|': case '+':
950 if (s->substate & SS_START_OF_COMMAND)
952 /* '*' at the beginning of a command begins a comment. */
953 s->state = S_COMMENT_1;
954 return segmenter_parse_comment_1__ (s, input, n, eof, type);
957 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
960 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
963 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
966 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
974 else if (c_isdigit (input[1]))
975 return segmenter_parse_number__ (s, input, n, eof, type);
977 int eol = at_end_of_line (input, n, eof, 1);
983 *type = SEG_END_COMMAND;
984 s->substate = SS_START_OF_COMMAND;
990 case '0': case '1': case '2': case '3': case '4':
991 case '5': case '6': case '7': case '8': case '9':
992 return segmenter_parse_number__ (s, input, n, eof, type);
995 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
996 s, input, n, eof, type);
999 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
1000 s, input, n, eof, type);
1002 case '\'': case '"':
1003 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1004 s, input, n, eof, type);
1007 return segmenter_parse_id__ (s, input, n, eof, type);
1010 if (lex_uc_is_space (uc))
1012 ofs = skip_spaces (input, n, eof, mblen);
1016 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1020 s->substate |= SS_START_OF_LINE;
1021 *type = SEG_NEWLINE;
1030 else if (lex_uc_is_id1 (uc))
1031 return segmenter_parse_id__ (s, input, n, eof, type);
1032 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1040 *type = SEG_UNEXPECTED_CHAR;
1048 compare_commands (const void *a_, const void *b_)
1050 const char *const *ap = a_;
1051 const char *const *bp = b_;
1052 const char *a = *ap;
1053 const char *b = *bp;
1055 return c_strcasecmp (a, b);
1058 static const char **
1059 segmenter_get_command_name_candidates (unsigned char first)
1061 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1062 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1063 static const char *commands[] =
1065 #include "language/command.def"
1068 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1074 static const char **cindex[UCHAR_MAX + 1];
1082 qsort (commands, n_commands, sizeof *commands, compare_commands);
1083 for (i = 0; i < n_commands; i++)
1085 unsigned char c = c_toupper (commands[i][0]);
1086 if (cindex[c] == NULL)
1087 cindex[c] = &commands[i];
1089 for (i = 0; i <= UCHAR_MAX; i++)
1090 if (cindex[i] == NULL)
1091 cindex[i] = &commands[n_commands];
1094 return cindex[c_toupper (first)];
1098 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1101 const char **commands;
1118 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1123 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1131 if (input[ofs - 1] == '.')
1134 for (commands = segmenter_get_command_name_candidates (input[0]);
1135 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1141 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1142 &exact, &missing_words)
1143 && missing_words <= 0)
1151 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1154 return eof ? 0 : -1;
1157 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1160 return eof ? 0 : -1;
1162 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1165 return c == '\'' || c == '"' || c == '\n';
1169 segmenter_parse_start_of_line__ (struct segmenter *s,
1170 const char *input, size_t n, bool eof,
1171 enum segment_type *type)
1177 assert (s->state == S_GENERAL);
1178 assert (s->substate & SS_START_OF_LINE);
1180 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1187 ofs = skip_spaces_and_comments (input, n, eof, 1);
1192 int is_string = is_start_of_string__ (input, n, eof, ofs);
1197 /* This is punctuation that may separate pieces of a string. */
1207 *type = SEG_START_COMMAND;
1208 s->substate = SS_START_OF_COMMAND;
1212 if (lex_uc_is_space (uc))
1214 int eol = at_end_of_line (input, n, eof, 0);
1219 s->substate = SS_START_OF_COMMAND;
1220 *type = SEG_SEPARATE_COMMANDS;
1226 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1228 else if (s->mode == SEG_MODE_AUTO)
1230 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1237 assert (s->mode == SEG_MODE_BATCH);
1239 s->substate = SS_START_OF_COMMAND;
1240 *type = SEG_START_COMMAND;
1244 s->substate = SS_START_OF_COMMAND;
1245 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1249 segmenter_parse_file_label_1__ (struct segmenter *s,
1250 const char *input, size_t n, bool eof,
1251 enum segment_type *type)
1253 struct segmenter sub;
1257 sub.state = S_GENERAL;
1258 ofs = segmenter_push (&sub, input, n, eof, type);
1262 else if (*type == SEG_IDENTIFIER)
1266 assert (lex_id_match (ss_cstr ("LABEL"),
1267 ss_buffer ((char *) input, ofs)));
1268 result = segmenter_unquoted (input, n, eof, ofs);
1274 s->state = S_FILE_LABEL_2;
1282 s->substate = sub.substate;
1288 segmenter_parse_file_label_2__ (struct segmenter *s,
1289 const char *input, size_t n, bool eof,
1290 enum segment_type *type)
1294 ofs = skip_spaces (input, n, eof, 0);
1297 s->state = S_FILE_LABEL_3;
1303 segmenter_parse_file_label_3__ (struct segmenter *s,
1304 const char *input, size_t n, bool eof,
1305 enum segment_type *type)
1317 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1331 if (!lex_uc_is_space (uc))
1342 s->state = S_GENERAL;
1344 *type = SEG_UNQUOTED_STRING;
1345 return endcmd >= 0 ? endcmd : ofs;
1352 segmenter_subparse (struct segmenter *s,
1353 const char *input, size_t n, bool eof,
1354 enum segment_type *type)
1356 struct segmenter sub;
1360 sub.state = S_GENERAL;
1361 sub.substate = s->substate;
1362 ofs = segmenter_push (&sub, input, n, eof, type);
1363 s->substate = sub.substate;
1367 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1368 defines the stand-in variables (the head) before the lines of syntax to be
1369 repeated (the body). */
1371 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1372 const char *input, size_t n, bool eof,
1373 enum segment_type *type)
1375 int ofs = segmenter_subparse (s, input, n, eof, type);
1379 if (*type == SEG_SEPARATE_COMMANDS)
1381 /* We reached a blank line that separates the head from the body. */
1382 s->state = S_DO_REPEAT_2;
1384 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1386 /* We reached the body. */
1387 s->state = S_DO_REPEAT_3;
1394 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1395 separates the head from the body. */
1397 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1398 const char *input, size_t n, bool eof,
1399 enum segment_type *type)
1401 int ofs = segmenter_subparse (s, input, n, eof, type);
1405 if (*type == SEG_NEWLINE)
1407 /* We reached the body. */
1408 s->state = S_DO_REPEAT_3;
1416 check_repeat_command (struct segmenter *s,
1417 const char *input, size_t n, bool eof)
1424 if (input[ofs] == '+' || input[ofs] == '-')
1427 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1430 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1432 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1437 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1441 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1442 s->substate += direction;
1447 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1448 enum segment_type *type)
1450 const char *newline = memchr (input, '\n', n);
1452 return eof ? n : -1;
1454 ptrdiff_t ofs = newline - input;
1455 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1457 *type = SEG_NEWLINE;
1461 return ofs - (input[ofs - 1] == '\r');
1464 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1465 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1467 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1468 the lines we're segmenting. s->substate counts the nesting level, starting
1471 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1472 const char *input, size_t n, bool eof,
1473 enum segment_type *type)
1477 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1478 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1480 else if (!check_repeat_command (s, input, n, eof) && !eof)
1482 else if (s->substate == 0)
1484 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1486 s->state = S_GENERAL;
1487 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1488 return segmenter_push (s, input, n, eof, type);
1492 *type = SEG_DO_REPEAT_COMMAND;
1497 /* We are segmenting a DEFINE command, which consists of:
1499 - The DEFINE keyword.
1503 - "(" followed by a sequence of tokens possibly including balanced parentheses
1506 - A sequence of any number of lines, one string per line, ending with
1507 "!ENDDEFINE". The first line is usually blank (that is, a newline follows
1508 the "("). The last line usually just has "!ENDDEFINE." on it, but it can
1509 start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
1513 segmenter_parse_define_1__ (struct segmenter *s,
1514 const char *input, size_t n, bool eof,
1515 enum segment_type *type)
1517 int ofs = segmenter_subparse (s, input, n, eof, type);
1521 if (*type == SEG_SEPARATE_COMMANDS
1522 || *type == SEG_END_COMMAND
1523 || *type == SEG_START_COMMAND)
1525 /* The DEFINE command is malformed because we reached its end without
1526 ever hitting a "(" token. Transition back to general parsing. */
1527 s->state = S_GENERAL;
1530 else if (*type == SEG_PUNCT && input[0] == '(')
1532 s->state = S_DEFINE_2;
1541 segmenter_parse_define_2__ (struct segmenter *s,
1542 const char *input, size_t n, bool eof,
1543 enum segment_type *type)
1545 int ofs = segmenter_subparse (s, input, n, eof, type);
1549 if (*type == SEG_SEPARATE_COMMANDS
1550 || *type == SEG_END_COMMAND
1551 || *type == SEG_START_COMMAND)
1553 /* The DEFINE command is malformed because we reached its end before
1554 closing the set of parentheses. Transition back to general
1556 s->state = S_GENERAL;
1559 else if (*type == SEG_PUNCT && input[0] == '(')
1564 else if (*type == SEG_PUNCT && input[0] == ')')
1569 s->state = S_DEFINE_3;
1579 find_enddefine (struct substring input)
1581 size_t n = input.length;
1582 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1585 /* Skip !ENDDEFINE in comment. */
1586 ofs = skip_spaces_and_comments (input.string, n, true, ofs);
1587 if (ofs + enddefine.length > n)
1590 char c = input.string[ofs];
1592 && ss_equals_case (ss_substr (input, ofs, enddefine.length),
1595 else if (c == '\'' || c == '"')
1597 /* Skip quoted !ENDDEFINE. */
1603 else if (input.string[ofs++] == c)
1612 /* We are in the body of a macro definition, looking for additional lines of
1613 the body or !ENDDEFINE. */
1615 segmenter_parse_define_3__ (struct segmenter *s,
1616 const char *input, size_t n, bool eof,
1617 enum segment_type *type)
1619 /* Gather a whole line. */
1620 const char *newline = memchr (input, '\n', n);
1621 int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
1627 /* Does the line contain !ENDDEFINE? */
1628 size_t end = find_enddefine (ss_buffer (input, ofs));
1629 if (end == SIZE_MAX)
1631 /* No !ENDDEFINE. We have a full line of macro body.
1633 The line might be blank, whether completely empty or just spaces and
1634 comments. That's OK: we need to report blank lines because they can
1637 However, if the first line of the macro body (the same line as the
1638 closing parenthesis in the argument definition) is blank, we just
1639 report it as spaces because it's not significant. */
1640 *type = (s->substate == 0 && is_all_spaces (input, ofs)
1641 ? SEG_SPACES : SEG_MACRO_BODY);
1642 s->state = S_DEFINE_4;
1648 /* Macro ends at the !ENDDEFINE on this line. */
1649 s->state = S_GENERAL;
1653 /* Line starts with !ENDDEFINE. */
1654 return segmenter_push (s, input, n, eof, type);
1658 if (is_all_spaces (input, end))
1660 /* Line starts with spaces followed by !ENDDEFINE. */
1665 /* Line starts with some content followed by !ENDDEFINE. */
1666 *type = SEG_MACRO_BODY;
1674 segmenter_parse_define_4__ (struct segmenter *s,
1675 const char *input, size_t n, bool eof,
1676 enum segment_type *type)
1678 int ofs = segmenter_parse_newline__ (input, n, eof, type);
1682 s->state = S_DEFINE_3;
1687 segmenter_parse_begin_data_1__ (struct segmenter *s,
1688 const char *input, size_t n, bool eof,
1689 enum segment_type *type)
1691 int ofs = segmenter_subparse (s, input, n, eof, type);
1695 if (*type == SEG_NEWLINE)
1696 s->state = S_BEGIN_DATA_2;
1702 segmenter_parse_begin_data_2__ (struct segmenter *s,
1703 const char *input, size_t n, bool eof,
1704 enum segment_type *type)
1706 int ofs = segmenter_subparse (s, input, n, eof, type);
1710 if (*type == SEG_NEWLINE)
1711 s->state = S_BEGIN_DATA_3;
1717 is_end_data (const char *input, size_t n)
1719 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1725 if (n < 4 || c_strncasecmp (input, "END", 3))
1729 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1730 if (!lex_uc_is_space (uc))
1734 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1741 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1748 else if (!lex_uc_is_space (uc))
1757 segmenter_parse_begin_data_3__ (struct segmenter *s,
1758 const char *input, size_t n, bool eof,
1759 enum segment_type *type)
1763 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1766 else if (is_end_data (input, ofs))
1768 s->state = S_GENERAL;
1769 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1770 return segmenter_push (s, input, n, eof, type);
1774 *type = SEG_INLINE_DATA;
1775 s->state = S_BEGIN_DATA_4;
1776 return input[ofs - 1] == '\n' ? 0 : ofs;
1781 segmenter_parse_begin_data_4__ (struct segmenter *s,
1782 const char *input, size_t n, bool eof,
1783 enum segment_type *type)
1787 ofs = segmenter_parse_newline__ (input, n, eof, type);
1791 s->state = S_BEGIN_DATA_3;
1795 /* Returns the name of segment TYPE as a string. The caller must not modify
1796 or free the returned string.
1798 This is useful only for debugging and testing. */
1800 segment_type_to_string (enum segment_type type)
1804 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1808 return "unknown segment type";
1812 /* Returns a segmenter with the given syntax MODE.
1814 If IS_SNIPPET is false, then the segmenter will parse as if it's being given
1815 a whole file. This means, for example, that it will interpret - or + at the
1816 beginning of the syntax as a separator between commands (since - or + at the
1817 beginning of a line has this meaning).
1819 If IS_SNIPPET is true, then the segmenter will parse as if it's being given
1820 an isolated piece of syntax. This means that, for example, that it will
1821 interpret - or + at the beginning of the syntax as an operator token or (if
1822 followed by a digit) as part of a number.
1824 A segmenter does not contain any external references, so nothing needs to be
1825 done to destroy one. For the same reason, segmenters may be copied with
1826 plain struct assignment (or memcpy). */
1828 segmenter_init (enum segmenter_mode mode, bool is_snippet)
1830 return (struct segmenter) {
1831 .state = is_snippet ? S_GENERAL : S_SHBANG,
1836 /* Returns the mode passed to segmenter_init() for S. */
1838 segmenter_get_mode (const struct segmenter *s)
1843 /* Attempts to label a prefix of S's remaining input with a segment type. The
1844 caller supplies the first N bytes of the remaining input as INPUT, which
1845 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1846 are the entire (remainder) of the input; if EOF is false, then further input
1847 is potentially available.
1849 The input may contain '\n' or '\r\n' line ends in any combination.
1851 If successful, returns the number of bytes in the segment at the beginning
1852 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1853 into *TYPE. The next call to segmenter_push() should not include those
1854 bytes as part of INPUT, because they have (figuratively) been consumed by
1857 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1858 be determined. In this case segmenter_push() returns -1. If more input is
1859 available, the caller should obtain some more, then call again with a larger
1860 N. If this is not enough, the process might need to repeat again and agin.
1861 If input is exhausted, then the caller may call again setting EOF to true.
1862 segmenter_push() will never return -1 when EOF is true.
1864 The caller must not, in a sequence of calls, supply contradictory input.
1865 That is, bytes provided as part of INPUT in one call, but not consumed, must
1866 not be provided with *different* values on subsequent calls. This is
1867 because segmenter_push() must often make decisions based on looking ahead
1868 beyond the bytes that it consumes. */
1870 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1871 enum segment_type *type)
1887 return segmenter_parse_shbang__ (s, input, n, eof, type);
1890 return (s->substate & SS_START_OF_LINE
1891 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1892 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1895 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1897 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1900 return segmenter_parse_document_1__ (s, input, n, eof, type);
1902 return segmenter_parse_document_2__ (s, input, n, eof, type);
1904 return segmenter_parse_document_3__ (s, type);
1906 case S_FILE_LABEL_1:
1907 return segmenter_parse_file_label_1__ (s, input, n, eof, type);
1908 case S_FILE_LABEL_2:
1909 return segmenter_parse_file_label_2__ (s, input, n, eof, type);
1910 case S_FILE_LABEL_3:
1911 return segmenter_parse_file_label_3__ (s, input, n, eof, type);
1914 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1916 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1918 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1921 return segmenter_parse_define_1__ (s, input, n, eof, type);
1923 return segmenter_parse_define_2__ (s, input, n, eof, type);
1925 return segmenter_parse_define_3__ (s, input, n, eof, type);
1927 return segmenter_parse_define_4__ (s, input, n, eof, type);
1929 case S_BEGIN_DATA_1:
1930 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1931 case S_BEGIN_DATA_2:
1932 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1933 case S_BEGIN_DATA_3:
1934 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1935 case S_BEGIN_DATA_4:
1936 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1942 /* Returns the style of command prompt to display to an interactive user for
1943 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1944 and at the beginning of a line (that is, if segmenter_push() consumed as
1945 much as possible of the input up to a new-line). */
1947 segmenter_get_prompt (const struct segmenter *s)
1952 return PROMPT_FIRST;
1955 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1959 return PROMPT_COMMENT;
1963 return PROMPT_DOCUMENT;
1965 return PROMPT_FIRST;
1967 case S_FILE_LABEL_1:
1968 return PROMPT_LATER;
1969 case S_FILE_LABEL_2:
1970 case S_FILE_LABEL_3:
1971 return PROMPT_FIRST;
1975 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1977 return PROMPT_DO_REPEAT;
1981 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1984 return PROMPT_DEFINE;
1986 case S_BEGIN_DATA_1:
1987 return PROMPT_FIRST;
1988 case S_BEGIN_DATA_2:
1989 return PROMPT_LATER;
1990 case S_BEGIN_DATA_3:
1991 case S_BEGIN_DATA_4: