1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
57 #define SS_START_OF_LINE (1u << 0)
58 #define SS_START_OF_COMMAND (1u << 1)
60 static int segmenter_detect_command_name__ (const char *input,
61 size_t n, bool eof, int ofs);
64 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
67 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
75 mblen = u8_mbtoucr (puc, input, n);
79 return u8_mbtouc (puc, input, n);
90 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
91 bool eof, enum segment_type *type)
99 for (int ofs = 2; ; ofs++)
106 else if (input[ofs] == '\n')
108 if (input[ofs - 1] == '\r')
114 s->state = S_GENERAL;
115 s->substate = SS_START_OF_COMMAND;
125 s->state = S_GENERAL;
126 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
127 return segmenter_push (s, input, n, eof, type);
131 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
132 const char *input, size_t n, bool eof,
133 enum segment_type *type)
135 assert (s->state == S_GENERAL);
141 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
145 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
147 for (; ofs < n; ofs++)
149 if (input[ofs] == '\n')
151 else if (input[ofs] == '*')
154 return eof ? ofs + 1 : -1;
155 else if (input[ofs + 1] == '/')
159 return eof ? ofs : -1;
163 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
170 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
177 return eof ? ofs : -1;
178 else if (input[ofs + 1] != '*')
181 ofs = skip_comment (input, n, eof, ofs + 2);
185 else if (lex_uc_is_space (uc) && uc != '\n')
191 return eof ? ofs : -1;
195 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
199 else if (input[ofs] == '\n')
201 else if (input[ofs] == '\r')
205 return input[ofs + 1] == '\n';
212 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
214 ofs = skip_spaces_and_comments (input, n, eof, ofs);
218 return is_end_of_line (input, n, eof, ofs);
222 is_all_spaces (const char *input_, size_t n)
224 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
227 for (int ofs = 0; ofs < n; ofs += mblen)
230 mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
231 if (!lex_uc_is_space (uc))
238 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
239 enum segment_type *type)
243 if (input[0] == '\n')
253 assert (input[0] == '\r');
254 assert (input[1] == '\n');
263 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
270 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
274 if (!lex_uc_is_space (uc) || uc == '\n')
280 return eof ? ofs : -1;
284 skip_digits (const char *input, size_t n, bool eof, int ofs)
286 for (; ofs < n; ofs++)
287 if (!c_isdigit (input[ofs]))
289 return eof ? ofs : -1;
293 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
294 bool eof, enum segment_type *type)
298 assert (s->state == S_GENERAL);
300 ofs = skip_digits (input, n, eof, 0);
310 if (input[ofs] == '.')
319 ofs = skip_digits (input, n, eof, ofs + 1);
326 if (input[ofs] == 'e' || input[ofs] == 'E')
333 goto expected_exponent;
336 if (input[ofs] == '+' || input[ofs] == '-')
343 goto expected_exponent;
347 if (!c_isdigit (input[ofs]))
348 goto expected_exponent;
350 ofs = skip_digits (input, n, eof, ofs);
355 if (input[ofs - 1] == '.')
357 int eol = at_end_of_line (input, n, eof, ofs);
370 *type = SEG_EXPECTED_EXPONENT;
376 is_reserved_word (const char *s, int n)
380 s0 = c_toupper (s[0]);
384 s1 = c_toupper (s[1]);
385 return ((s0 == 'B' && s1 == 'Y')
386 || (s0 == 'E' && s1 == 'Q')
387 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
388 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
389 || (s0 == 'N' && s1 == 'E')
390 || (s0 == 'O' && s1 == 'R')
391 || (s0 == 'T' && s1 == 'O'));
394 s1 = c_toupper (s[1]);
395 s2 = c_toupper (s[2]);
396 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
397 || (s1 == 'N' && s2 == 'D')))
398 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
401 s1 = c_toupper (s[1]);
402 s2 = c_toupper (s[2]);
403 s3 = c_toupper (s[3]);
404 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
412 segmenter_parse_comment_1__ (struct segmenter *s,
413 const char *input, size_t n, bool eof,
414 enum segment_type *type)
426 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
437 if (ofs > 1 && input[ofs - 1] == '\r')
441 /* Blank line ends comment command. */
442 s->state = S_GENERAL;
443 s->substate = SS_START_OF_COMMAND;
444 *type = SEG_SEPARATE_COMMANDS;
447 else if (endcmd >= 0)
449 /* '.' at end of line ends comment command. */
450 s->state = S_GENERAL;
452 *type = SEG_COMMENT_COMMAND;
457 /* Comment continues onto next line. */
458 *type = SEG_COMMENT_COMMAND;
459 s->state = S_COMMENT_2;
465 if (!lex_uc_is_space (uc))
476 s->state = S_GENERAL;
477 s->substate = SS_START_OF_COMMAND;
478 *type = SEG_SEPARATE_COMMANDS;
486 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
487 size_t n, bool eof, enum segment_type *type)
489 int ofs = segmenter_parse_newline__ (input, n, eof, type);
503 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
507 if (uc == '+' || uc == '-' || uc == '.')
509 else if (!lex_uc_is_space (uc))
512 case SEG_MODE_INTERACTIVE:
521 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
535 s->state = S_GENERAL;
536 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
539 s->state = S_COMMENT_1;
544 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
545 bool eof, enum segment_type *type)
557 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
568 if (ofs > 1 && input[ofs - 1] == '\r')
571 *type = SEG_DOCUMENT;
572 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
576 if (!lex_uc_is_space (uc))
585 *type = SEG_DOCUMENT;
586 s->state = S_DOCUMENT_3;
593 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
594 bool eof, enum segment_type *type)
598 ofs = segmenter_parse_newline__ (input, n, eof, type);
602 s->state = S_DOCUMENT_1;
607 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
609 *type = SEG_END_COMMAND;
610 s->state = S_GENERAL;
611 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
616 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
619 ofs = skip_spaces_and_comments (input, n, eof, ofs);
625 return c != '\'' && c != '"' && c != '\n';
635 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
636 bool eof, int ofs, char id[], size_t id_size)
638 struct segmenter sub;
640 assert (id_size > 0);
643 sub.state = S_GENERAL;
647 enum segment_type type;
650 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
666 if (retval < id_size)
668 memcpy (id, input + ofs, retval);
675 case SEG_QUOTED_STRING:
677 case SEG_UNICODE_STRING:
678 case SEG_UNQUOTED_STRING:
679 case SEG_RESERVED_WORD:
681 case SEG_COMMENT_COMMAND:
682 case SEG_DO_REPEAT_COMMAND:
683 case SEG_INLINE_DATA:
686 case SEG_START_DOCUMENT:
688 case SEG_START_COMMAND:
689 case SEG_SEPARATE_COMMANDS:
690 case SEG_END_COMMAND:
692 case SEG_EXPECTED_QUOTE:
693 case SEG_EXPECTED_EXPONENT:
694 case SEG_UNEXPECTED_CHAR:
702 /* Called when INPUT begins with a character that can start off an ID token. */
704 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
705 bool eof, enum segment_type *type)
711 assert (s->state == S_GENERAL);
713 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
725 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
728 else if (!lex_uc_is_idn (uc))
734 if (input[ofs - 1] == '.')
736 int eol = at_end_of_line (input, n, eof, ofs);
743 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
744 : input[0] == '!' ? SEG_MACRO_ID
747 if (s->substate & SS_START_OF_COMMAND)
749 struct substring word = ss_buffer (input, ofs);
751 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
753 s->state = S_COMMENT_1;
754 return segmenter_parse_comment_1__ (s, input, n, eof, type);
756 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
758 s->state = S_DOCUMENT_1;
759 *type = SEG_START_DOCUMENT;
762 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
764 s->state = S_DEFINE_1;
767 else if (lex_id_match (ss_cstr ("FILE"), word))
771 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
773 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
775 s->state = S_FILE_LABEL_1;
780 else if (lex_id_match (ss_cstr ("DO"), word))
784 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
786 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
788 s->state = S_DO_REPEAT_1;
793 else if (lex_id_match (ss_cstr ("BEGIN"), word))
798 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
801 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
805 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
811 else if (input[ofs2] == '.')
813 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
818 eol = is_end_of_line (input, n, eof, ofs2);
823 if (memchr (input, '\n', ofs2))
824 s->state = S_BEGIN_DATA_1;
826 s->state = S_BEGIN_DATA_2;
839 segmenter_parse_string__ (enum segment_type string_type,
840 int ofs, struct segmenter *s,
841 const char *input, size_t n, bool eof,
842 enum segment_type *type)
844 int quote = input[ofs];
848 if (input[ofs] == quote)
853 if (input[ofs] == quote)
866 else if (input[ofs] == '\n')
877 *type = SEG_EXPECTED_QUOTE;
883 segmenter_maybe_parse_string__ (enum segment_type string_type,
885 const char *input, size_t n, bool eof,
886 enum segment_type *type)
893 else if (input[1] == '\'' || input[1] == '"')
894 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
896 return segmenter_parse_id__ (s, input, n, eof, type);
900 segmenter_parse_mid_command__ (struct segmenter *s,
901 const char *input, size_t n, bool eof,
902 enum segment_type *type)
908 assert (s->state == S_GENERAL);
909 assert (!(s->substate & SS_START_OF_LINE));
911 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
918 s->substate |= SS_START_OF_LINE;
928 else if (input[1] == '*')
930 ofs = skip_comment (input, n, eof, 2);
942 case '(': case ')': case ',': case '=': case '-':
943 case '[': case ']': case '&': case '|': case '+':
949 if (s->substate & SS_START_OF_COMMAND)
951 /* '*' at the beginning of a command begins a comment. */
952 s->state = S_COMMENT_1;
953 return segmenter_parse_comment_1__ (s, input, n, eof, type);
956 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
959 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
962 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
965 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
973 else if (c_isdigit (input[1]))
974 return segmenter_parse_number__ (s, input, n, eof, type);
976 int eol = at_end_of_line (input, n, eof, 1);
982 *type = SEG_END_COMMAND;
983 s->substate = SS_START_OF_COMMAND;
989 case '0': case '1': case '2': case '3': case '4':
990 case '5': case '6': case '7': case '8': case '9':
991 return segmenter_parse_number__ (s, input, n, eof, type);
994 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
995 s, input, n, eof, type);
998 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
999 s, input, n, eof, type);
1001 case '\'': case '"':
1002 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1003 s, input, n, eof, type);
1006 return segmenter_parse_id__ (s, input, n, eof, type);
1009 if (lex_uc_is_space (uc))
1011 ofs = skip_spaces (input, n, eof, mblen);
1015 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1019 s->substate |= SS_START_OF_LINE;
1020 *type = SEG_NEWLINE;
1029 else if (lex_uc_is_id1 (uc))
1030 return segmenter_parse_id__ (s, input, n, eof, type);
1031 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1039 *type = SEG_UNEXPECTED_CHAR;
1047 compare_commands (const void *a_, const void *b_)
1049 const char *const *ap = a_;
1050 const char *const *bp = b_;
1051 const char *a = *ap;
1052 const char *b = *bp;
1054 return c_strcasecmp (a, b);
1057 static const char **
1058 segmenter_get_command_name_candidates (unsigned char first)
1060 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1061 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1062 static const char *commands[] =
1064 #include "language/command.def"
1067 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1073 static const char **cindex[UCHAR_MAX + 1];
1081 qsort (commands, n_commands, sizeof *commands, compare_commands);
1082 for (i = 0; i < n_commands; i++)
1084 unsigned char c = c_toupper (commands[i][0]);
1085 if (cindex[c] == NULL)
1086 cindex[c] = &commands[i];
1088 for (i = 0; i <= UCHAR_MAX; i++)
1089 if (cindex[i] == NULL)
1090 cindex[i] = &commands[n_commands];
1093 return cindex[c_toupper (first)];
1097 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1100 const char **commands;
1117 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1122 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1130 if (input[ofs - 1] == '.')
1133 for (commands = segmenter_get_command_name_candidates (input[0]);
1134 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1140 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1141 &exact, &missing_words)
1142 && missing_words <= 0)
1150 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1153 return eof ? 0 : -1;
1156 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1159 return eof ? 0 : -1;
1161 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1164 return c == '\'' || c == '"' || c == '\n';
1168 segmenter_parse_start_of_line__ (struct segmenter *s,
1169 const char *input, size_t n, bool eof,
1170 enum segment_type *type)
1176 assert (s->state == S_GENERAL);
1177 assert (s->substate & SS_START_OF_LINE);
1179 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1186 ofs = skip_spaces_and_comments (input, n, eof, 1);
1191 int is_string = is_start_of_string__ (input, n, eof, ofs);
1196 /* This is punctuation that may separate pieces of a string. */
1206 *type = SEG_START_COMMAND;
1207 s->substate = SS_START_OF_COMMAND;
1211 if (lex_uc_is_space (uc))
1213 int eol = at_end_of_line (input, n, eof, 0);
1218 s->substate = SS_START_OF_COMMAND;
1219 *type = SEG_SEPARATE_COMMANDS;
1225 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1227 else if (s->mode == SEG_MODE_AUTO)
1229 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1236 assert (s->mode == SEG_MODE_BATCH);
1238 s->substate = SS_START_OF_COMMAND;
1239 *type = SEG_START_COMMAND;
1243 s->substate = SS_START_OF_COMMAND;
1244 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1248 segmenter_parse_file_label_1__ (struct segmenter *s,
1249 const char *input, size_t n, bool eof,
1250 enum segment_type *type)
1252 struct segmenter sub;
1256 sub.state = S_GENERAL;
1257 ofs = segmenter_push (&sub, input, n, eof, type);
1261 else if (*type == SEG_IDENTIFIER)
1265 assert (lex_id_match (ss_cstr ("LABEL"),
1266 ss_buffer ((char *) input, ofs)));
1267 result = segmenter_unquoted (input, n, eof, ofs);
1273 s->state = S_FILE_LABEL_2;
1281 s->substate = sub.substate;
1287 segmenter_parse_file_label_2__ (struct segmenter *s,
1288 const char *input, size_t n, bool eof,
1289 enum segment_type *type)
1293 ofs = skip_spaces (input, n, eof, 0);
1296 s->state = S_FILE_LABEL_3;
1302 segmenter_parse_file_label_3__ (struct segmenter *s,
1303 const char *input, size_t n, bool eof,
1304 enum segment_type *type)
1316 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1330 if (!lex_uc_is_space (uc))
1341 s->state = S_GENERAL;
1343 *type = SEG_UNQUOTED_STRING;
1344 return endcmd >= 0 ? endcmd : ofs;
1351 segmenter_subparse (struct segmenter *s,
1352 const char *input, size_t n, bool eof,
1353 enum segment_type *type)
1355 struct segmenter sub;
1359 sub.state = S_GENERAL;
1360 sub.substate = s->substate;
1361 ofs = segmenter_push (&sub, input, n, eof, type);
1362 s->substate = sub.substate;
1366 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1367 defines the stand-in variables (the head) before the lines of syntax to be
1368 repeated (the body). */
1370 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1371 const char *input, size_t n, bool eof,
1372 enum segment_type *type)
1374 int ofs = segmenter_subparse (s, input, n, eof, type);
1378 if (*type == SEG_SEPARATE_COMMANDS)
1380 /* We reached a blank line that separates the head from the body. */
1381 s->state = S_DO_REPEAT_2;
1383 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1385 /* We reached the body. */
1386 s->state = S_DO_REPEAT_3;
1393 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1394 separates the head from the body. */
1396 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1397 const char *input, size_t n, bool eof,
1398 enum segment_type *type)
1400 int ofs = segmenter_subparse (s, input, n, eof, type);
1404 if (*type == SEG_NEWLINE)
1406 /* We reached the body. */
1407 s->state = S_DO_REPEAT_3;
1415 check_repeat_command (struct segmenter *s,
1416 const char *input, size_t n, bool eof)
1423 if (input[ofs] == '+' || input[ofs] == '-')
1426 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1429 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1431 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1436 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1440 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1441 s->substate += direction;
1446 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1447 enum segment_type *type)
1449 const char *newline = memchr (input, '\n', n);
1451 return eof ? n : -1;
1453 ptrdiff_t ofs = newline - input;
1454 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1456 *type = SEG_NEWLINE;
1460 return ofs - (input[ofs - 1] == '\r');
1463 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1464 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1466 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1467 the lines we're segmenting. s->substate counts the nesting level, starting
1470 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1471 const char *input, size_t n, bool eof,
1472 enum segment_type *type)
1476 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1477 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1479 else if (!check_repeat_command (s, input, n, eof) && !eof)
1481 else if (s->substate == 0)
1483 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1485 s->state = S_GENERAL;
1486 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1487 return segmenter_push (s, input, n, eof, type);
1491 *type = SEG_DO_REPEAT_COMMAND;
1496 /* We are segmenting a DEFINE command, which consists of:
1498 - The DEFINE keyword.
1502 - "(" followed by a sequence of tokens possibly including balanced parentheses
1505 - A sequence of any number of lines, one string per line, ending with
1506 "!ENDDEFINE". The first line is usually blank (that is, a newline follows
1507 the "("). The last line usually just has "!ENDDEFINE." on it, but it can
1508 start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
1512 segmenter_parse_define_1__ (struct segmenter *s,
1513 const char *input, size_t n, bool eof,
1514 enum segment_type *type)
1516 int ofs = segmenter_subparse (s, input, n, eof, type);
1520 if (*type == SEG_SEPARATE_COMMANDS
1521 || *type == SEG_END_COMMAND
1522 || *type == SEG_START_COMMAND)
1524 /* The DEFINE command is malformed because we reached its end without
1525 ever hitting a "(" token. Transition back to general parsing. */
1526 s->state = S_GENERAL;
1529 else if (*type == SEG_PUNCT && input[0] == '(')
1531 s->state = S_DEFINE_2;
1540 segmenter_parse_define_2__ (struct segmenter *s,
1541 const char *input, size_t n, bool eof,
1542 enum segment_type *type)
1544 int ofs = segmenter_subparse (s, input, n, eof, type);
1548 if (*type == SEG_SEPARATE_COMMANDS
1549 || *type == SEG_END_COMMAND
1550 || *type == SEG_START_COMMAND)
1552 /* The DEFINE command is malformed because we reached its end before
1553 closing the set of parentheses. Transition back to general
1555 s->state = S_GENERAL;
1558 else if (*type == SEG_PUNCT && input[0] == '(')
1563 else if (*type == SEG_PUNCT && input[0] == ')')
1568 s->state = S_DEFINE_3;
1578 find_enddefine (struct substring input)
1580 size_t n = input.length;
1581 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1584 /* Skip !ENDDEFINE in comment. */
1585 ofs = skip_spaces_and_comments (input.string, n, true, ofs);
1586 if (ofs + enddefine.length > n)
1589 char c = input.string[ofs];
1591 && ss_equals_case (ss_substr (input, ofs, enddefine.length),
1594 else if (c == '\'' || c == '"')
1596 /* Skip quoted !ENDDEFINE. */
1602 else if (input.string[ofs++] == c)
1611 /* We are in the body of a macro definition, looking for additional lines of
1612 the body or !ENDDEFINE. */
1614 segmenter_parse_define_3__ (struct segmenter *s,
1615 const char *input, size_t n, bool eof,
1616 enum segment_type *type)
1618 /* Gather a whole line. */
1619 const char *newline = memchr (input, '\n', n);
1620 int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
1626 /* Does the line contain !ENDDEFINE? */
1627 size_t end = find_enddefine (ss_buffer (input, ofs));
1628 if (end == SIZE_MAX)
1630 /* No !ENDDEFINE. We have a full line of macro body.
1632 The line might be blank, whether completely empty or just spaces and
1633 comments. That's OK: we need to report blank lines because they can
1636 However, if the first line of the macro body (the same line as the
1637 closing parenthesis in the argument definition) is blank, we just
1638 report it as spaces because it's not significant. */
1639 *type = (s->substate == 0 && is_all_spaces (input, ofs)
1640 ? SEG_SPACES : SEG_MACRO_BODY);
1641 s->state = S_DEFINE_4;
1647 /* Macro ends at the !ENDDEFINE on this line. */
1648 s->state = S_GENERAL;
1652 /* Line starts with !ENDDEFINE. */
1653 return segmenter_push (s, input, n, eof, type);
1657 if (is_all_spaces (input, end))
1659 /* Line starts with spaces followed by !ENDDEFINE. */
1664 /* Line starts with some content followed by !ENDDEFINE. */
1665 *type = SEG_MACRO_BODY;
1673 segmenter_parse_define_4__ (struct segmenter *s,
1674 const char *input, size_t n, bool eof,
1675 enum segment_type *type)
1677 int ofs = segmenter_parse_newline__ (input, n, eof, type);
1681 s->state = S_DEFINE_3;
1686 segmenter_parse_begin_data_1__ (struct segmenter *s,
1687 const char *input, size_t n, bool eof,
1688 enum segment_type *type)
1690 int ofs = segmenter_subparse (s, input, n, eof, type);
1694 if (*type == SEG_NEWLINE)
1695 s->state = S_BEGIN_DATA_2;
1701 segmenter_parse_begin_data_2__ (struct segmenter *s,
1702 const char *input, size_t n, bool eof,
1703 enum segment_type *type)
1705 int ofs = segmenter_subparse (s, input, n, eof, type);
1709 if (*type == SEG_NEWLINE)
1710 s->state = S_BEGIN_DATA_3;
1716 is_end_data (const char *input, size_t n)
1718 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1724 if (n < 4 || c_strncasecmp (input, "END", 3))
1728 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1729 if (!lex_uc_is_space (uc))
1733 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1740 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1747 else if (!lex_uc_is_space (uc))
1756 segmenter_parse_begin_data_3__ (struct segmenter *s,
1757 const char *input, size_t n, bool eof,
1758 enum segment_type *type)
1762 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1765 else if (is_end_data (input, ofs))
1767 s->state = S_GENERAL;
1768 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1769 return segmenter_push (s, input, n, eof, type);
1773 *type = SEG_INLINE_DATA;
1774 s->state = S_BEGIN_DATA_4;
1775 return input[ofs - 1] == '\n' ? 0 : ofs;
1780 segmenter_parse_begin_data_4__ (struct segmenter *s,
1781 const char *input, size_t n, bool eof,
1782 enum segment_type *type)
1786 ofs = segmenter_parse_newline__ (input, n, eof, type);
1790 s->state = S_BEGIN_DATA_3;
1794 /* Returns the name of segment TYPE as a string. The caller must not modify
1795 or free the returned string.
1797 This is useful only for debugging and testing. */
1799 segment_type_to_string (enum segment_type type)
1803 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1807 return "unknown segment type";
1811 /* Returns a segmenter with the given syntax MODE.
1813 If IS_SNIPPET is false, then the segmenter will parse as if it's being given
1814 a whole file. This means, for example, that it will interpret - or + at the
1815 beginning of the syntax as a separator between commands (since - or + at the
1816 beginning of a line has this meaning).
1818 If IS_SNIPPET is true, then the segmenter will parse as if it's being given
1819 an isolated piece of syntax. This means that, for example, that it will
1820 interpret - or + at the beginning of the syntax as an operator token or (if
1821 followed by a digit) as part of a number.
1823 A segmenter does not contain any external references, so nothing needs to be
1824 done to destroy one. For the same reason, segmenters may be copied with
1825 plain struct assignment (or memcpy). */
1827 segmenter_init (enum segmenter_mode mode, bool is_snippet)
1829 return (struct segmenter) {
1830 .state = is_snippet ? S_GENERAL : S_SHBANG,
1835 /* Returns the mode passed to segmenter_init() for S. */
1837 segmenter_get_mode (const struct segmenter *s)
1842 /* Attempts to label a prefix of S's remaining input with a segment type. The
1843 caller supplies the first N bytes of the remaining input as INPUT, which
1844 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1845 are the entire (remainder) of the input; if EOF is false, then further input
1846 is potentially available.
1848 The input may contain '\n' or '\r\n' line ends in any combination.
1850 If successful, returns the number of bytes in the segment at the beginning
1851 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1852 into *TYPE. The next call to segmenter_push() should not include those
1853 bytes as part of INPUT, because they have (figuratively) been consumed by
1856 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1857 be determined. In this case segmenter_push() returns -1. If more input is
1858 available, the caller should obtain some more, then call again with a larger
1859 N. If this is not enough, the process might need to repeat again and agin.
1860 If input is exhausted, then the caller may call again setting EOF to true.
1861 segmenter_push() will never return -1 when EOF is true.
1863 The caller must not, in a sequence of calls, supply contradictory input.
1864 That is, bytes provided as part of INPUT in one call, but not consumed, must
1865 not be provided with *different* values on subsequent calls. This is
1866 because segmenter_push() must often make decisions based on looking ahead
1867 beyond the bytes that it consumes. */
1869 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1870 enum segment_type *type)
1886 return segmenter_parse_shbang__ (s, input, n, eof, type);
1889 return (s->substate & SS_START_OF_LINE
1890 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1891 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1894 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1896 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1899 return segmenter_parse_document_1__ (s, input, n, eof, type);
1901 return segmenter_parse_document_2__ (s, input, n, eof, type);
1903 return segmenter_parse_document_3__ (s, type);
1905 case S_FILE_LABEL_1:
1906 return segmenter_parse_file_label_1__ (s, input, n, eof, type);
1907 case S_FILE_LABEL_2:
1908 return segmenter_parse_file_label_2__ (s, input, n, eof, type);
1909 case S_FILE_LABEL_3:
1910 return segmenter_parse_file_label_3__ (s, input, n, eof, type);
1913 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1915 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1917 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1920 return segmenter_parse_define_1__ (s, input, n, eof, type);
1922 return segmenter_parse_define_2__ (s, input, n, eof, type);
1924 return segmenter_parse_define_3__ (s, input, n, eof, type);
1926 return segmenter_parse_define_4__ (s, input, n, eof, type);
1928 case S_BEGIN_DATA_1:
1929 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1930 case S_BEGIN_DATA_2:
1931 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1932 case S_BEGIN_DATA_3:
1933 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1934 case S_BEGIN_DATA_4:
1935 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1941 /* Returns the style of command prompt to display to an interactive user for
1942 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1943 and at the beginning of a line (that is, if segmenter_push() consumed as
1944 much as possible of the input up to a new-line). */
1946 segmenter_get_prompt (const struct segmenter *s)
1951 return PROMPT_FIRST;
1954 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1958 return PROMPT_COMMENT;
1962 return PROMPT_DOCUMENT;
1964 return PROMPT_FIRST;
1966 case S_FILE_LABEL_1:
1967 return PROMPT_LATER;
1968 case S_FILE_LABEL_2:
1969 case S_FILE_LABEL_3:
1970 return PROMPT_FIRST;
1974 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1976 return PROMPT_DO_REPEAT;
1980 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1983 return PROMPT_DEFINE;
1985 case S_BEGIN_DATA_1:
1986 return PROMPT_FIRST;
1987 case S_BEGIN_DATA_2:
1988 return PROMPT_LATER;
1989 case S_BEGIN_DATA_3:
1990 case S_BEGIN_DATA_4: