1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
58 #define SS_START_OF_LINE (1u << 0)
59 #define SS_START_OF_COMMAND (1u << 1)
61 static int segmenter_detect_command_name__ (const char *input,
62 size_t n, bool eof, int ofs);
65 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
68 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
76 mblen = u8_mbtoucr (puc, input, n);
80 return u8_mbtouc (puc, input, n);
91 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
92 bool eof, enum segment_type *type)
100 for (int ofs = 2; ; ofs++)
107 else if (input[ofs] == '\n')
109 if (input[ofs - 1] == '\r')
115 s->state = S_GENERAL;
116 s->substate = SS_START_OF_COMMAND;
126 s->state = S_GENERAL;
127 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
128 return segmenter_push (s, input, n, eof, type);
132 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
133 const char *input, size_t n, bool eof,
134 enum segment_type *type)
136 assert (s->state == S_GENERAL);
142 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
146 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
148 for (; ofs < n; ofs++)
150 if (input[ofs] == '\n')
152 else if (input[ofs] == '*')
155 return eof ? ofs + 1 : -1;
156 else if (input[ofs + 1] == '/')
160 return eof ? ofs : -1;
164 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
171 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
178 return eof ? ofs : -1;
179 else if (input[ofs + 1] != '*')
182 ofs = skip_comment (input, n, eof, ofs + 2);
186 else if (lex_uc_is_space (uc) && uc != '\n')
192 return eof ? ofs : -1;
196 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
200 else if (input[ofs] == '\n')
202 else if (input[ofs] == '\r')
206 return input[ofs + 1] == '\n';
213 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
215 ofs = skip_spaces_and_comments (input, n, eof, ofs);
219 return is_end_of_line (input, n, eof, ofs);
223 is_all_spaces (const char *input_, size_t n)
225 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
228 for (int ofs = 0; ofs < n; ofs += mblen)
231 mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
232 if (!lex_uc_is_space (uc))
239 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
240 enum segment_type *type)
244 if (input[0] == '\n')
254 assert (input[0] == '\r');
255 assert (input[1] == '\n');
264 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
271 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
275 if (!lex_uc_is_space (uc) || uc == '\n')
281 return eof ? ofs : -1;
285 skip_digits (const char *input, size_t n, bool eof, int ofs)
287 for (; ofs < n; ofs++)
288 if (!c_isdigit (input[ofs]))
290 return eof ? ofs : -1;
294 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
295 bool eof, enum segment_type *type, int ofs)
297 assert (s->state == S_GENERAL);
299 ofs = skip_digits (input, n, eof, ofs);
309 if (input[ofs] == '.')
318 ofs = skip_digits (input, n, eof, ofs + 1);
325 if (input[ofs] == 'e' || input[ofs] == 'E')
332 goto expected_exponent;
335 if (input[ofs] == '+' || input[ofs] == '-')
342 goto expected_exponent;
346 if (!c_isdigit (input[ofs]))
347 goto expected_exponent;
349 ofs = skip_digits (input, n, eof, ofs);
354 if (input[ofs - 1] == '.')
356 int eol = at_end_of_line (input, n, eof, ofs);
369 *type = SEG_EXPECTED_EXPONENT;
375 is_reserved_word (const char *s, int n)
379 s0 = c_toupper (s[0]);
383 s1 = c_toupper (s[1]);
384 return ((s0 == 'B' && s1 == 'Y')
385 || (s0 == 'E' && s1 == 'Q')
386 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
387 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
388 || (s0 == 'N' && s1 == 'E')
389 || (s0 == 'O' && s1 == 'R')
390 || (s0 == 'T' && s1 == 'O'));
393 s1 = c_toupper (s[1]);
394 s2 = c_toupper (s[2]);
395 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
396 || (s1 == 'N' && s2 == 'D')))
397 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
400 s1 = c_toupper (s[1]);
401 s2 = c_toupper (s[2]);
402 s3 = c_toupper (s[3]);
403 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
411 segmenter_parse_comment_1__ (struct segmenter *s,
412 const char *input, size_t n, bool eof,
413 enum segment_type *type)
425 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
436 if (ofs > 1 && input[ofs - 1] == '\r')
440 /* Blank line ends comment command. */
441 s->state = S_GENERAL;
442 s->substate = SS_START_OF_COMMAND;
443 *type = SEG_SEPARATE_COMMANDS;
446 else if (endcmd >= 0)
448 /* '.' at end of line ends comment command. */
449 s->state = S_GENERAL;
451 *type = SEG_COMMENT_COMMAND;
456 /* Comment continues onto next line. */
457 *type = SEG_COMMENT_COMMAND;
458 s->state = S_COMMENT_2;
464 if (!lex_uc_is_space (uc))
475 s->state = S_GENERAL;
476 s->substate = SS_START_OF_COMMAND;
477 *type = SEG_SEPARATE_COMMANDS;
485 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
486 size_t n, bool eof, enum segment_type *type)
488 int ofs = segmenter_parse_newline__ (input, n, eof, type);
502 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
506 if (uc == '+' || uc == '-' || uc == '.')
508 else if (!lex_uc_is_space (uc))
511 case SEG_MODE_INTERACTIVE:
520 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
534 s->state = S_GENERAL;
535 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
538 s->state = S_COMMENT_1;
543 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
544 bool eof, enum segment_type *type)
556 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
567 if (ofs > 1 && input[ofs - 1] == '\r')
570 *type = SEG_DOCUMENT;
571 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
575 if (!lex_uc_is_space (uc))
584 *type = SEG_DOCUMENT;
585 s->state = S_DOCUMENT_3;
592 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
593 bool eof, enum segment_type *type)
597 ofs = segmenter_parse_newline__ (input, n, eof, type);
601 s->state = S_DOCUMENT_1;
606 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
608 *type = SEG_END_COMMAND;
609 s->state = S_GENERAL;
610 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
615 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
618 ofs = skip_spaces_and_comments (input, n, eof, ofs);
624 return c != '\'' && c != '"' && c != '\n';
634 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
635 bool eof, int ofs, char id[], size_t id_size)
637 struct segmenter sub;
639 assert (id_size > 0);
642 sub.state = S_GENERAL;
646 enum segment_type type;
649 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
665 if (retval < id_size)
667 memcpy (id, input + ofs, retval);
674 case SEG_QUOTED_STRING:
676 case SEG_UNICODE_STRING:
677 case SEG_UNQUOTED_STRING:
678 case SEG_RESERVED_WORD:
680 case SEG_COMMENT_COMMAND:
681 case SEG_DO_REPEAT_COMMAND:
682 case SEG_INLINE_DATA:
686 case SEG_START_DOCUMENT:
688 case SEG_START_COMMAND:
689 case SEG_SEPARATE_COMMANDS:
690 case SEG_END_COMMAND:
692 case SEG_EXPECTED_QUOTE:
693 case SEG_EXPECTED_EXPONENT:
694 case SEG_UNEXPECTED_CHAR:
702 /* Called when INPUT begins with a character that can start off an ID token. */
704 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
705 bool eof, enum segment_type *type)
711 assert (s->state == S_GENERAL);
713 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
725 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
728 else if (!lex_uc_is_idn (uc))
734 if (input[ofs - 1] == '.')
736 int eol = at_end_of_line (input, n, eof, ofs);
743 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
744 : input[0] == '!' ? SEG_MACRO_ID
747 if (s->substate & SS_START_OF_COMMAND)
749 struct substring word = ss_buffer (input, ofs);
751 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
753 s->state = S_COMMENT_1;
754 return segmenter_parse_comment_1__ (s, input, n, eof, type);
756 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
758 s->state = S_DOCUMENT_1;
759 *type = SEG_START_DOCUMENT;
762 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
764 s->state = S_DEFINE_1;
767 else if (lex_id_match (ss_cstr ("FILE"), word))
771 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
773 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
775 s->state = S_FILE_LABEL_1;
780 else if (lex_id_match (ss_cstr ("DO"), word))
784 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
786 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
788 s->state = S_DO_REPEAT_1;
793 else if (lex_id_match (ss_cstr ("BEGIN"), word))
798 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
801 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
805 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
811 else if (input[ofs2] == '.')
813 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
818 eol = is_end_of_line (input, n, eof, ofs2);
823 if (memchr (input, '\n', ofs2))
824 s->state = S_BEGIN_DATA_1;
826 s->state = S_BEGIN_DATA_2;
839 segmenter_parse_string__ (enum segment_type string_type,
840 int ofs, struct segmenter *s,
841 const char *input, size_t n, bool eof,
842 enum segment_type *type)
844 int quote = input[ofs];
848 if (input[ofs] == quote)
853 if (input[ofs] == quote)
866 else if (input[ofs] == '\n')
877 *type = SEG_EXPECTED_QUOTE;
883 segmenter_maybe_parse_string__ (enum segment_type string_type,
885 const char *input, size_t n, bool eof,
886 enum segment_type *type)
893 else if (input[1] == '\'' || input[1] == '"')
894 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
896 return segmenter_parse_id__ (s, input, n, eof, type);
900 segmenter_parse_mid_command__ (struct segmenter *s,
901 const char *input, size_t n, bool eof,
902 enum segment_type *type)
908 assert (s->state == S_GENERAL);
909 assert (!(s->substate & SS_START_OF_LINE));
911 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
918 s->substate |= SS_START_OF_LINE;
928 else if (input[1] == '*')
930 ofs = skip_comment (input, n, eof, 2);
943 ofs = skip_spaces (input, n, eof, 1);
946 else if (ofs < n && c_isdigit (input[ofs]))
947 return segmenter_parse_number__ (s, input, n, eof, type, ofs);
948 else if (ofs < n && input[ofs] == '.')
955 else if (c_isdigit (input[ofs + 1]))
956 return segmenter_parse_number__ (s, input, n, eof, type, ofs);
959 case '(': case ')': case '{': case ',': case '=': case ';': case ':':
960 case '[': case ']': case '}': case '&': case '|': case '+':
966 if (s->substate & SS_START_OF_COMMAND)
968 /* '*' at the beginning of a command begins a comment. */
969 s->state = S_COMMENT_1;
970 return segmenter_parse_comment_1__ (s, input, n, eof, type);
973 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
976 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
979 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
982 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
990 else if (c_isdigit (input[1]))
991 return segmenter_parse_number__ (s, input, n, eof, type, 0);
993 int eol = at_end_of_line (input, n, eof, 1);
999 *type = SEG_END_COMMAND;
1000 s->substate = SS_START_OF_COMMAND;
1006 case '0': case '1': case '2': case '3': case '4':
1007 case '5': case '6': case '7': case '8': case '9':
1008 return segmenter_parse_number__ (s, input, n, eof, type, 0);
1011 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
1012 s, input, n, eof, type);
1015 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
1016 s, input, n, eof, type);
1018 case '\'': case '"':
1019 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1020 s, input, n, eof, type);
1030 else if (input[1] == '*')
1032 *type = SEG_MACRO_ID;
1036 return segmenter_parse_id__ (s, input, n, eof, type);
1039 if (lex_uc_is_space (uc))
1041 ofs = skip_spaces (input, n, eof, mblen);
1045 if (ofs < n && input[ofs - 1] == '\r' && input[ofs] == '\n')
1049 s->substate |= SS_START_OF_LINE;
1050 *type = SEG_NEWLINE;
1059 else if (lex_uc_is_id1 (uc))
1060 return segmenter_parse_id__ (s, input, n, eof, type);
1061 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1069 *type = SEG_UNEXPECTED_CHAR;
1077 compare_commands (const void *a_, const void *b_)
1079 const char *const *ap = a_;
1080 const char *const *bp = b_;
1081 const char *a = *ap;
1082 const char *b = *bp;
1084 return c_strcasecmp (a, b);
1087 static const char **
1088 segmenter_get_command_name_candidates (unsigned char first)
1090 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1091 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1092 static const char *commands[] =
1094 #include "language/command.def"
1097 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1103 static const char **cindex[UCHAR_MAX + 1];
1111 qsort (commands, n_commands, sizeof *commands, compare_commands);
1112 for (i = 0; i < n_commands; i++)
1114 unsigned char c = c_toupper (commands[i][0]);
1115 if (cindex[c] == NULL)
1116 cindex[c] = &commands[i];
1118 for (i = 0; i <= UCHAR_MAX; i++)
1119 if (cindex[i] == NULL)
1120 cindex[i] = &commands[n_commands];
1123 return cindex[c_toupper (first)];
1127 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1130 const char **commands;
1147 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1152 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1160 if (input[ofs - 1] == '.')
1163 for (commands = segmenter_get_command_name_candidates (input[0]);
1164 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1170 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1171 &exact, &missing_words)
1172 && missing_words <= 0)
1180 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1183 return eof ? 0 : -1;
1186 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1189 return eof ? 0 : -1;
1191 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1194 return c == '\'' || c == '"' || c == '\n';
1198 segmenter_parse_start_of_line__ (struct segmenter *s,
1199 const char *input, size_t n, bool eof,
1200 enum segment_type *type)
1206 assert (s->state == S_GENERAL);
1207 assert (s->substate & SS_START_OF_LINE);
1209 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1216 ofs = skip_spaces_and_comments (input, n, eof, 1);
1221 int is_string = is_start_of_string__ (input, n, eof, ofs);
1226 /* This is punctuation that may separate pieces of a string. */
1236 *type = SEG_START_COMMAND;
1237 s->substate = SS_START_OF_COMMAND;
1241 if (lex_uc_is_space (uc))
1243 int eol = at_end_of_line (input, n, eof, 0);
1248 s->substate = SS_START_OF_COMMAND;
1249 *type = SEG_SEPARATE_COMMANDS;
1255 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1257 else if (s->mode == SEG_MODE_AUTO)
1259 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1266 assert (s->mode == SEG_MODE_BATCH);
1268 s->substate = SS_START_OF_COMMAND;
1269 *type = SEG_START_COMMAND;
1273 s->substate = SS_START_OF_COMMAND;
1274 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1278 segmenter_parse_file_label_1__ (struct segmenter *s,
1279 const char *input, size_t n, bool eof,
1280 enum segment_type *type)
1282 struct segmenter sub;
1286 sub.state = S_GENERAL;
1287 ofs = segmenter_push (&sub, input, n, eof, type);
1291 else if (*type == SEG_IDENTIFIER)
1295 assert (lex_id_match (ss_cstr ("LABEL"),
1296 ss_buffer ((char *) input, ofs)));
1297 result = segmenter_unquoted (input, n, eof, ofs);
1303 s->state = S_FILE_LABEL_2;
1311 s->substate = sub.substate;
1317 segmenter_parse_file_label_2__ (struct segmenter *s,
1318 const char *input, size_t n, bool eof,
1319 enum segment_type *type)
1323 ofs = skip_spaces (input, n, eof, 0);
1326 s->state = S_FILE_LABEL_3;
1332 segmenter_parse_file_label_3__ (struct segmenter *s,
1333 const char *input, size_t n, bool eof,
1334 enum segment_type *type)
1346 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1360 if (!lex_uc_is_space (uc))
1371 s->state = S_GENERAL;
1373 *type = SEG_UNQUOTED_STRING;
1374 return endcmd >= 0 ? endcmd : ofs;
1381 segmenter_subparse (struct segmenter *s,
1382 const char *input, size_t n, bool eof,
1383 enum segment_type *type)
1385 struct segmenter sub;
1389 sub.state = S_GENERAL;
1390 sub.substate = s->substate;
1391 ofs = segmenter_push (&sub, input, n, eof, type);
1392 s->substate = sub.substate;
1396 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1397 defines the stand-in variables (the head) before the lines of syntax to be
1398 repeated (the body). */
1400 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1401 const char *input, size_t n, bool eof,
1402 enum segment_type *type)
1404 int ofs = segmenter_subparse (s, input, n, eof, type);
1408 if (*type == SEG_SEPARATE_COMMANDS)
1410 /* We reached a blank line that separates the head from the body. */
1411 s->state = S_DO_REPEAT_2;
1413 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1415 /* We reached the body. */
1416 s->state = S_DO_REPEAT_3;
1423 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1424 separates the head from the body. */
1426 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1427 const char *input, size_t n, bool eof,
1428 enum segment_type *type)
1430 int ofs = segmenter_subparse (s, input, n, eof, type);
1434 if (*type == SEG_NEWLINE)
1436 /* We reached the body. */
1437 s->state = S_DO_REPEAT_3;
1445 check_repeat_command (struct segmenter *s,
1446 const char *input, size_t n, bool eof)
1453 if (input[ofs] == '+' || input[ofs] == '-')
1456 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1459 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1461 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1466 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1470 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1471 s->substate += direction;
1476 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1477 enum segment_type *type)
1479 const char *newline = memchr (input, '\n', n);
1481 return eof ? n : -1;
1483 ptrdiff_t ofs = newline - input;
1484 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1486 *type = SEG_NEWLINE;
1490 return ofs - (input[ofs - 1] == '\r');
1493 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1494 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1496 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1497 the lines we're segmenting. s->substate counts the nesting level, starting
1500 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1501 const char *input, size_t n, bool eof,
1502 enum segment_type *type)
1506 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1507 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1509 else if (!check_repeat_command (s, input, n, eof) && !eof)
1511 else if (s->substate == 0)
1513 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1515 s->state = S_GENERAL;
1516 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1517 return segmenter_push (s, input, n, eof, type);
1521 *type = SEG_DO_REPEAT_COMMAND;
1526 /* We are segmenting a DEFINE command, which consists of:
1528 - The DEFINE keyword.
1530 - An identifier. We transform this into SEG_MACRO_NAME instead of
1531 SEG_IDENTIFIER or SEG_MACRO_NAME because this identifier must never be
1536 - "(" followed by a sequence of tokens possibly including balanced parentheses
1539 - A sequence of any number of lines, one string per line, ending with
1540 "!ENDDEFINE". The first line is usually blank (that is, a newline follows
1541 the "("). The last line usually just has "!ENDDEFINE." on it, but it can
1542 start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
1546 segmenter_parse_define_1_2__ (struct segmenter *s,
1547 const char *input, size_t n, bool eof,
1548 enum segment_type *type)
1550 int ofs = segmenter_subparse (s, input, n, eof, type);
1554 if (s->state == S_DEFINE_1
1555 && (*type == SEG_IDENTIFIER || *type == SEG_MACRO_ID))
1557 *type = SEG_MACRO_NAME;
1558 s->state = S_DEFINE_2;
1560 else if (*type == SEG_SEPARATE_COMMANDS
1561 || *type == SEG_END_COMMAND
1562 || *type == SEG_START_COMMAND)
1564 /* The DEFINE command is malformed because we reached its end without
1565 ever hitting a "(" token. Transition back to general parsing. */
1566 s->state = S_GENERAL;
1569 else if (*type == SEG_PUNCT && input[0] == '(')
1571 s->state = S_DEFINE_3;
1580 segmenter_parse_define_3__ (struct segmenter *s,
1581 const char *input, size_t n, bool eof,
1582 enum segment_type *type)
1584 int ofs = segmenter_subparse (s, input, n, eof, type);
1588 if (*type == SEG_SEPARATE_COMMANDS
1589 || *type == SEG_END_COMMAND
1590 || *type == SEG_START_COMMAND)
1592 /* The DEFINE command is malformed because we reached its end before
1593 closing the set of parentheses. Transition back to general
1595 s->state = S_GENERAL;
1598 else if (*type == SEG_PUNCT && input[0] == '(')
1603 else if (*type == SEG_PUNCT && input[0] == ')')
1608 s->state = S_DEFINE_4;
1618 find_enddefine (struct substring input)
1620 size_t n = input.length;
1621 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1624 /* Skip !ENDDEFINE in comment. */
1625 ofs = skip_spaces_and_comments (input.string, n, true, ofs);
1626 if (ofs + enddefine.length > n)
1629 char c = input.string[ofs];
1631 && ss_equals_case (ss_substr (input, ofs, enddefine.length),
1634 else if (c == '\'' || c == '"')
1636 /* Skip quoted !ENDDEFINE. */
1642 else if (input.string[ofs++] == c)
1651 /* We are in the body of a macro definition, looking for additional lines of
1652 the body or !ENDDEFINE. */
1654 segmenter_parse_define_4__ (struct segmenter *s,
1655 const char *input, size_t n, bool eof,
1656 enum segment_type *type)
1658 /* Gather a whole line. */
1659 const char *newline = memchr (input, '\n', n);
1660 int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
1666 /* Does the line contain !ENDDEFINE? */
1667 size_t end = find_enddefine (ss_buffer (input, ofs));
1668 if (end == SIZE_MAX)
1670 /* No !ENDDEFINE. We have a full line of macro body.
1672 The line might be blank, whether completely empty or just spaces and
1673 comments. That's OK: we need to report blank lines because they can
1676 However, if the first line of the macro body (the same line as the
1677 closing parenthesis in the argument definition) is blank, we just
1678 report it as spaces because it's not significant. */
1679 *type = (s->substate == 0 && is_all_spaces (input, ofs)
1680 ? SEG_SPACES : SEG_MACRO_BODY);
1681 s->state = S_DEFINE_5;
1687 /* Macro ends at the !ENDDEFINE on this line. */
1688 s->state = S_GENERAL;
1692 /* Line starts with !ENDDEFINE. */
1693 return segmenter_push (s, input, n, eof, type);
1697 if (is_all_spaces (input, end))
1699 /* Line starts with spaces followed by !ENDDEFINE. */
1704 /* Line starts with some content followed by !ENDDEFINE. */
1705 *type = SEG_MACRO_BODY;
1713 segmenter_parse_define_5__ (struct segmenter *s,
1714 const char *input, size_t n, bool eof,
1715 enum segment_type *type)
1717 int ofs = segmenter_parse_newline__ (input, n, eof, type);
1721 s->state = S_DEFINE_4;
1726 segmenter_parse_begin_data_1__ (struct segmenter *s,
1727 const char *input, size_t n, bool eof,
1728 enum segment_type *type)
1730 int ofs = segmenter_subparse (s, input, n, eof, type);
1734 if (*type == SEG_NEWLINE)
1735 s->state = S_BEGIN_DATA_2;
1741 segmenter_parse_begin_data_2__ (struct segmenter *s,
1742 const char *input, size_t n, bool eof,
1743 enum segment_type *type)
1745 int ofs = segmenter_subparse (s, input, n, eof, type);
1749 if (*type == SEG_NEWLINE)
1750 s->state = S_BEGIN_DATA_3;
1756 is_end_data (const char *input, size_t n)
1758 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1764 if (n < 4 || c_strncasecmp (input, "END", 3))
1768 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1769 if (!lex_uc_is_space (uc))
1773 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1780 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1787 else if (!lex_uc_is_space (uc))
1796 segmenter_parse_begin_data_3__ (struct segmenter *s,
1797 const char *input, size_t n, bool eof,
1798 enum segment_type *type)
1802 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1805 else if (is_end_data (input, ofs))
1807 s->state = S_GENERAL;
1808 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1809 return segmenter_push (s, input, n, eof, type);
1813 *type = SEG_INLINE_DATA;
1814 s->state = S_BEGIN_DATA_4;
1815 return input[ofs - 1] == '\n' ? 0 : ofs;
1820 segmenter_parse_begin_data_4__ (struct segmenter *s,
1821 const char *input, size_t n, bool eof,
1822 enum segment_type *type)
1826 ofs = segmenter_parse_newline__ (input, n, eof, type);
1830 s->state = S_BEGIN_DATA_3;
1834 /* Returns the name of segment TYPE as a string. The caller must not modify
1835 or free the returned string.
1837 This is useful only for debugging and testing. */
1839 segment_type_to_string (enum segment_type type)
1843 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1847 return "unknown segment type";
1851 /* Returns a segmenter with the given syntax MODE.
1853 If IS_SNIPPET is false, then the segmenter will parse as if it's being given
1854 a whole file. This means, for example, that it will interpret - or + at the
1855 beginning of the syntax as a separator between commands (since - or + at the
1856 beginning of a line has this meaning).
1858 If IS_SNIPPET is true, then the segmenter will parse as if it's being given
1859 an isolated piece of syntax. This means that, for example, that it will
1860 interpret - or + at the beginning of the syntax as an operator token or (if
1861 followed by a digit) as part of a number.
1863 A segmenter does not contain any external references, so nothing needs to be
1864 done to destroy one. For the same reason, segmenters may be copied with
1865 plain struct assignment (or memcpy). */
1867 segmenter_init (enum segmenter_mode mode, bool is_snippet)
1869 return (struct segmenter) {
1870 .state = is_snippet ? S_GENERAL : S_SHBANG,
1875 /* Returns the mode passed to segmenter_init() for S. */
1877 segmenter_get_mode (const struct segmenter *s)
1882 /* Attempts to label a prefix of S's remaining input with a segment type. The
1883 caller supplies the first N bytes of the remaining input as INPUT, which
1884 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1885 are the entire (remainder) of the input; if EOF is false, then further input
1886 is potentially available.
1888 The input may contain '\n' or '\r\n' line ends in any combination.
1890 If successful, returns the number of bytes in the segment at the beginning
1891 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1892 into *TYPE. The next call to segmenter_push() should not include those
1893 bytes as part of INPUT, because they have (figuratively) been consumed by
1896 Segments can have zero length, including segment types SEG_END,
1897 SEG_SEPARATE_COMMANDS, SEG_START_DOCUMENT, SEG_INLINE_DATA, and SEG_SPACES.
1899 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1900 be determined. In this case segmenter_push() returns -1. If more input is
1901 available, the caller should obtain some more, then call again with a larger
1902 N. If this is not enough, the process might need to repeat again and agin.
1903 If input is exhausted, then the caller may call again setting EOF to true.
1904 segmenter_push() will never return -1 when EOF is true.
1906 The caller must not, in a sequence of calls, supply contradictory input.
1907 That is, bytes provided as part of INPUT in one call, but not consumed, must
1908 not be provided with *different* values on subsequent calls. This is
1909 because segmenter_push() must often make decisions based on looking ahead
1910 beyond the bytes that it consumes. */
1912 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1913 enum segment_type *type)
1929 return segmenter_parse_shbang__ (s, input, n, eof, type);
1932 return (s->substate & SS_START_OF_LINE
1933 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1934 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1937 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1939 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1942 return segmenter_parse_document_1__ (s, input, n, eof, type);
1944 return segmenter_parse_document_2__ (s, input, n, eof, type);
1946 return segmenter_parse_document_3__ (s, type);
1948 case S_FILE_LABEL_1:
1949 return segmenter_parse_file_label_1__ (s, input, n, eof, type);
1950 case S_FILE_LABEL_2:
1951 return segmenter_parse_file_label_2__ (s, input, n, eof, type);
1952 case S_FILE_LABEL_3:
1953 return segmenter_parse_file_label_3__ (s, input, n, eof, type);
1956 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1958 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1960 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1964 return segmenter_parse_define_1_2__ (s, input, n, eof, type);
1966 return segmenter_parse_define_3__ (s, input, n, eof, type);
1968 return segmenter_parse_define_4__ (s, input, n, eof, type);
1970 return segmenter_parse_define_5__ (s, input, n, eof, type);
1972 case S_BEGIN_DATA_1:
1973 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1974 case S_BEGIN_DATA_2:
1975 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1976 case S_BEGIN_DATA_3:
1977 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1978 case S_BEGIN_DATA_4:
1979 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1985 /* Returns the style of command prompt to display to an interactive user for
1986 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1987 and at the beginning of a line (that is, if segmenter_push() consumed as
1988 much as possible of the input up to a new-line). */
1990 segmenter_get_prompt (const struct segmenter *s)
1995 return PROMPT_FIRST;
1998 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
2002 return PROMPT_COMMENT;
2006 return PROMPT_DOCUMENT;
2008 return PROMPT_FIRST;
2010 case S_FILE_LABEL_1:
2011 return PROMPT_LATER;
2012 case S_FILE_LABEL_2:
2013 case S_FILE_LABEL_3:
2014 return PROMPT_FIRST;
2018 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
2020 return PROMPT_DO_REPEAT;
2025 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
2028 return PROMPT_DEFINE;
2030 case S_BEGIN_DATA_1:
2031 return PROMPT_FIRST;
2032 case S_BEGIN_DATA_2:
2033 return PROMPT_LATER;
2034 case S_BEGIN_DATA_3:
2035 case S_BEGIN_DATA_4: