1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
57 #define SS_START_OF_LINE (1u << 0)
58 #define SS_START_OF_COMMAND (1u << 1)
60 static int segmenter_detect_command_name__ (const char *input,
61 size_t n, bool eof, int ofs);
64 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
67 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
75 mblen = u8_mbtoucr (puc, input, n);
79 return u8_mbtouc (puc, input, n);
90 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
91 bool eof, enum segment_type *type)
99 for (int ofs = 2; ; ofs++)
106 else if (input[ofs] == '\n')
108 if (input[ofs - 1] == '\r')
114 s->state = S_GENERAL;
115 s->substate = SS_START_OF_COMMAND;
125 s->state = S_GENERAL;
126 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
127 return segmenter_push (s, input, n, eof, type);
131 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
132 const char *input, size_t n, bool eof,
133 enum segment_type *type)
135 assert (s->state == S_GENERAL);
141 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
145 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
147 for (; ofs < n; ofs++)
149 if (input[ofs] == '\n')
151 else if (input[ofs] == '*')
154 return eof ? ofs + 1 : -1;
155 else if (input[ofs + 1] == '/')
159 return eof ? ofs : -1;
163 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
170 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
177 return eof ? ofs : -1;
178 else if (input[ofs + 1] != '*')
181 ofs = skip_comment (input, n, eof, ofs + 2);
185 else if (lex_uc_is_space (uc) && uc != '\n')
191 return eof ? ofs : -1;
195 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
199 else if (input[ofs] == '\n')
201 else if (input[ofs] == '\r')
205 return input[ofs + 1] == '\n';
212 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
214 ofs = skip_spaces_and_comments (input, n, eof, ofs);
218 return is_end_of_line (input, n, eof, ofs);
222 is_all_spaces (const char *input_, size_t n)
224 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
227 for (int ofs = 0; ofs < n; ofs += mblen)
230 mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
231 if (!lex_uc_is_space (uc))
238 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
239 enum segment_type *type)
243 if (input[0] == '\n')
253 assert (input[0] == '\r');
254 assert (input[1] == '\n');
263 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
270 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
274 if (!lex_uc_is_space (uc) || uc == '\n')
280 return eof ? ofs : -1;
284 skip_digits (const char *input, size_t n, bool eof, int ofs)
286 for (; ofs < n; ofs++)
287 if (!c_isdigit (input[ofs]))
289 return eof ? ofs : -1;
293 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
294 bool eof, enum segment_type *type, int ofs)
296 assert (s->state == S_GENERAL);
298 ofs = skip_digits (input, n, eof, ofs);
308 if (input[ofs] == '.')
317 ofs = skip_digits (input, n, eof, ofs + 1);
324 if (input[ofs] == 'e' || input[ofs] == 'E')
331 goto expected_exponent;
334 if (input[ofs] == '+' || input[ofs] == '-')
341 goto expected_exponent;
345 if (!c_isdigit (input[ofs]))
346 goto expected_exponent;
348 ofs = skip_digits (input, n, eof, ofs);
353 if (input[ofs - 1] == '.')
355 int eol = at_end_of_line (input, n, eof, ofs);
368 *type = SEG_EXPECTED_EXPONENT;
374 is_reserved_word (const char *s, int n)
378 s0 = c_toupper (s[0]);
382 s1 = c_toupper (s[1]);
383 return ((s0 == 'B' && s1 == 'Y')
384 || (s0 == 'E' && s1 == 'Q')
385 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
386 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
387 || (s0 == 'N' && s1 == 'E')
388 || (s0 == 'O' && s1 == 'R')
389 || (s0 == 'T' && s1 == 'O'));
392 s1 = c_toupper (s[1]);
393 s2 = c_toupper (s[2]);
394 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
395 || (s1 == 'N' && s2 == 'D')))
396 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
399 s1 = c_toupper (s[1]);
400 s2 = c_toupper (s[2]);
401 s3 = c_toupper (s[3]);
402 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
410 segmenter_parse_comment_1__ (struct segmenter *s,
411 const char *input, size_t n, bool eof,
412 enum segment_type *type)
424 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
435 if (ofs > 1 && input[ofs - 1] == '\r')
439 /* Blank line ends comment command. */
440 s->state = S_GENERAL;
441 s->substate = SS_START_OF_COMMAND;
442 *type = SEG_SEPARATE_COMMANDS;
445 else if (endcmd >= 0)
447 /* '.' at end of line ends comment command. */
448 s->state = S_GENERAL;
450 *type = SEG_COMMENT_COMMAND;
455 /* Comment continues onto next line. */
456 *type = SEG_COMMENT_COMMAND;
457 s->state = S_COMMENT_2;
463 if (!lex_uc_is_space (uc))
474 s->state = S_GENERAL;
475 s->substate = SS_START_OF_COMMAND;
476 *type = SEG_SEPARATE_COMMANDS;
484 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
485 size_t n, bool eof, enum segment_type *type)
487 int ofs = segmenter_parse_newline__ (input, n, eof, type);
501 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
505 if (uc == '+' || uc == '-' || uc == '.')
507 else if (!lex_uc_is_space (uc))
510 case SEG_MODE_INTERACTIVE:
519 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
533 s->state = S_GENERAL;
534 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
537 s->state = S_COMMENT_1;
542 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
543 bool eof, enum segment_type *type)
555 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
566 if (ofs > 1 && input[ofs - 1] == '\r')
569 *type = SEG_DOCUMENT;
570 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
574 if (!lex_uc_is_space (uc))
583 *type = SEG_DOCUMENT;
584 s->state = S_DOCUMENT_3;
591 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
592 bool eof, enum segment_type *type)
596 ofs = segmenter_parse_newline__ (input, n, eof, type);
600 s->state = S_DOCUMENT_1;
605 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
607 *type = SEG_END_COMMAND;
608 s->state = S_GENERAL;
609 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
614 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
617 ofs = skip_spaces_and_comments (input, n, eof, ofs);
623 return c != '\'' && c != '"' && c != '\n';
633 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
634 bool eof, int ofs, char id[], size_t id_size)
636 struct segmenter sub;
638 assert (id_size > 0);
641 sub.state = S_GENERAL;
645 enum segment_type type;
648 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
664 if (retval < id_size)
666 memcpy (id, input + ofs, retval);
673 case SEG_QUOTED_STRING:
675 case SEG_UNICODE_STRING:
676 case SEG_UNQUOTED_STRING:
677 case SEG_RESERVED_WORD:
679 case SEG_COMMENT_COMMAND:
680 case SEG_DO_REPEAT_COMMAND:
681 case SEG_INLINE_DATA:
684 case SEG_START_DOCUMENT:
686 case SEG_START_COMMAND:
687 case SEG_SEPARATE_COMMANDS:
688 case SEG_END_COMMAND:
690 case SEG_EXPECTED_QUOTE:
691 case SEG_EXPECTED_EXPONENT:
692 case SEG_UNEXPECTED_CHAR:
700 /* Called when INPUT begins with a character that can start off an ID token. */
702 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
703 bool eof, enum segment_type *type)
709 assert (s->state == S_GENERAL);
711 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
723 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
726 else if (!lex_uc_is_idn (uc))
732 if (input[ofs - 1] == '.')
734 int eol = at_end_of_line (input, n, eof, ofs);
741 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
742 : input[0] == '!' ? SEG_MACRO_ID
745 if (s->substate & SS_START_OF_COMMAND)
747 struct substring word = ss_buffer (input, ofs);
749 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
751 s->state = S_COMMENT_1;
752 return segmenter_parse_comment_1__ (s, input, n, eof, type);
754 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
756 s->state = S_DOCUMENT_1;
757 *type = SEG_START_DOCUMENT;
760 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
762 s->state = S_DEFINE_1;
765 else if (lex_id_match (ss_cstr ("FILE"), word))
769 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
771 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
773 s->state = S_FILE_LABEL_1;
778 else if (lex_id_match (ss_cstr ("DO"), word))
782 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
784 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
786 s->state = S_DO_REPEAT_1;
791 else if (lex_id_match (ss_cstr ("BEGIN"), word))
796 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
799 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
803 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
809 else if (input[ofs2] == '.')
811 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
816 eol = is_end_of_line (input, n, eof, ofs2);
821 if (memchr (input, '\n', ofs2))
822 s->state = S_BEGIN_DATA_1;
824 s->state = S_BEGIN_DATA_2;
837 segmenter_parse_string__ (enum segment_type string_type,
838 int ofs, struct segmenter *s,
839 const char *input, size_t n, bool eof,
840 enum segment_type *type)
842 int quote = input[ofs];
846 if (input[ofs] == quote)
851 if (input[ofs] == quote)
864 else if (input[ofs] == '\n')
875 *type = SEG_EXPECTED_QUOTE;
881 segmenter_maybe_parse_string__ (enum segment_type string_type,
883 const char *input, size_t n, bool eof,
884 enum segment_type *type)
891 else if (input[1] == '\'' || input[1] == '"')
892 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
894 return segmenter_parse_id__ (s, input, n, eof, type);
898 segmenter_parse_mid_command__ (struct segmenter *s,
899 const char *input, size_t n, bool eof,
900 enum segment_type *type)
906 assert (s->state == S_GENERAL);
907 assert (!(s->substate & SS_START_OF_LINE));
909 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
916 s->substate |= SS_START_OF_LINE;
926 else if (input[1] == '*')
928 ofs = skip_comment (input, n, eof, 2);
941 ofs = skip_spaces (input, n, eof, 1);
944 else if (c_isdigit (input[ofs]))
945 return segmenter_parse_number__ (s, input, n, eof, type, ofs);
946 else if (input[ofs] == '.')
953 else if (c_isdigit (input[ofs + 1]))
954 return segmenter_parse_number__ (s, input, n, eof, type, ofs);
957 case '(': case ')': case ',': case '=':
958 case '[': case ']': case '&': case '|': case '+':
964 if (s->substate & SS_START_OF_COMMAND)
966 /* '*' at the beginning of a command begins a comment. */
967 s->state = S_COMMENT_1;
968 return segmenter_parse_comment_1__ (s, input, n, eof, type);
971 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
974 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
977 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
980 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
988 else if (c_isdigit (input[1]))
989 return segmenter_parse_number__ (s, input, n, eof, type, 0);
991 int eol = at_end_of_line (input, n, eof, 1);
997 *type = SEG_END_COMMAND;
998 s->substate = SS_START_OF_COMMAND;
1004 case '0': case '1': case '2': case '3': case '4':
1005 case '5': case '6': case '7': case '8': case '9':
1006 return segmenter_parse_number__ (s, input, n, eof, type, 0);
1009 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
1010 s, input, n, eof, type);
1013 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
1014 s, input, n, eof, type);
1016 case '\'': case '"':
1017 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1018 s, input, n, eof, type);
1028 else if (input[1] == '*')
1030 *type = SEG_MACRO_ID;
1034 return segmenter_parse_id__ (s, input, n, eof, type);
1037 if (lex_uc_is_space (uc))
1039 ofs = skip_spaces (input, n, eof, mblen);
1043 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1047 s->substate |= SS_START_OF_LINE;
1048 *type = SEG_NEWLINE;
1057 else if (lex_uc_is_id1 (uc))
1058 return segmenter_parse_id__ (s, input, n, eof, type);
1059 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1067 *type = SEG_UNEXPECTED_CHAR;
1075 compare_commands (const void *a_, const void *b_)
1077 const char *const *ap = a_;
1078 const char *const *bp = b_;
1079 const char *a = *ap;
1080 const char *b = *bp;
1082 return c_strcasecmp (a, b);
1085 static const char **
1086 segmenter_get_command_name_candidates (unsigned char first)
1088 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1089 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1090 static const char *commands[] =
1092 #include "language/command.def"
1095 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1101 static const char **cindex[UCHAR_MAX + 1];
1109 qsort (commands, n_commands, sizeof *commands, compare_commands);
1110 for (i = 0; i < n_commands; i++)
1112 unsigned char c = c_toupper (commands[i][0]);
1113 if (cindex[c] == NULL)
1114 cindex[c] = &commands[i];
1116 for (i = 0; i <= UCHAR_MAX; i++)
1117 if (cindex[i] == NULL)
1118 cindex[i] = &commands[n_commands];
1121 return cindex[c_toupper (first)];
1125 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1128 const char **commands;
1145 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1150 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1158 if (input[ofs - 1] == '.')
1161 for (commands = segmenter_get_command_name_candidates (input[0]);
1162 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1168 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1169 &exact, &missing_words)
1170 && missing_words <= 0)
1178 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1181 return eof ? 0 : -1;
1184 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1187 return eof ? 0 : -1;
1189 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1192 return c == '\'' || c == '"' || c == '\n';
1196 segmenter_parse_start_of_line__ (struct segmenter *s,
1197 const char *input, size_t n, bool eof,
1198 enum segment_type *type)
1204 assert (s->state == S_GENERAL);
1205 assert (s->substate & SS_START_OF_LINE);
1207 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1214 ofs = skip_spaces_and_comments (input, n, eof, 1);
1219 int is_string = is_start_of_string__ (input, n, eof, ofs);
1224 /* This is punctuation that may separate pieces of a string. */
1234 *type = SEG_START_COMMAND;
1235 s->substate = SS_START_OF_COMMAND;
1239 if (lex_uc_is_space (uc))
1241 int eol = at_end_of_line (input, n, eof, 0);
1246 s->substate = SS_START_OF_COMMAND;
1247 *type = SEG_SEPARATE_COMMANDS;
1253 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1255 else if (s->mode == SEG_MODE_AUTO)
1257 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1264 assert (s->mode == SEG_MODE_BATCH);
1266 s->substate = SS_START_OF_COMMAND;
1267 *type = SEG_START_COMMAND;
1271 s->substate = SS_START_OF_COMMAND;
1272 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1276 segmenter_parse_file_label_1__ (struct segmenter *s,
1277 const char *input, size_t n, bool eof,
1278 enum segment_type *type)
1280 struct segmenter sub;
1284 sub.state = S_GENERAL;
1285 ofs = segmenter_push (&sub, input, n, eof, type);
1289 else if (*type == SEG_IDENTIFIER)
1293 assert (lex_id_match (ss_cstr ("LABEL"),
1294 ss_buffer ((char *) input, ofs)));
1295 result = segmenter_unquoted (input, n, eof, ofs);
1301 s->state = S_FILE_LABEL_2;
1309 s->substate = sub.substate;
1315 segmenter_parse_file_label_2__ (struct segmenter *s,
1316 const char *input, size_t n, bool eof,
1317 enum segment_type *type)
1321 ofs = skip_spaces (input, n, eof, 0);
1324 s->state = S_FILE_LABEL_3;
1330 segmenter_parse_file_label_3__ (struct segmenter *s,
1331 const char *input, size_t n, bool eof,
1332 enum segment_type *type)
1344 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1358 if (!lex_uc_is_space (uc))
1369 s->state = S_GENERAL;
1371 *type = SEG_UNQUOTED_STRING;
1372 return endcmd >= 0 ? endcmd : ofs;
1379 segmenter_subparse (struct segmenter *s,
1380 const char *input, size_t n, bool eof,
1381 enum segment_type *type)
1383 struct segmenter sub;
1387 sub.state = S_GENERAL;
1388 sub.substate = s->substate;
1389 ofs = segmenter_push (&sub, input, n, eof, type);
1390 s->substate = sub.substate;
1394 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1395 defines the stand-in variables (the head) before the lines of syntax to be
1396 repeated (the body). */
1398 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1399 const char *input, size_t n, bool eof,
1400 enum segment_type *type)
1402 int ofs = segmenter_subparse (s, input, n, eof, type);
1406 if (*type == SEG_SEPARATE_COMMANDS)
1408 /* We reached a blank line that separates the head from the body. */
1409 s->state = S_DO_REPEAT_2;
1411 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1413 /* We reached the body. */
1414 s->state = S_DO_REPEAT_3;
1421 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1422 separates the head from the body. */
1424 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1425 const char *input, size_t n, bool eof,
1426 enum segment_type *type)
1428 int ofs = segmenter_subparse (s, input, n, eof, type);
1432 if (*type == SEG_NEWLINE)
1434 /* We reached the body. */
1435 s->state = S_DO_REPEAT_3;
1443 check_repeat_command (struct segmenter *s,
1444 const char *input, size_t n, bool eof)
1451 if (input[ofs] == '+' || input[ofs] == '-')
1454 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1457 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1459 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1464 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1468 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1469 s->substate += direction;
1474 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1475 enum segment_type *type)
1477 const char *newline = memchr (input, '\n', n);
1479 return eof ? n : -1;
1481 ptrdiff_t ofs = newline - input;
1482 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1484 *type = SEG_NEWLINE;
1488 return ofs - (input[ofs - 1] == '\r');
1491 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1492 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1494 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1495 the lines we're segmenting. s->substate counts the nesting level, starting
1498 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1499 const char *input, size_t n, bool eof,
1500 enum segment_type *type)
1504 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1505 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1507 else if (!check_repeat_command (s, input, n, eof) && !eof)
1509 else if (s->substate == 0)
1511 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1513 s->state = S_GENERAL;
1514 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1515 return segmenter_push (s, input, n, eof, type);
1519 *type = SEG_DO_REPEAT_COMMAND;
1524 /* We are segmenting a DEFINE command, which consists of:
1526 - The DEFINE keyword.
1530 - "(" followed by a sequence of tokens possibly including balanced parentheses
1533 - A sequence of any number of lines, one string per line, ending with
1534 "!ENDDEFINE". The first line is usually blank (that is, a newline follows
1535 the "("). The last line usually just has "!ENDDEFINE." on it, but it can
1536 start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
1540 segmenter_parse_define_1__ (struct segmenter *s,
1541 const char *input, size_t n, bool eof,
1542 enum segment_type *type)
1544 int ofs = segmenter_subparse (s, input, n, eof, type);
1548 if (*type == SEG_SEPARATE_COMMANDS
1549 || *type == SEG_END_COMMAND
1550 || *type == SEG_START_COMMAND)
1552 /* The DEFINE command is malformed because we reached its end without
1553 ever hitting a "(" token. Transition back to general parsing. */
1554 s->state = S_GENERAL;
1557 else if (*type == SEG_PUNCT && input[0] == '(')
1559 s->state = S_DEFINE_2;
1568 segmenter_parse_define_2__ (struct segmenter *s,
1569 const char *input, size_t n, bool eof,
1570 enum segment_type *type)
1572 int ofs = segmenter_subparse (s, input, n, eof, type);
1576 if (*type == SEG_SEPARATE_COMMANDS
1577 || *type == SEG_END_COMMAND
1578 || *type == SEG_START_COMMAND)
1580 /* The DEFINE command is malformed because we reached its end before
1581 closing the set of parentheses. Transition back to general
1583 s->state = S_GENERAL;
1586 else if (*type == SEG_PUNCT && input[0] == '(')
1591 else if (*type == SEG_PUNCT && input[0] == ')')
1596 s->state = S_DEFINE_3;
1606 find_enddefine (struct substring input)
1608 size_t n = input.length;
1609 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1612 /* Skip !ENDDEFINE in comment. */
1613 ofs = skip_spaces_and_comments (input.string, n, true, ofs);
1614 if (ofs + enddefine.length > n)
1617 char c = input.string[ofs];
1619 && ss_equals_case (ss_substr (input, ofs, enddefine.length),
1622 else if (c == '\'' || c == '"')
1624 /* Skip quoted !ENDDEFINE. */
1630 else if (input.string[ofs++] == c)
1639 /* We are in the body of a macro definition, looking for additional lines of
1640 the body or !ENDDEFINE. */
1642 segmenter_parse_define_3__ (struct segmenter *s,
1643 const char *input, size_t n, bool eof,
1644 enum segment_type *type)
1646 /* Gather a whole line. */
1647 const char *newline = memchr (input, '\n', n);
1648 int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
1654 /* Does the line contain !ENDDEFINE? */
1655 size_t end = find_enddefine (ss_buffer (input, ofs));
1656 if (end == SIZE_MAX)
1658 /* No !ENDDEFINE. We have a full line of macro body.
1660 The line might be blank, whether completely empty or just spaces and
1661 comments. That's OK: we need to report blank lines because they can
1664 However, if the first line of the macro body (the same line as the
1665 closing parenthesis in the argument definition) is blank, we just
1666 report it as spaces because it's not significant. */
1667 *type = (s->substate == 0 && is_all_spaces (input, ofs)
1668 ? SEG_SPACES : SEG_MACRO_BODY);
1669 s->state = S_DEFINE_4;
1675 /* Macro ends at the !ENDDEFINE on this line. */
1676 s->state = S_GENERAL;
1680 /* Line starts with !ENDDEFINE. */
1681 return segmenter_push (s, input, n, eof, type);
1685 if (is_all_spaces (input, end))
1687 /* Line starts with spaces followed by !ENDDEFINE. */
1692 /* Line starts with some content followed by !ENDDEFINE. */
1693 *type = SEG_MACRO_BODY;
1701 segmenter_parse_define_4__ (struct segmenter *s,
1702 const char *input, size_t n, bool eof,
1703 enum segment_type *type)
1705 int ofs = segmenter_parse_newline__ (input, n, eof, type);
1709 s->state = S_DEFINE_3;
1714 segmenter_parse_begin_data_1__ (struct segmenter *s,
1715 const char *input, size_t n, bool eof,
1716 enum segment_type *type)
1718 int ofs = segmenter_subparse (s, input, n, eof, type);
1722 if (*type == SEG_NEWLINE)
1723 s->state = S_BEGIN_DATA_2;
1729 segmenter_parse_begin_data_2__ (struct segmenter *s,
1730 const char *input, size_t n, bool eof,
1731 enum segment_type *type)
1733 int ofs = segmenter_subparse (s, input, n, eof, type);
1737 if (*type == SEG_NEWLINE)
1738 s->state = S_BEGIN_DATA_3;
1744 is_end_data (const char *input, size_t n)
1746 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1752 if (n < 4 || c_strncasecmp (input, "END", 3))
1756 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1757 if (!lex_uc_is_space (uc))
1761 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1768 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1775 else if (!lex_uc_is_space (uc))
1784 segmenter_parse_begin_data_3__ (struct segmenter *s,
1785 const char *input, size_t n, bool eof,
1786 enum segment_type *type)
1790 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1793 else if (is_end_data (input, ofs))
1795 s->state = S_GENERAL;
1796 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1797 return segmenter_push (s, input, n, eof, type);
1801 *type = SEG_INLINE_DATA;
1802 s->state = S_BEGIN_DATA_4;
1803 return input[ofs - 1] == '\n' ? 0 : ofs;
1808 segmenter_parse_begin_data_4__ (struct segmenter *s,
1809 const char *input, size_t n, bool eof,
1810 enum segment_type *type)
1814 ofs = segmenter_parse_newline__ (input, n, eof, type);
1818 s->state = S_BEGIN_DATA_3;
1822 /* Returns the name of segment TYPE as a string. The caller must not modify
1823 or free the returned string.
1825 This is useful only for debugging and testing. */
1827 segment_type_to_string (enum segment_type type)
1831 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1835 return "unknown segment type";
1839 /* Returns a segmenter with the given syntax MODE.
1841 If IS_SNIPPET is false, then the segmenter will parse as if it's being given
1842 a whole file. This means, for example, that it will interpret - or + at the
1843 beginning of the syntax as a separator between commands (since - or + at the
1844 beginning of a line has this meaning).
1846 If IS_SNIPPET is true, then the segmenter will parse as if it's being given
1847 an isolated piece of syntax. This means that, for example, that it will
1848 interpret - or + at the beginning of the syntax as an operator token or (if
1849 followed by a digit) as part of a number.
1851 A segmenter does not contain any external references, so nothing needs to be
1852 done to destroy one. For the same reason, segmenters may be copied with
1853 plain struct assignment (or memcpy). */
1855 segmenter_init (enum segmenter_mode mode, bool is_snippet)
1857 return (struct segmenter) {
1858 .state = is_snippet ? S_GENERAL : S_SHBANG,
1863 /* Returns the mode passed to segmenter_init() for S. */
1865 segmenter_get_mode (const struct segmenter *s)
1870 /* Attempts to label a prefix of S's remaining input with a segment type. The
1871 caller supplies the first N bytes of the remaining input as INPUT, which
1872 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1873 are the entire (remainder) of the input; if EOF is false, then further input
1874 is potentially available.
1876 The input may contain '\n' or '\r\n' line ends in any combination.
1878 If successful, returns the number of bytes in the segment at the beginning
1879 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1880 into *TYPE. The next call to segmenter_push() should not include those
1881 bytes as part of INPUT, because they have (figuratively) been consumed by
1884 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1885 be determined. In this case segmenter_push() returns -1. If more input is
1886 available, the caller should obtain some more, then call again with a larger
1887 N. If this is not enough, the process might need to repeat again and agin.
1888 If input is exhausted, then the caller may call again setting EOF to true.
1889 segmenter_push() will never return -1 when EOF is true.
1891 The caller must not, in a sequence of calls, supply contradictory input.
1892 That is, bytes provided as part of INPUT in one call, but not consumed, must
1893 not be provided with *different* values on subsequent calls. This is
1894 because segmenter_push() must often make decisions based on looking ahead
1895 beyond the bytes that it consumes. */
1897 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1898 enum segment_type *type)
1914 return segmenter_parse_shbang__ (s, input, n, eof, type);
1917 return (s->substate & SS_START_OF_LINE
1918 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1919 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1922 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1924 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1927 return segmenter_parse_document_1__ (s, input, n, eof, type);
1929 return segmenter_parse_document_2__ (s, input, n, eof, type);
1931 return segmenter_parse_document_3__ (s, type);
1933 case S_FILE_LABEL_1:
1934 return segmenter_parse_file_label_1__ (s, input, n, eof, type);
1935 case S_FILE_LABEL_2:
1936 return segmenter_parse_file_label_2__ (s, input, n, eof, type);
1937 case S_FILE_LABEL_3:
1938 return segmenter_parse_file_label_3__ (s, input, n, eof, type);
1941 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1943 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1945 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1948 return segmenter_parse_define_1__ (s, input, n, eof, type);
1950 return segmenter_parse_define_2__ (s, input, n, eof, type);
1952 return segmenter_parse_define_3__ (s, input, n, eof, type);
1954 return segmenter_parse_define_4__ (s, input, n, eof, type);
1956 case S_BEGIN_DATA_1:
1957 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1958 case S_BEGIN_DATA_2:
1959 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1960 case S_BEGIN_DATA_3:
1961 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1962 case S_BEGIN_DATA_4:
1963 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1969 /* Returns the style of command prompt to display to an interactive user for
1970 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1971 and at the beginning of a line (that is, if segmenter_push() consumed as
1972 much as possible of the input up to a new-line). */
1974 segmenter_get_prompt (const struct segmenter *s)
1979 return PROMPT_FIRST;
1982 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1986 return PROMPT_COMMENT;
1990 return PROMPT_DOCUMENT;
1992 return PROMPT_FIRST;
1994 case S_FILE_LABEL_1:
1995 return PROMPT_LATER;
1996 case S_FILE_LABEL_2:
1997 case S_FILE_LABEL_3:
1998 return PROMPT_FIRST;
2002 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
2004 return PROMPT_DO_REPEAT;
2008 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
2011 return PROMPT_DEFINE;
2013 case S_BEGIN_DATA_1:
2014 return PROMPT_FIRST;
2015 case S_BEGIN_DATA_2:
2016 return PROMPT_LATER;
2017 case S_BEGIN_DATA_3:
2018 case S_BEGIN_DATA_4: