1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
31 #include "gl/verify.h"
58 #define SS_START_OF_LINE (1u << 0)
59 #define SS_START_OF_COMMAND (1u << 1)
61 static int segmenter_detect_command_name__ (const char *input,
62 size_t n, bool eof, int ofs);
65 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
68 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
76 mblen = u8_mbtoucr (puc, input, n);
80 return u8_mbtouc (puc, input, n);
91 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
92 bool eof, enum segment_type *type)
100 for (int ofs = 2; ; ofs++)
107 else if (input[ofs] == '\n')
109 if (input[ofs - 1] == '\r')
115 s->state = S_GENERAL;
116 s->substate = SS_START_OF_COMMAND;
126 s->state = S_GENERAL;
127 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
128 return segmenter_push (s, input, n, eof, type);
132 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
133 const char *input, size_t n, bool eof,
134 enum segment_type *type)
136 assert (s->state == S_GENERAL);
142 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
146 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
148 for (; ofs < n; ofs++)
150 if (input[ofs] == '\n')
152 else if (input[ofs] == '*')
155 return eof ? ofs + 1 : -1;
156 else if (input[ofs + 1] == '/')
160 return eof ? ofs : -1;
164 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
171 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
178 return eof ? ofs : -1;
179 else if (input[ofs + 1] != '*')
182 ofs = skip_comment (input, n, eof, ofs + 2);
186 else if (lex_uc_is_space (uc) && uc != '\n')
192 return eof ? ofs : -1;
196 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
200 else if (input[ofs] == '\n')
202 else if (input[ofs] == '\r')
206 return input[ofs + 1] == '\n';
213 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
215 ofs = skip_spaces_and_comments (input, n, eof, ofs);
219 return is_end_of_line (input, n, eof, ofs);
223 is_all_spaces (const char *input_, size_t n)
225 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
228 for (int ofs = 0; ofs < n; ofs += mblen)
231 mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
232 if (!lex_uc_is_space (uc))
239 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
240 enum segment_type *type)
244 if (input[0] == '\n')
254 assert (input[0] == '\r');
255 assert (input[1] == '\n');
264 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
271 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
275 if (!lex_uc_is_space (uc) || uc == '\n')
281 return eof ? ofs : -1;
285 skip_digits (const char *input, size_t n, bool eof, int ofs)
287 for (; ofs < n; ofs++)
288 if (!c_isdigit (input[ofs]))
290 return eof ? ofs : -1;
294 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
295 bool eof, enum segment_type *type)
299 assert (s->state == S_GENERAL);
301 ofs = skip_digits (input, n, eof, 0);
311 if (input[ofs] == '.')
320 ofs = skip_digits (input, n, eof, ofs + 1);
327 if (input[ofs] == 'e' || input[ofs] == 'E')
334 goto expected_exponent;
337 if (input[ofs] == '+' || input[ofs] == '-')
344 goto expected_exponent;
348 if (!c_isdigit (input[ofs]))
349 goto expected_exponent;
351 ofs = skip_digits (input, n, eof, ofs);
356 if (input[ofs - 1] == '.')
358 int eol = at_end_of_line (input, n, eof, ofs);
371 *type = SEG_EXPECTED_EXPONENT;
377 is_reserved_word (const char *s, int n)
381 s0 = c_toupper (s[0]);
385 s1 = c_toupper (s[1]);
386 return ((s0 == 'B' && s1 == 'Y')
387 || (s0 == 'E' && s1 == 'Q')
388 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
389 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
390 || (s0 == 'N' && s1 == 'E')
391 || (s0 == 'O' && s1 == 'R')
392 || (s0 == 'T' && s1 == 'O'));
395 s1 = c_toupper (s[1]);
396 s2 = c_toupper (s[2]);
397 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
398 || (s1 == 'N' && s2 == 'D')))
399 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
402 s1 = c_toupper (s[1]);
403 s2 = c_toupper (s[2]);
404 s3 = c_toupper (s[3]);
405 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
413 segmenter_parse_comment_1__ (struct segmenter *s,
414 const char *input, size_t n, bool eof,
415 enum segment_type *type)
427 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
438 if (ofs > 1 && input[ofs - 1] == '\r')
442 /* Blank line ends comment command. */
443 s->state = S_GENERAL;
444 s->substate = SS_START_OF_COMMAND;
445 *type = SEG_SEPARATE_COMMANDS;
448 else if (endcmd >= 0)
450 /* '.' at end of line ends comment command. */
451 s->state = S_GENERAL;
453 *type = SEG_COMMENT_COMMAND;
458 /* Comment continues onto next line. */
459 *type = SEG_COMMENT_COMMAND;
460 s->state = S_COMMENT_2;
466 if (!lex_uc_is_space (uc))
477 s->state = S_GENERAL;
478 s->substate = SS_START_OF_COMMAND;
479 *type = SEG_SEPARATE_COMMANDS;
487 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
488 size_t n, bool eof, enum segment_type *type)
490 int ofs = segmenter_parse_newline__ (input, n, eof, type);
504 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
508 if (uc == '+' || uc == '-' || uc == '.')
510 else if (!lex_uc_is_space (uc))
513 case SEG_MODE_INTERACTIVE:
522 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
536 s->state = S_GENERAL;
537 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
540 s->state = S_COMMENT_1;
545 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
546 bool eof, enum segment_type *type)
558 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
569 if (ofs > 1 && input[ofs - 1] == '\r')
572 *type = SEG_DOCUMENT;
573 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
577 if (!lex_uc_is_space (uc))
586 *type = SEG_DOCUMENT;
587 s->state = S_DOCUMENT_3;
594 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
595 bool eof, enum segment_type *type)
599 ofs = segmenter_parse_newline__ (input, n, eof, type);
603 s->state = S_DOCUMENT_1;
608 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
610 *type = SEG_END_COMMAND;
611 s->state = S_GENERAL;
612 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
617 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
620 ofs = skip_spaces_and_comments (input, n, eof, ofs);
626 return c != '\'' && c != '"' && c != '\n';
636 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
637 bool eof, int ofs, char id[], size_t id_size)
639 struct segmenter sub;
641 assert (id_size > 0);
644 sub.state = S_GENERAL;
648 enum segment_type type;
651 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
667 if (retval < id_size)
669 memcpy (id, input + ofs, retval);
676 case SEG_QUOTED_STRING:
678 case SEG_UNICODE_STRING:
679 case SEG_UNQUOTED_STRING:
680 case SEG_RESERVED_WORD:
682 case SEG_COMMENT_COMMAND:
683 case SEG_DO_REPEAT_COMMAND:
684 case SEG_INLINE_DATA:
687 case SEG_START_DOCUMENT:
689 case SEG_START_COMMAND:
690 case SEG_SEPARATE_COMMANDS:
691 case SEG_END_COMMAND:
693 case SEG_EXPECTED_QUOTE:
694 case SEG_EXPECTED_EXPONENT:
695 case SEG_UNEXPECTED_CHAR:
703 /* Called when INPUT begins with a character that can start off an ID token. */
705 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
706 bool eof, enum segment_type *type)
712 assert (s->state == S_GENERAL);
714 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
726 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
729 else if (!lex_uc_is_idn (uc))
735 if (input[ofs - 1] == '.')
737 int eol = at_end_of_line (input, n, eof, ofs);
744 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
745 : input[0] == '!' ? SEG_MACRO_ID
748 if (s->substate & SS_START_OF_COMMAND)
750 struct substring word = ss_buffer (input, ofs);
752 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
754 s->state = S_COMMENT_1;
755 return segmenter_parse_comment_1__ (s, input, n, eof, type);
757 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
759 s->state = S_DOCUMENT_1;
760 *type = SEG_START_DOCUMENT;
763 else if (lex_id_match (ss_cstr ("TITLE"), word)
764 || lex_id_match (ss_cstr ("SUBTITLE"), word))
766 int result = segmenter_unquoted (input, n, eof, ofs);
771 s->state = S_TITLE_1;
775 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
777 s->state = S_DEFINE_1;
780 else if (lex_id_match (ss_cstr ("FILE"), word))
784 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
786 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
788 s->state = S_FILE_LABEL;
793 else if (lex_id_match (ss_cstr ("DO"), word))
797 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
799 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
801 s->state = S_DO_REPEAT_1;
806 else if (lex_id_match (ss_cstr ("BEGIN"), word))
811 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
814 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
818 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
824 else if (input[ofs2] == '.')
826 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
831 eol = is_end_of_line (input, n, eof, ofs2);
836 if (memchr (input, '\n', ofs2))
837 s->state = S_BEGIN_DATA_1;
839 s->state = S_BEGIN_DATA_2;
852 segmenter_parse_string__ (enum segment_type string_type,
853 int ofs, struct segmenter *s,
854 const char *input, size_t n, bool eof,
855 enum segment_type *type)
857 int quote = input[ofs];
861 if (input[ofs] == quote)
866 if (input[ofs] == quote)
879 else if (input[ofs] == '\n')
890 *type = SEG_EXPECTED_QUOTE;
896 segmenter_maybe_parse_string__ (enum segment_type string_type,
898 const char *input, size_t n, bool eof,
899 enum segment_type *type)
906 else if (input[1] == '\'' || input[1] == '"')
907 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
909 return segmenter_parse_id__ (s, input, n, eof, type);
913 segmenter_parse_mid_command__ (struct segmenter *s,
914 const char *input, size_t n, bool eof,
915 enum segment_type *type)
921 assert (s->state == S_GENERAL);
922 assert (!(s->substate & SS_START_OF_LINE));
924 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
931 s->substate |= SS_START_OF_LINE;
941 else if (input[1] == '*')
943 ofs = skip_comment (input, n, eof, 2);
955 case '(': case ')': case ',': case '=': case '-':
956 case '[': case ']': case '&': case '|': case '+':
962 if (s->substate & SS_START_OF_COMMAND)
964 /* '*' at the beginning of a command begins a comment. */
965 s->state = S_COMMENT_1;
966 return segmenter_parse_comment_1__ (s, input, n, eof, type);
969 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
972 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
975 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
978 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
986 else if (c_isdigit (input[1]))
987 return segmenter_parse_number__ (s, input, n, eof, type);
989 int eol = at_end_of_line (input, n, eof, 1);
995 *type = SEG_END_COMMAND;
996 s->substate = SS_START_OF_COMMAND;
1002 case '0': case '1': case '2': case '3': case '4':
1003 case '5': case '6': case '7': case '8': case '9':
1004 return segmenter_parse_number__ (s, input, n, eof, type);
1007 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
1008 s, input, n, eof, type);
1011 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
1012 s, input, n, eof, type);
1014 case '\'': case '"':
1015 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1016 s, input, n, eof, type);
1019 return segmenter_parse_id__ (s, input, n, eof, type);
1022 if (lex_uc_is_space (uc))
1024 ofs = skip_spaces (input, n, eof, mblen);
1028 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1032 s->substate |= SS_START_OF_LINE;
1033 *type = SEG_NEWLINE;
1042 else if (lex_uc_is_id1 (uc))
1043 return segmenter_parse_id__ (s, input, n, eof, type);
1044 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1052 *type = SEG_UNEXPECTED_CHAR;
1060 compare_commands (const void *a_, const void *b_)
1062 const char *const *ap = a_;
1063 const char *const *bp = b_;
1064 const char *a = *ap;
1065 const char *b = *bp;
1067 return c_strcasecmp (a, b);
1070 static const char **
1071 segmenter_get_command_name_candidates (unsigned char first)
1073 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1074 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1075 static const char *commands[] =
1077 #include "language/command.def"
1080 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1086 static const char **cindex[UCHAR_MAX + 1];
1094 qsort (commands, n_commands, sizeof *commands, compare_commands);
1095 for (i = 0; i < n_commands; i++)
1097 unsigned char c = c_toupper (commands[i][0]);
1098 if (cindex[c] == NULL)
1099 cindex[c] = &commands[i];
1101 for (i = 0; i <= UCHAR_MAX; i++)
1102 if (cindex[i] == NULL)
1103 cindex[i] = &commands[n_commands];
1106 return cindex[c_toupper (first)];
1110 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1113 const char **commands;
1130 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1135 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1143 if (input[ofs - 1] == '.')
1146 for (commands = segmenter_get_command_name_candidates (input[0]);
1147 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1153 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1154 &exact, &missing_words)
1155 && missing_words <= 0)
1163 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1166 return eof ? 0 : -1;
1169 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1172 return eof ? 0 : -1;
1174 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1177 return c == '\'' || c == '"' || c == '\n';
1181 segmenter_parse_start_of_line__ (struct segmenter *s,
1182 const char *input, size_t n, bool eof,
1183 enum segment_type *type)
1189 assert (s->state == S_GENERAL);
1190 assert (s->substate & SS_START_OF_LINE);
1192 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1199 ofs = skip_spaces_and_comments (input, n, eof, 1);
1204 int is_string = is_start_of_string__ (input, n, eof, ofs);
1209 /* This is punctuation that may separate pieces of a string. */
1219 *type = SEG_START_COMMAND;
1220 s->substate = SS_START_OF_COMMAND;
1224 if (lex_uc_is_space (uc))
1226 int eol = at_end_of_line (input, n, eof, 0);
1231 s->substate = SS_START_OF_COMMAND;
1232 *type = SEG_SEPARATE_COMMANDS;
1238 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1240 else if (s->mode == SEG_MODE_AUTO)
1242 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1249 assert (s->mode == SEG_MODE_BATCH);
1251 s->substate = SS_START_OF_COMMAND;
1252 *type = SEG_START_COMMAND;
1256 s->substate = SS_START_OF_COMMAND;
1257 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1261 segmenter_parse_file_label__ (struct segmenter *s,
1262 const char *input, size_t n, bool eof,
1263 enum segment_type *type)
1265 struct segmenter sub;
1269 sub.state = S_GENERAL;
1270 ofs = segmenter_push (&sub, input, n, eof, type);
1274 else if (*type == SEG_IDENTIFIER)
1278 assert (lex_id_match (ss_cstr ("LABEL"),
1279 ss_buffer ((char *) input, ofs)));
1280 result = segmenter_unquoted (input, n, eof, ofs);
1286 s->state = S_TITLE_1;
1294 s->substate = sub.substate;
1300 segmenter_subparse (struct segmenter *s,
1301 const char *input, size_t n, bool eof,
1302 enum segment_type *type)
1304 struct segmenter sub;
1308 sub.state = S_GENERAL;
1309 sub.substate = s->substate;
1310 ofs = segmenter_push (&sub, input, n, eof, type);
1311 s->substate = sub.substate;
1315 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1316 defines the stand-in variables (the head) before the lines of syntax to be
1317 repeated (the body). */
1319 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1320 const char *input, size_t n, bool eof,
1321 enum segment_type *type)
1323 int ofs = segmenter_subparse (s, input, n, eof, type);
1327 if (*type == SEG_SEPARATE_COMMANDS)
1329 /* We reached a blank line that separates the head from the body. */
1330 s->state = S_DO_REPEAT_2;
1332 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1334 /* We reached the body. */
1335 s->state = S_DO_REPEAT_3;
1342 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1343 separates the head from the body. */
1345 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1346 const char *input, size_t n, bool eof,
1347 enum segment_type *type)
1349 int ofs = segmenter_subparse (s, input, n, eof, type);
1353 if (*type == SEG_NEWLINE)
1355 /* We reached the body. */
1356 s->state = S_DO_REPEAT_3;
1364 check_repeat_command (struct segmenter *s,
1365 const char *input, size_t n, bool eof)
1372 if (input[ofs] == '+' || input[ofs] == '-')
1375 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1378 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1380 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1385 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1389 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1390 s->substate += direction;
1395 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1396 enum segment_type *type)
1398 const char *newline = memchr (input, '\n', n);
1400 return eof ? n : -1;
1402 ptrdiff_t ofs = newline - input;
1403 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1405 *type = SEG_NEWLINE;
1409 return ofs - (input[ofs - 1] == '\r');
1412 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1413 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1415 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1416 the lines we're segmenting. s->substate counts the nesting level, starting
1419 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1420 const char *input, size_t n, bool eof,
1421 enum segment_type *type)
1425 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1426 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1428 else if (!check_repeat_command (s, input, n, eof) && !eof)
1430 else if (s->substate == 0)
1432 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1434 s->state = S_GENERAL;
1435 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1436 return segmenter_push (s, input, n, eof, type);
1440 *type = SEG_DO_REPEAT_COMMAND;
1445 /* We are segmenting a DEFINE command, which consists of:
1447 - The DEFINE keyword.
1451 - "(" followed by a sequence of tokens possibly including balanced parentheses
1454 - A sequence of any number of lines, one string per line, ending with
1455 "!ENDDEFINE". The first line is usually blank (that is, a newline follows
1456 the "("). The last line usually just has "!ENDDEFINE." on it, but it can
1457 start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
1461 segmenter_parse_define_1__ (struct segmenter *s,
1462 const char *input, size_t n, bool eof,
1463 enum segment_type *type)
1465 int ofs = segmenter_subparse (s, input, n, eof, type);
1469 if (*type == SEG_SEPARATE_COMMANDS
1470 || *type == SEG_END_COMMAND
1471 || *type == SEG_START_COMMAND)
1473 /* The DEFINE command is malformed because we reached its end without
1474 ever hitting a "(" token. Transition back to general parsing. */
1475 s->state = S_GENERAL;
1478 else if (*type == SEG_PUNCT && input[0] == '(')
1480 s->state = S_DEFINE_2;
1489 segmenter_parse_define_2__ (struct segmenter *s,
1490 const char *input, size_t n, bool eof,
1491 enum segment_type *type)
1493 int ofs = segmenter_subparse (s, input, n, eof, type);
1497 if (*type == SEG_SEPARATE_COMMANDS
1498 || *type == SEG_END_COMMAND
1499 || *type == SEG_START_COMMAND)
1501 /* The DEFINE command is malformed because we reached its end before
1502 closing the set of parentheses. Transition back to general
1504 s->state = S_GENERAL;
1507 else if (*type == SEG_PUNCT && input[0] == '(')
1512 else if (*type == SEG_PUNCT && input[0] == ')')
1517 s->state = S_DEFINE_3;
1527 find_enddefine (struct substring input)
1529 size_t n = input.length;
1530 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1531 for (size_t i = 0; i + enddefine.length <= n; i++)
1532 if (input.string[i] == '!'
1533 && ss_equals_case (ss_substr (input, i, enddefine.length), enddefine))
1538 /* We are in the body of a macro definition, looking for additional lines of
1539 the body or !ENDDEFINE. */
1541 segmenter_parse_define_3__ (struct segmenter *s,
1542 const char *input, size_t n, bool eof,
1543 enum segment_type *type)
1545 /* Gather a whole line. */
1546 const char *newline = memchr (input, '\n', n);
1547 int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
1553 /* Does the line contain !ENDDEFINE? */
1554 size_t end = find_enddefine (ss_buffer (input, ofs));
1555 if (end == SIZE_MAX)
1557 /* No !ENDDEFINE. We have a full line of macro body.
1559 The line might be blank, whether completely empty or just spaces and
1560 comments. That's OK: we need to report blank lines because they can
1563 However, if the first line of the macro body (the same line as the
1564 closing parenthesis in the argument definition) is blank, we just
1565 report it as spaces because it's not significant. */
1566 *type = (s->substate == 0 && is_all_spaces (input, ofs)
1567 ? SEG_SPACES : SEG_MACRO_BODY);
1568 s->state = S_DEFINE_4;
1574 /* Macro ends at the !ENDDEFINE on this line. */
1575 s->state = S_GENERAL;
1579 /* Line starts with !ENDDEFINE. */
1580 return segmenter_push (s, input, n, eof, type);
1584 if (is_all_spaces (input, end))
1586 /* Line starts with spaces followed by !ENDDEFINE. */
1591 /* Line starts with some content followed by !ENDDEFINE. */
1592 *type = SEG_MACRO_BODY;
1600 segmenter_parse_define_4__ (struct segmenter *s,
1601 const char *input, size_t n, bool eof,
1602 enum segment_type *type)
1604 int ofs = segmenter_parse_newline__ (input, n, eof, type);
1608 s->state = S_DEFINE_3;
1613 segmenter_parse_begin_data_1__ (struct segmenter *s,
1614 const char *input, size_t n, bool eof,
1615 enum segment_type *type)
1617 int ofs = segmenter_subparse (s, input, n, eof, type);
1621 if (*type == SEG_NEWLINE)
1622 s->state = S_BEGIN_DATA_2;
1628 segmenter_parse_begin_data_2__ (struct segmenter *s,
1629 const char *input, size_t n, bool eof,
1630 enum segment_type *type)
1632 int ofs = segmenter_subparse (s, input, n, eof, type);
1636 if (*type == SEG_NEWLINE)
1637 s->state = S_BEGIN_DATA_3;
1643 is_end_data (const char *input, size_t n)
1645 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1651 if (n < 4 || c_strncasecmp (input, "END", 3))
1655 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1656 if (!lex_uc_is_space (uc))
1660 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1667 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1674 else if (!lex_uc_is_space (uc))
1683 segmenter_parse_begin_data_3__ (struct segmenter *s,
1684 const char *input, size_t n, bool eof,
1685 enum segment_type *type)
1689 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1692 else if (is_end_data (input, ofs))
1694 s->state = S_GENERAL;
1695 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1696 return segmenter_push (s, input, n, eof, type);
1700 *type = SEG_INLINE_DATA;
1701 s->state = S_BEGIN_DATA_4;
1702 return input[ofs - 1] == '\n' ? 0 : ofs;
1707 segmenter_parse_begin_data_4__ (struct segmenter *s,
1708 const char *input, size_t n, bool eof,
1709 enum segment_type *type)
1713 ofs = segmenter_parse_newline__ (input, n, eof, type);
1717 s->state = S_BEGIN_DATA_3;
1722 segmenter_parse_title_1__ (struct segmenter *s,
1723 const char *input, size_t n, bool eof,
1724 enum segment_type *type)
1728 ofs = skip_spaces (input, n, eof, 0);
1731 s->state = S_TITLE_2;
1737 segmenter_parse_title_2__ (struct segmenter *s,
1738 const char *input, size_t n, bool eof,
1739 enum segment_type *type)
1751 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1765 if (!lex_uc_is_space (uc))
1776 s->state = S_GENERAL;
1778 *type = SEG_UNQUOTED_STRING;
1779 return endcmd >= 0 ? endcmd : ofs;
1785 /* Returns the name of segment TYPE as a string. The caller must not modify
1786 or free the returned string.
1788 This is useful only for debugging and testing. */
1790 segment_type_to_string (enum segment_type type)
1794 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1798 return "unknown segment type";
1802 /* Returns a segmenter with the given syntax MODE.
1804 If IS_SNIPPET is false, then the segmenter will parse as if it's being given
1805 a whole file. This means, for example, that it will interpret - or + at the
1806 beginning of the syntax as a separator between commands (since - or + at the
1807 beginning of a line has this meaning).
1809 If IS_SNIPPET is true, then the segmenter will parse as if it's being given
1810 an isolated piece of syntax. This means that, for example, that it will
1811 interpret - or + at the beginning of the syntax as an operator token or (if
1812 followed by a digit) as part of a number.
1814 A segmenter does not contain any external references, so nothing needs to be
1815 done to destroy one. For the same reason, segmenters may be copied with
1816 plain struct assignment (or memcpy). */
1818 segmenter_init (enum segmenter_mode mode, bool is_snippet)
1820 return (struct segmenter) {
1821 .state = is_snippet ? S_GENERAL : S_SHBANG,
1826 /* Returns the mode passed to segmenter_init() for S. */
1828 segmenter_get_mode (const struct segmenter *s)
1833 /* Attempts to label a prefix of S's remaining input with a segment type. The
1834 caller supplies the first N bytes of the remaining input as INPUT, which
1835 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1836 are the entire (remainder) of the input; if EOF is false, then further input
1837 is potentially available.
1839 The input may contain '\n' or '\r\n' line ends in any combination.
1841 If successful, returns the number of bytes in the segment at the beginning
1842 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1843 into *TYPE. The next call to segmenter_push() should not include those
1844 bytes as part of INPUT, because they have (figuratively) been consumed by
1847 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1848 be determined. In this case segmenter_push() returns -1. If more input is
1849 available, the caller should obtain some more, then call again with a larger
1850 N. If this is not enough, the process might need to repeat again and agin.
1851 If input is exhausted, then the caller may call again setting EOF to true.
1852 segmenter_push() will never return -1 when EOF is true.
1854 The caller must not, in a sequence of calls, supply contradictory input.
1855 That is, bytes provided as part of INPUT in one call, but not consumed, must
1856 not be provided with *different* values on subsequent calls. This is
1857 because segmenter_push() must often make decisions based on looking ahead
1858 beyond the bytes that it consumes. */
1860 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1861 enum segment_type *type)
1877 return segmenter_parse_shbang__ (s, input, n, eof, type);
1880 return (s->substate & SS_START_OF_LINE
1881 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1882 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1885 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1887 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1890 return segmenter_parse_document_1__ (s, input, n, eof, type);
1892 return segmenter_parse_document_2__ (s, input, n, eof, type);
1894 return segmenter_parse_document_3__ (s, type);
1897 return segmenter_parse_file_label__ (s, input, n, eof, type);
1900 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1902 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1904 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1907 return segmenter_parse_define_1__ (s, input, n, eof, type);
1909 return segmenter_parse_define_2__ (s, input, n, eof, type);
1911 return segmenter_parse_define_3__ (s, input, n, eof, type);
1913 return segmenter_parse_define_4__ (s, input, n, eof, type);
1915 case S_BEGIN_DATA_1:
1916 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1917 case S_BEGIN_DATA_2:
1918 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1919 case S_BEGIN_DATA_3:
1920 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1921 case S_BEGIN_DATA_4:
1922 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1925 return segmenter_parse_title_1__ (s, input, n, eof, type);
1927 return segmenter_parse_title_2__ (s, input, n, eof, type);
1933 /* Returns the style of command prompt to display to an interactive user for
1934 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1935 and at the beginning of a line (that is, if segmenter_push() consumed as
1936 much as possible of the input up to a new-line). */
1938 segmenter_get_prompt (const struct segmenter *s)
1943 return PROMPT_FIRST;
1946 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1950 return PROMPT_COMMENT;
1954 return PROMPT_DOCUMENT;
1956 return PROMPT_FIRST;
1959 return PROMPT_LATER;
1963 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1965 return PROMPT_DO_REPEAT;
1969 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1972 return PROMPT_DEFINE;
1974 case S_BEGIN_DATA_1:
1975 return PROMPT_FIRST;
1976 case S_BEGIN_DATA_2:
1977 return PROMPT_LATER;
1978 case S_BEGIN_DATA_3:
1979 case S_BEGIN_DATA_4:
1984 return PROMPT_FIRST;