1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
57 #define SS_START_OF_LINE (1u << 0)
58 #define SS_START_OF_COMMAND (1u << 1)
60 static int segmenter_detect_command_name__ (const char *input,
61 size_t n, bool eof, int ofs);
64 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
67 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
75 mblen = u8_mbtoucr (puc, input, n);
79 return u8_mbtouc (puc, input, n);
90 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
91 bool eof, enum segment_type *type)
99 for (int ofs = 2; ; ofs++)
106 else if (input[ofs] == '\n')
108 if (input[ofs - 1] == '\r')
114 s->state = S_GENERAL;
115 s->substate = SS_START_OF_COMMAND;
125 s->state = S_GENERAL;
126 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
127 return segmenter_push (s, input, n, eof, type);
131 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
132 const char *input, size_t n, bool eof,
133 enum segment_type *type)
135 assert (s->state == S_GENERAL);
141 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
145 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
147 for (; ofs < n; ofs++)
149 if (input[ofs] == '\n')
151 else if (input[ofs] == '*')
154 return eof ? ofs + 1 : -1;
155 else if (input[ofs + 1] == '/')
159 return eof ? ofs : -1;
163 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
170 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
177 return eof ? ofs : -1;
178 else if (input[ofs + 1] != '*')
181 ofs = skip_comment (input, n, eof, ofs + 2);
185 else if (lex_uc_is_space (uc) && uc != '\n')
191 return eof ? ofs : -1;
195 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
199 else if (input[ofs] == '\n')
201 else if (input[ofs] == '\r')
205 return input[ofs + 1] == '\n';
212 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
214 ofs = skip_spaces_and_comments (input, n, eof, ofs);
218 return is_end_of_line (input, n, eof, ofs);
222 is_all_spaces (const char *input_, size_t n)
224 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
227 for (int ofs = 0; ofs < n; ofs += mblen)
230 mblen = u8_mbtouc (&uc, input, n);
231 if (!lex_uc_is_space (uc))
238 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
239 enum segment_type *type)
243 if (input[0] == '\n')
253 assert (input[0] == '\r');
254 assert (input[1] == '\n');
263 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
270 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
274 if (!lex_uc_is_space (uc) || uc == '\n')
280 return eof ? ofs : -1;
284 skip_digits (const char *input, size_t n, bool eof, int ofs)
286 for (; ofs < n; ofs++)
287 if (!c_isdigit (input[ofs]))
289 return eof ? ofs : -1;
293 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
294 bool eof, enum segment_type *type)
298 assert (s->state == S_GENERAL);
300 ofs = skip_digits (input, n, eof, 0);
310 if (input[ofs] == '.')
319 ofs = skip_digits (input, n, eof, ofs + 1);
326 if (input[ofs] == 'e' || input[ofs] == 'E')
333 goto expected_exponent;
336 if (input[ofs] == '+' || input[ofs] == '-')
343 goto expected_exponent;
347 if (!c_isdigit (input[ofs]))
348 goto expected_exponent;
350 ofs = skip_digits (input, n, eof, ofs);
355 if (input[ofs - 1] == '.')
357 int eol = at_end_of_line (input, n, eof, ofs);
370 *type = SEG_EXPECTED_EXPONENT;
376 is_reserved_word (const char *s, int n)
380 s0 = c_toupper (s[0]);
384 s1 = c_toupper (s[1]);
385 return ((s0 == 'B' && s1 == 'Y')
386 || (s0 == 'E' && s1 == 'Q')
387 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
388 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
389 || (s0 == 'N' && s1 == 'E')
390 || (s0 == 'O' && s1 == 'R')
391 || (s0 == 'T' && s1 == 'O'));
394 s1 = c_toupper (s[1]);
395 s2 = c_toupper (s[2]);
396 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
397 || (s1 == 'N' && s2 == 'D')))
398 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
401 s1 = c_toupper (s[1]);
402 s2 = c_toupper (s[2]);
403 s3 = c_toupper (s[3]);
404 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
412 segmenter_parse_comment_1__ (struct segmenter *s,
413 const char *input, size_t n, bool eof,
414 enum segment_type *type)
426 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
437 if (ofs > 1 && input[ofs - 1] == '\r')
441 /* Blank line ends comment command. */
442 s->state = S_GENERAL;
443 s->substate = SS_START_OF_COMMAND;
444 *type = SEG_SEPARATE_COMMANDS;
447 else if (endcmd >= 0)
449 /* '.' at end of line ends comment command. */
450 s->state = S_GENERAL;
452 *type = SEG_COMMENT_COMMAND;
457 /* Comment continues onto next line. */
458 *type = SEG_COMMENT_COMMAND;
459 s->state = S_COMMENT_2;
465 if (!lex_uc_is_space (uc))
476 s->state = S_GENERAL;
477 s->substate = SS_START_OF_COMMAND;
478 *type = SEG_SEPARATE_COMMANDS;
486 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
487 size_t n, bool eof, enum segment_type *type)
489 int ofs = segmenter_parse_newline__ (input, n, eof, type);
503 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
507 if (uc == '+' || uc == '-' || uc == '.')
509 else if (!lex_uc_is_space (uc))
512 case SEG_MODE_INTERACTIVE:
521 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
535 s->state = S_GENERAL;
536 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
539 s->state = S_COMMENT_1;
544 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
545 bool eof, enum segment_type *type)
557 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
568 if (ofs > 1 && input[ofs - 1] == '\r')
571 *type = SEG_DOCUMENT;
572 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
576 if (!lex_uc_is_space (uc))
585 *type = SEG_DOCUMENT;
586 s->state = S_DOCUMENT_3;
593 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
594 bool eof, enum segment_type *type)
598 ofs = segmenter_parse_newline__ (input, n, eof, type);
602 s->state = S_DOCUMENT_1;
607 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
609 *type = SEG_END_COMMAND;
610 s->state = S_GENERAL;
611 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
616 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
619 ofs = skip_spaces_and_comments (input, n, eof, ofs);
625 return c != '\'' && c != '"' && c != '\n';
635 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
636 bool eof, int ofs, char id[], size_t id_size)
638 struct segmenter sub;
640 assert (id_size > 0);
643 sub.state = S_GENERAL;
647 enum segment_type type;
650 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
666 if (retval < id_size)
668 memcpy (id, input + ofs, retval);
675 case SEG_QUOTED_STRING:
677 case SEG_UNICODE_STRING:
678 case SEG_UNQUOTED_STRING:
679 case SEG_RESERVED_WORD:
681 case SEG_COMMENT_COMMAND:
682 case SEG_DO_REPEAT_COMMAND:
683 case SEG_INLINE_DATA:
686 case SEG_START_DOCUMENT:
688 case SEG_START_COMMAND:
689 case SEG_SEPARATE_COMMANDS:
690 case SEG_END_COMMAND:
692 case SEG_EXPECTED_QUOTE:
693 case SEG_EXPECTED_EXPONENT:
694 case SEG_UNEXPECTED_CHAR:
702 /* Called when INPUT begins with a character that can start off an ID token. */
704 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
705 bool eof, enum segment_type *type)
711 assert (s->state == S_GENERAL);
713 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
725 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
728 else if (!lex_uc_is_idn (uc))
734 if (input[ofs - 1] == '.')
736 int eol = at_end_of_line (input, n, eof, ofs);
743 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
744 : input[0] == '!' ? SEG_MACRO_ID
747 if (s->substate & SS_START_OF_COMMAND)
749 struct substring word = ss_buffer (input, ofs);
751 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
753 s->state = S_COMMENT_1;
754 return segmenter_parse_comment_1__ (s, input, n, eof, type);
756 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
758 s->state = S_DOCUMENT_1;
759 *type = SEG_START_DOCUMENT;
762 else if (lex_id_match (ss_cstr ("TITLE"), word)
763 || lex_id_match (ss_cstr ("SUBTITLE"), word))
765 int result = segmenter_unquoted (input, n, eof, ofs);
770 s->state = S_TITLE_1;
774 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
776 s->state = S_DEFINE_1;
779 else if (lex_id_match (ss_cstr ("FILE"), word))
783 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
785 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
787 s->state = S_FILE_LABEL;
792 else if (lex_id_match (ss_cstr ("DO"), word))
796 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
798 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
800 s->state = S_DO_REPEAT_1;
805 else if (lex_id_match (ss_cstr ("BEGIN"), word))
810 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
813 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
817 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
823 else if (input[ofs2] == '.')
825 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
830 eol = is_end_of_line (input, n, eof, ofs2);
835 if (memchr (input, '\n', ofs2))
836 s->state = S_BEGIN_DATA_1;
838 s->state = S_BEGIN_DATA_2;
851 segmenter_parse_string__ (enum segment_type string_type,
852 int ofs, struct segmenter *s,
853 const char *input, size_t n, bool eof,
854 enum segment_type *type)
856 int quote = input[ofs];
860 if (input[ofs] == quote)
865 if (input[ofs] == quote)
878 else if (input[ofs] == '\n')
889 *type = SEG_EXPECTED_QUOTE;
895 segmenter_maybe_parse_string__ (enum segment_type string_type,
897 const char *input, size_t n, bool eof,
898 enum segment_type *type)
905 else if (input[1] == '\'' || input[1] == '"')
906 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
908 return segmenter_parse_id__ (s, input, n, eof, type);
912 segmenter_parse_mid_command__ (struct segmenter *s,
913 const char *input, size_t n, bool eof,
914 enum segment_type *type)
920 assert (s->state == S_GENERAL);
921 assert (!(s->substate & SS_START_OF_LINE));
923 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
930 s->substate |= SS_START_OF_LINE;
940 else if (input[1] == '*')
942 ofs = skip_comment (input, n, eof, 2);
954 case '(': case ')': case ',': case '=': case '-':
955 case '[': case ']': case '&': case '|': case '+':
961 if (s->substate & SS_START_OF_COMMAND)
963 /* '*' at the beginning of a command begins a comment. */
964 s->state = S_COMMENT_1;
965 return segmenter_parse_comment_1__ (s, input, n, eof, type);
968 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
971 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
974 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
977 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
985 else if (c_isdigit (input[1]))
986 return segmenter_parse_number__ (s, input, n, eof, type);
988 int eol = at_end_of_line (input, n, eof, 1);
994 *type = SEG_END_COMMAND;
995 s->substate = SS_START_OF_COMMAND;
1001 case '0': case '1': case '2': case '3': case '4':
1002 case '5': case '6': case '7': case '8': case '9':
1003 return segmenter_parse_number__ (s, input, n, eof, type);
1006 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
1007 s, input, n, eof, type);
1010 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
1011 s, input, n, eof, type);
1013 case '\'': case '"':
1014 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1015 s, input, n, eof, type);
1018 return segmenter_parse_id__ (s, input, n, eof, type);
1021 if (lex_uc_is_space (uc))
1023 ofs = skip_spaces (input, n, eof, mblen);
1027 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1031 s->substate |= SS_START_OF_LINE;
1032 *type = SEG_NEWLINE;
1041 else if (lex_uc_is_id1 (uc))
1042 return segmenter_parse_id__ (s, input, n, eof, type);
1043 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1051 *type = SEG_UNEXPECTED_CHAR;
1059 compare_commands (const void *a_, const void *b_)
1061 const char *const *ap = a_;
1062 const char *const *bp = b_;
1063 const char *a = *ap;
1064 const char *b = *bp;
1066 return c_strcasecmp (a, b);
1069 static const char **
1070 segmenter_get_command_name_candidates (unsigned char first)
1072 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1073 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1074 static const char *commands[] =
1076 #include "language/command.def"
1079 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1085 static const char **cindex[UCHAR_MAX + 1];
1093 qsort (commands, n_commands, sizeof *commands, compare_commands);
1094 for (i = 0; i < n_commands; i++)
1096 unsigned char c = c_toupper (commands[i][0]);
1097 if (cindex[c] == NULL)
1098 cindex[c] = &commands[i];
1100 for (i = 0; i <= UCHAR_MAX; i++)
1101 if (cindex[i] == NULL)
1102 cindex[i] = &commands[n_commands];
1105 return cindex[c_toupper (first)];
1109 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1112 const char **commands;
1129 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1134 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1142 if (input[ofs - 1] == '.')
1145 for (commands = segmenter_get_command_name_candidates (input[0]);
1146 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1152 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1153 &exact, &missing_words)
1154 && missing_words <= 0)
1162 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1165 return eof ? 0 : -1;
1168 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1171 return eof ? 0 : -1;
1173 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1176 return c == '\'' || c == '"' || c == '\n';
1180 segmenter_parse_start_of_line__ (struct segmenter *s,
1181 const char *input, size_t n, bool eof,
1182 enum segment_type *type)
1188 assert (s->state == S_GENERAL);
1189 assert (s->substate & SS_START_OF_LINE);
1191 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1198 ofs = skip_spaces_and_comments (input, n, eof, 1);
1203 int is_string = is_start_of_string__ (input, n, eof, ofs);
1208 /* This is punctuation that may separate pieces of a string. */
1218 *type = SEG_START_COMMAND;
1219 s->substate = SS_START_OF_COMMAND;
1223 if (lex_uc_is_space (uc))
1225 int eol = at_end_of_line (input, n, eof, 0);
1230 s->substate = SS_START_OF_COMMAND;
1231 *type = SEG_SEPARATE_COMMANDS;
1237 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1239 else if (s->mode == SEG_MODE_AUTO)
1241 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1248 assert (s->mode == SEG_MODE_BATCH);
1250 s->substate = SS_START_OF_COMMAND;
1251 *type = SEG_START_COMMAND;
1255 s->substate = SS_START_OF_COMMAND;
1256 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1260 segmenter_parse_file_label__ (struct segmenter *s,
1261 const char *input, size_t n, bool eof,
1262 enum segment_type *type)
1264 struct segmenter sub;
1268 sub.state = S_GENERAL;
1269 ofs = segmenter_push (&sub, input, n, eof, type);
1273 else if (*type == SEG_IDENTIFIER)
1277 assert (lex_id_match (ss_cstr ("LABEL"),
1278 ss_buffer ((char *) input, ofs)));
1279 result = segmenter_unquoted (input, n, eof, ofs);
1285 s->state = S_TITLE_1;
1293 s->substate = sub.substate;
1299 segmenter_subparse (struct segmenter *s,
1300 const char *input, size_t n, bool eof,
1301 enum segment_type *type)
1303 struct segmenter sub;
1307 sub.state = S_GENERAL;
1308 sub.substate = s->substate;
1309 ofs = segmenter_push (&sub, input, n, eof, type);
1310 s->substate = sub.substate;
1314 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1315 defines the stand-in variables (the head) before the lines of syntax to be
1316 repeated (the body). */
1318 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1319 const char *input, size_t n, bool eof,
1320 enum segment_type *type)
1322 int ofs = segmenter_subparse (s, input, n, eof, type);
1326 if (*type == SEG_SEPARATE_COMMANDS)
1328 /* We reached a blank line that separates the head from the body. */
1329 s->state = S_DO_REPEAT_2;
1331 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1333 /* We reached the body. */
1334 s->state = S_DO_REPEAT_3;
1341 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1342 separates the head from the body. */
1344 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1345 const char *input, size_t n, bool eof,
1346 enum segment_type *type)
1348 int ofs = segmenter_subparse (s, input, n, eof, type);
1352 if (*type == SEG_NEWLINE)
1354 /* We reached the body. */
1355 s->state = S_DO_REPEAT_3;
1363 check_repeat_command (struct segmenter *s,
1364 const char *input, size_t n, bool eof)
1371 if (input[ofs] == '+' || input[ofs] == '-')
1374 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1377 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1379 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1384 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1388 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1389 s->substate += direction;
1394 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1395 enum segment_type *type)
1397 const char *newline = memchr (input, '\n', n);
1399 return eof ? n : -1;
1401 ptrdiff_t ofs = newline - input;
1402 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1404 *type = SEG_NEWLINE;
1408 return ofs - (input[ofs - 1] == '\r');
1411 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1412 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1414 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1415 the lines we're segmenting. s->substate counts the nesting level, starting
1418 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1419 const char *input, size_t n, bool eof,
1420 enum segment_type *type)
1424 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1425 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1427 else if (!check_repeat_command (s, input, n, eof) && !eof)
1429 else if (s->substate == 0)
1431 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1433 s->state = S_GENERAL;
1434 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1435 return segmenter_push (s, input, n, eof, type);
1439 *type = SEG_DO_REPEAT_COMMAND;
1444 /* We are segmenting a DEFINE command, which consists of:
1446 - The DEFINE keyword.
1450 - "(" followed by a sequence of tokens possibly including balanced parentheses
1453 - A sequence of any number of lines, one string per line, ending with
1454 "!ENDDEFINE". The first line is usually blank (that is, a newline follows
1455 the "("). The last line usually just has "!ENDDEFINE." on it, but it can
1456 start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
1460 segmenter_parse_define_1__ (struct segmenter *s,
1461 const char *input, size_t n, bool eof,
1462 enum segment_type *type)
1464 int ofs = segmenter_subparse (s, input, n, eof, type);
1468 if (*type == SEG_SEPARATE_COMMANDS
1469 || *type == SEG_END_COMMAND
1470 || *type == SEG_START_COMMAND)
1472 /* The DEFINE command is malformed because we reached its end without
1473 ever hitting a "(" token. Transition back to general parsing. */
1474 s->state = S_GENERAL;
1477 else if (*type == SEG_PUNCT && input[0] == '(')
1479 s->state = S_DEFINE_2;
1488 segmenter_parse_define_2__ (struct segmenter *s,
1489 const char *input, size_t n, bool eof,
1490 enum segment_type *type)
1492 int ofs = segmenter_subparse (s, input, n, eof, type);
1496 if (*type == SEG_SEPARATE_COMMANDS
1497 || *type == SEG_END_COMMAND
1498 || *type == SEG_START_COMMAND)
1500 /* The DEFINE command is malformed because we reached its end before
1501 closing the set of parentheses. Transition back to general
1503 s->state = S_GENERAL;
1506 else if (*type == SEG_PUNCT && input[0] == '(')
1511 else if (*type == SEG_PUNCT && input[0] == ')')
1515 s->state = S_DEFINE_3;
1523 find_enddefine (struct substring input)
1525 size_t n = input.length;
1526 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1527 for (size_t i = 0; i + enddefine.length <= n; i++)
1528 if (input.string[i] == '!'
1529 && ss_equals_case (ss_substr (input, i, enddefine.length), enddefine))
1534 /* We are in the body of a macro definition, looking for additional lines of
1535 the body or !ENDDEFINE. */
1537 segmenter_parse_define_3__ (struct segmenter *s,
1538 const char *input, size_t n, bool eof,
1539 enum segment_type *type)
1541 /* Gather a whole line. */
1542 const char *newline = memchr (input, '\n', n);
1543 int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
1549 /* Does the line contain !ENDDEFINE? */
1550 size_t end = find_enddefine (ss_buffer (input, ofs));
1551 if (end == SIZE_MAX)
1553 /* No !ENDDEFINE. We have a full line of macro body.
1555 The line might be blank, whether completely empty or just spaces and
1556 comments. That's OK: we need to report blank lines because they can
1557 have significance. */
1558 *type = SEG_MACRO_BODY;
1559 s->state = S_DEFINE_4;
1564 /* Macro ends at the !ENDDEFINE on this line. */
1565 s->state = S_GENERAL;
1569 /* Line starts with !ENDDEFINE. */
1570 return segmenter_push (s, input, n, eof, type);
1574 if (is_all_spaces (input, end))
1576 /* Line starts with spaces followed by !ENDDEFINE. */
1581 /* Line starts with some content followed by !ENDDEFINE. */
1582 *type = SEG_MACRO_BODY;
1590 segmenter_parse_define_4__ (struct segmenter *s,
1591 const char *input, size_t n, bool eof,
1592 enum segment_type *type)
1594 int ofs = segmenter_parse_newline__ (input, n, eof, type);
1598 s->state = S_DEFINE_3;
1603 segmenter_parse_begin_data_1__ (struct segmenter *s,
1604 const char *input, size_t n, bool eof,
1605 enum segment_type *type)
1607 int ofs = segmenter_subparse (s, input, n, eof, type);
1611 if (*type == SEG_NEWLINE)
1612 s->state = S_BEGIN_DATA_2;
1618 segmenter_parse_begin_data_2__ (struct segmenter *s,
1619 const char *input, size_t n, bool eof,
1620 enum segment_type *type)
1622 int ofs = segmenter_subparse (s, input, n, eof, type);
1626 if (*type == SEG_NEWLINE)
1627 s->state = S_BEGIN_DATA_3;
1633 is_end_data (const char *input, size_t n)
1635 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1641 if (n < 4 || c_strncasecmp (input, "END", 3))
1645 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1646 if (!lex_uc_is_space (uc))
1650 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1657 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1664 else if (!lex_uc_is_space (uc))
1673 segmenter_parse_begin_data_3__ (struct segmenter *s,
1674 const char *input, size_t n, bool eof,
1675 enum segment_type *type)
1679 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1682 else if (is_end_data (input, ofs))
1684 s->state = S_GENERAL;
1685 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1686 return segmenter_push (s, input, n, eof, type);
1690 *type = SEG_INLINE_DATA;
1691 s->state = S_BEGIN_DATA_4;
1692 return input[ofs - 1] == '\n' ? 0 : ofs;
1697 segmenter_parse_begin_data_4__ (struct segmenter *s,
1698 const char *input, size_t n, bool eof,
1699 enum segment_type *type)
1703 ofs = segmenter_parse_newline__ (input, n, eof, type);
1707 s->state = S_BEGIN_DATA_3;
1712 segmenter_parse_title_1__ (struct segmenter *s,
1713 const char *input, size_t n, bool eof,
1714 enum segment_type *type)
1718 ofs = skip_spaces (input, n, eof, 0);
1721 s->state = S_TITLE_2;
1727 segmenter_parse_title_2__ (struct segmenter *s,
1728 const char *input, size_t n, bool eof,
1729 enum segment_type *type)
1741 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1755 if (!lex_uc_is_space (uc))
1766 s->state = S_GENERAL;
1768 *type = SEG_UNQUOTED_STRING;
1769 return endcmd >= 0 ? endcmd : ofs;
1775 /* Returns the name of segment TYPE as a string. The caller must not modify
1776 or free the returned string.
1778 This is useful only for debugging and testing. */
1780 segment_type_to_string (enum segment_type type)
1784 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1788 return "unknown segment type";
1792 /* Initializes S as a segmenter with the given syntax MODE.
1794 A segmenter does not contain any external references, so nothing needs to be
1795 done to destroy one. For the same reason, segmenters may be copied with
1796 plain struct assignment (or memcpy). */
1798 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1800 s->state = S_SHBANG;
1805 /* Returns the mode passed to segmenter_init() for S. */
1807 segmenter_get_mode (const struct segmenter *s)
1812 /* Attempts to label a prefix of S's remaining input with a segment type. The
1813 caller supplies the first N bytes of the remaining input as INPUT, which
1814 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1815 are the entire (remainder) of the input; if EOF is false, then further input
1816 is potentially available.
1818 The input may contain '\n' or '\r\n' line ends in any combination.
1820 If successful, returns the number of bytes in the segment at the beginning
1821 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1822 into *TYPE. The next call to segmenter_push() should not include those
1823 bytes as part of INPUT, because they have (figuratively) been consumed by
1826 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1827 be determined. In this case segmenter_push() returns -1. If more input is
1828 available, the caller should obtain some more, then call again with a larger
1829 N. If this is not enough, the process might need to repeat again and agin.
1830 If input is exhausted, then the caller may call again setting EOF to true.
1831 segmenter_push() will never return -1 when EOF is true.
1833 The caller must not, in a sequence of calls, supply contradictory input.
1834 That is, bytes provided as part of INPUT in one call, but not consumed, must
1835 not be provided with *different* values on subsequent calls. This is
1836 because segmenter_push() must often make decisions based on looking ahead
1837 beyond the bytes that it consumes. */
1839 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1840 enum segment_type *type)
1856 return segmenter_parse_shbang__ (s, input, n, eof, type);
1859 return (s->substate & SS_START_OF_LINE
1860 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1861 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1864 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1866 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1869 return segmenter_parse_document_1__ (s, input, n, eof, type);
1871 return segmenter_parse_document_2__ (s, input, n, eof, type);
1873 return segmenter_parse_document_3__ (s, type);
1876 return segmenter_parse_file_label__ (s, input, n, eof, type);
1879 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1881 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1883 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1886 return segmenter_parse_define_1__ (s, input, n, eof, type);
1888 return segmenter_parse_define_2__ (s, input, n, eof, type);
1890 return segmenter_parse_define_3__ (s, input, n, eof, type);
1892 return segmenter_parse_define_4__ (s, input, n, eof, type);
1894 case S_BEGIN_DATA_1:
1895 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1896 case S_BEGIN_DATA_2:
1897 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1898 case S_BEGIN_DATA_3:
1899 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1900 case S_BEGIN_DATA_4:
1901 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1904 return segmenter_parse_title_1__ (s, input, n, eof, type);
1906 return segmenter_parse_title_2__ (s, input, n, eof, type);
1912 /* Returns the style of command prompt to display to an interactive user for
1913 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1914 and at the beginning of a line (that is, if segmenter_push() consumed as
1915 much as possible of the input up to a new-line). */
1917 segmenter_get_prompt (const struct segmenter *s)
1922 return PROMPT_FIRST;
1925 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1929 return PROMPT_COMMENT;
1933 return PROMPT_DOCUMENT;
1935 return PROMPT_FIRST;
1938 return PROMPT_LATER;
1942 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1944 return PROMPT_DO_REPEAT;
1948 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1951 return PROMPT_DEFINE;
1953 case S_BEGIN_DATA_1:
1954 return PROMPT_FIRST;
1955 case S_BEGIN_DATA_2:
1956 return PROMPT_LATER;
1957 case S_BEGIN_DATA_3:
1958 case S_BEGIN_DATA_4:
1963 return PROMPT_FIRST;