1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
58 #define SS_START_OF_LINE (1u << 0)
59 #define SS_START_OF_COMMAND (1u << 1)
61 static int segmenter_detect_command_name__ (const char *input,
62 size_t n, bool eof, int ofs);
65 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
68 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
76 mblen = u8_mbtoucr (puc, input, n);
80 return u8_mbtouc (puc, input, n);
91 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
92 bool eof, enum segment_type *type)
100 for (int ofs = 2; ; ofs++)
107 else if (input[ofs] == '\n')
109 if (input[ofs - 1] == '\r')
115 s->state = S_GENERAL;
116 s->substate = SS_START_OF_COMMAND;
126 s->state = S_GENERAL;
127 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
128 return segmenter_push (s, input, n, eof, type);
132 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
133 const char *input, size_t n, bool eof,
134 enum segment_type *type)
136 assert (s->state == S_GENERAL);
142 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
146 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
148 for (; ofs < n; ofs++)
150 if (input[ofs] == '\n')
152 else if (input[ofs] == '*')
155 return eof ? ofs + 1 : -1;
156 else if (input[ofs + 1] == '/')
160 return eof ? ofs : -1;
164 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
171 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
178 return eof ? ofs : -1;
179 else if (input[ofs + 1] != '*')
182 ofs = skip_comment (input, n, eof, ofs + 2);
186 else if (lex_uc_is_space (uc) && uc != '\n')
192 return eof ? ofs : -1;
196 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
200 else if (input[ofs] == '\n')
202 else if (input[ofs] == '\r')
206 return input[ofs + 1] == '\n';
213 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
215 ofs = skip_spaces_and_comments (input, n, eof, ofs);
219 return is_end_of_line (input, n, eof, ofs);
223 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
224 enum segment_type *type)
228 if (input[0] == '\n')
238 assert (input[0] == '\r');
239 assert (input[1] == '\n');
248 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
255 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
259 if (!lex_uc_is_space (uc) || uc == '\n')
265 return eof ? ofs : -1;
269 skip_digits (const char *input, size_t n, bool eof, int ofs)
271 for (; ofs < n; ofs++)
272 if (!c_isdigit (input[ofs]))
274 return eof ? ofs : -1;
278 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
279 bool eof, enum segment_type *type)
283 assert (s->state == S_GENERAL);
285 ofs = skip_digits (input, n, eof, 0);
295 if (input[ofs] == '.')
304 ofs = skip_digits (input, n, eof, ofs + 1);
311 if (input[ofs] == 'e' || input[ofs] == 'E')
318 goto expected_exponent;
321 if (input[ofs] == '+' || input[ofs] == '-')
328 goto expected_exponent;
332 if (!c_isdigit (input[ofs]))
333 goto expected_exponent;
335 ofs = skip_digits (input, n, eof, ofs);
340 if (input[ofs - 1] == '.')
342 int eol = at_end_of_line (input, n, eof, ofs);
355 *type = SEG_EXPECTED_EXPONENT;
361 is_reserved_word (const char *s, int n)
365 s0 = c_toupper (s[0]);
369 s1 = c_toupper (s[1]);
370 return ((s0 == 'B' && s1 == 'Y')
371 || (s0 == 'E' && s1 == 'Q')
372 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
373 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
374 || (s0 == 'N' && s1 == 'E')
375 || (s0 == 'O' && s1 == 'R')
376 || (s0 == 'T' && s1 == 'O'));
379 s1 = c_toupper (s[1]);
380 s2 = c_toupper (s[2]);
381 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
382 || (s1 == 'N' && s2 == 'D')))
383 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
386 s1 = c_toupper (s[1]);
387 s2 = c_toupper (s[2]);
388 s3 = c_toupper (s[3]);
389 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
397 segmenter_parse_comment_1__ (struct segmenter *s,
398 const char *input, size_t n, bool eof,
399 enum segment_type *type)
411 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
422 if (ofs > 1 && input[ofs - 1] == '\r')
426 /* Blank line ends comment command. */
427 s->state = S_GENERAL;
428 s->substate = SS_START_OF_COMMAND;
429 *type = SEG_SEPARATE_COMMANDS;
432 else if (endcmd >= 0)
434 /* '.' at end of line ends comment command. */
435 s->state = S_GENERAL;
437 *type = SEG_COMMENT_COMMAND;
442 /* Comment continues onto next line. */
443 *type = SEG_COMMENT_COMMAND;
444 s->state = S_COMMENT_2;
450 if (!lex_uc_is_space (uc))
461 s->state = S_GENERAL;
462 s->substate = SS_START_OF_COMMAND;
463 *type = SEG_SEPARATE_COMMANDS;
471 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
472 size_t n, bool eof, enum segment_type *type)
474 int ofs = segmenter_parse_newline__ (input, n, eof, type);
488 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
492 if (uc == '+' || uc == '-' || uc == '.')
494 else if (!lex_uc_is_space (uc))
497 case SEG_MODE_INTERACTIVE:
506 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
520 s->state = S_GENERAL;
521 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
524 s->state = S_COMMENT_1;
529 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
530 bool eof, enum segment_type *type)
542 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
553 if (ofs > 1 && input[ofs - 1] == '\r')
556 *type = SEG_DOCUMENT;
557 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
561 if (!lex_uc_is_space (uc))
570 *type = SEG_DOCUMENT;
571 s->state = S_DOCUMENT_3;
578 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
579 bool eof, enum segment_type *type)
583 ofs = segmenter_parse_newline__ (input, n, eof, type);
587 s->state = S_DOCUMENT_1;
592 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
594 *type = SEG_END_COMMAND;
595 s->state = S_GENERAL;
596 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
601 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
604 ofs = skip_spaces_and_comments (input, n, eof, ofs);
610 return c != '\'' && c != '"' && c != '\n';
620 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
621 bool eof, int ofs, char id[], size_t id_size)
623 struct segmenter sub;
625 assert (id_size > 0);
628 sub.state = S_GENERAL;
632 enum segment_type type;
635 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
651 if (retval < id_size)
653 memcpy (id, input + ofs, retval);
660 case SEG_QUOTED_STRING:
662 case SEG_UNICODE_STRING:
663 case SEG_UNQUOTED_STRING:
664 case SEG_RESERVED_WORD:
666 case SEG_COMMENT_COMMAND:
667 case SEG_DO_REPEAT_COMMAND:
668 case SEG_INLINE_DATA:
671 case SEG_START_DOCUMENT:
673 case SEG_START_COMMAND:
674 case SEG_SEPARATE_COMMANDS:
675 case SEG_END_COMMAND:
677 case SEG_EXPECTED_QUOTE:
678 case SEG_EXPECTED_EXPONENT:
679 case SEG_UNEXPECTED_DOT:
680 case SEG_UNEXPECTED_CHAR:
688 /* Called when INPUT begins with a character that can start off an ID token. */
690 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
691 bool eof, enum segment_type *type)
697 assert (s->state == S_GENERAL);
699 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
711 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
714 else if (!lex_uc_is_idn (uc))
720 if (input[ofs - 1] == '.')
722 int eol = at_end_of_line (input, n, eof, ofs);
729 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
730 : input[0] == '!' ? SEG_MACRO_ID
733 if (s->substate & SS_START_OF_COMMAND)
735 struct substring word = ss_buffer (input, ofs);
737 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
739 s->state = S_COMMENT_1;
740 return segmenter_parse_comment_1__ (s, input, n, eof, type);
742 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
744 s->state = S_DOCUMENT_1;
745 *type = SEG_START_DOCUMENT;
748 else if (lex_id_match (ss_cstr ("TITLE"), word)
749 || lex_id_match (ss_cstr ("SUBTITLE"), word))
751 int result = segmenter_unquoted (input, n, eof, ofs);
756 s->state = S_TITLE_1;
760 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
762 s->state = S_DEFINE_1;
765 else if (lex_id_match (ss_cstr ("FILE"), word))
769 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
771 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
773 s->state = S_FILE_LABEL;
778 else if (lex_id_match (ss_cstr ("DO"), word))
782 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
784 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
786 s->state = S_DO_REPEAT_1;
791 else if (lex_id_match (ss_cstr ("BEGIN"), word))
796 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
799 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
803 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
809 else if (input[ofs2] == '.')
811 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
816 eol = is_end_of_line (input, n, eof, ofs2);
821 if (memchr (input, '\n', ofs2))
822 s->state = S_BEGIN_DATA_1;
824 s->state = S_BEGIN_DATA_2;
837 segmenter_parse_string__ (enum segment_type string_type,
838 int ofs, struct segmenter *s,
839 const char *input, size_t n, bool eof,
840 enum segment_type *type)
842 int quote = input[ofs];
846 if (input[ofs] == quote)
851 if (input[ofs] == quote)
864 else if (input[ofs] == '\n')
875 *type = SEG_EXPECTED_QUOTE;
881 segmenter_maybe_parse_string__ (enum segment_type string_type,
883 const char *input, size_t n, bool eof,
884 enum segment_type *type)
891 else if (input[1] == '\'' || input[1] == '"')
892 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
894 return segmenter_parse_id__ (s, input, n, eof, type);
898 segmenter_parse_mid_command__ (struct segmenter *s,
899 const char *input, size_t n, bool eof,
900 enum segment_type *type)
906 assert (s->state == S_GENERAL);
907 assert (!(s->substate & SS_START_OF_LINE));
909 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
916 s->substate |= SS_START_OF_LINE;
926 else if (input[1] == '*')
928 ofs = skip_comment (input, n, eof, 2);
940 case '(': case ')': case ',': case '=': case '-':
941 case '[': case ']': case '&': case '|': case '+':
947 if (s->substate & SS_START_OF_COMMAND)
949 /* '*' at the beginning of a command begins a comment. */
950 s->state = S_COMMENT_1;
951 return segmenter_parse_comment_1__ (s, input, n, eof, type);
954 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
957 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
960 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
963 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
971 else if (c_isdigit (input[1]))
972 return segmenter_parse_number__ (s, input, n, eof, type);
974 int eol = at_end_of_line (input, n, eof, 1);
980 *type = SEG_END_COMMAND;
981 s->substate = SS_START_OF_COMMAND;
984 *type = SEG_UNEXPECTED_DOT;
987 case '0': case '1': case '2': case '3': case '4':
988 case '5': case '6': case '7': case '8': case '9':
989 return segmenter_parse_number__ (s, input, n, eof, type);
992 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
993 s, input, n, eof, type);
996 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
997 s, input, n, eof, type);
1000 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1001 s, input, n, eof, type);
1004 return segmenter_parse_id__ (s, input, n, eof, type);
1007 if (lex_uc_is_space (uc))
1009 ofs = skip_spaces (input, n, eof, mblen);
1013 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1017 s->substate |= SS_START_OF_LINE;
1018 *type = SEG_NEWLINE;
1027 else if (lex_uc_is_id1 (uc))
1028 return segmenter_parse_id__ (s, input, n, eof, type);
1029 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1037 *type = SEG_UNEXPECTED_CHAR;
1045 compare_commands (const void *a_, const void *b_)
1047 const char *const *ap = a_;
1048 const char *const *bp = b_;
1049 const char *a = *ap;
1050 const char *b = *bp;
1052 return c_strcasecmp (a, b);
1055 static const char **
1056 segmenter_get_command_name_candidates (unsigned char first)
1058 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1059 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1060 static const char *commands[] =
1062 #include "language/command.def"
1065 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1071 static const char **cindex[UCHAR_MAX + 1];
1079 qsort (commands, n_commands, sizeof *commands, compare_commands);
1080 for (i = 0; i < n_commands; i++)
1082 unsigned char c = c_toupper (commands[i][0]);
1083 if (cindex[c] == NULL)
1084 cindex[c] = &commands[i];
1086 for (i = 0; i <= UCHAR_MAX; i++)
1087 if (cindex[i] == NULL)
1088 cindex[i] = &commands[n_commands];
1091 return cindex[c_toupper (first)];
1095 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1098 const char **commands;
1115 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1120 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1128 if (input[ofs - 1] == '.')
1131 for (commands = segmenter_get_command_name_candidates (input[0]);
1132 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1138 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1139 &exact, &missing_words)
1140 && missing_words <= 0)
1148 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1151 return eof ? 0 : -1;
1154 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1157 return eof ? 0 : -1;
1159 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1162 return c == '\'' || c == '"' || c == '\n';
1166 segmenter_parse_start_of_line__ (struct segmenter *s,
1167 const char *input, size_t n, bool eof,
1168 enum segment_type *type)
1174 assert (s->state == S_GENERAL);
1175 assert (s->substate & SS_START_OF_LINE);
1177 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1184 ofs = skip_spaces_and_comments (input, n, eof, 1);
1189 int is_string = is_start_of_string__ (input, n, eof, ofs);
1194 /* This is punctuation that may separate pieces of a string. */
1204 *type = SEG_START_COMMAND;
1205 s->substate = SS_START_OF_COMMAND;
1209 if (lex_uc_is_space (uc))
1211 int eol = at_end_of_line (input, n, eof, 0);
1216 s->substate = SS_START_OF_COMMAND;
1217 *type = SEG_SEPARATE_COMMANDS;
1223 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1225 else if (s->mode == SEG_MODE_AUTO)
1227 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1234 assert (s->mode == SEG_MODE_BATCH);
1236 s->substate = SS_START_OF_COMMAND;
1237 *type = SEG_START_COMMAND;
1241 s->substate = SS_START_OF_COMMAND;
1242 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1246 segmenter_parse_file_label__ (struct segmenter *s,
1247 const char *input, size_t n, bool eof,
1248 enum segment_type *type)
1250 struct segmenter sub;
1254 sub.state = S_GENERAL;
1255 ofs = segmenter_push (&sub, input, n, eof, type);
1259 else if (*type == SEG_IDENTIFIER)
1263 assert (lex_id_match (ss_cstr ("LABEL"),
1264 ss_buffer ((char *) input, ofs)));
1265 result = segmenter_unquoted (input, n, eof, ofs);
1271 s->state = S_TITLE_1;
1279 s->substate = sub.substate;
1285 segmenter_subparse (struct segmenter *s,
1286 const char *input, size_t n, bool eof,
1287 enum segment_type *type)
1289 struct segmenter sub;
1293 sub.state = S_GENERAL;
1294 sub.substate = s->substate;
1295 ofs = segmenter_push (&sub, input, n, eof, type);
1296 s->substate = sub.substate;
1300 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1301 defines the stand-in variables (the head) before the lines of syntax to be
1302 repeated (the body). */
1304 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1305 const char *input, size_t n, bool eof,
1306 enum segment_type *type)
1308 int ofs = segmenter_subparse (s, input, n, eof, type);
1312 if (*type == SEG_SEPARATE_COMMANDS)
1314 /* We reached a blank line that separates the head from the body. */
1315 s->state = S_DO_REPEAT_2;
1317 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1319 /* We reached the body. */
1320 s->state = S_DO_REPEAT_3;
1327 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1328 separates the head from the body. */
1330 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1331 const char *input, size_t n, bool eof,
1332 enum segment_type *type)
1334 int ofs = segmenter_subparse (s, input, n, eof, type);
1338 if (*type == SEG_NEWLINE)
1340 /* We reached the body. */
1341 s->state = S_DO_REPEAT_3;
1349 check_repeat_command (struct segmenter *s,
1350 const char *input, size_t n, bool eof)
1357 if (input[ofs] == '+' || input[ofs] == '-')
1360 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1363 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1365 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1370 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1374 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1375 s->substate += direction;
1380 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1381 enum segment_type *type)
1383 const char *newline = memchr (input, '\n', n);
1385 return eof ? n : -1;
1387 ptrdiff_t ofs = newline - input;
1388 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1390 *type = SEG_NEWLINE;
1394 return ofs - (input[ofs - 1] == '\r');
1397 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1398 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1400 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1401 the lines we're segmenting. s->substate counts the nesting level, starting
1404 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1405 const char *input, size_t n, bool eof,
1406 enum segment_type *type)
1410 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1411 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1413 else if (!check_repeat_command (s, input, n, eof) && !eof)
1415 else if (s->substate == 0)
1417 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1419 s->state = S_GENERAL;
1420 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1421 return segmenter_push (s, input, n, eof, type);
1425 *type = SEG_DO_REPEAT_COMMAND;
1430 /* We are segmenting a DEFINE command, which consists of:
1432 - The DEFINE keyword.
1436 - "(" followed by a sequence of tokens possibly including balanced parentheses
1441 - A sequence of lines that don't start with "!ENDDEFINE", one string per line,
1442 each ending in a newline.
1448 segmenter_parse_define_1__ (struct segmenter *s,
1449 const char *input, size_t n, bool eof,
1450 enum segment_type *type)
1452 int ofs = segmenter_subparse (s, input, n, eof, type);
1456 if (*type == SEG_SEPARATE_COMMANDS
1457 || *type == SEG_END_COMMAND
1458 || *type == SEG_START_COMMAND)
1460 /* The DEFINE command is malformed because we reached its end without
1461 ever hitting a "(" token. Transition back to general parsing. */
1462 s->state = S_GENERAL;
1465 else if (*type == SEG_PUNCT && input[0] == '(')
1467 s->state = S_DEFINE_2;
1476 segmenter_parse_define_2__ (struct segmenter *s,
1477 const char *input, size_t n, bool eof,
1478 enum segment_type *type)
1480 int ofs = segmenter_subparse (s, input, n, eof, type);
1484 if (*type == SEG_SEPARATE_COMMANDS
1485 || *type == SEG_END_COMMAND
1486 || *type == SEG_START_COMMAND)
1488 /* The DEFINE command is malformed because we reached its end before
1489 closing the set of parentheses. Transition back to general
1491 s->state = S_GENERAL;
1494 else if (*type == SEG_PUNCT && input[0] == '(')
1499 else if (*type == SEG_PUNCT && input[0] == ')')
1503 s->state = S_DEFINE_3;
1511 segmenter_parse_define_3__ (struct segmenter *s,
1512 const char *input, size_t n, bool eof,
1513 enum segment_type *type)
1515 int ofs = segmenter_subparse (s, input, n, eof, type);
1519 if (*type == SEG_END_COMMAND)
1521 /* The DEFINE command is malformed because there was a command terminator
1522 before the first line of the body. Transition back to general
1524 s->state = S_GENERAL;
1527 else if (*type == SEG_NEWLINE)
1528 s->state = S_DEFINE_4;
1534 is_enddefine (const char *input, size_t n)
1536 int ofs = skip_spaces_and_comments (input, n, true, 0);
1539 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1540 if (n - ofs < enddefine.length)
1543 if (!ss_equals_case (ss_buffer (input + ofs, enddefine.length), enddefine))
1546 if (ofs + enddefine.length >= n)
1549 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1551 u8_mbtouc (&uc, u_input + ofs, n - ofs);
1552 return uc == '.' || !lex_uc_is_idn (uc);
1556 segmenter_parse_define_4__ (struct segmenter *s,
1557 const char *input, size_t n, bool eof,
1558 enum segment_type *type)
1562 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1565 else if (is_enddefine (input, ofs))
1567 s->state = S_GENERAL;
1568 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1569 return segmenter_push (s, input, n, eof, type);
1573 *type = SEG_MACRO_BODY;
1574 s->state = S_DEFINE_5;
1575 return input[ofs - 1] == '\n' ? 0 : ofs;
1580 segmenter_parse_define_5__ (struct segmenter *s,
1581 const char *input, size_t n, bool eof,
1582 enum segment_type *type)
1586 ofs = segmenter_parse_newline__ (input, n, eof, type);
1590 s->state = S_DEFINE_4;
1595 segmenter_parse_begin_data_1__ (struct segmenter *s,
1596 const char *input, size_t n, bool eof,
1597 enum segment_type *type)
1599 int ofs = segmenter_subparse (s, input, n, eof, type);
1603 if (*type == SEG_NEWLINE)
1604 s->state = S_BEGIN_DATA_2;
1610 segmenter_parse_begin_data_2__ (struct segmenter *s,
1611 const char *input, size_t n, bool eof,
1612 enum segment_type *type)
1614 int ofs = segmenter_subparse (s, input, n, eof, type);
1618 if (*type == SEG_NEWLINE)
1619 s->state = S_BEGIN_DATA_3;
1625 is_end_data (const char *input, size_t n)
1627 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1633 if (n < 4 || c_strncasecmp (input, "END", 3))
1637 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1638 if (!lex_uc_is_space (uc))
1642 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1649 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1656 else if (!lex_uc_is_space (uc))
1665 segmenter_parse_begin_data_3__ (struct segmenter *s,
1666 const char *input, size_t n, bool eof,
1667 enum segment_type *type)
1671 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1674 else if (is_end_data (input, ofs))
1676 s->state = S_GENERAL;
1677 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1678 return segmenter_push (s, input, n, eof, type);
1682 *type = SEG_INLINE_DATA;
1683 s->state = S_BEGIN_DATA_4;
1684 return input[ofs - 1] == '\n' ? 0 : ofs;
1689 segmenter_parse_begin_data_4__ (struct segmenter *s,
1690 const char *input, size_t n, bool eof,
1691 enum segment_type *type)
1695 ofs = segmenter_parse_newline__ (input, n, eof, type);
1699 s->state = S_BEGIN_DATA_3;
1704 segmenter_parse_title_1__ (struct segmenter *s,
1705 const char *input, size_t n, bool eof,
1706 enum segment_type *type)
1710 ofs = skip_spaces (input, n, eof, 0);
1713 s->state = S_TITLE_2;
1719 segmenter_parse_title_2__ (struct segmenter *s,
1720 const char *input, size_t n, bool eof,
1721 enum segment_type *type)
1733 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1747 if (!lex_uc_is_space (uc))
1758 s->state = S_GENERAL;
1760 *type = SEG_UNQUOTED_STRING;
1761 return endcmd >= 0 ? endcmd : ofs;
1767 /* Returns the name of segment TYPE as a string. The caller must not modify
1768 or free the returned string.
1770 This is useful only for debugging and testing. */
1772 segment_type_to_string (enum segment_type type)
1776 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1780 return "unknown segment type";
1784 /* Initializes S as a segmenter with the given syntax MODE.
1786 A segmenter does not contain any external references, so nothing needs to be
1787 done to destroy one. For the same reason, segmenters may be copied with
1788 plain struct assignment (or memcpy). */
1790 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1792 s->state = S_SHBANG;
1797 /* Returns the mode passed to segmenter_init() for S. */
1799 segmenter_get_mode (const struct segmenter *s)
1804 /* Attempts to label a prefix of S's remaining input with a segment type. The
1805 caller supplies the first N bytes of the remaining input as INPUT, which
1806 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1807 are the entire (remainder) of the input; if EOF is false, then further input
1808 is potentially available.
1810 The input may contain '\n' or '\r\n' line ends in any combination.
1812 If successful, returns the number of bytes in the segment at the beginning
1813 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1814 into *TYPE. The next call to segmenter_push() should not include those
1815 bytes as part of INPUT, because they have (figuratively) been consumed by
1818 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1819 be determined. In this case segmenter_push() returns -1. If more input is
1820 available, the caller should obtain some more, then call again with a larger
1821 N. If this is not enough, the process might need to repeat again and agin.
1822 If input is exhausted, then the caller may call again setting EOF to true.
1823 segmenter_push() will never return -1 when EOF is true.
1825 The caller must not, in a sequence of calls, supply contradictory input.
1826 That is, bytes provided as part of INPUT in one call, but not consumed, must
1827 not be provided with *different* values on subsequent calls. This is
1828 because segmenter_push() must often make decisions based on looking ahead
1829 beyond the bytes that it consumes. */
1831 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1832 enum segment_type *type)
1848 return segmenter_parse_shbang__ (s, input, n, eof, type);
1851 return (s->substate & SS_START_OF_LINE
1852 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1853 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1856 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1858 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1861 return segmenter_parse_document_1__ (s, input, n, eof, type);
1863 return segmenter_parse_document_2__ (s, input, n, eof, type);
1865 return segmenter_parse_document_3__ (s, type);
1868 return segmenter_parse_file_label__ (s, input, n, eof, type);
1871 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1873 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1875 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1878 return segmenter_parse_define_1__ (s, input, n, eof, type);
1880 return segmenter_parse_define_2__ (s, input, n, eof, type);
1882 return segmenter_parse_define_3__ (s, input, n, eof, type);
1884 return segmenter_parse_define_4__ (s, input, n, eof, type);
1886 return segmenter_parse_define_5__ (s, input, n, eof, type);
1888 case S_BEGIN_DATA_1:
1889 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1890 case S_BEGIN_DATA_2:
1891 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1892 case S_BEGIN_DATA_3:
1893 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1894 case S_BEGIN_DATA_4:
1895 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1898 return segmenter_parse_title_1__ (s, input, n, eof, type);
1900 return segmenter_parse_title_2__ (s, input, n, eof, type);
1906 /* Returns the style of command prompt to display to an interactive user for
1907 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1908 and at the beginning of a line (that is, if segmenter_push() consumed as
1909 much as possible of the input up to a new-line). */
1911 segmenter_get_prompt (const struct segmenter *s)
1916 return PROMPT_FIRST;
1919 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1923 return PROMPT_COMMENT;
1927 return PROMPT_DOCUMENT;
1929 return PROMPT_FIRST;
1932 return PROMPT_LATER;
1936 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1938 return PROMPT_DO_REPEAT;
1943 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1946 return PROMPT_DEFINE;
1948 case S_BEGIN_DATA_1:
1949 return PROMPT_FIRST;
1950 case S_BEGIN_DATA_2:
1951 return PROMPT_LATER;
1952 case S_BEGIN_DATA_3:
1953 case S_BEGIN_DATA_4:
1958 return PROMPT_FIRST;