1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
58 #define SS_START_OF_LINE (1u << 0)
59 #define SS_START_OF_COMMAND (1u << 1)
61 static int segmenter_detect_command_name__ (const char *input,
62 size_t n, bool eof, int ofs);
65 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
68 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
76 mblen = u8_mbtoucr (puc, input, n);
80 return u8_mbtouc (puc, input, n);
91 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
92 bool eof, enum segment_type *type)
100 for (int ofs = 2; ; ofs++)
107 else if (input[ofs] == '\n')
109 if (input[ofs - 1] == '\r')
115 s->state = S_GENERAL;
116 s->substate = SS_START_OF_COMMAND;
126 s->state = S_GENERAL;
127 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
128 return segmenter_push (s, input, n, eof, type);
132 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
133 const char *input, size_t n, bool eof,
134 enum segment_type *type)
136 assert (s->state == S_GENERAL);
142 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
146 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
148 for (; ofs < n; ofs++)
150 if (input[ofs] == '\n')
152 else if (input[ofs] == '*')
155 return eof ? ofs + 1 : -1;
156 else if (input[ofs + 1] == '/')
160 return eof ? ofs : -1;
164 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
171 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
178 return eof ? ofs : -1;
179 else if (input[ofs + 1] != '*')
182 ofs = skip_comment (input, n, eof, ofs + 2);
186 else if (lex_uc_is_space (uc) && uc != '\n')
192 return eof ? ofs : -1;
196 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
200 else if (input[ofs] == '\n')
202 else if (input[ofs] == '\r')
206 return input[ofs + 1] == '\n';
213 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
215 ofs = skip_spaces_and_comments (input, n, eof, ofs);
219 return is_end_of_line (input, n, eof, ofs);
223 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
224 enum segment_type *type)
228 if (input[0] == '\n')
238 assert (input[0] == '\r');
239 assert (input[1] == '\n');
248 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
255 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
259 if (!lex_uc_is_space (uc) || uc == '\n')
265 return eof ? ofs : -1;
269 skip_digits (const char *input, size_t n, bool eof, int ofs)
271 for (; ofs < n; ofs++)
272 if (!c_isdigit (input[ofs]))
274 return eof ? ofs : -1;
278 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
279 bool eof, enum segment_type *type)
283 assert (s->state == S_GENERAL);
285 ofs = skip_digits (input, n, eof, 0);
295 if (input[ofs] == '.')
304 ofs = skip_digits (input, n, eof, ofs + 1);
311 if (input[ofs] == 'e' || input[ofs] == 'E')
318 goto expected_exponent;
321 if (input[ofs] == '+' || input[ofs] == '-')
328 goto expected_exponent;
332 if (!c_isdigit (input[ofs]))
333 goto expected_exponent;
335 ofs = skip_digits (input, n, eof, ofs);
340 if (input[ofs - 1] == '.')
342 int eol = at_end_of_line (input, n, eof, ofs);
355 *type = SEG_EXPECTED_EXPONENT;
361 is_reserved_word (const char *s, int n)
365 s0 = c_toupper (s[0]);
369 s1 = c_toupper (s[1]);
370 return ((s0 == 'B' && s1 == 'Y')
371 || (s0 == 'E' && s1 == 'Q')
372 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
373 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
374 || (s0 == 'N' && s1 == 'E')
375 || (s0 == 'O' && s1 == 'R')
376 || (s0 == 'T' && s1 == 'O'));
379 s1 = c_toupper (s[1]);
380 s2 = c_toupper (s[2]);
381 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
382 || (s1 == 'N' && s2 == 'D')))
383 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
386 s1 = c_toupper (s[1]);
387 s2 = c_toupper (s[2]);
388 s3 = c_toupper (s[3]);
389 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
397 segmenter_parse_comment_1__ (struct segmenter *s,
398 const char *input, size_t n, bool eof,
399 enum segment_type *type)
411 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
422 if (ofs > 1 && input[ofs - 1] == '\r')
426 /* Blank line ends comment command. */
427 s->state = S_GENERAL;
428 s->substate = SS_START_OF_COMMAND;
429 *type = SEG_SEPARATE_COMMANDS;
432 else if (endcmd >= 0)
434 /* '.' at end of line ends comment command. */
435 s->state = S_GENERAL;
437 *type = SEG_COMMENT_COMMAND;
442 /* Comment continues onto next line. */
443 *type = SEG_COMMENT_COMMAND;
444 s->state = S_COMMENT_2;
450 if (!lex_uc_is_space (uc))
461 s->state = S_GENERAL;
462 s->substate = SS_START_OF_COMMAND;
463 *type = SEG_SEPARATE_COMMANDS;
471 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
472 size_t n, bool eof, enum segment_type *type)
474 int ofs = segmenter_parse_newline__ (input, n, eof, type);
488 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
492 if (uc == '+' || uc == '-' || uc == '.')
494 else if (!lex_uc_is_space (uc))
497 case SEG_MODE_INTERACTIVE:
506 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
520 s->state = S_GENERAL;
521 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
524 s->state = S_COMMENT_1;
529 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
530 bool eof, enum segment_type *type)
542 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
553 if (ofs > 1 && input[ofs - 1] == '\r')
556 *type = SEG_DOCUMENT;
557 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
561 if (!lex_uc_is_space (uc))
570 *type = SEG_DOCUMENT;
571 s->state = S_DOCUMENT_3;
578 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
579 bool eof, enum segment_type *type)
583 ofs = segmenter_parse_newline__ (input, n, eof, type);
587 s->state = S_DOCUMENT_1;
592 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
594 *type = SEG_END_COMMAND;
595 s->state = S_GENERAL;
596 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
601 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
604 ofs = skip_spaces_and_comments (input, n, eof, ofs);
610 return c != '\'' && c != '"' && c != '\n';
620 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
621 bool eof, int ofs, char id[], size_t id_size)
623 struct segmenter sub;
625 assert (id_size > 0);
628 sub.state = S_GENERAL;
632 enum segment_type type;
635 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
651 if (retval < id_size)
653 memcpy (id, input + ofs, retval);
660 case SEG_QUOTED_STRING:
662 case SEG_UNICODE_STRING:
663 case SEG_UNQUOTED_STRING:
664 case SEG_RESERVED_WORD:
666 case SEG_COMMENT_COMMAND:
667 case SEG_DO_REPEAT_COMMAND:
668 case SEG_INLINE_DATA:
671 case SEG_START_DOCUMENT:
673 case SEG_START_COMMAND:
674 case SEG_SEPARATE_COMMANDS:
675 case SEG_END_COMMAND:
677 case SEG_EXPECTED_QUOTE:
678 case SEG_EXPECTED_EXPONENT:
679 case SEG_UNEXPECTED_DOT:
680 case SEG_UNEXPECTED_CHAR:
688 /* Called when INPUT begins with a character that can start off an ID token. */
690 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
691 bool eof, enum segment_type *type)
697 assert (s->state == S_GENERAL);
699 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
711 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
714 else if (!lex_uc_is_idn (uc))
720 if (input[ofs - 1] == '.')
722 int eol = at_end_of_line (input, n, eof, ofs);
729 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
730 : input[0] == '!' ? SEG_MACRO_ID
733 if (s->substate & SS_START_OF_COMMAND)
735 struct substring word = ss_buffer (input, ofs);
737 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
739 s->state = S_COMMENT_1;
740 return segmenter_parse_comment_1__ (s, input, n, eof, type);
742 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
744 s->state = S_DOCUMENT_1;
745 *type = SEG_START_DOCUMENT;
748 else if (lex_id_match (ss_cstr ("TITLE"), word)
749 || lex_id_match (ss_cstr ("SUBTITLE"), word))
751 int result = segmenter_unquoted (input, n, eof, ofs);
756 s->state = S_TITLE_1;
760 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
762 s->state = S_DEFINE_1;
765 else if (lex_id_match (ss_cstr ("FILE"), word))
769 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
771 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
773 s->state = S_FILE_LABEL;
778 else if (lex_id_match (ss_cstr ("DO"), word))
782 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
784 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
786 s->state = S_DO_REPEAT_1;
791 else if (lex_id_match (ss_cstr ("BEGIN"), word))
796 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
799 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
803 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
809 else if (input[ofs2] == '.')
811 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
816 eol = is_end_of_line (input, n, eof, ofs2);
821 if (memchr (input, '\n', ofs2))
822 s->state = S_BEGIN_DATA_1;
824 s->state = S_BEGIN_DATA_2;
837 segmenter_parse_string__ (enum segment_type string_type,
838 int ofs, struct segmenter *s,
839 const char *input, size_t n, bool eof,
840 enum segment_type *type)
842 int quote = input[ofs];
846 if (input[ofs] == quote)
851 if (input[ofs] == quote)
864 else if (input[ofs] == '\n')
875 *type = SEG_EXPECTED_QUOTE;
881 segmenter_maybe_parse_string__ (enum segment_type string_type,
883 const char *input, size_t n, bool eof,
884 enum segment_type *type)
891 else if (input[1] == '\'' || input[1] == '"')
892 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
894 return segmenter_parse_id__ (s, input, n, eof, type);
898 segmenter_parse_mid_command__ (struct segmenter *s,
899 const char *input, size_t n, bool eof,
900 enum segment_type *type)
906 assert (s->state == S_GENERAL);
907 assert (!(s->substate & SS_START_OF_LINE));
909 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
916 s->substate |= SS_START_OF_LINE;
926 else if (input[1] == '*')
928 ofs = skip_comment (input, n, eof, 2);
940 case '(': case ')': case ',': case '=': case '-':
941 case '[': case ']': case '&': case '|': case '+':
947 if (s->substate & SS_START_OF_COMMAND)
949 /* '*' at the beginning of a command begins a comment. */
950 s->state = S_COMMENT_1;
951 return segmenter_parse_comment_1__ (s, input, n, eof, type);
954 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
957 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
960 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
963 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
971 else if (c_isdigit (input[1]))
972 return segmenter_parse_number__ (s, input, n, eof, type);
974 int eol = at_end_of_line (input, n, eof, 1);
980 *type = SEG_END_COMMAND;
981 s->substate = SS_START_OF_COMMAND;
984 *type = SEG_UNEXPECTED_DOT;
987 case '0': case '1': case '2': case '3': case '4':
988 case '5': case '6': case '7': case '8': case '9':
989 return segmenter_parse_number__ (s, input, n, eof, type);
992 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
993 s, input, n, eof, type);
996 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
997 s, input, n, eof, type);
1000 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1001 s, input, n, eof, type);
1004 return segmenter_parse_id__ (s, input, n, eof, type);
1007 if (lex_uc_is_space (uc))
1009 ofs = skip_spaces (input, n, eof, mblen);
1013 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1017 s->substate |= SS_START_OF_LINE;
1018 *type = SEG_NEWLINE;
1027 else if (lex_uc_is_id1 (uc))
1028 return segmenter_parse_id__ (s, input, n, eof, type);
1029 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1037 *type = SEG_UNEXPECTED_CHAR;
1045 compare_commands (const void *a_, const void *b_)
1047 const char *const *ap = a_;
1048 const char *const *bp = b_;
1049 const char *a = *ap;
1050 const char *b = *bp;
1052 return c_strcasecmp (a, b);
1055 static const char **
1056 segmenter_get_command_name_candidates (unsigned char first)
1058 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1059 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1060 static const char *commands[] =
1062 #include "language/command.def"
1065 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1071 static const char **cindex[UCHAR_MAX + 1];
1079 qsort (commands, n_commands, sizeof *commands, compare_commands);
1080 for (i = 0; i < n_commands; i++)
1082 unsigned char c = c_toupper (commands[i][0]);
1083 if (cindex[c] == NULL)
1084 cindex[c] = &commands[i];
1086 for (i = 0; i <= UCHAR_MAX; i++)
1087 if (cindex[i] == NULL)
1088 cindex[i] = &commands[n_commands];
1091 return cindex[c_toupper (first)];
1095 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1098 const char **commands;
1115 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1120 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1128 if (input[ofs - 1] == '.')
1131 for (commands = segmenter_get_command_name_candidates (input[0]);
1132 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1138 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1139 &exact, &missing_words)
1140 && missing_words <= 0)
1148 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1151 return eof ? 0 : -1;
1154 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1157 return eof ? 0 : -1;
1159 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1162 return c == '\'' || c == '"' || c == '\n';
1166 segmenter_parse_start_of_line__ (struct segmenter *s,
1167 const char *input, size_t n, bool eof,
1168 enum segment_type *type)
1174 assert (s->state == S_GENERAL);
1175 assert (s->substate & SS_START_OF_LINE);
1177 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1184 ofs = skip_spaces_and_comments (input, n, eof, 1);
1189 int is_string = is_start_of_string__ (input, n, eof, ofs);
1194 /* This is punctuation that may separate pieces of a string. */
1204 *type = SEG_START_COMMAND;
1205 s->substate = SS_START_OF_COMMAND;
1209 if (lex_uc_is_space (uc))
1211 int eol = at_end_of_line (input, n, eof, 0);
1216 s->substate = SS_START_OF_COMMAND;
1217 *type = SEG_SEPARATE_COMMANDS;
1223 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1225 else if (s->mode == SEG_MODE_AUTO)
1227 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1234 assert (s->mode == SEG_MODE_BATCH);
1236 s->substate = SS_START_OF_COMMAND;
1237 *type = SEG_START_COMMAND;
1241 s->substate = SS_START_OF_COMMAND;
1242 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1246 segmenter_parse_file_label__ (struct segmenter *s,
1247 const char *input, size_t n, bool eof,
1248 enum segment_type *type)
1250 struct segmenter sub;
1254 sub.state = S_GENERAL;
1255 ofs = segmenter_push (&sub, input, n, eof, type);
1259 else if (*type == SEG_IDENTIFIER)
1263 assert (lex_id_match (ss_cstr ("LABEL"),
1264 ss_buffer ((char *) input, ofs)));
1265 result = segmenter_unquoted (input, n, eof, ofs);
1271 s->state = S_TITLE_1;
1279 s->substate = sub.substate;
1285 segmenter_subparse (struct segmenter *s,
1286 const char *input, size_t n, bool eof,
1287 enum segment_type *type)
1289 struct segmenter sub;
1293 sub.state = S_GENERAL;
1294 sub.substate = s->substate;
1295 ofs = segmenter_push (&sub, input, n, eof, type);
1296 s->substate = sub.substate;
1300 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1301 defines the stand-in variables (the head) before the lines of syntax to be
1302 repeated (the body). */
1304 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1305 const char *input, size_t n, bool eof,
1306 enum segment_type *type)
1308 int ofs = segmenter_subparse (s, input, n, eof, type);
1312 if (*type == SEG_SEPARATE_COMMANDS)
1314 /* We reached a blank line that separates the head from the body. */
1315 s->state = S_DO_REPEAT_2;
1317 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1319 /* We reached the body. */
1320 s->state = S_DO_REPEAT_3;
1327 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1328 separates the head from the body. */
1330 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1331 const char *input, size_t n, bool eof,
1332 enum segment_type *type)
1334 int ofs = segmenter_subparse (s, input, n, eof, type);
1338 if (*type == SEG_NEWLINE)
1340 /* We reached the body. */
1341 s->state = S_DO_REPEAT_3;
1349 check_repeat_command (struct segmenter *s,
1350 const char *input, size_t n, bool eof)
1357 if (input[ofs] == '+' || input[ofs] == '-')
1360 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1363 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1365 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1370 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1374 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1375 s->substate += direction;
1380 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1381 enum segment_type *type)
1383 const char *newline = memchr (input, '\n', n);
1385 return eof ? n : -1;
1387 ptrdiff_t ofs = newline - input;
1388 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1390 *type = SEG_NEWLINE;
1394 return ofs - (input[ofs - 1] == '\r');
1397 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1398 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1400 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1401 the lines we're segmenting. s->substate counts the nesting level, starting
1404 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1405 const char *input, size_t n, bool eof,
1406 enum segment_type *type)
1410 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1411 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1413 else if (!check_repeat_command (s, input, n, eof) && !eof)
1415 else if (s->substate == 0)
1417 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1419 s->state = S_GENERAL;
1420 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1421 return segmenter_push (s, input, n, eof, type);
1425 *type = SEG_DO_REPEAT_COMMAND;
1430 /* We are segmenting a DEFINE command, which consists of:
1432 - The DEFINE keyword.
1436 - "(" followed by a sequence of tokens possibly including balanced parentheses
1441 - A sequence of lines that don't start with "!ENDDEFINE", one string per line,
1442 each ending in a newline.
1448 segmenter_parse_define_1__ (struct segmenter *s,
1449 const char *input, size_t n, bool eof,
1450 enum segment_type *type)
1452 int ofs = segmenter_subparse (s, input, n, eof, type);
1456 if (*type == SEG_SEPARATE_COMMANDS
1457 || *type == SEG_END_COMMAND
1458 || *type == SEG_START_COMMAND)
1460 /* The DEFINE command is malformed because we reached its end without
1461 ever hitting a "(" token. Transition back to general parsing. */
1462 s->state = S_GENERAL;
1465 else if (*type == SEG_PUNCT && input[0] == '(')
1467 s->state = S_DEFINE_2;
1476 segmenter_parse_define_2__ (struct segmenter *s,
1477 const char *input, size_t n, bool eof,
1478 enum segment_type *type)
1480 int ofs = segmenter_subparse (s, input, n, eof, type);
1484 if (*type == SEG_SEPARATE_COMMANDS
1485 || *type == SEG_END_COMMAND
1486 || *type == SEG_START_COMMAND)
1488 /* The DEFINE command is malformed because we reached its end before
1489 closing the set of parentheses. Transition back to general
1491 s->state = S_GENERAL;
1494 else if (*type == SEG_PUNCT && input[0] == '(')
1499 else if (*type == SEG_PUNCT && input[0] == ')')
1503 s->state = S_DEFINE_3;
1511 segmenter_parse_define_3__ (struct segmenter *s,
1512 const char *input, size_t n, bool eof,
1513 enum segment_type *type)
1515 int ofs = segmenter_subparse (s, input, n, eof, type);
1519 if (*type == SEG_NEWLINE)
1520 s->state = S_DEFINE_4;
1526 is_enddefine (const char *input, size_t n)
1528 int ofs = skip_spaces_and_comments (input, n, true, 0);
1531 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1532 if (n - ofs < enddefine.length)
1535 if (!ss_equals_case (ss_buffer (input + ofs, enddefine.length), enddefine))
1538 if (ofs + enddefine.length >= n)
1541 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1543 u8_mbtouc (&uc, u_input + ofs, n - ofs);
1544 return uc == '.' || !lex_uc_is_idn (uc);
1548 segmenter_parse_define_4__ (struct segmenter *s,
1549 const char *input, size_t n, bool eof,
1550 enum segment_type *type)
1554 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1557 else if (is_enddefine (input, ofs))
1559 s->state = S_GENERAL;
1560 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1561 return segmenter_push (s, input, n, eof, type);
1565 *type = SEG_MACRO_BODY;
1566 s->state = S_DEFINE_5;
1567 return input[ofs - 1] == '\n' ? 0 : ofs;
1572 segmenter_parse_define_5__ (struct segmenter *s,
1573 const char *input, size_t n, bool eof,
1574 enum segment_type *type)
1578 ofs = segmenter_parse_newline__ (input, n, eof, type);
1582 s->state = S_DEFINE_4;
1587 segmenter_parse_begin_data_1__ (struct segmenter *s,
1588 const char *input, size_t n, bool eof,
1589 enum segment_type *type)
1591 int ofs = segmenter_subparse (s, input, n, eof, type);
1595 if (*type == SEG_NEWLINE)
1596 s->state = S_BEGIN_DATA_2;
1602 segmenter_parse_begin_data_2__ (struct segmenter *s,
1603 const char *input, size_t n, bool eof,
1604 enum segment_type *type)
1606 int ofs = segmenter_subparse (s, input, n, eof, type);
1610 if (*type == SEG_NEWLINE)
1611 s->state = S_BEGIN_DATA_3;
1617 is_end_data (const char *input, size_t n)
1619 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1625 if (n < 4 || c_strncasecmp (input, "END", 3))
1629 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1630 if (!lex_uc_is_space (uc))
1634 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1641 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1648 else if (!lex_uc_is_space (uc))
1657 segmenter_parse_begin_data_3__ (struct segmenter *s,
1658 const char *input, size_t n, bool eof,
1659 enum segment_type *type)
1663 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1666 else if (is_end_data (input, ofs))
1668 s->state = S_GENERAL;
1669 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1670 return segmenter_push (s, input, n, eof, type);
1674 *type = SEG_INLINE_DATA;
1675 s->state = S_BEGIN_DATA_4;
1676 return input[ofs - 1] == '\n' ? 0 : ofs;
1681 segmenter_parse_begin_data_4__ (struct segmenter *s,
1682 const char *input, size_t n, bool eof,
1683 enum segment_type *type)
1687 ofs = segmenter_parse_newline__ (input, n, eof, type);
1691 s->state = S_BEGIN_DATA_3;
1696 segmenter_parse_title_1__ (struct segmenter *s,
1697 const char *input, size_t n, bool eof,
1698 enum segment_type *type)
1702 ofs = skip_spaces (input, n, eof, 0);
1705 s->state = S_TITLE_2;
1711 segmenter_parse_title_2__ (struct segmenter *s,
1712 const char *input, size_t n, bool eof,
1713 enum segment_type *type)
1725 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1739 if (!lex_uc_is_space (uc))
1750 s->state = S_GENERAL;
1752 *type = SEG_UNQUOTED_STRING;
1753 return endcmd >= 0 ? endcmd : ofs;
1759 /* Returns the name of segment TYPE as a string. The caller must not modify
1760 or free the returned string.
1762 This is useful only for debugging and testing. */
1764 segment_type_to_string (enum segment_type type)
1768 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1772 return "unknown segment type";
1776 /* Initializes S as a segmenter with the given syntax MODE.
1778 A segmenter does not contain any external references, so nothing needs to be
1779 done to destroy one. For the same reason, segmenters may be copied with
1780 plain struct assignment (or memcpy). */
1782 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1784 s->state = S_SHBANG;
1789 /* Returns the mode passed to segmenter_init() for S. */
1791 segmenter_get_mode (const struct segmenter *s)
1796 /* Attempts to label a prefix of S's remaining input with a segment type. The
1797 caller supplies the first N bytes of the remaining input as INPUT, which
1798 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1799 are the entire (remainder) of the input; if EOF is false, then further input
1800 is potentially available.
1802 The input may contain '\n' or '\r\n' line ends in any combination.
1804 If successful, returns the number of bytes in the segment at the beginning
1805 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1806 into *TYPE. The next call to segmenter_push() should not include those
1807 bytes as part of INPUT, because they have (figuratively) been consumed by
1810 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1811 be determined. In this case segmenter_push() returns -1. If more input is
1812 available, the caller should obtain some more, then call again with a larger
1813 N. If this is not enough, the process might need to repeat again and agin.
1814 If input is exhausted, then the caller may call again setting EOF to true.
1815 segmenter_push() will never return -1 when EOF is true.
1817 The caller must not, in a sequence of calls, supply contradictory input.
1818 That is, bytes provided as part of INPUT in one call, but not consumed, must
1819 not be provided with *different* values on subsequent calls. This is
1820 because segmenter_push() must often make decisions based on looking ahead
1821 beyond the bytes that it consumes. */
1823 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1824 enum segment_type *type)
1840 return segmenter_parse_shbang__ (s, input, n, eof, type);
1843 return (s->substate & SS_START_OF_LINE
1844 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1845 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1848 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1850 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1853 return segmenter_parse_document_1__ (s, input, n, eof, type);
1855 return segmenter_parse_document_2__ (s, input, n, eof, type);
1857 return segmenter_parse_document_3__ (s, type);
1860 return segmenter_parse_file_label__ (s, input, n, eof, type);
1863 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1865 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1867 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1870 return segmenter_parse_define_1__ (s, input, n, eof, type);
1872 return segmenter_parse_define_2__ (s, input, n, eof, type);
1874 return segmenter_parse_define_3__ (s, input, n, eof, type);
1876 return segmenter_parse_define_4__ (s, input, n, eof, type);
1878 return segmenter_parse_define_5__ (s, input, n, eof, type);
1880 case S_BEGIN_DATA_1:
1881 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1882 case S_BEGIN_DATA_2:
1883 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1884 case S_BEGIN_DATA_3:
1885 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1886 case S_BEGIN_DATA_4:
1887 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1890 return segmenter_parse_title_1__ (s, input, n, eof, type);
1892 return segmenter_parse_title_2__ (s, input, n, eof, type);
1898 /* Returns the style of command prompt to display to an interactive user for
1899 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1900 and at the beginning of a line (that is, if segmenter_push() consumed as
1901 much as possible of the input up to a new-line). */
1903 segmenter_get_prompt (const struct segmenter *s)
1908 return PROMPT_FIRST;
1911 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1915 return PROMPT_COMMENT;
1919 return PROMPT_DOCUMENT;
1921 return PROMPT_FIRST;
1924 return PROMPT_LATER;
1928 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1930 return PROMPT_DO_REPEAT;
1935 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1938 return PROMPT_DEFINE;
1940 case S_BEGIN_DATA_1:
1941 return PROMPT_FIRST;
1942 case S_BEGIN_DATA_2:
1943 return PROMPT_LATER;
1944 case S_BEGIN_DATA_3:
1945 case S_BEGIN_DATA_4:
1950 return PROMPT_FIRST;