1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
58 #define SS_START_OF_LINE (1u << 0)
59 #define SS_START_OF_COMMAND (1u << 1)
61 static int segmenter_detect_command_name__ (const char *input,
62 size_t n, bool eof, int ofs);
65 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
68 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
76 mblen = u8_mbtoucr (puc, input, n);
80 return u8_mbtouc (puc, input, n);
91 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
92 bool eof, enum segment_type *type)
100 for (int ofs = 2; ; ofs++)
107 else if (input[ofs] == '\n')
109 if (input[ofs - 1] == '\r')
115 s->state = S_GENERAL;
116 s->substate = SS_START_OF_COMMAND;
126 s->state = S_GENERAL;
127 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
128 return segmenter_push (s, input, n, eof, type);
132 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
133 const char *input, size_t n, bool eof,
134 enum segment_type *type)
136 assert (s->state == S_GENERAL);
142 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
146 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
148 for (; ofs < n; ofs++)
150 if (input[ofs] == '\n')
152 else if (input[ofs] == '*')
155 return eof ? ofs + 1 : -1;
156 else if (input[ofs + 1] == '/')
160 return eof ? ofs : -1;
164 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
171 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
178 return eof ? ofs : -1;
179 else if (input[ofs + 1] != '*')
182 ofs = skip_comment (input, n, eof, ofs + 2);
186 else if (lex_uc_is_space (uc) && uc != '\n')
192 return eof ? ofs : -1;
196 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
200 else if (input[ofs] == '\n')
202 else if (input[ofs] == '\r')
206 return input[ofs + 1] == '\n';
213 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
215 ofs = skip_spaces_and_comments (input, n, eof, ofs);
219 return is_end_of_line (input, n, eof, ofs);
223 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
224 enum segment_type *type)
228 if (input[0] == '\n')
238 assert (input[0] == '\r');
239 assert (input[1] == '\n');
248 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
255 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
259 if (!lex_uc_is_space (uc) || uc == '\n')
265 return eof ? ofs : -1;
269 skip_digits (const char *input, size_t n, bool eof, int ofs)
271 for (; ofs < n; ofs++)
272 if (!c_isdigit (input[ofs]))
274 return eof ? ofs : -1;
278 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
279 bool eof, enum segment_type *type)
283 assert (s->state == S_GENERAL);
285 ofs = skip_digits (input, n, eof, 0);
295 if (input[ofs] == '.')
304 ofs = skip_digits (input, n, eof, ofs + 1);
311 if (input[ofs] == 'e' || input[ofs] == 'E')
318 goto expected_exponent;
321 if (input[ofs] == '+' || input[ofs] == '-')
328 goto expected_exponent;
332 if (!c_isdigit (input[ofs]))
333 goto expected_exponent;
335 ofs = skip_digits (input, n, eof, ofs);
340 if (input[ofs - 1] == '.')
342 int eol = at_end_of_line (input, n, eof, ofs);
355 *type = SEG_EXPECTED_EXPONENT;
361 is_reserved_word (const char *s, int n)
365 s0 = c_toupper (s[0]);
369 s1 = c_toupper (s[1]);
370 return ((s0 == 'B' && s1 == 'Y')
371 || (s0 == 'E' && s1 == 'Q')
372 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
373 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
374 || (s0 == 'N' && s1 == 'E')
375 || (s0 == 'O' && s1 == 'R')
376 || (s0 == 'T' && s1 == 'O'));
379 s1 = c_toupper (s[1]);
380 s2 = c_toupper (s[2]);
381 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
382 || (s1 == 'N' && s2 == 'D')))
383 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
386 s1 = c_toupper (s[1]);
387 s2 = c_toupper (s[2]);
388 s3 = c_toupper (s[3]);
389 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
397 segmenter_parse_comment_1__ (struct segmenter *s,
398 const char *input, size_t n, bool eof,
399 enum segment_type *type)
411 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
422 if (ofs > 1 && input[ofs - 1] == '\r')
426 /* Blank line ends comment command. */
427 s->state = S_GENERAL;
428 s->substate = SS_START_OF_COMMAND;
429 *type = SEG_SEPARATE_COMMANDS;
432 else if (endcmd >= 0)
434 /* '.' at end of line ends comment command. */
435 s->state = S_GENERAL;
437 *type = SEG_COMMENT_COMMAND;
442 /* Comment continues onto next line. */
443 *type = SEG_COMMENT_COMMAND;
444 s->state = S_COMMENT_2;
450 if (!lex_uc_is_space (uc))
461 s->state = S_GENERAL;
462 s->substate = SS_START_OF_COMMAND;
463 *type = SEG_SEPARATE_COMMANDS;
471 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
472 size_t n, bool eof, enum segment_type *type)
474 int ofs = segmenter_parse_newline__ (input, n, eof, type);
488 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
492 if (uc == '+' || uc == '-' || uc == '.')
494 else if (!lex_uc_is_space (uc))
497 case SEG_MODE_INTERACTIVE:
506 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
520 s->state = S_GENERAL;
521 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
524 s->state = S_COMMENT_1;
529 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
530 bool eof, enum segment_type *type)
542 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
553 if (ofs > 1 && input[ofs - 1] == '\r')
556 *type = SEG_DOCUMENT;
557 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
561 if (!lex_uc_is_space (uc))
570 *type = SEG_DOCUMENT;
571 s->state = S_DOCUMENT_3;
578 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
579 bool eof, enum segment_type *type)
583 ofs = segmenter_parse_newline__ (input, n, eof, type);
587 s->state = S_DOCUMENT_1;
592 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
594 *type = SEG_END_COMMAND;
595 s->state = S_GENERAL;
596 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
601 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
604 ofs = skip_spaces_and_comments (input, n, eof, ofs);
610 return c != '\'' && c != '"' && c != '\n';
620 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
621 bool eof, int ofs, char id[], size_t id_size)
623 struct segmenter sub;
625 assert (id_size > 0);
628 sub.state = S_GENERAL;
632 enum segment_type type;
635 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
651 if (retval < id_size)
653 memcpy (id, input + ofs, retval);
660 case SEG_QUOTED_STRING:
662 case SEG_UNICODE_STRING:
663 case SEG_UNQUOTED_STRING:
664 case SEG_RESERVED_WORD:
666 case SEG_COMMENT_COMMAND:
667 case SEG_DO_REPEAT_COMMAND:
668 case SEG_INLINE_DATA:
671 case SEG_START_DOCUMENT:
673 case SEG_START_COMMAND:
674 case SEG_SEPARATE_COMMANDS:
675 case SEG_END_COMMAND:
677 case SEG_EXPECTED_QUOTE:
678 case SEG_EXPECTED_EXPONENT:
679 case SEG_UNEXPECTED_CHAR:
687 /* Called when INPUT begins with a character that can start off an ID token. */
689 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
690 bool eof, enum segment_type *type)
696 assert (s->state == S_GENERAL);
698 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
710 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
713 else if (!lex_uc_is_idn (uc))
719 if (input[ofs - 1] == '.')
721 int eol = at_end_of_line (input, n, eof, ofs);
728 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
729 : input[0] == '!' ? SEG_MACRO_ID
732 if (s->substate & SS_START_OF_COMMAND)
734 struct substring word = ss_buffer (input, ofs);
736 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
738 s->state = S_COMMENT_1;
739 return segmenter_parse_comment_1__ (s, input, n, eof, type);
741 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
743 s->state = S_DOCUMENT_1;
744 *type = SEG_START_DOCUMENT;
747 else if (lex_id_match (ss_cstr ("TITLE"), word)
748 || lex_id_match (ss_cstr ("SUBTITLE"), word))
750 int result = segmenter_unquoted (input, n, eof, ofs);
755 s->state = S_TITLE_1;
759 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
761 s->state = S_DEFINE_1;
764 else if (lex_id_match (ss_cstr ("FILE"), word))
768 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
770 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
772 s->state = S_FILE_LABEL;
777 else if (lex_id_match (ss_cstr ("DO"), word))
781 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
783 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
785 s->state = S_DO_REPEAT_1;
790 else if (lex_id_match (ss_cstr ("BEGIN"), word))
795 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
798 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
802 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
808 else if (input[ofs2] == '.')
810 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
815 eol = is_end_of_line (input, n, eof, ofs2);
820 if (memchr (input, '\n', ofs2))
821 s->state = S_BEGIN_DATA_1;
823 s->state = S_BEGIN_DATA_2;
836 segmenter_parse_string__ (enum segment_type string_type,
837 int ofs, struct segmenter *s,
838 const char *input, size_t n, bool eof,
839 enum segment_type *type)
841 int quote = input[ofs];
845 if (input[ofs] == quote)
850 if (input[ofs] == quote)
863 else if (input[ofs] == '\n')
874 *type = SEG_EXPECTED_QUOTE;
880 segmenter_maybe_parse_string__ (enum segment_type string_type,
882 const char *input, size_t n, bool eof,
883 enum segment_type *type)
890 else if (input[1] == '\'' || input[1] == '"')
891 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
893 return segmenter_parse_id__ (s, input, n, eof, type);
897 segmenter_parse_mid_command__ (struct segmenter *s,
898 const char *input, size_t n, bool eof,
899 enum segment_type *type)
905 assert (s->state == S_GENERAL);
906 assert (!(s->substate & SS_START_OF_LINE));
908 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
915 s->substate |= SS_START_OF_LINE;
925 else if (input[1] == '*')
927 ofs = skip_comment (input, n, eof, 2);
939 case '(': case ')': case ',': case '=': case '-':
940 case '[': case ']': case '&': case '|': case '+':
946 if (s->substate & SS_START_OF_COMMAND)
948 /* '*' at the beginning of a command begins a comment. */
949 s->state = S_COMMENT_1;
950 return segmenter_parse_comment_1__ (s, input, n, eof, type);
953 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
956 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
959 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
962 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
970 else if (c_isdigit (input[1]))
971 return segmenter_parse_number__ (s, input, n, eof, type);
973 int eol = at_end_of_line (input, n, eof, 1);
979 *type = SEG_END_COMMAND;
980 s->substate = SS_START_OF_COMMAND;
986 case '0': case '1': case '2': case '3': case '4':
987 case '5': case '6': case '7': case '8': case '9':
988 return segmenter_parse_number__ (s, input, n, eof, type);
991 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
992 s, input, n, eof, type);
995 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
996 s, input, n, eof, type);
999 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1000 s, input, n, eof, type);
1003 return segmenter_parse_id__ (s, input, n, eof, type);
1006 if (lex_uc_is_space (uc))
1008 ofs = skip_spaces (input, n, eof, mblen);
1012 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1016 s->substate |= SS_START_OF_LINE;
1017 *type = SEG_NEWLINE;
1026 else if (lex_uc_is_id1 (uc))
1027 return segmenter_parse_id__ (s, input, n, eof, type);
1028 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1036 *type = SEG_UNEXPECTED_CHAR;
1044 compare_commands (const void *a_, const void *b_)
1046 const char *const *ap = a_;
1047 const char *const *bp = b_;
1048 const char *a = *ap;
1049 const char *b = *bp;
1051 return c_strcasecmp (a, b);
1054 static const char **
1055 segmenter_get_command_name_candidates (unsigned char first)
1057 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1058 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1059 static const char *commands[] =
1061 #include "language/command.def"
1064 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1070 static const char **cindex[UCHAR_MAX + 1];
1078 qsort (commands, n_commands, sizeof *commands, compare_commands);
1079 for (i = 0; i < n_commands; i++)
1081 unsigned char c = c_toupper (commands[i][0]);
1082 if (cindex[c] == NULL)
1083 cindex[c] = &commands[i];
1085 for (i = 0; i <= UCHAR_MAX; i++)
1086 if (cindex[i] == NULL)
1087 cindex[i] = &commands[n_commands];
1090 return cindex[c_toupper (first)];
1094 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1097 const char **commands;
1114 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1119 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1127 if (input[ofs - 1] == '.')
1130 for (commands = segmenter_get_command_name_candidates (input[0]);
1131 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1137 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1138 &exact, &missing_words)
1139 && missing_words <= 0)
1147 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1150 return eof ? 0 : -1;
1153 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1156 return eof ? 0 : -1;
1158 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1161 return c == '\'' || c == '"' || c == '\n';
1165 segmenter_parse_start_of_line__ (struct segmenter *s,
1166 const char *input, size_t n, bool eof,
1167 enum segment_type *type)
1173 assert (s->state == S_GENERAL);
1174 assert (s->substate & SS_START_OF_LINE);
1176 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1183 ofs = skip_spaces_and_comments (input, n, eof, 1);
1188 int is_string = is_start_of_string__ (input, n, eof, ofs);
1193 /* This is punctuation that may separate pieces of a string. */
1203 *type = SEG_START_COMMAND;
1204 s->substate = SS_START_OF_COMMAND;
1208 if (lex_uc_is_space (uc))
1210 int eol = at_end_of_line (input, n, eof, 0);
1215 s->substate = SS_START_OF_COMMAND;
1216 *type = SEG_SEPARATE_COMMANDS;
1222 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1224 else if (s->mode == SEG_MODE_AUTO)
1226 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1233 assert (s->mode == SEG_MODE_BATCH);
1235 s->substate = SS_START_OF_COMMAND;
1236 *type = SEG_START_COMMAND;
1240 s->substate = SS_START_OF_COMMAND;
1241 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1245 segmenter_parse_file_label__ (struct segmenter *s,
1246 const char *input, size_t n, bool eof,
1247 enum segment_type *type)
1249 struct segmenter sub;
1253 sub.state = S_GENERAL;
1254 ofs = segmenter_push (&sub, input, n, eof, type);
1258 else if (*type == SEG_IDENTIFIER)
1262 assert (lex_id_match (ss_cstr ("LABEL"),
1263 ss_buffer ((char *) input, ofs)));
1264 result = segmenter_unquoted (input, n, eof, ofs);
1270 s->state = S_TITLE_1;
1278 s->substate = sub.substate;
1284 segmenter_subparse (struct segmenter *s,
1285 const char *input, size_t n, bool eof,
1286 enum segment_type *type)
1288 struct segmenter sub;
1292 sub.state = S_GENERAL;
1293 sub.substate = s->substate;
1294 ofs = segmenter_push (&sub, input, n, eof, type);
1295 s->substate = sub.substate;
1299 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1300 defines the stand-in variables (the head) before the lines of syntax to be
1301 repeated (the body). */
1303 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1304 const char *input, size_t n, bool eof,
1305 enum segment_type *type)
1307 int ofs = segmenter_subparse (s, input, n, eof, type);
1311 if (*type == SEG_SEPARATE_COMMANDS)
1313 /* We reached a blank line that separates the head from the body. */
1314 s->state = S_DO_REPEAT_2;
1316 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1318 /* We reached the body. */
1319 s->state = S_DO_REPEAT_3;
1326 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1327 separates the head from the body. */
1329 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1330 const char *input, size_t n, bool eof,
1331 enum segment_type *type)
1333 int ofs = segmenter_subparse (s, input, n, eof, type);
1337 if (*type == SEG_NEWLINE)
1339 /* We reached the body. */
1340 s->state = S_DO_REPEAT_3;
1348 check_repeat_command (struct segmenter *s,
1349 const char *input, size_t n, bool eof)
1356 if (input[ofs] == '+' || input[ofs] == '-')
1359 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1362 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1364 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1369 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1373 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1374 s->substate += direction;
1379 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1380 enum segment_type *type)
1382 const char *newline = memchr (input, '\n', n);
1384 return eof ? n : -1;
1386 ptrdiff_t ofs = newline - input;
1387 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1389 *type = SEG_NEWLINE;
1393 return ofs - (input[ofs - 1] == '\r');
1396 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1397 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1399 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1400 the lines we're segmenting. s->substate counts the nesting level, starting
1403 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1404 const char *input, size_t n, bool eof,
1405 enum segment_type *type)
1409 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1410 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1412 else if (!check_repeat_command (s, input, n, eof) && !eof)
1414 else if (s->substate == 0)
1416 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1418 s->state = S_GENERAL;
1419 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1420 return segmenter_push (s, input, n, eof, type);
1424 *type = SEG_DO_REPEAT_COMMAND;
1429 /* We are segmenting a DEFINE command, which consists of:
1431 - The DEFINE keyword.
1435 - "(" followed by a sequence of tokens possibly including balanced parentheses
1440 - A sequence of lines that don't start with "!ENDDEFINE", one string per line,
1441 each ending in a newline.
1447 segmenter_parse_define_1__ (struct segmenter *s,
1448 const char *input, size_t n, bool eof,
1449 enum segment_type *type)
1451 int ofs = segmenter_subparse (s, input, n, eof, type);
1455 if (*type == SEG_SEPARATE_COMMANDS
1456 || *type == SEG_END_COMMAND
1457 || *type == SEG_START_COMMAND)
1459 /* The DEFINE command is malformed because we reached its end without
1460 ever hitting a "(" token. Transition back to general parsing. */
1461 s->state = S_GENERAL;
1464 else if (*type == SEG_PUNCT && input[0] == '(')
1466 s->state = S_DEFINE_2;
1475 segmenter_parse_define_2__ (struct segmenter *s,
1476 const char *input, size_t n, bool eof,
1477 enum segment_type *type)
1479 int ofs = segmenter_subparse (s, input, n, eof, type);
1483 if (*type == SEG_SEPARATE_COMMANDS
1484 || *type == SEG_END_COMMAND
1485 || *type == SEG_START_COMMAND)
1487 /* The DEFINE command is malformed because we reached its end before
1488 closing the set of parentheses. Transition back to general
1490 s->state = S_GENERAL;
1493 else if (*type == SEG_PUNCT && input[0] == '(')
1498 else if (*type == SEG_PUNCT && input[0] == ')')
1502 s->state = S_DEFINE_3;
1510 segmenter_parse_define_3__ (struct segmenter *s,
1511 const char *input, size_t n, bool eof,
1512 enum segment_type *type)
1514 int ofs = segmenter_subparse (s, input, n, eof, type);
1518 if (*type == SEG_END_COMMAND)
1520 /* The DEFINE command is malformed because there was a command terminator
1521 before the first line of the body. Transition back to general
1523 s->state = S_GENERAL;
1526 else if (*type == SEG_NEWLINE)
1527 s->state = S_DEFINE_4;
1533 is_enddefine (const char *input, size_t n)
1535 int ofs = skip_spaces_and_comments (input, n, true, 0);
1538 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1539 if (n - ofs < enddefine.length)
1542 if (!ss_equals_case (ss_buffer (input + ofs, enddefine.length), enddefine))
1545 if (ofs + enddefine.length >= n)
1548 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1550 u8_mbtouc (&uc, u_input + ofs, n - ofs);
1551 return uc == '.' || !lex_uc_is_idn (uc);
1555 segmenter_parse_define_4__ (struct segmenter *s,
1556 const char *input, size_t n, bool eof,
1557 enum segment_type *type)
1561 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1564 else if (is_enddefine (input, ofs))
1566 s->state = S_GENERAL;
1567 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1568 return segmenter_push (s, input, n, eof, type);
1572 *type = SEG_MACRO_BODY;
1573 s->state = S_DEFINE_5;
1574 return input[ofs - 1] == '\n' ? 0 : ofs;
1579 segmenter_parse_define_5__ (struct segmenter *s,
1580 const char *input, size_t n, bool eof,
1581 enum segment_type *type)
1585 ofs = segmenter_parse_newline__ (input, n, eof, type);
1589 s->state = S_DEFINE_4;
1594 segmenter_parse_begin_data_1__ (struct segmenter *s,
1595 const char *input, size_t n, bool eof,
1596 enum segment_type *type)
1598 int ofs = segmenter_subparse (s, input, n, eof, type);
1602 if (*type == SEG_NEWLINE)
1603 s->state = S_BEGIN_DATA_2;
1609 segmenter_parse_begin_data_2__ (struct segmenter *s,
1610 const char *input, size_t n, bool eof,
1611 enum segment_type *type)
1613 int ofs = segmenter_subparse (s, input, n, eof, type);
1617 if (*type == SEG_NEWLINE)
1618 s->state = S_BEGIN_DATA_3;
1624 is_end_data (const char *input, size_t n)
1626 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1632 if (n < 4 || c_strncasecmp (input, "END", 3))
1636 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1637 if (!lex_uc_is_space (uc))
1641 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1648 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1655 else if (!lex_uc_is_space (uc))
1664 segmenter_parse_begin_data_3__ (struct segmenter *s,
1665 const char *input, size_t n, bool eof,
1666 enum segment_type *type)
1670 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1673 else if (is_end_data (input, ofs))
1675 s->state = S_GENERAL;
1676 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1677 return segmenter_push (s, input, n, eof, type);
1681 *type = SEG_INLINE_DATA;
1682 s->state = S_BEGIN_DATA_4;
1683 return input[ofs - 1] == '\n' ? 0 : ofs;
1688 segmenter_parse_begin_data_4__ (struct segmenter *s,
1689 const char *input, size_t n, bool eof,
1690 enum segment_type *type)
1694 ofs = segmenter_parse_newline__ (input, n, eof, type);
1698 s->state = S_BEGIN_DATA_3;
1703 segmenter_parse_title_1__ (struct segmenter *s,
1704 const char *input, size_t n, bool eof,
1705 enum segment_type *type)
1709 ofs = skip_spaces (input, n, eof, 0);
1712 s->state = S_TITLE_2;
1718 segmenter_parse_title_2__ (struct segmenter *s,
1719 const char *input, size_t n, bool eof,
1720 enum segment_type *type)
1732 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1746 if (!lex_uc_is_space (uc))
1757 s->state = S_GENERAL;
1759 *type = SEG_UNQUOTED_STRING;
1760 return endcmd >= 0 ? endcmd : ofs;
1766 /* Returns the name of segment TYPE as a string. The caller must not modify
1767 or free the returned string.
1769 This is useful only for debugging and testing. */
1771 segment_type_to_string (enum segment_type type)
1775 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1779 return "unknown segment type";
1783 /* Initializes S as a segmenter with the given syntax MODE.
1785 A segmenter does not contain any external references, so nothing needs to be
1786 done to destroy one. For the same reason, segmenters may be copied with
1787 plain struct assignment (or memcpy). */
1789 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1791 s->state = S_SHBANG;
1796 /* Returns the mode passed to segmenter_init() for S. */
1798 segmenter_get_mode (const struct segmenter *s)
1803 /* Attempts to label a prefix of S's remaining input with a segment type. The
1804 caller supplies the first N bytes of the remaining input as INPUT, which
1805 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1806 are the entire (remainder) of the input; if EOF is false, then further input
1807 is potentially available.
1809 The input may contain '\n' or '\r\n' line ends in any combination.
1811 If successful, returns the number of bytes in the segment at the beginning
1812 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1813 into *TYPE. The next call to segmenter_push() should not include those
1814 bytes as part of INPUT, because they have (figuratively) been consumed by
1817 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1818 be determined. In this case segmenter_push() returns -1. If more input is
1819 available, the caller should obtain some more, then call again with a larger
1820 N. If this is not enough, the process might need to repeat again and agin.
1821 If input is exhausted, then the caller may call again setting EOF to true.
1822 segmenter_push() will never return -1 when EOF is true.
1824 The caller must not, in a sequence of calls, supply contradictory input.
1825 That is, bytes provided as part of INPUT in one call, but not consumed, must
1826 not be provided with *different* values on subsequent calls. This is
1827 because segmenter_push() must often make decisions based on looking ahead
1828 beyond the bytes that it consumes. */
1830 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1831 enum segment_type *type)
1847 return segmenter_parse_shbang__ (s, input, n, eof, type);
1850 return (s->substate & SS_START_OF_LINE
1851 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1852 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1855 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1857 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1860 return segmenter_parse_document_1__ (s, input, n, eof, type);
1862 return segmenter_parse_document_2__ (s, input, n, eof, type);
1864 return segmenter_parse_document_3__ (s, type);
1867 return segmenter_parse_file_label__ (s, input, n, eof, type);
1870 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1872 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1874 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1877 return segmenter_parse_define_1__ (s, input, n, eof, type);
1879 return segmenter_parse_define_2__ (s, input, n, eof, type);
1881 return segmenter_parse_define_3__ (s, input, n, eof, type);
1883 return segmenter_parse_define_4__ (s, input, n, eof, type);
1885 return segmenter_parse_define_5__ (s, input, n, eof, type);
1887 case S_BEGIN_DATA_1:
1888 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1889 case S_BEGIN_DATA_2:
1890 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1891 case S_BEGIN_DATA_3:
1892 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1893 case S_BEGIN_DATA_4:
1894 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1897 return segmenter_parse_title_1__ (s, input, n, eof, type);
1899 return segmenter_parse_title_2__ (s, input, n, eof, type);
1905 /* Returns the style of command prompt to display to an interactive user for
1906 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1907 and at the beginning of a line (that is, if segmenter_push() consumed as
1908 much as possible of the input up to a new-line). */
1910 segmenter_get_prompt (const struct segmenter *s)
1915 return PROMPT_FIRST;
1918 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1922 return PROMPT_COMMENT;
1926 return PROMPT_DOCUMENT;
1928 return PROMPT_FIRST;
1931 return PROMPT_LATER;
1935 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1937 return PROMPT_DO_REPEAT;
1942 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1945 return PROMPT_DEFINE;
1947 case S_BEGIN_DATA_1:
1948 return PROMPT_FIRST;
1949 case S_BEGIN_DATA_2:
1950 return PROMPT_LATER;
1951 case S_BEGIN_DATA_3:
1952 case S_BEGIN_DATA_4:
1957 return PROMPT_FIRST;