1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
56 static int segmenter_detect_command_name__ (const char *input,
57 size_t n, bool eof, int ofs);
60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
63 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
71 mblen = u8_mbtoucr (puc, input, n);
75 return u8_mbtouc (puc, input, n);
86 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
87 bool eof, enum segment_type *type)
97 for (ofs = 2; ofs < n; ofs++)
98 if (input[ofs] == '\n')
100 if (input[ofs] == '\n' && input[ofs - 1] == '\r')
103 s->state = S_GENERAL;
104 s->substate = SS_START_OF_COMMAND;
109 return eof ? ofs : -1;
116 s->state = S_GENERAL;
117 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
118 return segmenter_push (s, input, n, eof, type);
122 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
123 const char *input, size_t n, bool eof,
124 enum segment_type *type)
126 assert (s->state == S_GENERAL);
132 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
136 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
138 for (; ofs < n; ofs++)
140 if (input[ofs] == '\n')
142 else if (input[ofs] == '*')
145 return eof ? ofs + 1 : -1;
146 else if (input[ofs + 1] == '/')
150 return eof ? ofs : -1;
154 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
161 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
168 return eof ? ofs : -1;
169 else if (input[ofs + 1] != '*')
172 ofs = skip_comment (input, n, eof, ofs + 2);
176 else if (lex_uc_is_space (uc) && uc != '\n')
182 return eof ? ofs : -1;
186 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
190 else if (input[ofs] == '\n')
192 else if (input[ofs] == '\r')
196 return input[ofs + 1] == '\n';
203 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
205 ofs = skip_spaces_and_comments (input, n, eof, ofs);
209 return is_end_of_line (input, n, eof, ofs);
213 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
214 enum segment_type *type)
218 if (input[0] == '\n')
228 assert (input[0] == '\r');
229 assert (input[1] == '\n');
238 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
245 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
249 if (!lex_uc_is_space (uc) || uc == '\n')
255 return eof ? ofs : -1;
259 skip_digits (const char *input, size_t n, bool eof, int ofs)
261 for (; ofs < n; ofs++)
262 if (!c_isdigit (input[ofs]))
264 return eof ? ofs : -1;
268 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
269 bool eof, enum segment_type *type)
273 assert (s->state == S_GENERAL);
275 ofs = skip_digits (input, n, eof, 0);
285 if (input[ofs] == '.')
287 ofs = skip_digits (input, n, eof, ofs + 1);
298 if (input[ofs] == 'e' || input[ofs] == 'E')
305 goto expected_exponent;
308 if (input[ofs] == '+' || input[ofs] == '-')
315 goto expected_exponent;
319 if (!c_isdigit (input[ofs]))
320 goto expected_exponent;
322 ofs = skip_digits (input, n, eof, ofs);
327 if (input[ofs - 1] == '.')
329 int eol = at_end_of_line (input, n, eof, ofs);
342 *type = SEG_EXPECTED_EXPONENT;
348 is_reserved_word (const char *s, int n)
352 s0 = c_toupper (s[0]);
356 s1 = c_toupper (s[1]);
357 return ((s0 == 'B' && s1 == 'Y')
358 || (s0 == 'E' && s1 == 'Q')
359 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
360 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
361 || (s0 == 'N' && s1 == 'E')
362 || (s0 == 'O' && s1 == 'R')
363 || (s0 == 'T' && s1 == 'O'));
366 s1 = c_toupper (s[1]);
367 s2 = c_toupper (s[2]);
368 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
369 || (s1 == 'N' && s2 == 'D')))
370 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
373 s1 = c_toupper (s[1]);
374 s2 = c_toupper (s[2]);
375 s3 = c_toupper (s[3]);
376 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
384 segmenter_parse_comment_1__ (struct segmenter *s,
385 const char *input, size_t n, bool eof,
386 enum segment_type *type)
398 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
409 if (ofs > 1 && input[ofs - 1] == '\r')
413 /* Blank line ends comment command. */
414 s->state = S_GENERAL;
415 s->substate = SS_START_OF_COMMAND;
416 *type = SEG_SEPARATE_COMMANDS;
419 else if (endcmd >= 0)
421 /* '.' at end of line ends comment command. */
422 s->state = S_GENERAL;
424 *type = SEG_COMMENT_COMMAND;
429 /* Comment continues onto next line. */
430 *type = SEG_COMMENT_COMMAND;
431 s->state = S_COMMENT_2;
437 if (!lex_uc_is_space (uc))
448 s->state = S_GENERAL;
449 s->substate = SS_START_OF_COMMAND;
450 *type = SEG_SEPARATE_COMMANDS;
458 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
459 size_t n, bool eof, enum segment_type *type)
461 int ofs = segmenter_parse_newline__ (input, n, eof, type);
475 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
479 if (uc == '+' || uc == '-' || uc == '.')
481 else if (!lex_uc_is_space (uc))
484 case SEG_MODE_INTERACTIVE:
493 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
507 s->state = S_GENERAL;
508 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
511 s->state = S_COMMENT_1;
516 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
517 bool eof, enum segment_type *type)
529 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
540 if (ofs > 1 && input[ofs - 1] == '\r')
543 *type = SEG_DOCUMENT;
544 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
548 if (!lex_uc_is_space (uc))
557 *type = SEG_DOCUMENT;
558 s->state = S_DOCUMENT_3;
565 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
566 bool eof, enum segment_type *type)
570 ofs = segmenter_parse_newline__ (input, n, eof, type);
574 s->state = S_DOCUMENT_1;
579 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
581 *type = SEG_END_COMMAND;
582 s->state = S_GENERAL;
583 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
588 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
591 ofs = skip_spaces_and_comments (input, n, eof, ofs);
597 return c != '\'' && c != '"' && c != '\n';
607 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
608 bool eof, int ofs, char id[], size_t id_size)
610 struct segmenter sub;
612 assert (id_size > 0);
615 sub.state = S_GENERAL;
619 enum segment_type type;
622 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
638 if (retval < id_size)
640 memcpy (id, input + ofs, retval);
647 case SEG_QUOTED_STRING:
649 case SEG_UNICODE_STRING:
650 case SEG_UNQUOTED_STRING:
651 case SEG_RESERVED_WORD:
653 case SEG_COMMENT_COMMAND:
654 case SEG_DO_REPEAT_COMMAND:
655 case SEG_INLINE_DATA:
656 case SEG_START_DOCUMENT:
658 case SEG_START_COMMAND:
659 case SEG_SEPARATE_COMMANDS:
660 case SEG_END_COMMAND:
662 case SEG_EXPECTED_QUOTE:
663 case SEG_EXPECTED_EXPONENT:
664 case SEG_UNEXPECTED_DOT:
665 case SEG_UNEXPECTED_CHAR:
673 /* Called when INPUT begins with a character that can start off an ID token. */
675 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
676 bool eof, enum segment_type *type)
682 assert (s->state == S_GENERAL);
684 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
696 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
699 else if (!lex_uc_is_idn (uc))
705 if (input[ofs - 1] == '.')
707 int eol = at_end_of_line (input, n, eof, ofs);
714 if (is_reserved_word (input, ofs))
715 *type = SEG_RESERVED_WORD;
717 *type = SEG_IDENTIFIER;
719 if (s->substate & SS_START_OF_COMMAND)
721 struct substring word = ss_buffer (input, ofs);
723 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
725 s->state = S_COMMENT_1;
726 return segmenter_parse_comment_1__ (s, input, n, eof, type);
728 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
730 s->state = S_DOCUMENT_1;
731 *type = SEG_START_DOCUMENT;
734 else if (lex_id_match (ss_cstr ("TITLE"), word)
735 || lex_id_match (ss_cstr ("SUBTITLE"), word))
737 int result = segmenter_unquoted (input, n, eof, ofs);
742 s->state = S_TITLE_1;
746 else if (lex_id_match (ss_cstr ("FILE"), word))
750 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
752 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
754 s->state = S_FILE_LABEL;
759 else if (lex_id_match (ss_cstr ("DO"), word))
763 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
765 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
767 s->state = S_DO_REPEAT_1;
772 else if (lex_id_match (ss_cstr ("BEGIN"), word))
777 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
780 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
784 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
790 else if (input[ofs2] == '.')
792 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
797 eol = is_end_of_line (input, n, eof, ofs2);
802 if (memchr (input, '\n', ofs2))
803 s->state = S_BEGIN_DATA_1;
805 s->state = S_BEGIN_DATA_2;
818 segmenter_parse_string__ (enum segment_type string_type,
819 int ofs, struct segmenter *s,
820 const char *input, size_t n, bool eof,
821 enum segment_type *type)
823 int quote = input[ofs];
827 if (input[ofs] == quote)
832 if (input[ofs] == quote)
845 else if (input[ofs] == '\n')
856 *type = SEG_EXPECTED_QUOTE;
862 segmenter_maybe_parse_string__ (enum segment_type string_type,
864 const char *input, size_t n, bool eof,
865 enum segment_type *type)
872 else if (input[1] == '\'' || input[1] == '"')
873 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
875 return segmenter_parse_id__ (s, input, n, eof, type);
879 segmenter_parse_mid_command__ (struct segmenter *s,
880 const char *input, size_t n, bool eof,
881 enum segment_type *type)
887 assert (s->state == S_GENERAL);
888 assert (!(s->substate & SS_START_OF_LINE));
890 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
897 s->substate |= SS_START_OF_LINE;
907 else if (input[1] == '*')
909 ofs = skip_comment (input, n, eof, 2);
921 case '(': case ')': case ',': case '=': case '-':
922 case '[': case ']': case '&': case '|': case '+':
928 if (s->substate & SS_START_OF_COMMAND)
930 /* '*' at the beginning of a command begins a comment. */
931 s->state = S_COMMENT_1;
932 return segmenter_parse_comment_1__ (s, input, n, eof, type);
935 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
938 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
941 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
944 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
952 else if (c_isdigit (input[1]))
953 return segmenter_parse_number__ (s, input, n, eof, type);
955 int eol = at_end_of_line (input, n, eof, 1);
961 *type = SEG_END_COMMAND;
962 s->substate = SS_START_OF_COMMAND;
965 *type = SEG_UNEXPECTED_DOT;
968 case '0': case '1': case '2': case '3': case '4':
969 case '5': case '6': case '7': case '8': case '9':
970 return segmenter_parse_number__ (s, input, n, eof, type);
973 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
974 s, input, n, eof, type);
977 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
978 s, input, n, eof, type);
981 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
982 s, input, n, eof, type);
985 if (lex_uc_is_space (uc))
987 ofs = skip_spaces (input, n, eof, mblen);
991 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
995 s->substate |= SS_START_OF_LINE;
1005 else if (lex_uc_is_id1 (uc))
1006 return segmenter_parse_id__ (s, input, n, eof, type);
1009 *type = SEG_UNEXPECTED_CHAR;
1017 compare_commands (const void *a_, const void *b_)
1019 const char *const *ap = a_;
1020 const char *const *bp = b_;
1021 const char *a = *ap;
1022 const char *b = *bp;
1024 return c_strcasecmp (a, b);
1027 static const char **
1028 segmenter_get_command_name_candidates (unsigned char first)
1030 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1031 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1032 static const char *commands[] =
1034 #include "language/command.def"
1037 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1043 static const char **cindex[UCHAR_MAX + 1];
1051 qsort (commands, n_commands, sizeof *commands, compare_commands);
1052 for (i = 0; i < n_commands; i++)
1054 unsigned char c = c_toupper (commands[i][0]);
1055 if (cindex[c] == NULL)
1056 cindex[c] = &commands[i];
1058 for (i = 0; i <= UCHAR_MAX; i++)
1059 if (cindex[i] == NULL)
1060 cindex[i] = &commands[n_commands];
1063 return cindex[c_toupper (first)];
1067 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1070 const char **commands;
1087 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1092 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1100 if (input[ofs - 1] == '.')
1103 for (commands = segmenter_get_command_name_candidates (input[0]);
1104 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1110 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1111 &exact, &missing_words)
1112 && missing_words <= 0)
1120 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1123 return eof ? 0 : -1;
1126 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1129 return eof ? 0 : -1;
1131 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1134 return c == '\'' || c == '"' || c == '\n';
1138 segmenter_parse_start_of_line__ (struct segmenter *s,
1139 const char *input, size_t n, bool eof,
1140 enum segment_type *type)
1146 assert (s->state == S_GENERAL);
1147 assert (s->substate & SS_START_OF_LINE);
1149 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1156 ofs = skip_spaces_and_comments (input, n, eof, 1);
1161 int is_string = is_start_of_string__ (input, n, eof, ofs);
1166 /* This is punctuation that may separate pieces of a string. */
1176 *type = SEG_START_COMMAND;
1177 s->substate = SS_START_OF_COMMAND;
1181 if (lex_uc_is_space (uc))
1183 int eol = at_end_of_line (input, n, eof, 0);
1188 s->substate = SS_START_OF_COMMAND;
1189 *type = SEG_SEPARATE_COMMANDS;
1195 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1197 else if (s->mode == SEG_MODE_AUTO)
1199 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1206 assert (s->mode == SEG_MODE_BATCH);
1208 s->substate = SS_START_OF_COMMAND;
1209 *type = SEG_START_COMMAND;
1213 s->substate = SS_START_OF_COMMAND;
1214 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1218 segmenter_parse_file_label__ (struct segmenter *s,
1219 const char *input, size_t n, bool eof,
1220 enum segment_type *type)
1222 struct segmenter sub;
1226 sub.state = S_GENERAL;
1227 ofs = segmenter_push (&sub, input, n, eof, type);
1231 else if (*type == SEG_IDENTIFIER)
1235 assert (lex_id_match (ss_cstr ("LABEL"),
1236 ss_buffer ((char *) input, ofs)));
1237 result = segmenter_unquoted (input, n, eof, ofs);
1243 s->state = S_TITLE_1;
1251 s->substate = sub.substate;
1257 segmenter_subparse (struct segmenter *s,
1258 const char *input, size_t n, bool eof,
1259 enum segment_type *type)
1261 struct segmenter sub;
1265 sub.state = S_GENERAL;
1266 sub.substate = s->substate;
1267 ofs = segmenter_push (&sub, input, n, eof, type);
1268 s->substate = sub.substate;
1273 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1274 const char *input, size_t n, bool eof,
1275 enum segment_type *type)
1277 int ofs = segmenter_subparse (s, input, n, eof, type);
1281 if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
1282 s->state = S_DO_REPEAT_2;
1283 else if (*type == SEG_END_COMMAND)
1285 s->state = S_DO_REPEAT_3;
1293 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1294 const char *input, size_t n, bool eof,
1295 enum segment_type *type)
1297 int ofs = segmenter_subparse (s, input, n, eof, type);
1301 if (*type == SEG_NEWLINE)
1303 s->state = S_DO_REPEAT_3;
1311 check_repeat_command (struct segmenter *s,
1312 const char *input, size_t n, bool eof)
1319 if (input[ofs] == '+' || input[ofs] == '-')
1322 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1325 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1327 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1332 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1336 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1337 s->substate += direction;
1342 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1343 enum segment_type *type)
1345 const char *newline = memchr (input, '\n', n);
1347 return eof ? n : -1;
1349 ptrdiff_t ofs = newline - input;
1350 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1352 *type = SEG_NEWLINE;
1356 return ofs - (input[ofs - 1] == '\r');
1360 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1361 const char *input, size_t n, bool eof,
1362 enum segment_type *type)
1366 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1367 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1369 else if (!check_repeat_command (s, input, n, eof) && !eof)
1371 else if (s->substate == 0)
1373 s->state = S_GENERAL;
1374 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1375 return segmenter_push (s, input, n, eof, type);
1379 *type = SEG_DO_REPEAT_COMMAND;
1385 segmenter_parse_begin_data_1__ (struct segmenter *s,
1386 const char *input, size_t n, bool eof,
1387 enum segment_type *type)
1389 int ofs = segmenter_subparse (s, input, n, eof, type);
1393 if (*type == SEG_NEWLINE)
1394 s->state = S_BEGIN_DATA_2;
1400 segmenter_parse_begin_data_2__ (struct segmenter *s,
1401 const char *input, size_t n, bool eof,
1402 enum segment_type *type)
1404 int ofs = segmenter_subparse (s, input, n, eof, type);
1408 if (*type == SEG_NEWLINE)
1409 s->state = S_BEGIN_DATA_3;
1415 is_end_data (const char *input, size_t n)
1417 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1423 if (n < 4 || c_strncasecmp (input, "END", 3))
1427 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1428 if (!lex_uc_is_space (uc))
1432 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1439 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1446 else if (!lex_uc_is_space (uc))
1455 segmenter_parse_begin_data_3__ (struct segmenter *s,
1456 const char *input, size_t n, bool eof,
1457 enum segment_type *type)
1461 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1464 else if (is_end_data (input, ofs))
1466 s->state = S_GENERAL;
1467 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1468 return segmenter_push (s, input, n, eof, type);
1472 *type = SEG_INLINE_DATA;
1473 s->state = S_BEGIN_DATA_4;
1474 return input[ofs - 1] == '\n' ? 0 : ofs;
1479 segmenter_parse_begin_data_4__ (struct segmenter *s,
1480 const char *input, size_t n, bool eof,
1481 enum segment_type *type)
1485 ofs = segmenter_parse_newline__ (input, n, eof, type);
1489 s->state = S_BEGIN_DATA_3;
1494 segmenter_parse_title_1__ (struct segmenter *s,
1495 const char *input, size_t n, bool eof,
1496 enum segment_type *type)
1500 ofs = skip_spaces (input, n, eof, 0);
1503 s->state = S_TITLE_2;
1509 segmenter_parse_title_2__ (struct segmenter *s,
1510 const char *input, size_t n, bool eof,
1511 enum segment_type *type)
1523 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1537 if (!lex_uc_is_space (uc))
1548 s->state = S_GENERAL;
1550 *type = SEG_UNQUOTED_STRING;
1551 return endcmd >= 0 ? endcmd : ofs;
1557 /* Returns the name of segment TYPE as a string. The caller must not modify
1558 or free the returned string.
1560 This is useful only for debugging and testing. */
1562 segment_type_to_string (enum segment_type type)
1566 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1570 return "unknown segment type";
1574 /* Initializes S as a segmenter with the given syntax MODE.
1576 A segmenter does not contain any external references, so nothing needs to be
1577 done to destroy one. For the same reason, segmenters may be copied with
1578 plain struct assignment (or memcpy). */
1580 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1582 s->state = S_SHBANG;
1587 /* Returns the mode passed to segmenter_init() for S. */
1589 segmenter_get_mode (const struct segmenter *s)
1594 /* Attempts to label a prefix of S's remaining input with a segment type. The
1595 caller supplies the first N bytes of the remaining input as INPUT, which
1596 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1597 are the entire (remainder) of the input; if EOF is false, then further input
1598 is potentially available.
1600 The input may contain '\n' or '\r\n' line ends in any combination.
1602 If successful, returns the number of bytes in the segment at the beginning
1603 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1604 into *TYPE. The next call to segmenter_push() should not include those
1605 bytes as part of INPUT, because they have (figuratively) been consumed by
1608 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1609 be determined. In this case segmenter_push() returns -1. If more input is
1610 available, the caller should obtain some more, then call again with a larger
1611 N. If this is not enough, the process might need to repeat again and agin.
1612 If input is exhausted, then the caller may call again setting EOF to true.
1613 segmenter_push() will never return -1 when EOF is true.
1615 The caller must not, in a sequence of calls, supply contradictory input.
1616 That is, bytes provided as part of INPUT in one call, but not consumed, must
1617 not be provided with *different* values on subsequent calls. This is
1618 because segmenter_push() must often make decisions based on looking ahead
1619 beyond the bytes that it consumes. */
1621 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1622 enum segment_type *type)
1638 return segmenter_parse_shbang__ (s, input, n, eof, type);
1641 return (s->substate & SS_START_OF_LINE
1642 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1643 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1646 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1648 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1651 return segmenter_parse_document_1__ (s, input, n, eof, type);
1653 return segmenter_parse_document_2__ (s, input, n, eof, type);
1655 return segmenter_parse_document_3__ (s, type);
1658 return segmenter_parse_file_label__ (s, input, n, eof, type);
1661 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1663 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1665 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1667 case S_BEGIN_DATA_1:
1668 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1669 case S_BEGIN_DATA_2:
1670 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1671 case S_BEGIN_DATA_3:
1672 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1673 case S_BEGIN_DATA_4:
1674 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1677 return segmenter_parse_title_1__ (s, input, n, eof, type);
1679 return segmenter_parse_title_2__ (s, input, n, eof, type);
1685 /* Returns the style of command prompt to display to an interactive user for
1686 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1687 and at the beginning of a line (that is, if segmenter_push() consumed as
1688 much as possible of the input up to a new-line). */
1690 segmenter_get_prompt (const struct segmenter *s)
1695 return PROMPT_FIRST;
1698 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1702 return PROMPT_COMMENT;
1706 return PROMPT_DOCUMENT;
1708 return PROMPT_FIRST;
1711 return PROMPT_LATER;
1715 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1717 return PROMPT_DO_REPEAT;
1719 case S_BEGIN_DATA_1:
1720 return PROMPT_FIRST;
1721 case S_BEGIN_DATA_2:
1722 return PROMPT_LATER;
1723 case S_BEGIN_DATA_3:
1724 case S_BEGIN_DATA_4:
1729 return PROMPT_FIRST;